├── .gitignore
├── 01
    └── chapter01.txt
├── 02
    ├── abaloneCorrHeat.py
    ├── abaloneCorrMat.txt
    ├── abaloneParallelPlot.py
    ├── abaloneSummary.py
    ├── abaloneSummaryOutput.txt
    ├── chapter02.zip
    ├── corrCalc.py
    ├── corrPlot.py
    ├── glassCorrHeatMap.py
    ├── glassParallelPlot.py
    ├── glassSummary.py
    ├── glassSummary.txt
    ├── linePlots.py
    ├── outputRocksVMinesContents.txt
    ├── outputSummaryStats.txt
    ├── pandasReadSummarize.py
    ├── pandasReadSummarizeOutput.txt
    ├── qqplotAttribute.py
    ├── rVMSummaryStats.py
    ├── rockVmineContents.py
    ├── rockVmineSummaries.py
    ├── sampleCorrHeatMap.py
    ├── targetCorr.py
    ├── wineCorrHeatMap.py
    ├── wineParallelPlot.py
    ├── wineSummary.py
    └── wineSummary.txt
├── 03
    ├── chapter03.zip
    ├── classifierPerformance_RocksVMines.py
    ├── classifierPerformance_RocksVMinesOutput.txt
    ├── classifierRidgeRocksVMines.py
    ├── classifierRidgeRocksVMinesOutput.txt
    ├── fwdStepwiseWine.py
    ├── fwdStepwiseWineOutput.txt
    ├── regressionErrorMeasures.py
    ├── ridgeWine.py
    └── ridgeWineOutput.txt
├── 04
    ├── chapter04.zip
    ├── cvCurveDetails.txt
    ├── glmnetOrderedNamesList.txt
    ├── glmnetWine.py
    ├── larsAbalone.py
    ├── larsAbaloneOutput.txt
    ├── larsRocksVMines.py
    ├── larsWine.py
    ├── larsWine2.py
    ├── larsWineCV.py
    ├── orderedNamesList.txt
    ├── rocksVMinesCoefOrder.txt
    └── wineBasisExpand.py
├── 05
    ├── chapter05.zip
    ├── glass
    │   └── glassENetRegCV.py
    ├── rocksVMines
    │   ├── rocksVMinesCoefCurves.py
    │   ├── rocksVMinesCoefCurvesPrintedOutput.txt
    │   ├── rocksVMinesENetRegCV.py
    │   ├── rocksVMinesENetRegCVPrintedOutput.txt
    │   ├── rocksVMinesGlmnet.py
    │   └── rocksVMinesGlmnetPrintedOutput.txt
    └── wineCS
    │   ├── wineExpandedLassoCV.py
    │   ├── wineLassoCV.py
    │   ├── wineLassoCVPrintedOutputNormalizedX.txt
    │   ├── wineLassoCVPrintedOutputNormalizedXandY.txt
    │   ├── wineLassoCVPrintedOutputUn-NormalizedX.txt
    │   ├── wineLassoCoefCurves.py
    │   ├── wineLassoCoefCurvesPrintedOutput.txt
    │   └── wineLassoExpandedCVPrintedOutput.txt
├── 06
    ├── chapter06.zip
    ├── simpleBagging.py
    ├── simpleGBM.py
    ├── simpleTree.py
    ├── simpleTreeCV.py
    ├── wineBagging.py
    ├── wineGBM.py
    ├── wineRF.py
    └── wineTree.py
├── 07
    ├── abaloneGBM.py
    ├── abaloneRF.py
    ├── glassGbm.py
    ├── glassRF.py
    ├── rocksVMinesGBM.py
    ├── rocksVMinesRF.py
    ├── timingComparisons.txt
    ├── wineBagging.py
    ├── wineGBM.py
    └── wineRF.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | env/
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *,cover
 49 | .hypothesis/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # dotenv
 85 | .env
 86 | 
 87 | # virtualenv
 88 | .venv
 89 | venv/
 90 | ENV/
 91 | 
 92 | # Spyder project settings
 93 | .spyderproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | ### JetBrains template
 98 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 99 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
100 | 
101 | # User-specific stuff:
102 | .idea/**/workspace.xml
103 | .idea/**/tasks.xml
104 | .idea/dictionaries
105 | 
106 | # Sensitive or high-churn files:
107 | .idea/**/dataSources/
108 | .idea/**/dataSources.ids
109 | .idea/**/dataSources.xml
110 | .idea/**/dataSources.local.xml
111 | .idea/**/sqlDataSources.xml
112 | .idea/**/dynamic.xml
113 | .idea/**/uiDesigner.xml
114 | 
115 | # Gradle:
116 | .idea/**/gradle.xml
117 | .idea/**/libraries
118 | 
119 | # Mongo Explorer plugin:
120 | .idea/**/mongoSettings.xml
121 | 
122 | ## File-based project format:
123 | *.iws
124 | 
125 | ## Plugin-specific files:
126 | 
127 | # IntelliJ
128 | /out/
129 | .idea/*
130 | 
131 | # mpeltonen/sbt-idea plugin
132 | .idea_modules/
133 | 
134 | # JIRA plugin
135 | atlassian-ide-plugin.xml
136 | 
137 | # Crashlytics plugin (for Android Studio and IntelliJ)
138 | com_crashlytics_export_strings.xml
139 | crashlytics.properties
140 | crashlytics-build.properties
141 | fabric.properties
142 | 
143 | 


--------------------------------------------------------------------------------
/01/chapter01.txt:
--------------------------------------------------------------------------------
1 | Chapter 1 of Machine Learning in Python has no code associated with it.
2 | 


--------------------------------------------------------------------------------
/02/abaloneCorrHeat.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | import matplotlib.pyplot as plot
 5 | 
 6 | target_url = ("http://archive.ics.uci.edu/ml/machine-"
 7 |               "learning-databases/abalone/abalone.data")
 8 | #read abalone data
 9 | abalone = pd.read_csv(target_url,header=None, prefix="V")
10 | abalone.columns = ['Sex', 'Length', 'Diameter', 'Height',
11 |                    'Whole weight', 'Shucked weight',
12 |                    'Viscera weight', 'Shell weight', 'Rings']
13 | 
14 | #calculate correlation matrix
15 | corMat = DataFrame(abalone.iloc[:,1:9].corr())
16 | #print correlation matrix
17 | print(corMat)
18 | 
19 | #visualize correlations using heatmap
20 | plot.pcolor(corMat)
21 | plot.show()
22 | 
23 | 


--------------------------------------------------------------------------------
/02/abaloneCorrMat.txt:
--------------------------------------------------------------------------------
 1 |                   Length  Diameter    Height  Whole Wt  Shucked Wt
 2 | Length          1.000000  0.986812  0.827554   0.925261   0.897914
 3 | Diameter        0.986812  1.000000  0.833684   0.925452   0.893162
 4 | Height          0.827554  0.833684  1.000000   0.819221   0.774972
 5 | Whole weight    0.925261  0.925452  0.819221   1.000000   0.969405
 6 | Shucked weight  0.897914  0.893162  0.774972   0.969405   1.000000
 7 | Viscera weight  0.903018  0.899724  0.798319   0.966375   0.931961
 8 | Shell weight    0.897706  0.905330  0.817338   0.955355   0.882617
 9 | Rings           0.556720  0.574660  0.557467   0.540390   0.420884
10 | 
11 |                 Viscera weight  Shell weight     Rings  
12 | Length                0.903018      0.897706  0.556720  
13 | Diameter              0.899724      0.905330  0.574660  
14 | Height                0.798319      0.817338  0.557467  
15 | Whole weight          0.966375      0.955355  0.540390  
16 | Shucked weight        0.931961      0.882617  0.420884  
17 | Viscera weight        1.000000      0.907656  0.503819  
18 | Shell weight          0.907656      1.000000  0.627574  
19 | Rings                 0.503819      0.627574  1.000000  
20 | 


--------------------------------------------------------------------------------
/02/abaloneParallelPlot.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | import matplotlib.pyplot as plot
 5 | from math import exp
 6 | target_url = ("http://archive.ics.uci.edu/ml/machine-"
 7 |               "learning-databases/abalone/abalone.data")
 8 | #read abalone data
 9 | abalone = pd.read_csv(target_url,header=None, prefix="V")
10 | abalone.columns = ['Sex', 'Length', 'Diameter', 'Height',
11 |                    'Whole Wt', 'Shucked Wt',
12 |                    'Viscera Wt', 'Shell Wt', 'Rings']
13 | #get summary to use for scaling
14 | summary = abalone.describe()
15 | minRings = summary.iloc[3,7]
16 | maxRings = summary.iloc[7,7]
17 | nrows = len(abalone.index)
18 | 
19 | for i in range(nrows):
20 |     #plot rows of data as if they were series data
21 |     dataRow = abalone.iloc[i,1:8]
22 |     labelColor = (abalone.iloc[i,8] - minRings) / (maxRings - minRings)
23 |     dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
24 | 
25 | plot.xlabel("Attribute Index")
26 | plot.ylabel(("Attribute Values"))
27 | plot.show()
28 | 
29 | #renormalize using mean and standard variation, then compress
30 | # with logit function
31 | 
32 | meanRings = summary.iloc[1,7]
33 | sdRings = summary.iloc[2,7]
34 | 
35 | for i in range(nrows):
36 |     #plot rows of data as if they were series data
37 |     dataRow = abalone.iloc[i,1:8]
38 |     normTarget = (abalone.iloc[i,8] - meanRings)/sdRings
39 |     labelColor = 1.0/(1.0 + exp(-normTarget))
40 |     dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
41 | 
42 | plot.xlabel("Attribute Index")
43 | plot.ylabel(("Attribute Values"))
44 | plot.show()


--------------------------------------------------------------------------------
/02/abaloneSummary.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | from pylab import *
 5 | import matplotlib.pyplot as plot
 6 | 
 7 | target_url = ("http://archive.ics.uci.edu/ml/machine-"
 8 |               "learning-databases/abalone/abalone.data")
 9 | #read abalone data
10 | abalone = pd.read_csv(target_url,header=None, prefix="V")
11 | abalone.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight',
12 |                    'Shucked weight', 'Viscera weight', 'Shell weight',
13 |                    'Rings']
14 | 
15 | 
16 | print(abalone.head())
17 | print(abalone.tail())
18 | 
19 | #print summary of data frame
20 | summary = abalone.describe()
21 | print(summary)
22 | 
23 | #box plot the real-valued attributes
24 | #convert to array for plot routine
25 | array = abalone.iloc[:,1:9].values
26 | boxplot(array)
27 | plot.xlabel("Attribute Index")
28 | plot.ylabel(("Quartile Ranges"))
29 | show()
30 | 
31 | #the last column (rings) is out of scale with the rest
32 | # - remove and replot
33 | array2 = abalone.iloc[:,1:8].values
34 | boxplot(array2)
35 | plot.xlabel("Attribute Index")
36 | plot.ylabel(("Quartile Ranges"))
37 | show()
38 | 
39 | #removing is okay but renormalizing the variables generalizes better.
40 | #renormalize columns to zero mean and unit standard deviation
41 | #this is a common normalization and desirable for other operations
42 | # (like k-means clustering or k-nearest neighbors
43 | abaloneNormalized = abalone.iloc[:,1:9]
44 | 
45 | 
46 | for i in range(8):
47 |     mean = summary.iloc[1, i]
48 |     sd = summary.iloc[2, i]
49 |     abaloneNormalized.iloc[:,i:(i + 1)] = (
50 |                     abaloneNormalized.iloc[:,i:(i + 1)] - mean) / sd
51 | 
52 | array3 = abaloneNormalized.values
53 | boxplot(array3)
54 | plot.xlabel("Attribute Index")
55 | plot.ylabel(("Quartile Ranges - Normalized "))
56 | show()


--------------------------------------------------------------------------------
/02/abaloneSummaryOutput.txt:
--------------------------------------------------------------------------------
 1 |   Sex  Length  Diameter  Height  Whole wt  Shucked wt  Viscera wt
 2 | 0   M   0.455     0.365   0.095    0.5140      0.2245      0.1010
 3 | 1   M   0.350     0.265   0.090    0.2255      0.0995      0.0485
 4 | 2   F   0.530     0.420   0.135    0.6770      0.2565      0.1415
 5 | 3   M   0.440     0.365   0.125    0.5160      0.2155      0.1140
 6 | 4   I   0.330     0.255   0.080    0.2050      0.0895      0.0395
 7 | 
 8 |    Shell weight  Rings  
 9 | 0         0.150     15  
10 | 1         0.070      7  
11 | 2         0.210      9  
12 | 3         0.155     10  
13 | 4         0.055      7  
14 |      Sex  Length  Diameter  Height  Whole weight  Shucked weight
15 | 4172   F   0.565     0.450   0.165        0.8870          0.3700
16 | 4173   M   0.590     0.440   0.135        0.9660          0.4390
17 | 4174   M   0.600     0.475   0.205        1.1760          0.5255
18 | 4175   F   0.625     0.485   0.150        1.0945          0.5310
19 | 4176   M   0.710     0.555   0.195        1.9485          0.9455
20 | 
21 |       Viscera weight  Shell weight  Rings  
22 | 4172          0.2390        0.2490     11  
23 | 4173          0.2145        0.2605     10  
24 | 4174          0.2875        0.3080      9  
25 | 4175          0.2610        0.2960     10  
26 | 4176          0.3765        0.4950     12  
27 |             Length     Diameter       Height    Whole wt  Shucked wt
28 | count  4177.000000  4177.000000  4177.000000 4177.000000 4177.000000
29 | mean      0.523992     0.407881     0.139516    0.828742    0.359367
30 | std       0.120093     0.099240     0.041827    0.490389    0.221963
31 | min       0.075000     0.055000     0.000000    0.002000    0.001000
32 | 25%       0.450000     0.350000     0.115000    0.441500    0.186000
33 | 50%       0.545000     0.425000     0.140000    0.799500    0.336000
34 | 75%       0.615000     0.480000     0.165000    1.153000    0.502000
35 | max       0.815000     0.650000     1.130000    2.825500    1.488000
36 | 
37 |        Viscera weight  Shell weight        Rings  
38 | count     4177.000000   4177.000000  4177.000000  
39 | mean         0.180594      0.238831     9.933684  
40 | std          0.109614      0.139203     3.224169  
41 | min          0.000500      0.001500     1.000000  
42 | 25%          0.093500      0.130000     8.000000  
43 | 50%          0.171000      0.234000     9.000000  
44 | 75%          0.253000      0.329000    11.000000
45 | max          0.760000      1.005000    29.000000 
46 | 


--------------------------------------------------------------------------------
/02/chapter02.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/02/chapter02.zip


--------------------------------------------------------------------------------
/02/corrCalc.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | from math import sqrt
 5 | import sys
 6 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 7 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
 8 | 
 9 | #read rocks versus mines data into pandas data frame
10 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
11 | 
12 | #calculate correlations between real-valued attributes
13 | dataRow2 = rocksVMines.iloc[1,0:60]
14 | dataRow3 = rocksVMines.iloc[2,0:60]
15 | dataRow21 = rocksVMines.iloc[20,0:60]
16 | 
17 | mean2 = 0.0; mean3 = 0.0; mean21 = 0.0
18 | numElt = len(dataRow2)
19 | for i in range(numElt):
20 |     mean2 += dataRow2[i]/numElt
21 |     mean3 += dataRow3[i]/numElt
22 |     mean21 += dataRow21[i]/numElt
23 | 
24 | var2 = 0.0; var3 = 0.0; var21 = 0.0
25 | for i in range(numElt):
26 |     var2 += (dataRow2[i] - mean2) * (dataRow2[i] - mean2)/numElt
27 |     var3 += (dataRow3[i] - mean3) * (dataRow3[i] - mean3)/numElt
28 |     var21 += (dataRow21[i] - mean21) * (dataRow21[i] - mean21)/numElt
29 | 
30 | corr23 = 0.0; corr221 = 0.0
31 | for i in range(numElt):
32 |     corr23 += (dataRow2[i] - mean2) * \
33 |               (dataRow3[i] - mean3) / (sqrt(var2*var3) * numElt)
34 |     corr221 += (dataRow2[i] - mean2) * \
35 |                (dataRow21[i] - mean21) / (sqrt(var2*var21) * numElt)
36 | 
37 | sys.stdout.write("Correlation between attribute 2 and 3 \n")
38 | print(corr23)
39 | sys.stdout.write(" \n")
40 | 
41 | sys.stdout.write("Correlation between attribute 2 and 21 \n")
42 | print(corr221)
43 | sys.stdout.write(" \n")
44 | 
45 | 
46 | # Output:
47 | # Correlation between attribute 2 and 3
48 | # 0.770938121191
49 | #
50 | # Correlation between attribute 2 and 21
51 | # 0.466548080789


--------------------------------------------------------------------------------
/02/corrPlot.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | import matplotlib.pyplot as plot
 5 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 6 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
 7 | 
 8 | #read rocks versus mines data into pandas data frame
 9 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
10 | 
11 | #calculate correlations between real-valued attributes
12 | dataRow2 = rocksVMines.iloc[1,0:60]
13 | dataRow3 = rocksVMines.iloc[2,0:60]
14 | 
15 | plot.scatter(dataRow2, dataRow3)
16 | 
17 | 
18 | plot.xlabel("2nd Attribute")
19 | plot.ylabel(("3rd Attribute"))
20 | plot.show()
21 | 
22 | dataRow21 = rocksVMines.iloc[20,0:60]
23 | 
24 | plot.scatter(dataRow2, dataRow21)
25 | 
26 | 
27 | plot.xlabel("2nd Attribute")
28 | plot.ylabel(("21st Attribute"))
29 | plot.show()


--------------------------------------------------------------------------------
/02/glassCorrHeatMap.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | from pylab import *
 5 | import matplotlib.pyplot as plot
 6 | from math import exp
 7 | 
 8 | target_url = ("https://archive.ics.uci.edu/ml/machine-"
 9 |               "learning-databases/glass/glass.data")
10 | glass = pd.read_csv(target_url,header=None, prefix="V")
11 | glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
12 |                  'K', 'Ca', 'Ba', 'Fe', 'Type']
13 | ncols = len(glass.columns)
14 | 
15 | #calculate correlation matrix
16 | corMat = DataFrame(glass.iloc[:, 1:(ncols - 1)].corr())
17 | 
18 | #visualize correlations using heatmap
19 | plot.pcolor(corMat)
20 | plot.show()


--------------------------------------------------------------------------------
/02/glassParallelPlot.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | from pylab import *
 5 | import matplotlib.pyplot as plot
 6 | 
 7 | target_url = ("https://archive.ics.uci.edu/ml/machine-"
 8 |               "learning-databases/glass/glass.data")
 9 | glass = pd.read_csv(target_url,header=None, prefix="V")
10 | glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
11 |                  'K', 'Ca', 'Ba', 'Fe', 'Type']
12 | 
13 | 
14 | glassNormalized = glass
15 | ncols = len(glassNormalized.columns)
16 | nrows = len(glassNormalized.index)
17 | summary = glassNormalized.describe()
18 | nDataCol = ncols - 1
19 | 
20 | #normalize except for labels
21 | for i in range(ncols - 1):
22 |     mean = summary.iloc[1, i]
23 |     sd = summary.iloc[2, i]
24 |     glassNormalized.iloc[:,i:(i + 1)] = \
25 |         (glassNormalized.iloc[:,i:(i + 1)] - mean) / sd
26 | 
27 | #Plot Parallel Coordinate Graph with normalized values
28 | for i in range(nrows):
29 |     #plot rows of data as if they were series data
30 |     dataRow = glassNormalized.iloc[i,1:nDataCol]
31 |     labelColor = glassNormalized.iloc[i,nDataCol]/7.0
32 |     dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
33 | 
34 | plot.xlabel("Attribute Index")
35 | plot.ylabel(("Attribute Values"))
36 | plot.show()
37 | 


--------------------------------------------------------------------------------
/02/glassSummary.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | from pylab import *
 5 | import matplotlib.pyplot as plot
 6 | 
 7 | target_url = ("https://archive.ics.uci.edu/ml/machine-"
 8 |               "learning-databases/glass/glass.data")
 9 | 
10 | glass = pd.read_csv(target_url,header=None, prefix="V")
11 | glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
12 |                  'K', 'Ca', 'Ba', 'Fe', 'Type']
13 | 
14 | print(glass.head())
15 | 
16 | #generate statistical summaries
17 | summary = glass.describe()
18 | print(summary)
19 | ncol1 = len(glass.columns)
20 | 
21 | glassNormalized = glass.iloc[:, 1:ncol1]
22 | ncol2 = len(glassNormalized.columns)
23 | summary2 = glassNormalized.describe()
24 | 
25 | for i in range(ncol2):
26 |     mean = summary2.iloc[1, i]
27 |     sd = summary2.iloc[2, i]
28 |     glassNormalized.iloc[:,i:(i + 1)] = \
29 |         (glassNormalized.iloc[:,i:(i + 1)] - mean) / sd
30 | 
31 | array = glassNormalized.values
32 | boxplot(array)
33 | plot.xlabel("Attribute Index")
34 | plot.ylabel(("Quartile Ranges - Normalized "))
35 | show()


--------------------------------------------------------------------------------
/02/glassSummary.txt:
--------------------------------------------------------------------------------
 1 | print(glass.head())
 2 | 
 3 |    Id       RI     Na    Mg    Al     Si     K    Ca  Ba  Fe  Type
 4 | 0   1  1.52101  13.64  4.49  1.10  71.78  0.06  8.75   0   0     1
 5 | 1   2  1.51761  13.89  3.60  1.36  72.73  0.48  7.83   0   0     1
 6 | 2   3  1.51618  13.53  3.55  1.54  72.99  0.39  7.78   0   0     1
 7 | 3   4  1.51766  13.21  3.69  1.29  72.61  0.57  8.22   0   0     1
 8 | 4   5  1.51742  13.27  3.62  1.24  73.08  0.55  8.07   0   0     1
 9 | 
10 | 
11 | print(summary) - Abridged
12 |                Id          RI          Na          Mg          Al
13 | count  214.000000  214.000000  214.000000  214.000000  214.000000
14 | mean   107.500000    1.518365   13.407850    2.684533    1.444907
15 | std     61.920648    0.003037    0.816604    1.442408    0.499270
16 | min      1.000000    1.511150   10.730000    0.000000    0.290000
17 | 25%     54.250000    1.516523   12.907500    2.115000    1.190000
18 | 50%    107.500000    1.517680   13.300000    3.480000    1.360000
19 | 75%    160.750000    1.519157   13.825000    3.600000    1.630000
20 | max    214.000000    1.533930   17.380000    4.490000    3.500000
21 | 
22 |                 K          Ca          Ba          Fe        Type
23 | count  214.000000  214.000000  214.000000  214.000000  214.000000
24 | mean     0.497056    8.956963    0.175047    0.057009    2.780374
25 | std      0.652192    1.423153    0.497219    0.097439    2.103739
26 | min      0.000000    5.430000    0.000000    0.000000    1.000000
27 | 25%      0.122500    8.240000    0.000000    0.000000    1.000000
28 | 50%      0.555000    8.600000    0.000000    0.000000    2.000000
29 | 75%      0.610000    9.172500    0.000000    0.100000    3.000000
30 | max      6.210000   16.190000    3.150000    0.510000    7.000000
31 | 
32 | 


--------------------------------------------------------------------------------
/02/linePlots.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | import matplotlib.pyplot as plot
 5 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 6 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
 7 | 
 8 | #read rocks versus mines data into pandas data frame
 9 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
10 | 
11 | for i in range(208):
12 |     #assign color based on color based on "M" or "R" labels
13 |     if rocksVMines.iat[i,60] == "M":
14 |         pcolor = "red"
15 |     else:
16 |         pcolor = "blue"
17 | 
18 |     #plot rows of data as if they were series data
19 |     dataRow = rocksVMines.iloc[i,0:60]
20 |     dataRow.plot(color=pcolor, alpha=0.5)
21 | 
22 | plot.xlabel("Attribute Index")
23 | plot.ylabel(("Attribute Values"))
24 | plot.show()
25 | 


--------------------------------------------------------------------------------
/02/outputRocksVMinesContents.txt:
--------------------------------------------------------------------------------
 1 | Col#   Number  Strings Other
 2 |  0      208      0      0
 3 |  1      208      0      0
 4 |  2      208      0      0
 5 |  3      208      0      0
 6 |  4      208      0      0
 7 |  5      208      0      0
 8 |  6      208      0      0
 9 |  7      208      0      0
10 |  8      208      0      0
11 |  9      208      0      0
12 | 10      208      0      0
13 | 11      208      0      0
14 |  .        .      .      .
15 |  .        .      .      .
16 |  .        .      .      .
17 | 54      208      0      0
18 | 55      208      0      0
19 | 56      208      0      0
20 | 57      208      0      0
21 | 58      208      0      0
22 | 59      208      0      0
23 | 60      0      208      0
24 | 


--------------------------------------------------------------------------------
/02/outputSummaryStats.txt:
--------------------------------------------------------------------------------
 1 | Mean =    0.0538923076923      Standard Deviation =     0.0464159832226
 2 | 
 3 | Boundaries for 4 Equal Percentiles 
 4 | [0.0057999999999999996, 0.024375000000000001, 0.044049999999999999, 0.064500000000000002, 0.4264]
 5 |  
 6 | Boundaries for 10 Equal Percentiles 
 7 | [0.00579999999999, 0.0141, 0.022740000000, 0.0278699999999, 0.0362200000000, 0.0440499999999, 0.050719999999, 0.0599599999999, 0.0779400000000, 0.10836, 0.4264]
 8 |  
 9 | Unique Label Values 
10 | set(['R', 'M'])
11 | 
12 | Counts for Each Value of Categorical Label 
13 | ['R', 'M']
14 | [97, 111]
15 | 


--------------------------------------------------------------------------------
/02/pandasReadSummarize.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | import matplotlib.pyplot as plot
 5 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 6 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
 7 | 
 8 | #read rocks versus mines data into pandas data frame
 9 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
10 | 
11 | #print head and tail of data frame
12 | print(rocksVMines.head())
13 | print(rocksVMines.tail())
14 | 
15 | #print summary of data frame
16 | summary = rocksVMines.describe()
17 | print(summary)


--------------------------------------------------------------------------------
/02/pandasReadSummarizeOutput.txt:
--------------------------------------------------------------------------------
 1 |        V0      V1      V2     ...     V57      V58     V59  V60
 2 | 0  0.0200  0.0371  0.0428     ...  0.0084   0.0090  0.0032    R
 3 | 1  0.0453  0.0523  0.0843     ...  0.0049   0.0052  0.0044    R
 4 | 2  0.0262  0.0582  0.1099     ...  0.0164   0.0095  0.0078    R
 5 | 3  0.0100  0.0171  0.0623     ...  0.0044   0.0040  0.0117    R
 6 | 4  0.0762  0.0666  0.0481     ...  0.0048   0.0107  0.0094    R
 7 | 
 8 | [5 rows x 61 columns]
 9 |          V0      V1      V2     ...      V57     V58     V59  V60
10 | 203  0.0187  0.0346  0.0168     ...  0.0115  0.0193  0.0157    M   
11 | 204  0.0323  0.0101  0.0298     ...  0.0032  0.0062  0.0067    M   
12 | 205  0.0522  0.0437  0.0180     ...  0.0138  0.0077  0.0031    M  
13 | 206  0.0303  0.0353  0.0490     ...  0.0079  0.0036  0.0048    M   
14 | 207  0.0260  0.0363  0.0136     ...  0.0036  0.0061  0.0115    M  
15 | 
16 | 
17 |                V0          V1         ...          V58         V59
18 | count  208.000000  208.000000         ...  208.000000  208.000000
19 | mean     0.029164    0.038437         ...    0.007941    0.006507
20 | std      0.022991    0.032960         ...    0.006181    0.005031
21 | min      0.001500    0.000600         ...    0.000100    0.000600
22 | 25%      0.013350    0.016450         ...    0.003675    0.003100
23 | 50%      0.022800    0.030800         ...    0.006400    0.005300
24 | 75%      0.035550    0.047950         ...    0.010325    0.008525
25 | max      0.137100    0.233900         ...    0.036400    0.043900


--------------------------------------------------------------------------------
/02/qqplotAttribute.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'ubuntu'
 2 | import numpy as np
 3 | import pylab
 4 | import scipy.stats as stats
 5 | import urllib2
 6 | import sys
 7 | 
 8 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 9 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
10 | 
11 | data = urllib2.urlopen(target_url)
12 | 
13 | 
14 | #arrange data into list for labels and list of lists for attributes
15 | xList = []
16 | labels = []
17 | 
18 | for line in data:
19 |     #split on comma
20 |     row = line.strip().split(",")
21 |     xList.append(row)
22 | nrow = len(xList)
23 | ncol = len(xList[1])
24 | 
25 | type = [0]*3
26 | colCounts = []
27 | 
28 | #generate summary statistics for column 3 (e.g.)
29 | col = 3
30 | colData = []
31 | for row in xList:
32 |     colData.append(float(row[col]))
33 | 
34 | 
35 | stats.probplot(colData, dist="norm", plot=pylab)
36 | pylab.show()


--------------------------------------------------------------------------------
/02/rVMSummaryStats.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import urllib2
 3 | import sys
 4 | import numpy as np
 5 | 
 6 | #read data from uci data repository
 7 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 8 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
 9 | 
10 | data = urllib2.urlopen(target_url)
11 | 
12 | #arrange data into list for labels and list of lists for attributes
13 | xList = []
14 | labels = []
15 | 
16 | for line in data:
17 |     #split on comma
18 |     row = line.strip().split(",")
19 |     xList.append(row)
20 | nrow = len(xList)
21 | ncol = len(xList[1])
22 | 
23 | type = [0]*3
24 | colCounts = []
25 | 
26 | #generate summary statistics for column 3 (e.g.)
27 | col = 3
28 | colData = []
29 | for row in xList:
30 |     colData.append(float(row[col]))
31 | 
32 | colArray = np.array(colData)
33 | colMean = np.mean(colArray)
34 | colsd = np.std(colArray)
35 | sys.stdout.write("Mean = " + '\t' + str(colMean) + '\t\t' +
36 |             "Standard Deviation = " + '\t ' + str(colsd) + "\n")
37 | 
38 | 
39 | #calculate quantile boundaries
40 | ntiles = 4
41 | 
42 | percentBdry = []
43 | 
44 | for i in range(ntiles+1):
45 |     percentBdry.append(np.percentile(colArray, i*(100)/ntiles))
46 | 
47 | sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n")
48 | print(percentBdry)
49 | sys.stdout.write(" \n")
50 | 
51 | 
52 | #run again with 10 equal intervals
53 | ntiles = 10
54 | 
55 | percentBdry = []
56 | 
57 | for i in range(ntiles+1):
58 |     percentBdry.append(np.percentile(colArray, i*(100)/ntiles))
59 | 
60 | sys.stdout.write("Boundaries for 10 Equal Percentiles \n")
61 | print(percentBdry)
62 | sys.stdout.write(" \n")
63 | 
64 | 
65 | #The last column contains categorical variables
66 | 
67 | col = 60
68 | colData = []
69 | for row in xList:
70 |     colData.append(row[col])
71 | 
72 | unique = set(colData)
73 | sys.stdout.write("Unique Label Values \n")
74 | print(unique)
75 | 
76 | #count up the number of elements having each value
77 | 
78 | catDict = dict(zip(list(unique),range(len(unique))))
79 | 
80 | catCount = [0]*2
81 | 
82 | for elt in colData:
83 |     catCount[catDict[elt]] += 1
84 | 
85 | sys.stdout.write("\nCounts for Each Value of Categorical Label \n")
86 | print(list(unique))
87 | print(catCount)
88 | 


--------------------------------------------------------------------------------
/02/rockVmineContents.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import urllib2
 3 | import sys
 4 | 
 5 | #read data from uci data repository
 6 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 7 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
 8 | 
 9 | data = urllib2.urlopen(target_url)
10 | 
11 | 
12 | #arrange data into list for labels and list of lists for attributes
13 | xList = []
14 | labels = []
15 | 
16 | for line in data:
17 |     #split on comma
18 |     row = line.strip().split(",")
19 |     xList.append(row)
20 | nrow = len(xList)
21 | ncol = len(xList[1])
22 | 
23 | type = [0]*3
24 | colCounts = []
25 | 
26 | for col in range(ncol):
27 |     for row in xList:
28 |         try:
29 |             a = float(row[col])
30 |             if isinstance(a, float):
31 |                 type[0] += 1
32 |         except ValueError:
33 |             if len(row[col]) > 0:
34 |                 type[1] += 1
35 |             else:
36 |                 type[2] += 1
37 | 
38 |     colCounts.append(type)
39 |     type = [0]*3
40 | 
41 | sys.stdout.write("Col#" + '\t' + "Number" + '\t' +
42 |                  "Strings" + '\t ' + "Other\n")
43 | iCol = 0
44 | for types in colCounts:
45 |     sys.stdout.write(str(iCol) + '\t\t' + str(types[0]) + '\t\t' +
46 |                      str(types[1]) + '\t\t' + str(types[2]) + "\n")
47 |     iCol += 1


--------------------------------------------------------------------------------
/02/rockVmineSummaries.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import urllib2
 3 | import sys
 4 | 
 5 | #read data from uci data repository
 6 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 7 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
 8 | 
 9 | data = urllib2.urlopen(target_url)
10 | 
11 | #arrange data into list for labels and list of lists for attributes
12 | xList = []
13 | labels = []
14 | for line in data:
15 |     #split on comma
16 |     row = line.strip().split(",")
17 |     xList.append(row)
18 | 
19 | sys.stdout.write("Number of Rows of Data = " + str(len(xList)) + '\n')
20 | sys.stdout.write("Number of Columns of Data = " + str(len(xList[1])))
21 | 
22 | 


--------------------------------------------------------------------------------
/02/sampleCorrHeatMap.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | import matplotlib.pyplot as plot
 5 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 6 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
 7 | 
 8 | #read rocks versus mines data into pandas data frame
 9 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
10 | 
11 | #calculate correlations between real-valued attributes
12 | 
13 | corMat = DataFrame(rocksVMines.corr())
14 | 
15 | #visualize correlations using heatmap
16 | plot.pcolor(corMat)
17 | plot.show()
18 | 


--------------------------------------------------------------------------------
/02/targetCorr.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | import matplotlib.pyplot as plot
 5 | from random import uniform
 6 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
 7 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
 8 | 
 9 | #read rocks versus mines data into pandas data frame
10 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
11 | 
12 | #change the targets to numeric values
13 | target = []
14 | for i in range(208):
15 |     #assign 0 or 1 target value based on "M" or "R" labels
16 |     if rocksVMines.iat[i,60] == "M":
17 |         target.append(1.0)
18 |     else:
19 |         target.append(0.0)
20 | 
21 |     #plot rows of data as if they were series data
22 | dataRow = rocksVMines.iloc[0:208,35]
23 | plot.scatter(dataRow, target)
24 | 
25 | plot.xlabel("Attribute Value")
26 | plot.ylabel("Target Value")
27 | plot.show()
28 | 
29 | #
30 | #To improve the visualization, this version dithers the points a little
31 | # and makes them somewhat transparent
32 | target = []
33 | for i in range(208):
34 |     #assign 0 or 1 target value based on "M" or "R" labels
35 |     # and add some dither
36 |     if rocksVMines.iat[i,60] == "M":
37 |         target.append(1.0 + uniform(-0.1, 0.1))
38 |     else:
39 |         target.append(0.0 + uniform(-0.1, 0.1))
40 | 
41 |     #plot rows of data as if they were series data
42 | dataRow = rocksVMines.iloc[0:208,35]
43 | plot.scatter(dataRow, target, alpha=0.5, s=120)
44 | 
45 | plot.xlabel("Attribute Value")
46 | plot.ylabel("Target Value")
47 | plot.show()


--------------------------------------------------------------------------------
/02/wineCorrHeatMap.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | from pylab import *
 5 | import matplotlib.pyplot as plot
 6 | from math import exp
 7 | 
 8 | target_url = ("http://archive.ics.uci.edu/ml/machine-"
 9 |               "learning-databases/wine-quality/winequality-red.csv")
10 | wine = pd.read_csv(target_url,header=0, sep=";")
11 | wineCols = len(wine.columns)
12 | 
13 | #calculate correlation matrix
14 | corMat = DataFrame(wine.corr())
15 | 
16 | #visualize correlations using heatmap
17 | plot.pcolor(corMat)
18 | plot.show()


--------------------------------------------------------------------------------
/02/wineParallelPlot.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | from pylab import *
 5 | import matplotlib.pyplot as plot
 6 | from math import exp
 7 | 
 8 | target_url = ("http://archive.ics.uci.edu/ml/machine-"
 9 |  "learning-databases/wine-quality/winequality-red.csv")
10 | 
11 | wine = pd.read_csv(target_url,header=0, sep=";")
12 | 
13 | #print column names in order to have the full versions
14 | print(wine.columns)
15 | 
16 | #change column names to shorter ones to fit graph
17 | wine.columns = ['fixAcid', 'volAcid', 'citAcid',
18 |     'resSugr', 'chlor', 'frSO2', 'totSO2',
19 |     'dens', 'pH', 'sulpha', 'alcohol', 'quality']
20 | 
21 | #generate statistical summaries
22 | summary = wine.describe()
23 | nrows = len(wine.index)
24 | tasteCol = len(summary.columns)
25 | meanTaste = summary.iloc[1,tasteCol - 1]
26 | sdTaste = summary.iloc[2,tasteCol - 1]
27 | nDataCol = len(wine.columns) -1
28 | 
29 | for i in range(nrows):
30 |     #plot rows of data as if they were series data
31 |     dataRow = wine.iloc[i,1:nDataCol]
32 |     normTarget = (wine.iloc[i,nDataCol] - meanTaste)/sdTaste
33 |     labelColor = 1.0/(1.0 + exp(-normTarget))
34 |     dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
35 | 
36 | plot.xlabel("Attribute Index")
37 | plot.ylabel(("Attribute Values"))
38 | plot.show()
39 | 
40 | wineNormalized = wine
41 | ncols = len(wineNormalized.columns)
42 | 
43 | for i in range(ncols):
44 |     mean = summary.iloc[1, i]
45 |     sd = summary.iloc[2, i]
46 |     wineNormalized.iloc[:,i:(i + 1)] = \
47 |         (wineNormalized.iloc[:,i:(i + 1)] - mean) / sd
48 | 
49 | #Try again with normalized values
50 | for i in range(nrows):
51 |     #plot rows of data as if they were series data
52 |     dataRow = wineNormalized.iloc[i,1:nDataCol]
53 |     normTarget = wineNormalized.iloc[i,nDataCol]
54 |     labelColor = 1.0/(1.0 + exp(-normTarget))
55 |     dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
56 | 
57 | plot.xlabel("Attribute Index")
58 | plot.ylabel(("Attribute Values"))
59 | plot.show()


--------------------------------------------------------------------------------
/02/wineSummary.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | from pylab import *
 5 | import matplotlib.pyplot as plot
 6 | 
 7 | target_url = ("http://archive.ics.uci.edu/ml/machine-"
 8 |  "learning-databases/wine-quality/winequality-red.csv")
 9 | 
10 | wine = pd.read_csv(target_url,header=0, sep=";")
11 | 
12 | print(wine.head())
13 | 
14 | #generate statistical summaries
15 | summary = wine.describe()
16 | print(summary)
17 | 
18 | wineNormalized = wine
19 | ncols = len(wineNormalized.columns)
20 | 
21 | for i in range(ncols):
22 |     mean = summary.iloc[1, i]
23 |     sd = summary.iloc[2, i]
24 |     wineNormalized.iloc[:,i:(i + 1)] = \
25 |         (wineNormalized.iloc[:,i:(i + 1)] - mean) / sd
26 | 
27 | array = wineNormalized.values
28 | boxplot(array)
29 | plot.xlabel("Attribute Index")
30 | plot.ylabel(("Quartile Ranges - Normalized "))
31 | show()


--------------------------------------------------------------------------------
/02/wineSummary.txt:
--------------------------------------------------------------------------------
 1 |    fixed acidity  vola acidity  citric acid  resid sugar  chlorides
 2 | 0            7.4          0.70         0.00          1.9      0.076
 3 | 1            7.8          0.88         0.00          2.6      0.098
 4 | 2            7.8          0.76         0.04          2.3      0.092
 5 | 3           11.2          0.28         0.56          1.9      0.075
 6 | 4            7.4          0.70         0.00          1.9      0.076
 7 | 
 8 |    free sulfur dioxide  tot sulfur dioxide  density    pH  sulphates
 9 | 0                   11                  34   0.9978  3.51       0.56
10 | 1                   25                  67   0.9968  3.20       0.68
11 | 2                   15                  54   0.9970  3.26       0.65
12 | 3                   17                  60   0.9980  3.16       0.58
13 | 4                   11                  34   0.9978  3.51       0.56
14 | 
15 |    alcohol  quality  
16 | 0      9.4        5  
17 | 1      9.8        5  
18 | 2      9.8        5  
19 | 3      9.8        6  
20 | 4      9.4        5  
21 |        fixed acidity  volatile acidity  citric acid  residual sugar
22 | count    1599.000000       1599.000000  1599.000000     1599.000000
23 | mean        8.319637          0.527821     0.270976        2.538806
24 | std         1.741096          0.179060     0.194801        1.409928
25 | min         4.600000          0.120000     0.000000        0.900000
26 | 25%         7.100000          0.390000     0.090000        1.900000
27 | 50%         7.900000          0.520000     0.260000        2.200000
28 | 75%         9.200000          0.640000     0.420000        2.600000
29 | max        15.900000          1.580000     1.000000       15.500000
30 | 
31 |          chlorides  free sulfur dioxide tot sulfur dioxide   density
32 | count  1599.000000          1599.000000      1599.000000 1599.000000
33 | mean      0.087467            15.874922        46.467792    0.996747
34 | std       0.047065            10.460157        32.895324    0.001887
35 | min       0.012000             1.000000         6.000000    0.990070
36 | 25%       0.070000             7.000000        22.000000    0.995600
37 | 50%       0.079000            14.000000        38.000000    0.996750
38 | 75%       0.090000            21.000000        62.000000    0.997835
39 | max       0.611000            72.000000       289.000000    1.003690
40 | 
41 |                 pH    sulphates      alcohol      quality  
42 | count  1599.000000  1599.000000  1599.000000  1599.000000  
43 | mean      3.311113     0.658149    10.422983     5.636023  
44 | std       0.154386     0.169507     1.065668     0.807569  
45 | min       2.740000     0.330000     8.400000     3.000000  
46 | 25%       3.210000     0.550000     9.500000     5.000000  
47 | 50%       3.310000     0.620000    10.200000     6.000000  
48 | 75%       3.400000     0.730000    11.100000     6.000000  
49 | max       4.010000     2.000000    14.900000     8.000000  
50 | 


--------------------------------------------------------------------------------
/03/chapter03.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/03/chapter03.zip


--------------------------------------------------------------------------------
/03/classifierPerformance_RocksVMines.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | #use scikit learn package to perform linear regression
  3 | #read in the rocks versus mines data set from uci.edu data repository
  4 | import urllib2
  5 | import numpy
  6 | import random
  7 | from sklearn import datasets, linear_model
  8 | from sklearn.metrics import roc_curve, auc
  9 | import pylab as pl
 10 | 
 11 | 
 12 | def confusionMatrix(predicted, actual, threshold):
 13 |     if len(predicted) != len(actual): return -1
 14 |     tp = 0.0
 15 |     fp = 0.0
 16 |     tn = 0.0
 17 |     fn = 0.0
 18 |     for i in range(len(actual)):
 19 |         if actual[i] > 0.5: #labels that are 1.0  (positive examples)
 20 |             if predicted[i] > threshold:
 21 |                 tp += 1.0 #correctly predicted positive
 22 |             else:
 23 |                 fn += 1.0 #incorrectly predicted negative
 24 |         else:              #labels that are 0.0 (negative examples)
 25 |             if predicted[i] < threshold:
 26 |                 tn += 1.0 #correctly predicted negative
 27 |             else:
 28 |                 fp += 1.0 #incorrectly predicted positive
 29 |     rtn = [tp, fn, fp, tn]
 30 |     return rtn
 31 | 
 32 | 
 33 | #read data from uci data repository
 34 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
 35 | data = urllib2.urlopen(target_url)
 36 | 
 37 | #arrange data into list for labels and list of lists for attributes
 38 | xList = []
 39 | labels = []
 40 | for line in data:
 41 |     #split on comma
 42 |     row = line.strip().split(",")
 43 |     #assign label 1.0 for "M" and 0.0 for "R"
 44 |     if(row[-1] == 'M'):
 45 |         labels.append(1.0)
 46 |     else:
 47 |         labels.append(0.0)
 48 |     #remove label from row
 49 |     row.pop()
 50 |     #convert row to floats
 51 |     floatRow = [float(num) for num in row]
 52 |     xList.append(floatRow)
 53 | 
 54 | #divide attribute matrix and label vector into training(2/3 of data) and test sets (1/3 of data)
 55 | indices = range(len(xList))
 56 | xListTest = [xList[i] for i in indices if i%3 == 0 ]
 57 | xListTrain = [xList[i] for i in indices if i%3 != 0 ]
 58 | labelsTest = [labels[i] for i in indices if i%3 == 0]
 59 | labelsTrain = [labels[i] for i in indices if i%3 != 0]
 60 | 
 61 | #form list of list input into numpy arrays to match input class for scikit-learn linear model
 62 | xTrain = numpy.array(xListTrain); yTrain = numpy.array(labelsTrain); xTest = numpy.array(xListTest); yTest = numpy.array(labelsTest)
 63 | 
 64 | #check shapes to see what they look like
 65 | print("Shape of xTrain array", xTrain.shape)
 66 | print("Shape of yTrain array", yTrain.shape)
 67 | print("Shape of xTest array", xTest.shape)
 68 | print("Shape of yTest array", yTest.shape)
 69 | 
 70 | #train linear regression model
 71 | rocksVMinesModel = linear_model.LinearRegression()
 72 | rocksVMinesModel.fit(xTrain,yTrain)
 73 | 
 74 | #generate predictions on in-sample error
 75 | trainingPredictions = rocksVMinesModel.predict(xTrain)
 76 | print("Some values predicted by model", trainingPredictions[0:5], trainingPredictions[-6:-1])
 77 | 
 78 | #generate confusion matrix for predictions on training set (in-sample
 79 | confusionMatTrain = confusionMatrix(trainingPredictions, yTrain, 0.5)
 80 | #pick threshold value and generate confusion matrix entries
 81 | tp = confusionMatTrain[0]; fn = confusionMatTrain[1]; fp = confusionMatTrain[2]; tn = confusionMatTrain[3]
 82 | 
 83 | print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')
 84 | 
 85 | #generate predictions on out-of-sample data
 86 | testPredictions = rocksVMinesModel.predict(xTest)
 87 | 
 88 | #generate confusion matrix from predictions on out-of-sample data
 89 | conMatTest = confusionMatrix(testPredictions, yTest, 0.5)
 90 | #pick threshold value and generate confusion matrix entries
 91 | tp = conMatTest[0]; fn = conMatTest[1]; fp = conMatTest[2]; tn = conMatTest[3]
 92 | print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')
 93 | 
 94 | #generate ROC curve for in-sample
 95 | 
 96 | fpr, tpr, thresholds = roc_curve(yTrain,trainingPredictions)
 97 | roc_auc = auc(fpr, tpr)
 98 | print( 'AUC for in-sample ROC curve: %f' % roc_auc)
 99 | 
100 | # Plot ROC curve
101 | pl.clf()
102 | pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
103 | pl.plot([0, 1], [0, 1], 'k--')
104 | pl.xlim([0.0, 1.0])
105 | pl.ylim([0.0, 1.0])
106 | pl.xlabel('False Positive Rate')
107 | pl.ylabel('True Positive Rate')
108 | pl.title('In sample ROC rocks versus mines')
109 | pl.legend(loc="lower right")
110 | pl.show()
111 | 
112 | #generate ROC curve for out-of-sample
113 | fpr, tpr, thresholds = roc_curve(yTest,testPredictions)
114 | roc_auc = auc(fpr, tpr)
115 | print( 'AUC for out-of-sample ROC curve: %f' % roc_auc)
116 | 
117 | # Plot ROC curve
118 | pl.clf()
119 | pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
120 | pl.plot([0, 1], [0, 1], 'k--')
121 | pl.xlim([0.0, 1.0])
122 | pl.ylim([0.0, 1.0])
123 | pl.xlabel('False Positive Rate')
124 | pl.ylabel('True Positive Rate')
125 | pl.title('Out-of-sample ROC rocks versus mines')
126 | pl.legend(loc="lower right")
127 | pl.show()


--------------------------------------------------------------------------------
/03/classifierPerformance_RocksVMinesOutput.txt:
--------------------------------------------------------------------------------
 1 | ('Shape of xTrain array', (138, 60))
 2 | ('Shape of yTrain array', (138,))
 3 | ('Shape of xTest array', (70, 60))
 4 | ('Shape of yTest array', (70,))
 5 | ('Some values predicted by model', array([-0.10240253,  0.42090698,  0.38593034,  0.36094537,  0.31520494]), array([ 1.11094176,  1.12242751,  0.77626699,  1.02016858,  0.66338081]))
 6 | tp = 68.0	fn = 6.0
 7 | fp = 7.0	tn = 57.0
 8 | 
 9 | tp = 28.0	fn = 9.0
10 | fp = 9.0	tn = 24.0
11 | 
12 | AUC for in-sample ROC curve: 0.979519
13 | AUC for out-of-sample ROC curve: 0.848485
14 | 
15 | 


--------------------------------------------------------------------------------
/03/classifierRidgeRocksVMines.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike-bowles'
 2 | import urllib2
 3 | import numpy
 4 | from sklearn import datasets, linear_model
 5 | from sklearn.metrics import roc_curve, auc
 6 | import pylab as plt
 7 | 
 8 | #read data from uci data repository
 9 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
10 | data = urllib2.urlopen(target_url)
11 | 
12 | #arrange data into list for labels and list of lists for attributes
13 | xList = []
14 | labels = []
15 | for line in data:
16 |     #split on comma
17 |     row = line.strip().split(",")
18 |     #assign label 1.0 for "M" and 0.0 for "R"
19 |     if(row[-1] == 'M'):
20 |         labels.append(1.0)
21 |     else:
22 |         labels.append(0.0)
23 |         #remove lable from row
24 |     row.pop()
25 |     #convert row to floats
26 |     floatRow = [float(num) for num in row]
27 |     xList.append(floatRow)
28 | 
29 | #divide attribute matrix and label vector into training(2/3 of data) and test sets (1/3 of data)
30 | indices = range(len(xList))
31 | xListTest = [xList[i] for i in indices if i%3 == 0 ]
32 | xListTrain = [xList[i] for i in indices if i%3 != 0 ]
33 | labelsTest = [labels[i] for i in indices if i%3 == 0]
34 | labelsTrain = [labels[i] for i in indices if i%3 != 0]
35 | 
36 | #form list of list input into numpy arrays to match input class for scikit-learn linear model
37 | xTrain = numpy.array(xListTrain); yTrain = numpy.array(labelsTrain); xTest = numpy.array(xListTest); yTest = numpy.array(labelsTest)
38 | 
39 | alphaList = [0.1**i for i in [-3, -2, -1, 0,1, 2, 3, 4, 5]]
40 | 
41 | aucList = []
42 | for alph in alphaList:
43 |     rocksVMinesRidgeModel = linear_model.Ridge(alpha=alph)
44 |     rocksVMinesRidgeModel.fit(xTrain, yTrain)
45 |     fpr, tpr, thresholds = roc_curve(yTest,rocksVMinesRidgeModel.predict(xTest))
46 |     roc_auc = auc(fpr, tpr)
47 |     aucList.append(roc_auc)
48 | 
49 | 
50 | print("AUC             alpha")
51 | for i in range(len(aucList)):
52 |     print(aucList[i], alphaList[i])
53 | 
54 | #plot auc values versus alpha values
55 | x = [-3, -2, -1, 0,1, 2, 3, 4, 5]
56 | plt.plot(x, aucList)
57 | plt.xlabel('-log(alpha)')
58 | plt.ylabel('AUC')
59 | plt.show()
60 | 
61 | #visualize the performance of the best classifier
62 | indexBest = aucList.index(max(aucList))
63 | alph = alphaList[indexBest]
64 | rocksVMinesRidgeModel = linear_model.Ridge(alpha=alph)
65 | rocksVMinesRidgeModel.fit(xTrain, yTrain)
66 | 
67 | #scatter plot of actual vs predicted
68 | plt.scatter(rocksVMinesRidgeModel.predict(xTest), yTest, s=100, alpha=0.25)
69 | plt.xlabel("Predicted Value")
70 | plt.ylabel("Actual Value")
71 | plt.show()


--------------------------------------------------------------------------------
/03/classifierRidgeRocksVMinesOutput.txt:
--------------------------------------------------------------------------------
 1 | AUC             alpha
 2 | (0.84111384111384113, 999.9999999999999)
 3 | (0.86404586404586403, 99.99999999999999)
 4 | (0.9074529074529073, 10.0)
 5 | (0.91809991809991809, 1.0)
 6 | (0.88288288288288286, 0.1)
 7 | (0.8615888615888615, 0.010000000000000002)
 8 | (0.85176085176085159, 0.0010000000000000002)
 9 | (0.85094185094185093, 0.00010000000000000002)
10 | (0.84930384930384917, 1.0000000000000003e-05)
11 | 


--------------------------------------------------------------------------------
/03/fwdStepwiseWine.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | import urllib2
  3 | import numpy
  4 | from sklearn import datasets, linear_model
  5 | from math import sqrt
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | def xattrSelect(x, idxSet):
  9 |     #takes X matrix as list of list and returns subset containing columns in idxSet
 10 |     xOut = []
 11 |     for row in x:
 12 |         xOut.append([row[i] for i in idxSet])
 13 |     return(xOut)
 14 | 
 15 | #read data into iterable
 16 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 17 | data = urllib2.urlopen(target_url)
 18 | xList = []
 19 | labels = []
 20 | names = []
 21 | firstLine = True
 22 | for line in data:
 23 |     if firstLine:
 24 |         names = line.strip().split(";")
 25 |         firstLine = False
 26 |     else:
 27 |         #split on semi-colon
 28 |         row = line.strip().split(";")
 29 |         #put labels in separate array
 30 |         labels.append(float(row[-1]))
 31 |         #remove label from row
 32 |         row.pop()
 33 |         #convert row to floats
 34 |         floatRow = [float(num) for num in row]
 35 |         xList.append(floatRow)
 36 | 
 37 | #divide attributes and labels into training and test sets
 38 | indices = range(len(xList))
 39 | xListTest = [xList[i] for i in indices if i%3 == 0 ]
 40 | xListTrain = [xList[i] for i in indices if i%3 != 0 ]
 41 | labelsTest = [labels[i] for i in indices if i%3 == 0]
 42 | labelsTrain = [labels[i] for i in indices if i%3 != 0]
 43 | 
 44 | #build list of attributes one-at-a-time - starting with empty
 45 | attributeList = []
 46 | index = range(len(xList[1]))
 47 | indexSet = set(index)
 48 | indexSeq = []
 49 | oosError = []
 50 | 
 51 | for i in index:
 52 |     attSet = set(attributeList)
 53 |     #attributes not in list already
 54 |     attTrySet = indexSet - attSet
 55 |     #form into list
 56 |     attTry = [ii for ii in attTrySet]
 57 |     errorList = []
 58 |     attTemp = []
 59 |     #try each attribute not in set to see which one gives least oos error
 60 |     for iTry in attTry:
 61 |         attTemp = [] + attributeList
 62 |         attTemp.append(iTry)
 63 |         #use attTemp to form training and testing sub matrices as list of lists
 64 |         xTrainTemp = xattrSelect(xListTrain, attTemp)
 65 |         xTestTemp = xattrSelect(xListTest, attTemp)
 66 |         #form into numpy arrays
 67 |         xTrain = numpy.array(xTrainTemp); yTrain = numpy.array(labelsTrain); xTest = numpy.array(xTestTemp); yTest = numpy.array(labelsTest)
 68 |         #use sci-kit learn linear regression
 69 |         wineQModel = linear_model.LinearRegression()
 70 |         wineQModel.fit(xTrain,yTrain)
 71 |         #use trained model to generate prediction and calculate rmsError
 72 |         rmsError = numpy.linalg.norm((yTest-wineQModel.predict(xTest)), 2)/sqrt(len(yTest))
 73 |         errorList.append(rmsError)
 74 |         attTemp = []
 75 | 
 76 |     iBest = numpy.argmin(errorList)
 77 |     attributeList.append(attTry[iBest])
 78 |     oosError.append(errorList[iBest])
 79 | 
 80 | print("Out of sample error versus attribute set size" )
 81 | print(oosError)
 82 | print("\n" + "Best attribute indices")
 83 | print(attributeList)
 84 | namesList = [names[i] for i in attributeList]
 85 | print("\n" + "Best attribute names")
 86 | print(namesList)
 87 | 
 88 | #Plot error versus number of attributes
 89 | x = range(len(oosError))
 90 | plt.plot(x, oosError, 'k')
 91 | plt.xlabel('Number of Attributes')
 92 | plt.ylabel('Error (RMS)')
 93 | plt.show()
 94 | 
 95 | #Plot histogram of out of sample errors for best number of attributes
 96 | #Identify index corresponding to min value, retrain with the corresponding attributes
 97 | #Use resulting model to predict against out of sample data.  Plot errors (aka residuals)
 98 | indexBest = oosError.index(min(oosError))
 99 | attributesBest = attributeList[1:(indexBest+1)]
100 | 
101 | #Define column-wise subsets of xListTrain and xListTest and convert to numpy
102 | xTrainTemp = xattrSelect(xListTrain, attributesBest)
103 | xTestTemp = xattrSelect(xListTest, attributesBest)
104 | xTrain = numpy.array(xTrainTemp); xTest = numpy.array(xTestTemp)
105 | 
106 | #train and plot error histogram
107 | wineQModel = linear_model.LinearRegression()
108 | wineQModel.fit(xTrain,yTrain)
109 | errorVector = yTest-wineQModel.predict(xTest)
110 | plt.hist(errorVector)
111 | plt.xlabel("Bin Boundaries")
112 | plt.ylabel("Counts")
113 | plt.show()
114 | 
115 | #scatter plot of actual versus predicted
116 | plt.scatter(wineQModel.predict(xTest), yTest, s=100, alpha=0.10)
117 | plt.xlabel('Predicted Taste Score')
118 | plt.ylabel('Actual Taste Score')
119 | plt.show()


--------------------------------------------------------------------------------
/03/fwdStepwiseWineOutput.txt:
--------------------------------------------------------------------------------
 1 | Out of sample error versus attribute set size
 2 | [0.7234259255116281, 0.68609931528371915, 0.67343650334202809, 0.66770332138977984, 0.66225585685222743, 0.65900047541546247, 0.65727172061430772, 0.65709058062076986, 0.65699930964461406, 0.65758189400434675, 0.65739098690113373]
 3 | 
 4 | Best attribute indices
 5 | [10, 1, 9, 4, 6, 8, 5, 3, 2, 7, 0]
 6 | 
 7 | Best attribute names
 8 | ['"alcohol"', '"volatile acidity"', '"sulphates"', '"chlorides"', '"total sulfur dioxide"', '"pH"', '"free sulfur dioxide"', '"residual sugar"', '"citric acid"', '"density"', '"fixed acidity"']
 9 | 
10 | 


--------------------------------------------------------------------------------
/03/regressionErrorMeasures.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike-bowles'
 2 | 
 3 | 
 4 | #here are some made-up numbers to start with
 5 | target = [1.5, 2.1, 3.3, -4.7, -2.3, 0.75]
 6 | prediction = [0.5, 1.5, 2.1, -2.2, 0.1, -0.5]
 7 | 
 8 | error = []
 9 | for i in range(len(target)):
10 |     error.append(target[i] - prediction[i])
11 | 
12 | #print the errors
13 | print("Errors ",)
14 | print(error)
15 | #ans:  [1.0, 0.60000000000000009, 1.1999999999999997, -2.5, -2.3999999999999999, 1.25]
16 | 
17 | 
18 | 
19 | #calculate the squared errors and absolute value of errors
20 | squaredError = []
21 | absError = []
22 | for val in error:
23 |     squaredError.append(val*val)
24 |     absError.append(abs(val))
25 | 
26 | 
27 | #print squared errors and absolute value of errors
28 | print("Squared Error")
29 | print(squaredError)
30 | #ans: [1.0, 0.3600000000000001, 1.4399999999999993, 6.25, 5.7599999999999998, 1.5625]
31 | print("Absolute Value of Error")
32 | print(absError)
33 | #ans: [1.0, 0.60000000000000009, 1.1999999999999997, 2.5, 2.3999999999999999, 1.25]
34 | 
35 | 
36 | #calculate and print mean squared error MSE
37 | print("MSE = ", sum(squaredError)/len(squaredError))
38 | #ans: 2.72875
39 | 
40 | 
41 | from math import sqrt
42 | #calculate and print square root of MSE (RMSE)
43 | print("RMSE = ", sqrt(sum(squaredError)/len(squaredError)))
44 | #ans: 1.65189285367
45 | 
46 | 
47 | #calculate and print mean absolute error MAE
48 | print("MAE = ", sum(absError)/len(absError))
49 | #ans: 1.49166666667
50 | 
51 | 
52 | #compare MSE to target variance
53 | targetDeviation = []
54 | targetMean = sum(target)/len(target)
55 | for val in target:
56 |     targetDeviation.append((val - targetMean)*(val - targetMean))
57 | 
58 | #print the target variance
59 | print("Target Variance = ", sum(targetDeviation)/len(targetDeviation))
60 | #ans: 7.5703472222222219
61 | 
62 | #print the the target standard deviation (square root of variance)
63 | print("Target Standard Deviation = ", sqrt(sum(targetDeviation)/len(targetDeviation)))
64 | #ans: 2.7514263977475797


--------------------------------------------------------------------------------
/03/ridgeWine.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike-bowles'
 2 | 
 3 | import urllib2
 4 | import numpy
 5 | from sklearn import datasets, linear_model
 6 | from math import sqrt
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #read data into iterable
10 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
11 | data = urllib2.urlopen(target_url)
12 | 
13 | xList = []
14 | labels = []
15 | names = []
16 | firstLine = True
17 | for line in data:
18 |     if firstLine:
19 |         names = line.strip().split(";")
20 |         firstLine = False
21 |     else:
22 |         #split on semi-colon
23 |         row = line.strip().split(";")
24 |         #put labels in separate array
25 |         labels.append(float(row[-1]))
26 |         #remove label from row
27 |         row.pop()
28 |         #convert row to floats
29 |         floatRow = [float(num) for num in row]
30 |         xList.append(floatRow)
31 | 
32 | #divide attributes and labels into training and test sets
33 | indices = range(len(xList))
34 | xListTest = [xList[i] for i in indices if i%3 == 0 ]
35 | xListTrain = [xList[i] for i in indices if i%3 != 0 ]
36 | labelsTest = [labels[i] for i in indices if i%3 == 0]
37 | labelsTrain = [labels[i] for i in indices if i%3 != 0]
38 | 
39 | xTrain = numpy.array(xListTrain); yTrain = numpy.array(labelsTrain); xTest = numpy.array(xListTest); yTest = numpy.array(labelsTest)
40 | 
41 | alphaList = [0.1**i for i in [0,1, 2, 3, 4, 5, 6]]
42 | 
43 | rmsError = []
44 | for alph in alphaList:
45 |     wineRidgeModel = linear_model.Ridge(alpha=alph)
46 |     wineRidgeModel.fit(xTrain, yTrain)
47 |     rmsError.append(numpy.linalg.norm((yTest-wineRidgeModel.predict(xTest)), 2)/sqrt(len(yTest)))
48 | 
49 | print("RMS Error             alpha")
50 | for i in range(len(rmsError)):
51 |     print(rmsError[i], alphaList[i])
52 | 
53 | #plot curve of out-of-sample error versus alpha
54 | x = range(len(rmsError))
55 | plt.plot(x, rmsError, 'k')
56 | plt.xlabel('-log(alpha)')
57 | plt.ylabel('Error (RMS)')
58 | plt.show()
59 | 
60 | #Plot histogram of out of sample errors for best alpha value and scatter plot of actual versus predicted
61 | #Identify index corresponding to min value, retrain with the corresponding value of alpha
62 | #Use resulting model to predict against out of sample data.  Plot errors (aka residuals)
63 | indexBest = rmsError.index(min(rmsError))
64 | alph = alphaList[indexBest]
65 | wineRidgeModel = linear_model.Ridge(alpha=alph)
66 | wineRidgeModel.fit(xTrain, yTrain)
67 | errorVector = yTest-wineRidgeModel.predict(xTest)
68 | plt.hist(errorVector)
69 | plt.xlabel("Bin Boundaries")
70 | plt.ylabel("Counts")
71 | plt.show()
72 | 
73 | plt.scatter(wineRidgeModel.predict(xTest), yTest, s=100, alpha=0.10)
74 | plt.xlabel('Predicted Taste Score')
75 | plt.ylabel('Actual Taste Score')
76 | plt.show()
77 | 
78 | 


--------------------------------------------------------------------------------
/03/ridgeWineOutput.txt:
--------------------------------------------------------------------------------
1 | RMS Error             alpha
2 | (0.65957881763424564, 1.0)
3 | (0.65786109188085928, 0.1)
4 | (0.65761721446402455, 0.010000000000000002)
5 | (0.65752164826417536, 0.0010000000000000002)
6 | (0.65741906801092931, 0.00010000000000000002)
7 | (0.65739416288512531, 1.0000000000000003e-05)
8 | (0.65739130871558593, 1.0000000000000004e-06)
9 | 


--------------------------------------------------------------------------------
/04/chapter04.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/04/chapter04.zip


--------------------------------------------------------------------------------
/04/cvCurveDetails.txt:
--------------------------------------------------------------------------------
1 | Output:
2 | ('Minimum Mean Square Error', 0.5873018933136459)
3 | ('Index of Minimum Mean Square Error', 311)
4 | 


--------------------------------------------------------------------------------
/04/glmnetOrderedNamesList.txt:
--------------------------------------------------------------------------------
1 | ['"alcohol"', '"volatile acidity"', '"sulphates"', '"total sulfur dioxide"', '"chlorides"', '"fixed acidity"', '"pH"']
2 | 


--------------------------------------------------------------------------------
/04/glmnetWine.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | from sklearn import datasets, linear_model
  6 | from math import sqrt
  7 | import matplotlib.pyplot as plot
  8 | def S(z, gamma):
  9 |     if gamma >= abs(z):
 10 |         return 0.0
 11 |     return (z/abs(z))*(abs(z) - gamma)
 12 | 
 13 | #read data into iterable
 14 | target_url = ("http://archive.ics.uci.edu/ml/machine-learning-"
 15 | "databases/wine-quality/winequality-red.csv")
 16 | data = urllib2.urlopen(target_url)
 17 | 
 18 | xList = []
 19 | labels = []
 20 | names = []
 21 | firstLine = True
 22 | for line in data:
 23 |     if firstLine:
 24 |         names = line.strip().split(";")
 25 |         firstLine = False
 26 |     else:
 27 |         #split on semi-colon
 28 |         row = line.strip().split(";")
 29 |         #put labels in separate array
 30 |         labels.append(float(row[-1]))
 31 |         #remove label from row
 32 |         row.pop()
 33 |         #convert row to floats
 34 |         floatRow = [float(num) for num in row]
 35 |         xList.append(floatRow)
 36 | 
 37 | #Normalize columns in x and labels
 38 | 
 39 | nrows = len(xList)
 40 | ncols = len(xList[0])
 41 | 
 42 | #calculate means and variances
 43 | xMeans = []
 44 | xSD = []
 45 | for i in range(ncols):
 46 |     col = [xList[j][i] for j in range(nrows)]
 47 |     mean = sum(col)/nrows
 48 |     xMeans.append(mean)
 49 |     colDiff = [(xList[j][i] - mean) for j in range(nrows)]
 50 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
 51 |     stdDev = sqrt(sumSq/nrows)
 52 |     xSD.append(stdDev)
 53 | 
 54 | #use calculate mean and standard deviation to normalize xList
 55 | xNormalized = []
 56 | for i in range(nrows):
 57 |     rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j]
 58 |                      for j in range(ncols)]
 59 |     xNormalized.append(rowNormalized)
 60 | 
 61 | #Normalize labels
 62 | meanLabel = sum(labels)/nrows
 63 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] -
 64 |                 meanLabel) for i in range(nrows)])/nrows)
 65 | 
 66 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
 67 | 
 68 | #select value for alpha parameter
 69 | 
 70 | alpha = 1.0
 71 | 
 72 | #make a pass through the data to determine value of lambda that
 73 | # just suppresses all coefficients.
 74 | #start with betas all equal to zero.
 75 | 
 76 | 
 77 | xy = [0.0]*ncols
 78 | for i in range(nrows):
 79 |     for j in range(ncols):
 80 |         xy[j] += xNormalized[i][j] * labelNormalized[i]
 81 | 
 82 | maxXY = 0.0
 83 | for i in range(ncols):
 84 |     val = abs(xy[i])/nrows
 85 |     if val > maxXY:
 86 |         maxXY = val
 87 | 
 88 | #calculate starting value for lambda
 89 | lam = maxXY/alpha
 90 | 
 91 | #this value of lambda corresponds to beta = list of 0's
 92 | #initialize a vector of coefficients beta
 93 | beta = [0.0] * ncols
 94 | 
 95 | #initialize matrix of betas at each step
 96 | betaMat = []
 97 | betaMat.append(list(beta))
 98 | 
 99 | #begin iteration
100 | nSteps = 100
101 | lamMult = 0.93 #100 steps gives reduction by factor of 1000 in
102 |                # lambda (recommended by authors)
103 | nzList = []
104 | 
105 | for iStep in range(nSteps):
106 |     #make lambda smaller so that some coefficient becomes non-zero
107 |     lam = lam * lamMult
108 | 
109 |     deltaBeta = 100.0
110 |     eps = 0.01
111 |     iterStep = 0
112 |     betaInner = list(beta)
113 |     while deltaBeta > eps:
114 |         iterStep += 1
115 |         if iterStep > 100: break
116 | 
117 |         #cycle through attributes and update one-at-a-time
118 |         #record starting value for comparison
119 |         betaStart = list(betaInner)
120 |         for iCol in range(ncols):
121 | 
122 |             xyj = 0.0
123 |             for i in range(nrows):
124 |                 #calculate residual with current value of beta
125 |                 labelHat = sum([xNormalized[i][k]*betaInner[k]
126 |                                 for k in range(ncols)])
127 |                 residual = labelNormalized[i] - labelHat
128 | 
129 |                 xyj += xNormalized[i][iCol] * residual
130 | 
131 |             uncBeta = xyj/nrows + betaInner[iCol]
132 |             betaInner[iCol] = S(uncBeta, lam * alpha) / (1 +
133 |                                             lam * (1 - alpha))
134 | 
135 |         sumDiff = sum([abs(betaInner[n] - betaStart[n])
136 |                        for n in range(ncols)])
137 |         sumBeta = sum([abs(betaInner[n]) for n in range(ncols)])
138 |         deltaBeta = sumDiff/sumBeta
139 |     print(iStep, iterStep)
140 |     beta = betaInner
141 | 
142 |     #add newly determined beta to list
143 |     betaMat.append(beta)
144 | 
145 |     #keep track of the order in which the betas become non-zero
146 |     nzBeta = [index for index in range(ncols) if beta[index] != 0.0]
147 |     for q in nzBeta:
148 |         if (q in nzList) == False:
149 |             nzList.append(q)
150 | 
151 | #print out the ordered list of betas
152 | nameList = [names[nzList[i]] for i in range(len(nzList))]
153 | print(nameList)
154 | 
155 | nPts = len(betaMat)
156 | for i in range(ncols):
157 |     #plot range of beta values for each attribute
158 |     coefCurve = [betaMat[k][i] for k in range(nPts)]
159 |     xaxis = range(nPts)
160 |     plot.plot(xaxis, coefCurve)
161 | 
162 | plot.xlabel("Steps Taken")
163 | plot.ylabel(("Coefficient Values"))
164 | plot.show()
165 | 
166 | 


--------------------------------------------------------------------------------
/04/larsAbalone.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | 
  3 | import urllib2
  4 | from pylab import *
  5 | import matplotlib.pyplot as plot
  6 | 
  7 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
  8 | #read abalone data
  9 | data = urllib2.urlopen(target_url)
 10 | 
 11 | xList = []
 12 | labels = []
 13 | 
 14 | for line in data:
 15 |     #split on semi-colon
 16 |     row = line.strip().split(",")
 17 | 
 18 |     #put labels in separate array and remove label from row
 19 |     labels.append(float(row.pop()))
 20 | 
 21 |     #form list of list of attributes (all strings)
 22 |     xList.append(row)
 23 | 
 24 | names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
 25 | 
 26 | #code three-valued sex attribute as numeric
 27 | xCoded = []
 28 | for row in xList:
 29 |     #first code the three-valued sex variable
 30 |     codedSex = [0.0, 0.0]
 31 |     if row[0] == 'M': codedSex[0] = 1.0
 32 |     if row[0] == 'F': codedSex[1] = 1.0
 33 | 
 34 |     numRow = [float(row[i]) for i in range(1,len(row))]
 35 |     rowCoded = list(codedSex) + numRow
 36 |     xCoded.append(rowCoded)
 37 | 
 38 | namesCoded = ['Sex1', 'Sex2', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
 39 | 
 40 | nrows = len(xCoded)
 41 | ncols = len(xCoded[1])
 42 | 
 43 | xMeans = []
 44 | xSD = []
 45 | for i in range(ncols):
 46 |     col = [xCoded[j][i] for j in range(nrows)]
 47 |     mean = sum(col)/nrows
 48 |     xMeans.append(mean)
 49 |     colDiff = [(xCoded[j][i] - mean) for j in range(nrows)]
 50 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
 51 |     stdDev = sqrt(sumSq/nrows)
 52 |     xSD.append(stdDev)
 53 | 
 54 | #use calculate mean and standard deviation to normalize xCoded
 55 | xNormalized = []
 56 | for i in range(nrows):
 57 |     rowNormalized = [(xCoded[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
 58 |     xNormalized.append(rowNormalized)
 59 | 
 60 | #Normalize labels
 61 | meanLabel = sum(labels)/nrows
 62 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
 63 | 
 64 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
 65 | 
 66 | #initialize a vector of coefficients beta
 67 | beta = [0.0] * ncols
 68 | 
 69 | #initialize matrix of betas at each step
 70 | betaMat = []
 71 | betaMat.append(list(beta))
 72 | 
 73 | 
 74 | #number of steps to take
 75 | nSteps = 350
 76 | stepSize = 0.004
 77 | nzList = []
 78 | 
 79 | for i in range(nSteps):
 80 |     #calculate residuals
 81 |     residuals = [0.0] * nrows
 82 |     for j in range(nrows):
 83 |         labelsHat = sum([xNormalized[j][k] * beta[k] for k in range(ncols)])
 84 |         residuals[j] = labelNormalized[j] - labelsHat
 85 | 
 86 |     #calculate correlation between attribute columns from normalized wine and residual
 87 |     corr = [0.0] * ncols
 88 | 
 89 |     for j in range(ncols):
 90 |         corr[j] = sum([xNormalized[k][j] * residuals[k] for k in range(nrows)]) / nrows
 91 | 
 92 |     iStar = 0
 93 |     corrStar = corr[0]
 94 | 
 95 |     for j in range(1, (ncols)):
 96 |         if abs(corrStar) < abs(corr[j]):
 97 |             iStar = j; corrStar = corr[j]
 98 | 
 99 |     beta[iStar] += stepSize * corrStar / abs(corrStar)
100 |     betaMat.append(list(beta))
101 | 
102 | 
103 |     nzBeta = [index for index in range(ncols) if beta[index] != 0.0]
104 |     for q in nzBeta:
105 |         if (q in nzList) == False:
106 |             nzList.append(q)
107 | 
108 | nameList = [namesCoded[nzList[i]] for i in range(len(nzList))]
109 | 
110 | print(nameList)
111 | for i in range(ncols):
112 |     #plot range of beta values for each attribute
113 |     coefCurve = [betaMat[k][i] for k in range(nSteps)]
114 |     xaxis = range(nSteps)
115 |     plot.plot(xaxis, coefCurve)
116 | 
117 | plot.xlabel("Steps Taken")
118 | plot.ylabel(("Coefficient Values"))
119 | plot.show()
120 | 
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/04/larsAbaloneOutput.txt:
--------------------------------------------------------------------------------
1 | ['Shell weight', 'Height', 'Sex2', 'Shucked weight', 'Diameter', 'Sex1']
2 | 


--------------------------------------------------------------------------------
/04/larsRocksVMines.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | import urllib2
  3 | import sys
  4 | from math import sqrt
  5 | import matplotlib.pyplot as plot
  6 | 
  7 | #read data from uci data repository
  8 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
  9 | data = urllib2.urlopen(target_url)
 10 | 
 11 | 
 12 | #arrange data into list for labels and list of lists for attributes
 13 | xList = []
 14 | 
 15 | 
 16 | for line in data:
 17 |     #split on comma
 18 |     row = line.strip().split(",")
 19 |     xList.append(row)
 20 | 
 21 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0
 22 | 
 23 | xNum = []
 24 | labels = []
 25 | 
 26 | for row in xList:
 27 |     lastCol = row.pop()
 28 |     if lastCol == "M":
 29 |         labels.append(1.0)
 30 |     else:
 31 |         labels.append(0.0)
 32 |     attrRow = [float(elt) for elt in row]
 33 |     xNum.append(attrRow)
 34 | 
 35 | #number of rows and columns in x matrix
 36 | nrow = len(xNum)
 37 | ncol = len(xNum[1])
 38 | 
 39 | 
 40 | 
 41 | #calculate means and variances
 42 | xMeans = []
 43 | xSD = []
 44 | for i in range(ncol):
 45 |     col = [xNum[j][i] for j in range(nrow)]
 46 |     mean = sum(col)/nrow
 47 |     xMeans.append(mean)
 48 |     colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
 49 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
 50 |     stdDev = sqrt(sumSq/nrow)
 51 |     xSD.append(stdDev)
 52 | 
 53 | #use calculate mean and standard deviation to normalize xNum
 54 | xNormalized = []
 55 | for i in range(nrow):
 56 |     rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)]
 57 |     xNormalized.append(rowNormalized)
 58 | 
 59 | #Normalize labels
 60 | meanLabel = sum(labels)/nrow
 61 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrow)])/nrow)
 62 | 
 63 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrow)]
 64 | 
 65 | #initialize a vector of coefficients beta
 66 | beta = [0.0] * ncol
 67 | 
 68 | #initialize matrix of betas at each step
 69 | betaMat = []
 70 | betaMat.append(list(beta))
 71 | 
 72 | 
 73 | #number of steps to take
 74 | nSteps = 350
 75 | stepSize = 0.004
 76 | nzList = []
 77 | 
 78 | for i in range(nSteps):
 79 |     #calculate residuals
 80 |     residuals = [0.0] * nrow
 81 |     for j in range(nrow):
 82 |         labelsHat = sum([xNormalized[j][k] * beta[k] for k in range(ncol)])
 83 |         residuals[j] = labelNormalized[j] - labelsHat
 84 | 
 85 |     #calculate correlation between attribute columns from normalized wine and residual
 86 |     corr = [0.0] * ncol
 87 | 
 88 |     for j in range(ncol):
 89 |         corr[j] = sum([xNormalized[k][j] * residuals[k] for k in range(nrow)]) / nrow
 90 | 
 91 |     iStar = 0
 92 |     corrStar = corr[0]
 93 | 
 94 |     for j in range(1, (ncol)):
 95 |         if abs(corrStar) < abs(corr[j]):
 96 |             iStar = j; corrStar = corr[j]
 97 | 
 98 |     beta[iStar] += stepSize * corrStar / abs(corrStar)
 99 |     betaMat.append(list(beta))
100 | 
101 | 
102 |     nzBeta = [index for index in range(ncol) if beta[index] != 0.0]
103 |     for q in nzBeta:
104 |         if (q in nzList) == False:
105 |             nzList.append(q)
106 | 
107 | #make up names for columns of xNum
108 | names = ['V' + str(i) for i in range(ncol)]
109 | nameList = [names[nzList[i]] for i in range(len(nzList))]
110 | 
111 | print(nameList)
112 | for i in range(ncol):
113 |     #plot range of beta values for each attribute
114 |     coefCurve = [betaMat[k][i] for k in range(nSteps)]
115 |     xaxis = range(nSteps)
116 |     plot.plot(xaxis, coefCurve)
117 | 
118 | plot.xlabel("Steps Taken")
119 | plot.ylabel(("Coefficient Values"))
120 | plot.show()


--------------------------------------------------------------------------------
/04/larsWine.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | import pandas as pd
 3 | from pandas import DataFrame
 4 | from pylab import *
 5 | import matplotlib.pyplot as plot
 6 | from math import fabs
 7 | 
 8 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 9 | wine = pd.read_csv(target_url,header=0, sep=";")
10 | 
11 | #normalize the wine data
12 | summary = wine.describe()
13 | print(summary)
14 | 
15 | wineNormalized = wine
16 | ncols = len(wineNormalized.columns)
17 | nrows = len(wineNormalized)
18 | 
19 | for i in range(ncols):
20 |     mean = summary.iloc[1, i]
21 |     sd = summary.iloc[2, i]
22 |     wineNormalized.iloc[:,i:(i + 1)] = (wineNormalized.iloc[:,i:(i + 1)] - mean) / sd
23 | 
24 | #initialize a vector of coefficients beta
25 | beta = [0.0] * (ncols - 1)
26 | #initialize matrix of betas at each step
27 | betaMat = []
28 | betaMat.append(list(beta))
29 | #initialize residuals list
30 | residuals = [0.0] * nrows
31 | 
32 | #number of steps to take
33 | nSteps = 100
34 | stepSize = 0.1
35 | 
36 | for i in range(nSteps):
37 |     #calculate residuals
38 |     for j in range(nrows):
39 |         residuals[j] = wineNormalized.iloc[j, (ncols - 1)]
40 |         for k in range(ncols - 1):
41 |             residuals[j] += - wineNormalized.iloc[j, k] * beta[k]
42 | 
43 |     #calculate correlation between attribute columns from normalized wine and residual
44 |     corr = [0.0] * (ncols - 1)
45 | 
46 |     for j in range(ncols - 1):
47 |         for k in range(nrows):
48 |             corr[j] += wineNormalized.iloc[k,j] * residuals[k] / nrows
49 | 
50 |     iStar = 0
51 |     corrStar = corr[0]
52 | 
53 |     for j in range(1, (ncols - 1)):
54 |         if abs(corrStar) < abs(corr[j]):
55 |             iStar = j; corrStar = corr[j]
56 | 
57 |     beta[iStar] += stepSize * corrStar / abs(corrStar)
58 |     betaMat.append(list(beta))
59 | 
60 | 
61 | for i in range(ncols - 1):
62 |     #plot range of beta values for each attribute
63 |     coefCurve = betaMat[0:nSteps][i]
64 |     coefCurve.plot()
65 | 
66 | plot.xlabel("Attribute Index")
67 | plot.ylabel(("Attribute Values"))
68 | plot.show()
69 | 


--------------------------------------------------------------------------------
/04/larsWine2.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | from sklearn import datasets, linear_model
  6 | from math import sqrt
  7 | import matplotlib.pyplot as plot
  8 | 
  9 | #read data into iterable
 10 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 11 | data = urllib2.urlopen(target_url)
 12 | 
 13 | xList = []
 14 | labels = []
 15 | names = []
 16 | firstLine = True
 17 | for line in data:
 18 |     if firstLine:
 19 |         names = line.strip().split(";")
 20 |         firstLine = False
 21 |     else:
 22 |         #split on semi-colon
 23 |         row = line.strip().split(";")
 24 |         #put labels in separate array
 25 |         labels.append(float(row[-1]))
 26 |         #remove label from row
 27 |         row.pop()
 28 |         #convert row to floats
 29 |         floatRow = [float(num) for num in row]
 30 |         xList.append(floatRow)
 31 | 
 32 | #Normalize columns in x and labels
 33 | 
 34 | nrows = len(xList)
 35 | ncols = len(xList[0])
 36 | 
 37 | #calculate means and variances
 38 | xMeans = []
 39 | xSD = []
 40 | for i in range(ncols):
 41 |     col = [xList[j][i] for j in range(nrows)]
 42 |     mean = sum(col)/nrows
 43 |     xMeans.append(mean)
 44 |     colDiff = [(xList[j][i] - mean) for j in range(nrows)]
 45 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
 46 |     stdDev = sqrt(sumSq/nrows)
 47 |     xSD.append(stdDev)
 48 | 
 49 | #use calculate mean and standard deviation to normalize xList
 50 | xNormalized = []
 51 | for i in range(nrows):
 52 |     rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
 53 |     xNormalized.append(rowNormalized)
 54 | 
 55 | #Normalize labels
 56 | meanLabel = sum(labels)/nrows
 57 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
 58 | 
 59 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
 60 | 
 61 | #initialize a vector of coefficients beta
 62 | beta = [0.0] * ncols
 63 | 
 64 | #initialize matrix of betas at each step
 65 | betaMat = []
 66 | betaMat.append(list(beta))
 67 | 
 68 | 
 69 | #number of steps to take
 70 | nSteps = 350
 71 | stepSize = 0.004
 72 | nzList = []
 73 | 
 74 | for i in range(nSteps):
 75 |     #calculate residuals
 76 |     residuals = [0.0] * nrows
 77 |     for j in range(nrows):
 78 |         labelsHat = sum([xNormalized[j][k] * beta[k] for k in range(ncols)])
 79 |         residuals[j] = labelNormalized[j] - labelsHat
 80 | 
 81 |     #calculate correlation between attribute columns from normalized wine and residual
 82 |     corr = [0.0] * ncols
 83 | 
 84 |     for j in range(ncols):
 85 |         corr[j] = sum([xNormalized[k][j] * residuals[k] for k in range(nrows)]) / nrows
 86 | 
 87 |     iStar = 0
 88 |     corrStar = corr[0]
 89 | 
 90 |     for j in range(1, (ncols)):
 91 |         if abs(corrStar) < abs(corr[j]):
 92 |             iStar = j; corrStar = corr[j]
 93 | 
 94 |     beta[iStar] += stepSize * corrStar / abs(corrStar)
 95 |     betaMat.append(list(beta))
 96 | 
 97 | 
 98 |     nzBeta = [index for index in range(ncols) if beta[index] != 0.0]
 99 |     for q in nzBeta:
100 |         if (q in nzList) == False:
101 |             nzList.append(q)
102 | 
103 | nameList = [names[nzList[i]] for i in range(len(nzList))]
104 | 
105 | print(nameList)
106 | for i in range(ncols):
107 |     #plot range of beta values for each attribute
108 |     coefCurve = [betaMat[k][i] for k in range(nSteps)]
109 |     xaxis = range(nSteps)
110 |     plot.plot(xaxis, coefCurve)
111 | 
112 | plot.xlabel("Steps Taken")
113 | plot.ylabel(("Coefficient Values"))
114 | plot.show()


--------------------------------------------------------------------------------
/04/larsWineCV.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | from sklearn import datasets, linear_model
  6 | from math import sqrt
  7 | import matplotlib.pyplot as plot
  8 | 
  9 | 
 10 | #read data into iterable
 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 12 | data = urllib2.urlopen(target_url)
 13 | 
 14 | xList = []
 15 | labels = []
 16 | names = []
 17 | firstLine = True
 18 | for line in data:
 19 |     if firstLine:
 20 |         names = line.strip().split(";")
 21 |         firstLine = False
 22 |     else:
 23 |         #split on semi-colon
 24 |         row = line.strip().split(";")
 25 |         #put labels in separate array
 26 |         labels.append(float(row[-1]))
 27 |         #remove label from row
 28 |         row.pop()
 29 |         #convert row to floats
 30 |         floatRow = [float(num) for num in row]
 31 |         xList.append(floatRow)
 32 | 
 33 | #Normalize columns in x and labels
 34 | 
 35 | nrows = len(xList)
 36 | ncols = len(xList[0])
 37 | 
 38 | #calculate means and variances
 39 | xMeans = []
 40 | xSD = []
 41 | for i in range(ncols):
 42 |     col = [xList[j][i] for j in range(nrows)]
 43 |     mean = sum(col)/nrows
 44 |     xMeans.append(mean)
 45 |     colDiff = [(xList[j][i] - mean) for j in range(nrows)]
 46 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
 47 |     stdDev = sqrt(sumSq/nrows)
 48 |     xSD.append(stdDev)
 49 | 
 50 | #use calculated mean and standard deviation to normalize xList
 51 | xNormalized = []
 52 | for i in range(nrows):
 53 |     rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
 54 |     xNormalized.append(rowNormalized)
 55 | 
 56 | #Normalize labels
 57 | meanLabel = sum(labels)/nrows
 58 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
 59 | 
 60 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
 61 | 
 62 | #Build cross-validation loop to determine best coefficient values.
 63 | 
 64 | #number of cross validation folds
 65 | nxval = 10
 66 | 
 67 | #number of steps and step size
 68 | nSteps = 350
 69 | stepSize = 0.004
 70 | 
 71 | #initialize list for storing errors.
 72 | errors = []
 73 | for i in range(nSteps):
 74 |     b = []
 75 |     errors.append(b)
 76 | 
 77 | 
 78 | for ixval in range(nxval):
 79 |     #Define test and training index sets
 80 |     idxTest = [a for a in range(nrows) if a%nxval == ixval*nxval]
 81 |     idxTrain = [a for a in range(nrows) if a%nxval != ixval*nxval]
 82 | 
 83 |     #Define test and training attribute and label sets
 84 |     xTrain = [xNormalized[r] for r in idxTrain]
 85 |     xTest = [xNormalized[r] for r in idxTest]
 86 |     labelTrain = [labelNormalized[r] for r in idxTrain]
 87 |     labelTest = [labelNormalized[r] for r in idxTest]
 88 | 
 89 |     #Train LARS regression on Training Data
 90 |     nrowsTrain = len(idxTrain)
 91 |     nrowsTest = len(idxTest)
 92 | 
 93 |     #initialize a vector of coefficients beta
 94 |     beta = [0.0] * ncols
 95 | 
 96 |     #initialize matrix of betas at each step
 97 |     betaMat = []
 98 |     betaMat.append(list(beta))
 99 | 
100 |     for iStep in range(nSteps):
101 |         #calculate residuals
102 |         residuals = [0.0] * nrows
103 |         for j in range(nrowsTrain):
104 |             labelsHat = sum([xTrain[j][k] * beta[k] for k in range(ncols)])
105 |             residuals[j] = labelTrain[j] - labelsHat
106 | 
107 |         #calculate correlation between attribute columns from normalized wine and residual
108 |         corr = [0.0] * ncols
109 | 
110 |         for j in range(ncols):
111 |             corr[j] = sum([xTrain[k][j] * residuals[k] for k in range(nrowsTrain)]) / nrowsTrain
112 | 
113 |         iStar = 0
114 |         corrStar = corr[0]
115 | 
116 |         for j in range(1, (ncols)):
117 |             if abs(corrStar) < abs(corr[j]):
118 |                 iStar = j; corrStar = corr[j]
119 | 
120 |         beta[iStar] += stepSize * corrStar / abs(corrStar)
121 |         betaMat.append(list(beta))
122 | 
123 |         #Use beta just calculated to predict and accumulate out of sample error - not being used in the calc of beta
124 |         for j in range(nrowsTest):
125 |             labelsHat = sum([xTest[j][k] * beta[k] for k in range(ncols)])
126 |             err = labelTest[j] - labelsHat
127 |             errors[iStep].append(err)
128 | 
129 | cvCurve = []
130 | for errVect in errors:
131 |     mse = sum([x*x for x in errVect])/len(errVect)
132 |     cvCurve.append(mse)
133 | 
134 | minMse = min(cvCurve)
135 | minPt = [i for i in range(len(cvCurve)) if cvCurve[i] == minMse ][0]
136 | print("Minimum Mean Square Error", minMse)
137 | print("Index of Minimum Mean Square Error", minPt)
138 | 
139 | xaxis = range(len(cvCurve))
140 | plot.plot(xaxis, cvCurve)
141 | 
142 | plot.xlabel("Steps Taken")
143 | plot.ylabel(("Mean Square Error"))
144 | plot.show()
145 | 


--------------------------------------------------------------------------------
/04/orderedNamesList.txt:
--------------------------------------------------------------------------------
1 | ['"alcohol"', '"volatile acidity"', '"sulphates"', '"total sulfur dioxide"', '"chlorides"', '"fixed acidity"', '"pH"', '"free sulfur dioxide"', '"citric acid"', '"residual sugar"', '"density"']
2 | 


--------------------------------------------------------------------------------
/04/rocksVMinesCoefOrder.txt:
--------------------------------------------------------------------------------
1 | ['V10', 'V48', 'V44', 'V11', 'V35', 'V51', 'V20', 'V3', 'V21', 'V15', 'V43', 'V0', 'V22', 'V45', 'V53', 'V27', 'V30', 'V50', 'V58', 'V46', 'V56', 'V28', 'V39']
2 | 
3 | 


--------------------------------------------------------------------------------
/04/wineBasisExpand.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike-bowles'
 2 | 
 3 | import urllib2
 4 | import matplotlib.pyplot as plot
 5 | from math import sqrt, cos, log
 6 | 
 7 | #read data into iterable
 8 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 9 | data = urllib2.urlopen(target_url)
10 | 
11 | xList = []
12 | labels = []
13 | names = []
14 | firstLine = True
15 | for line in data:
16 |     if firstLine:
17 |         names = line.strip().split(";")
18 |         firstLine = False
19 |     else:
20 |         #split on semi-colon
21 |         row = line.strip().split(";")
22 |         #put labels in separate array
23 |         labels.append(float(row[-1]))
24 |         #remove label from row
25 |         row.pop()
26 |         #convert row to floats
27 |         floatRow = [float(num) for num in row]
28 |         xList.append(floatRow)
29 | 
30 | 
31 | #extend the alcohol variable (the last column in that attribute matrix
32 | xExtended = []
33 | alchCol = len(xList[1])
34 | 
35 | 
36 | for row in xList:
37 |     newRow = list(row)
38 |     alch = row[alchCol - 1]
39 |     newRow.append((alch - 7) * (alch - 7)/10)
40 |     newRow.append(5 * log(alch - 7))
41 |     newRow.append(cos(alch))
42 |     xExtended.append(newRow)
43 | 
44 | nrow = len(xList)
45 | v1 = [xExtended[j][alchCol - 1] for j in range(nrow)]
46 | 
47 | for i in range(4):
48 |     v2 = [xExtended[j][alchCol - 1 + i] for j in range(nrow)]
49 |     plot.scatter(v1,v2)
50 | 
51 | plot.xlabel("Alcohol")
52 | plot.ylabel(("Extension Functions of Alcohol"))
53 | plot.show()
54 | 


--------------------------------------------------------------------------------
/05/chapter05.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/05/chapter05.zip


--------------------------------------------------------------------------------
/05/glass/glassENetRegCV.py:
--------------------------------------------------------------------------------
  1 | import urllib2
  2 | from math import sqrt, fabs, exp
  3 | import matplotlib.pyplot as plot
  4 | from sklearn.linear_model import enet_path
  5 | from sklearn.metrics import roc_auc_score, roc_curve
  6 | import numpy
  7 | 
  8 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"
  9 | data = urllib2.urlopen(target_url)
 10 | 
 11 | #arrange data into list for labels and list of lists for attributes
 12 | xList = []
 13 | for line in data:
 14 |     #split on comma
 15 |     row = line.strip().split(",")
 16 |     xList.append(row)
 17 | 
 18 | names = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']
 19 | 
 20 | #Separate attributes and labels
 21 | xNum = []
 22 | labels = []
 23 | 
 24 | for row in xList:
 25 |     labels.append(row.pop())
 26 |     l = len(row)
 27 |     #eliminate ID
 28 |     attrRow = [float(row[i]) for i in range(1, l)]
 29 |     xNum.append(attrRow)
 30 | 
 31 | #number of rows and columns in x matrix
 32 | nrow = len(xNum)
 33 | ncol = len(xNum[1])
 34 | 
 35 | #creat one versus all label vectors
 36 | #get distinct glass types and assign index to each
 37 | yOneVAll = []
 38 | labelSet = set(labels)
 39 | labelList = list(labelSet)
 40 | labelList.sort()
 41 | nlabels = len(labelList)
 42 | for i in range(nrow):
 43 |     yRow = [0.0]*nlabels
 44 |     index = labelList.index(labels[i])
 45 |     yRow[index] = 1.0
 46 |     yOneVAll.append(yRow)
 47 | 
 48 | #calculate means and variances
 49 | xMeans = []
 50 | xSD = []
 51 | for i in range(ncol):
 52 |     col = [xNum[j][i] for j in range(nrow)]
 53 |     mean = sum(col)/nrow
 54 |     xMeans.append(mean)
 55 |     colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
 56 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
 57 |     stdDev = sqrt(sumSq/nrow)
 58 |     xSD.append(stdDev)
 59 | 
 60 | #use calculate mean and standard deviation to normalize xNum
 61 | xNormalized = []
 62 | for i in range(nrow):
 63 |     rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)]
 64 |     xNormalized.append(rowNormalized)
 65 | 
 66 | #normalize y's to center
 67 | yMeans = []
 68 | ySD = []
 69 | for i in range(nlabels):
 70 |     col = [yOneVAll[j][i] for j in range(nrow)]
 71 |     mean = sum(col)/nrow
 72 |     yMeans.append(mean)
 73 |     colDiff = [(yOneVAll[j][i] - mean) for j in range(nrow)]
 74 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
 75 |     stdDev = sqrt(sumSq/nrow)
 76 |     ySD.append(stdDev)
 77 | 
 78 | yNormalized = []
 79 | for i in range(nrow):
 80 |     rowNormalized = [(yOneVAll[i][j] - yMeans[j])/ySD[j] for j in range(nlabels)]
 81 |     yNormalized.append(rowNormalized)
 82 | 
 83 | 
 84 | #number of cross validation folds
 85 | nxval = 10
 86 | nAlphas=100
 87 | misClass = [0.0] * nAlphas
 88 | 
 89 | for ixval in range(nxval):
 90 |     #Define test and training index sets
 91 |     idxTest = [a for a in range(nrow) if a%nxval == ixval%nxval]
 92 |     idxTrain = [a for a in range(nrow) if a%nxval != ixval%nxval]
 93 | 
 94 |     #Define test and training attribute and label sets
 95 |     xTrain = numpy.array([xNormalized[r] for r in idxTrain])
 96 |     xTest = numpy.array([xNormalized[r] for r in idxTest])
 97 |     yTrain = [yNormalized[r] for r in idxTrain]
 98 |     yTest = [yNormalized[r] for r in idxTest]
 99 |     labelsTest = [labels[r] for r in idxTest]
100 | 
101 |     #build model for each column in yTrain
102 |     models = []
103 |     lenTrain = len(yTrain)
104 |     lenTest = nrow - lenTrain
105 |     for iModel in range(nlabels):
106 |         yTemp = numpy.array([yTrain[j][iModel] for j in range(lenTrain)])
107 |         models.append(enet_path(xTrain, yTemp,l1_ratio=1.0, fit_intercept=False, eps=0.5e-3, n_alphas=nAlphas , return_models=False))
108 | 
109 |     for iStep in range(1,nAlphas):
110 |         #Assemble the predictions for all the models, find largest prediction and calc error
111 |         allPredictions = []
112 |         for iModel in range(nlabels):
113 |             _, coefs, _ = models[iModel]
114 |             predTemp = list(numpy.dot(xTest, coefs[:,iStep]))
115 |             #un-normalize the prediction for comparison
116 |             predUnNorm = [(predTemp[j]*ySD[iModel] + yMeans[iModel]) for j in range(len(predTemp))]
117 |             allPredictions.append(predUnNorm)
118 | 
119 |         predictions = []
120 |         for i in range(lenTest):
121 |             listOfPredictions = [allPredictions[j][i] for j in range(nlabels) ]
122 |             idxMax = listOfPredictions.index(max(listOfPredictions))
123 |             if labelList[idxMax] != labelsTest[i]:
124 |                 misClass[iStep] += 1.0
125 | 
126 | misClassPlot = [misClass[i]/nrow for i in range(1, nAlphas)]
127 | 
128 | plot.plot(misClassPlot)
129 | 
130 | plot.xlabel("Penalty Parameter Steps")
131 | plot.ylabel(("Misclassification Error Rate"))
132 | plot.show()


--------------------------------------------------------------------------------
/05/rocksVMines/rocksVMinesCoefCurves.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | import urllib2
  3 | from math import sqrt, fabs, exp
  4 | import matplotlib.pyplot as plot
  5 | from sklearn.linear_model import enet_path
  6 | from sklearn.metrics import roc_auc_score, roc_curve
  7 | import numpy
  8 | 
  9 | #read data from uci data repository
 10 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
 11 | data = urllib2.urlopen(target_url)
 12 | 
 13 | 
 14 | #arrange data into list for labels and list of lists for attributes
 15 | xList = []
 16 | 
 17 | 
 18 | for line in data:
 19 |     #split on comma
 20 |     row = line.strip().split(",")
 21 |     xList.append(row)
 22 | 
 23 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0
 24 | 
 25 | xNum = []
 26 | labels = []
 27 | 
 28 | for row in xList:
 29 |     lastCol = row.pop()
 30 |     if lastCol == "M":
 31 |         labels.append(1.0)
 32 |     else:
 33 |         labels.append(0.0)
 34 |     attrRow = [float(elt) for elt in row]
 35 |     xNum.append(attrRow)
 36 | 
 37 | #number of rows and columns in x matrix
 38 | nrow = len(xNum)
 39 | ncol = len(xNum[1])
 40 | 
 41 | alpha = 1.0
 42 | 
 43 | #calculate means and variances
 44 | xMeans = []
 45 | xSD = []
 46 | for i in range(ncol):
 47 |     col = [xNum[j][i] for j in range(nrow)]
 48 |     mean = sum(col)/nrow
 49 |     xMeans.append(mean)
 50 |     colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
 51 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
 52 |     stdDev = sqrt(sumSq/nrow)
 53 |     xSD.append(stdDev)
 54 | 
 55 | #use calculate mean and standard deviation to normalize xNum
 56 | xNormalized = []
 57 | for i in range(nrow):
 58 |     rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)]
 59 |     xNormalized.append(rowNormalized)
 60 | 
 61 | #normalize labels to center
 62 | 
 63 | meanLabel = sum(labels)/nrow
 64 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrow)])/nrow)
 65 | 
 66 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrow)]
 67 | 
 68 | #Convert normalized labels to numpy array
 69 | Y = numpy.array(labelNormalized)
 70 | 
 71 | #Convert normalized attributes to numpy array
 72 | X = numpy.array(xNormalized)
 73 | 
 74 | alphas, coefs, _ = enet_path(X, Y,l1_ratio=0.8, fit_intercept=False, return_models=False)
 75 | 
 76 | plot.plot(alphas,coefs.T)
 77 | 
 78 | plot.xlabel('alpha')
 79 | plot.ylabel('Coefficients')
 80 | plot.axis('tight')
 81 | plot.semilogx()
 82 | ax = plot.gca()
 83 | ax.invert_xaxis()
 84 | plot.show()
 85 | 
 86 | nattr, nalpha = coefs.shape
 87 | 
 88 | #find coefficient ordering
 89 | nzList = []
 90 | for iAlpha in range(1,nalpha):
 91 |     coefList = list(coefs[: ,iAlpha])
 92 |     nzCoef = [index for index in range(nattr) if coefList[index] != 0.0]
 93 |     for q in nzCoef:
 94 |         if not(q in nzList):
 95 |             nzList.append(q)
 96 | 
 97 | #make up names for columns of X
 98 | names = ['V' + str(i) for i in range(ncol)]
 99 | nameList = [names[nzList[i]] for i in range(len(nzList))]
100 | print("Attributes Ordered by How Early They Enter the Model")
101 | print(nameList)
102 | print('')
103 | #find coefficients corresponding to best alpha value. alpha value corresponding to
104 | #normalized X and normalized Y is 0.020334883589342503
105 | 
106 | alphaStar = 0.020334883589342503
107 | indexLTalphaStar = [index for index in range(100) if alphas[index] > alphaStar]
108 | indexStar = max(indexLTalphaStar)
109 | 
110 | #here's the set of coefficients to deploy
111 | coefStar = list(coefs[:,indexStar])
112 | print("Best Coefficient Values ")
113 | print(coefStar)
114 | print('')
115 | #The coefficients on normalized attributes give another slightly different ordering
116 | 
117 | absCoef = [abs(a) for a in coefStar]
118 | 
119 | #sort by magnitude
120 | coefSorted = sorted(absCoef, reverse=True)
121 | 
122 | idxCoefSize = [absCoef.index(a) for a in coefSorted if not(a == 0.0)]
123 | 
124 | namesList2 = [names[idxCoefSize[i]] for i in range(len(idxCoefSize))]
125 | 
126 | print("Attributes Ordered by Coef Size at Optimum alpha")
127 | print(namesList2)


--------------------------------------------------------------------------------
/05/rocksVMines/rocksVMinesCoefCurvesPrintedOutput.txt:
--------------------------------------------------------------------------------
 1 | Attributes Ordered by How Early They Enter the Model
 2 | ['V10', 'V48', 'V11', 'V44', 'V35', 'V51', 'V20', 'V3', 'V21', 'V45', 'V43', 'V15', 'V0', 'V22', 'V27', 'V50', 'V53', 'V30', 'V58', 'V56', 'V28', 'V39', 'V46', 'V19', 'V54', 'V29', 'V57', 'V6', 'V8', 'V7', 'V49', 'V2', 'V23', 'V37', 'V55', 'V4', 'V13', 'V36', 'V38', 'V26', 'V31', 'V1', 'V34', 'V33', 'V24', 'V16', 'V17', 'V5', 'V52', 'V41', 'V40', 'V59', 'V12', 'V9', 'V18', 'V14', 'V47', 'V42']
 3 | 
 4 | Best Coefficient Values 
 5 | [0.082258256813766639, 0.0020619887220043702, -0.11828642590855878, 0.16633956932499627, 0.0042854388193718004, -0.0, -0.04366252474594004, -0.07751510487942842, 0.10000054356323497, 0.0, 0.090617207036282038, 0.21210870399915693, -0.0, -0.010655386149821946, -0.0, -0.13328659558143779, -0.0, 0.0, 0.0, 0.052814854501417867, 0.038531154796719078, 0.0035515348181877982, 0.090854714680378215, 0.030316113904025031, -0.0, 0.0, 0.0086195542357481014, 0.0, 0.0, 0.17497679257272536, -0.2215687804617206, 0.012614243827937584, 0.0, -0.0, 0.0, -0.17160601809439849, -0.080450013824209077, 0.078096790041518344, 0.022035287616766441, -0.072184409273692227, 0.0, -0.0, 0.0, 0.057018816876250704, 0.096478265685721556, 0.039917367637236176, 0.049158231541622875, 0.0, 0.22671917920123755, -0.096272735479951091, 0.0, 0.078886784332226484, 0.0, 0.062312821755756878, -0.082785510713295471, 0.014466967172068596, -0.074326527525632721, 0.068096475974257331, 0.070488864435477847, 0.0]
 6 | 
 7 | Attributes Ordered by Coef Size at Optimum alpha
 8 | ['V48', 'V30', 'V11', 'V29', 'V35', 'V3', 'V15', 'V2', 'V8', 'V44', 'V49', 'V22', 'V10', 'V54', 'V0', 'V36', 'V51', 'V37', 'V7', 'V56', 'V39', 'V58', 'V57', 'V53', 'V43', 'V19', 'V46', 'V6', 'V45', 'V20', 'V23', 'V38', 'V55', 'V31', 'V13', 'V26', 'V4', 'V21', 'V1']
 9 | 
10 | 


--------------------------------------------------------------------------------
/05/rocksVMines/rocksVMinesENetRegCV.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | import urllib2
  3 | from math import sqrt, fabs, exp
  4 | import matplotlib.pyplot as plot
  5 | from sklearn.linear_model import enet_path
  6 | from sklearn.metrics import roc_auc_score, roc_curve
  7 | import numpy
  8 | 
  9 | #read data from uci data repository
 10 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
 11 | data = urllib2.urlopen(target_url)
 12 | 
 13 | 
 14 | #arrange data into list for labels and list of lists for attributes
 15 | xList = []
 16 | 
 17 | 
 18 | for line in data:
 19 |     #split on comma
 20 |     row = line.strip().split(",")
 21 |     xList.append(row)
 22 | 
 23 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0
 24 | 
 25 | xNum = []
 26 | labels = []
 27 | 
 28 | for row in xList:
 29 |     lastCol = row.pop()
 30 |     if lastCol == "M":
 31 |         labels.append(1.0)
 32 |     else:
 33 |         labels.append(0.0)
 34 |     attrRow = [float(elt) for elt in row]
 35 |     xNum.append(attrRow)
 36 | 
 37 | #number of rows and columns in x matrix
 38 | nrow = len(xNum)
 39 | ncol = len(xNum[1])
 40 | 
 41 | alpha = 1.0
 42 | 
 43 | #calculate means and variances
 44 | xMeans = []
 45 | xSD = []
 46 | for i in range(ncol):
 47 |     col = [xNum[j][i] for j in range(nrow)]
 48 |     mean = sum(col)/nrow
 49 |     xMeans.append(mean)
 50 |     colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
 51 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
 52 |     stdDev = sqrt(sumSq/nrow)
 53 |     xSD.append(stdDev)
 54 | 
 55 | #use calculate mean and standard deviation to normalize xNum
 56 | xNormalized = []
 57 | for i in range(nrow):
 58 |     rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)]
 59 |     xNormalized.append(rowNormalized)
 60 | 
 61 | #normalize labels to center
 62 | #Normalize labels
 63 | meanLabel = sum(labels)/nrow
 64 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrow)])/nrow)
 65 | 
 66 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrow)]
 67 | 
 68 | 
 69 | #number of cross validation folds
 70 | nxval = 10
 71 | 
 72 | 
 73 | for ixval in range(nxval):
 74 |     #Define test and training index sets
 75 |     idxTest = [a for a in range(nrow) if a%nxval == ixval%nxval]
 76 |     idxTrain = [a for a in range(nrow) if a%nxval != ixval%nxval]
 77 | 
 78 |     #Define test and training attribute and label sets
 79 |     xTrain = numpy.array([xNormalized[r] for r in idxTrain])
 80 |     xTest = numpy.array([xNormalized[r] for r in idxTest])
 81 |     labelTrain = numpy.array([labelNormalized[r] for r in idxTrain])
 82 |     labelTest = numpy.array([labelNormalized[r] for r in idxTest])
 83 |     alphas, coefs, _ = enet_path(xTrain, labelTrain,l1_ratio=0.8, fit_intercept=False, return_models=False)
 84 | 
 85 |     #apply coefs to test data to produce predictions and accumulate
 86 |     if ixval == 0:
 87 |         pred = numpy.dot(xTest, coefs)
 88 |         yOut = labelTest
 89 |     else:
 90 |         #accumulate predictions
 91 |         yTemp = numpy.array(yOut)
 92 |         yOut = numpy.concatenate((yTemp, labelTest), axis=0)
 93 | 
 94 |         #accumulate predictions
 95 |         predTemp = numpy.array(pred)
 96 |         pred = numpy.concatenate((predTemp, numpy.dot(xTest, coefs)), axis = 0)
 97 | 
 98 | 
 99 | #calculate miss classification error
100 | misClassRate = []
101 | _,nPred = pred.shape
102 | for iPred in range(1, nPred):
103 |     predList = list(pred[:, iPred])
104 |     errCnt = 0.0
105 |     for irow in range(nrow):
106 |         if (predList[irow] < 0.0) and (yOut[irow] >= 0.0):
107 |             errCnt += 1.0
108 |         elif (predList[irow] >= 0.0) and (yOut[irow] < 0.0):
109 |             errCnt += 1.0
110 |     misClassRate.append(errCnt/nrow)
111 | 
112 | #find minimum point for plot and for print
113 | minError = min(misClassRate)
114 | idxMin = misClassRate.index(minError)
115 | plotAlphas = list(alphas[1:len(alphas)])
116 | 
117 | plot.figure()
118 | plot.plot(plotAlphas, misClassRate, label='Misclassification Error Across Folds', linewidth=2)
119 | plot.axvline(plotAlphas[idxMin], linestyle='--',
120 |             label='CV Estimate of Best alpha')
121 | plot.legend()
122 | plot.semilogx()
123 | ax = plot.gca()
124 | ax.invert_xaxis()
125 | plot.xlabel('alpha')
126 | plot.ylabel('Misclassification Error')
127 | plot.axis('tight')
128 | plot.show()
129 | 
130 | 
131 | 
132 | #calculate AUC.
133 | idxPos = [i for i in range(nrow) if yOut[i] > 0.0]
134 | yOutBin = [0] * nrow
135 | for i in idxPos: yOutBin[i] = 1
136 | 
137 | auc = []
138 | for iPred in range(1, nPred):
139 |     predList = list(pred[:, iPred])
140 |     aucCalc = roc_auc_score(yOutBin, predList)
141 |     auc.append(aucCalc)
142 | 
143 | maxAUC = max(auc)
144 | idxMax = auc.index(maxAUC)
145 | 
146 | plot.figure()
147 | plot.plot(plotAlphas, auc, label='AUC Across Folds', linewidth=2)
148 | plot.axvline(plotAlphas[idxMax], linestyle='--',
149 |             label='CV Estimate of Best alpha')
150 | plot.legend()
151 | plot.semilogx()
152 | ax = plot.gca()
153 | ax.invert_xaxis()
154 | plot.xlabel('alpha')
155 | plot.ylabel('Area Under the ROC Curve')
156 | plot.axis('tight')
157 | plot.show()
158 | 
159 | 
160 | #plot best version of ROC curve
161 | fpr, tpr, thresh = roc_curve(yOutBin, list(pred[:, idxMax]))
162 | ctClass = [i*0.01 for i in range(101)]
163 | 
164 | plot.plot(fpr, tpr, linewidth=2)
165 | plot.plot(ctClass, ctClass, linestyle=':')
166 | plot.xlabel('False Positive Rate')
167 | plot.ylabel('True Positive Rate')
168 | plot.show()
169 | 
170 | print('Best Value of Misclassification Error = ', misClassRate[idxMin])
171 | print('Best alpha for Misclassification Error = ', plotAlphas[idxMin])
172 | print('')
173 | print('Best Value for AUC = ', auc[idxMax])
174 | print('Best alpha for AUC   =  ', plotAlphas[idxMax])
175 | 
176 | print('')
177 | print('Confusion Matrices for Different Threshold Values')
178 | 
179 | #pick some points along the curve to print.  There are 208 points.  The extremes aren't useful
180 | #Sample at 52, 104 and 156.  Use the calculated values of tpr and fpr along with definitions and
181 | #threshold values.
182 | #Some nomenclature (e.g. see wikkipedia "receiver operating curve")
183 | 
184 | 
185 | #P = Positive cases
186 | P = len(idxPos)
187 | #N = Negative cases
188 | N = nrow - P
189 | #TP = True positives = tpr * P
190 | TP = tpr[52] * P
191 | #FN = False negatives = P - TP
192 | FN = P - TP
193 | #FP = False positives = fpr * N
194 | FP = fpr[52] * N
195 | #TN = True negatives = N - FP
196 | TN = N - FP
197 | 
198 | print('Threshold Value =   ', thresh[52])
199 | print('TP = ', TP, 'FP = ', FP)
200 | print('FN = ', FN, 'TN = ', TN)
201 | 
202 | TP = tpr[104] * P; FN = P - TP; FP = fpr[104] * N; TN = N - FP
203 | 
204 | print('Threshold Value =   ', thresh[104])
205 | print('TP = ', TP, 'FP = ', FP)
206 | print('FN = ', FN, 'TN = ', TN)
207 | 
208 | TP = tpr[156] * P; FN = P - TP; FP = fpr[156] * N; TN = N - FP
209 | 
210 | print('Threshold Value =   ', thresh[156])
211 | print('TP = ', TP, 'FP = ', FP)
212 | print('FN = ', FN, 'TN = ', TN)


--------------------------------------------------------------------------------
/05/rocksVMines/rocksVMinesENetRegCVPrintedOutput.txt:
--------------------------------------------------------------------------------
 1 | ('Best Value of Misclassification Error = ', 0.22115384615384615)
 2 | ('Best alpha for Misclassification Error = ', 0.017686244720179375)
 3 | 
 4 | ('Best Value for AUC = ', 0.86867279650784812)
 5 | ('Best alpha for AUC   =  ', 0.020334883589342503)
 6 | 
 7 | Confusion Matrices for Different Threshold Values
 8 | ('Threshold Value =   ', 0.37952298245219962)
 9 | ('TP = ', 48.0, 'FP = ', 5.0)
10 | ('FN = ', 63.0, 'TN = ', 92.0)
11 | ('Threshold Value =   ', -0.045503481125357965)
12 | ('TP = ', 85.0, 'FP = ', 20.0)
13 | ('FN = ', 26.0, 'TN = ', 77.0)
14 | ('Threshold Value =   ', -0.4272522354395466)
15 | ('TP = ', 107.0, 'FP = ', 49.999999999999993)
16 | ('FN = ', 4.0, 'TN = ', 47.000000000000007)
17 | 
18 | 


--------------------------------------------------------------------------------
/05/rocksVMines/rocksVMinesGlmnet.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | import urllib2
  3 | import sys
  4 | from math import sqrt, fabs, exp
  5 | import matplotlib.pyplot as plot
  6 | 
  7 | def S(z,gamma):
  8 |     if gamma >= fabs(z):
  9 |         return 0.0
 10 |     if z > 0.0:
 11 |         return z - gamma
 12 |     else:
 13 |         return z + gamma
 14 | 
 15 | def Pr(b0,b,x):
 16 |     n = len(x)
 17 |     sum = b0
 18 |     for i in range(n):
 19 |         sum += b[i]*x[i]
 20 |         if sum < -100: sum = -100
 21 |     return 1.0/(1.0 + exp(-sum))
 22 | 
 23 | 
 24 | #read data from uci data repository
 25 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
 26 | data = urllib2.urlopen(target_url)
 27 | 
 28 | 
 29 | #arrange data into list for labels and list of lists for attributes
 30 | xList = []
 31 | 
 32 | 
 33 | for line in data:
 34 |     #split on comma
 35 |     row = line.strip().split(",")
 36 |     xList.append(row)
 37 | 
 38 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0
 39 | 
 40 | xNum = []
 41 | labels = []
 42 | 
 43 | for row in xList:
 44 |     lastCol = row.pop()
 45 |     if lastCol == "M":
 46 |         labels.append(1.0)
 47 |     else:
 48 |         labels.append(0.0)
 49 |     attrRow = [float(elt) for elt in row]
 50 |     xNum.append(attrRow)
 51 | 
 52 | #number of rows and columns in x matrix
 53 | nrow = len(xNum)
 54 | ncol = len(xNum[1])
 55 | 
 56 | alpha = 0.8
 57 | #calculate means and variances
 58 | xMeans = []
 59 | xSD = []
 60 | for i in range(ncol):
 61 |     col = [xNum[j][i] for j in range(nrow)]
 62 |     mean = sum(col)/nrow
 63 |     xMeans.append(mean)
 64 |     colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
 65 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
 66 |     stdDev = sqrt(sumSq/nrow)
 67 |     xSD.append(stdDev)
 68 | 
 69 | #use calculate mean and standard deviation to normalize xNum
 70 | xNormalized = []
 71 | for i in range(nrow):
 72 |     rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)]
 73 |     xNormalized.append(rowNormalized)
 74 | 
 75 | #Do Not Normalize labels but do calculate averages
 76 | meanLabel = sum(labels)/nrow
 77 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrow)])/nrow)
 78 | 
 79 | #initialize probabilities and weights
 80 | sumWxr = [0.0] * ncol
 81 | sumWxx = [0.0] * ncol
 82 | sumWr = 0.0
 83 | sumW = 0.0
 84 | 
 85 | #calculate starting points for betas
 86 | for iRow in range(nrow):
 87 |     p = meanLabel
 88 |     w = p * (1.0 - p)
 89 |     #residual for logistic
 90 |     r = (labels[iRow] - p) / w
 91 |     x = xNormalized[iRow]
 92 |     sumWxr = [sumWxr[i] + w * x[i] * r for i in range(ncol)]
 93 |     sumWxx = [sumWxx[i] + w * x[i] * x[i] for i in range(ncol)]
 94 |     sumWr = sumWr + w * r
 95 |     sumW = sumW + w
 96 | 
 97 | avgWxr = [sumWxr[i]/nrow for i in range(ncol)]
 98 | avgWxx = [sumWxx[i]/nrow for i in range(ncol)]
 99 | 
100 | maxWxr = 0.0
101 | for i in range(ncol):
102 |     val = abs(avgWxr[i])
103 |     if val > maxWxr:
104 |         maxWxr = val
105 | 
106 | #calculate starting value for lambda
107 | lam = maxWxr/alpha
108 | 
109 | #this value of lambda corresponds to beta = list of 0's
110 | #initialize a vector of coefficients beta
111 | beta = [0.0] * ncol
112 | beta0 = sumWr/sumW
113 | 
114 | #initialize matrix of betas at each step
115 | betaMat = []
116 | betaMat.append(list(beta))
117 | 
118 | beta0List = []
119 | beta0List.append(beta0)
120 | 
121 | #begin iteration
122 | nSteps = 100
123 | lamMult = 0.93 #100 steps gives reduction by factor of 1000 in lambda (recommended by authors)
124 | nzList = []
125 | for iStep in range(nSteps):
126 |     #decrease lambda
127 |     lam = lam * lamMult
128 | 
129 | 
130 |     #Use incremental change in betas to control inner iteration
131 | 
132 | 
133 |     #set middle loop values for betas = to outer values
134 |     # values are used for calculating weights and probabilities
135 |     #inner values are used for calculating penalized regression updates
136 | 
137 |     #take pass through data to calculate averages over data require for iteration
138 |     #initilize accumulators
139 | 
140 |     betaIRLS = list(beta)
141 |     beta0IRLS = beta0
142 |     distIRLS = 100.0
143 |     #Middle loop to calculate new betas with fixed IRLS weights and probabilities
144 |     iterIRLS = 0
145 |     while distIRLS > 0.01:
146 |         iterIRLS += 1
147 |         iterInner = 0.0
148 | 
149 |         betaInner = list(betaIRLS)
150 |         beta0Inner = beta0IRLS
151 |         distInner = 100.0
152 |         while distInner > 0.01:
153 |             iterInner += 1
154 |             if iterInner > 100: break
155 | 
156 |             #cycle through attributes and update one-at-a-time
157 |             #record starting value for comparison
158 |             betaStart = list(betaInner)
159 |             for iCol in range(ncol):
160 | 
161 |                 sumWxr = 0.0
162 |                 sumWxx = 0.0
163 |                 sumWr = 0.0
164 |                 sumW = 0.0
165 | 
166 |                 for iRow in range(nrow):
167 |                     x = list(xNormalized[iRow])
168 |                     y = labels[iRow]
169 |                     p = Pr(beta0IRLS, betaIRLS, x)
170 |                     if abs(p) < 1e-5:
171 |                         p = 0.0
172 |                         w = 1e-5
173 |                     elif abs(1.0 - p) < 1e-5:
174 |                         p = 1.0
175 |                         w = 1e-5
176 |                     else:
177 |                         w = p * (1.0 - p)
178 | 
179 |                     z = (y - p) / w + beta0IRLS + sum([x[i] * betaIRLS[i] for i in range(ncol)])
180 |                     r = z - beta0Inner - sum([x[i] * betaInner[i] for i in range(ncol)])
181 |                     sumWxr += w * x[iCol] * r
182 |                     sumWxx += w * x[iCol] * x[iCol]
183 |                     sumWr += w * r
184 |                     sumW += w
185 | 
186 |                 avgWxr = sumWxr / nrow
187 |                 avgWxx = sumWxx / nrow
188 | 
189 |                 beta0Inner = beta0Inner + sumWr / sumW
190 |                 uncBeta = avgWxr + avgWxx * betaInner[iCol]
191 |                 betaInner[iCol] = S(uncBeta, lam * alpha) / (avgWxx + lam * (1.0 - alpha))
192 | 
193 |             sumDiff = sum([abs(betaInner[n] - betaStart[n]) for n in range(ncol)])
194 |             sumBeta = sum([abs(betaInner[n]) for n in range(ncol)])
195 |             distInner = sumDiff/sumBeta
196 |         #print number of steps for inner and middle loop convergence to monitor behavior
197 |         #print(iStep, iterIRLS, iterInner)
198 | 
199 |         #if exit inner while loop, then set betaMiddle = betaMiddle and run through middle loop again.
200 | 
201 |         #Check change in betaMiddle to see if IRLS is converged
202 |         a = sum([abs(betaIRLS[i] - betaInner[i]) for i in range(ncol)])
203 |         b = sum([abs(betaIRLS[i]) for i in range(ncol)])
204 |         distIRLS = a / (b + 0.0001)
205 |         dBeta = [betaInner[i] - betaIRLS[i] for i in range(ncol)]
206 |         gradStep = 1.0
207 |         temp = [betaIRLS[i] + gradStep * dBeta[i] for i in range(ncol)]
208 |         betaIRLS = list(temp)
209 | 
210 |     beta = list(betaIRLS)
211 |     beta0 = beta0IRLS
212 |     betaMat.append(list(beta))
213 |     beta0List.append(beta0)
214 | 
215 |     nzBeta = [index for index in range(ncol) if beta[index] != 0.0]
216 |     for q in nzBeta:
217 |         if not(q in nzList):
218 |             nzList.append(q)
219 | 
220 | #make up names for columns of xNum
221 | names = ['V' + str(i) for i in range(ncol)]
222 | nameList = [names[nzList[i]] for i in range(len(nzList))]
223 | 
224 | print("Attributes Ordered by How Early They Enter the Model")
225 | print(nameList)
226 | for i in range(ncol):
227 |     #plot range of beta values for each attribute
228 |     coefCurve = [betaMat[k][i] for k in range(nSteps)]
229 |     xaxis = range(nSteps)
230 |     plot.plot(xaxis, coefCurve)
231 | 
232 | plot.xlabel("Steps Taken")
233 | plot.ylabel("Coefficient Values")
234 | plot.show()


--------------------------------------------------------------------------------
/05/rocksVMines/rocksVMinesGlmnetPrintedOutput.txt:
--------------------------------------------------------------------------------
1 | Attributes Ordered by How Early They Enter the Model
2 | ['V10', 'V48', 'V11', 'V44', 'V35', 'V51', 'V20', 'V3', 'V50', 'V21', 'V43', 'V47', 'V15', 'V27', 'V0', 'V22', 'V36', 'V30', 'V53', 'V56', 'V58', 'V6', 'V19', 'V28', 'V39', 'V49', 'V7', 'V23', 'V54', 'V8', 'V14', 'V2', 'V29', 'V38', 'V57', 'V45', 'V13', 'V32', 'V31', 'V42', 'V16', 'V37', 'V59', 'V52', 'V25', 'V18', 'V1', 'V33', 'V4', 'V55', 'V17', 'V46', 'V26', 'V12', 'V40', 'V34', 'V5', 'V24', 'V41', 'V9']
3 | 
4 | 


--------------------------------------------------------------------------------
/05/wineCS/wineExpandedLassoCV.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | from sklearn import datasets, linear_model
  6 | from sklearn.linear_model import LassoCV
  7 | from math import sqrt
  8 | import matplotlib.pyplot as plot
  9 | 
 10 | #read data into iterable
 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 12 | data = urllib2.urlopen(target_url)
 13 | 
 14 | xList = []
 15 | labels = []
 16 | names = []
 17 | firstLine = True
 18 | for line in data:
 19 |     if firstLine:
 20 |         names = line.strip().split(";")
 21 |         firstLine = False
 22 |     else:
 23 |         #split on semi-colon
 24 |         row = line.strip().split(";")
 25 |         #put labels in separate array
 26 |         labels.append(float(row[-1]))
 27 |         #remove label from row
 28 |         row.pop()
 29 |         #convert row to floats
 30 |         floatRow = [float(num) for num in row]
 31 |         xList.append(floatRow)
 32 | 
 33 | #append square of last term (alcohol)
 34 | 
 35 | for i in range(len(xList)):
 36 |     alcElt = xList[i][-1]
 37 |     volAcid = xList[i][1]
 38 |     temp = list(xList[i])
 39 |     temp.append(alcElt*alcElt)
 40 |     temp.append(alcElt*volAcid)
 41 |     xList[i] = list(temp)
 42 | 
 43 | #add new name to variable list
 44 | names[-1] = "alco^2"
 45 | names.append("alco*volAcid")
 46 | 
 47 | #Normalize columns in x and labels
 48 | #Note: be careful about normalization.  Some penalized regression packages include it
 49 | #and some don't.
 50 | 
 51 | nrows = len(xList)
 52 | ncols = len(xList[0])
 53 | 
 54 | #calculate means and variances
 55 | xMeans = []
 56 | xSD = []
 57 | for i in range(ncols):
 58 |     col = [xList[j][i] for j in range(nrows)]
 59 |     mean = sum(col)/nrows
 60 |     xMeans.append(mean)
 61 |     colDiff = [(xList[j][i] - mean) for j in range(nrows)]
 62 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
 63 |     stdDev = sqrt(sumSq/nrows)
 64 |     xSD.append(stdDev)
 65 | 
 66 | #use calculate mean and standard deviation to normalize xList
 67 | xNormalized = []
 68 | for i in range(nrows):
 69 |     rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
 70 |     xNormalized.append(rowNormalized)
 71 | 
 72 | #Normalize labels
 73 | meanLabel = sum(labels)/nrows
 74 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
 75 | 
 76 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
 77 | 
 78 | #Convert list of list to np array for input to sklearn packages
 79 | 
 80 | #Unnormalized labels
 81 | Y = numpy.array(labels)
 82 | 
 83 | #normalized lables
 84 | #Y = numpy.array(labelNormalized)
 85 | 
 86 | #Unnormalized X's
 87 | X = numpy.array(xList)
 88 | 
 89 | #Normlized Xss
 90 | X = numpy.array(xNormalized)
 91 | 
 92 | #Call LassoCV from sklearn.linear_model
 93 | wineModel = LassoCV(cv=10).fit(X, Y)
 94 | 
 95 | # Display results
 96 | 
 97 | 
 98 | plot.figure()
 99 | plot.plot(wineModel.alphas_, wineModel.mse_path_, ':')
100 | plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1),
101 |          label='Average MSE Across Folds', linewidth=2)
102 | plot.axvline(wineModel.alpha_, linestyle='--',
103 |             label='CV Estimate of Best alpha')
104 | plot.semilogx()
105 | plot.legend()
106 | ax = plot.gca()
107 | ax.invert_xaxis()
108 | plot.xlabel('alpha')
109 | plot.ylabel('Mean Square Error')
110 | plot.axis('tight')
111 | plot.show()
112 | 
113 | #print out the value of alpha that minimizes the Cv-error
114 | print("alpha Value that Minimizes CV Error  ",wineModel.alpha_)
115 | print("Minimum MSE  ", min(wineModel.mse_path_.mean(axis=-1)))
116 | 


--------------------------------------------------------------------------------
/05/wineCS/wineLassoCV.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | from sklearn import datasets, linear_model
  6 | from sklearn.linear_model import LassoCV
  7 | from math import sqrt
  8 | import matplotlib.pyplot as plot
  9 | 
 10 | #read data into iterable
 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 12 | data = urllib2.urlopen(target_url)
 13 | 
 14 | xList = []
 15 | labels = []
 16 | names = []
 17 | firstLine = True
 18 | for line in data:
 19 |     if firstLine:
 20 |         names = line.strip().split(";")
 21 |         firstLine = False
 22 |     else:
 23 |         #split on semi-colon
 24 |         row = line.strip().split(";")
 25 |         #put labels in separate array
 26 |         labels.append(float(row[-1]))
 27 |         #remove label from row
 28 |         row.pop()
 29 |         #convert row to floats
 30 |         floatRow = [float(num) for num in row]
 31 |         xList.append(floatRow)
 32 | 
 33 | #Normalize columns in x and labels
 34 | #Note: be careful about normalization.  Some penalized regression packages include it
 35 | #and some don't.
 36 | 
 37 | nrows = len(xList)
 38 | ncols = len(xList[0])
 39 | 
 40 | #calculate means and variances
 41 | xMeans = []
 42 | xSD = []
 43 | for i in range(ncols):
 44 |     col = [xList[j][i] for j in range(nrows)]
 45 |     mean = sum(col)/nrows
 46 |     xMeans.append(mean)
 47 |     colDiff = [(xList[j][i] - mean) for j in range(nrows)]
 48 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
 49 |     stdDev = sqrt(sumSq/nrows)
 50 |     xSD.append(stdDev)
 51 | 
 52 | #use calculate mean and standard deviation to normalize xList
 53 | xNormalized = []
 54 | for i in range(nrows):
 55 |     rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
 56 |     xNormalized.append(rowNormalized)
 57 | 
 58 | #Normalize labels
 59 | meanLabel = sum(labels)/nrows
 60 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
 61 | 
 62 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
 63 | 
 64 | #Convert list of list to np array for input to sklearn packages
 65 | 
 66 | #Unnormalized labels
 67 | Y = numpy.array(labels)
 68 | 
 69 | #normalized lables
 70 | Y = numpy.array(labelNormalized)
 71 | 
 72 | #Unnormalized X's
 73 | X = numpy.array(xList)
 74 | 
 75 | #Normlized Xss
 76 | X = numpy.array(xNormalized)
 77 | 
 78 | #Call LassoCV from sklearn.linear_model
 79 | wineModel = LassoCV(cv=10).fit(X, Y)
 80 | 
 81 | # Display results
 82 | 
 83 | 
 84 | plot.figure()
 85 | plot.plot(wineModel.alphas_, wineModel.mse_path_, ':')
 86 | plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1),
 87 |          label='Average MSE Across Folds', linewidth=2)
 88 | plot.axvline(wineModel.alpha_, linestyle='--',
 89 |             label='CV Estimate of Best alpha')
 90 | plot.semilogx()
 91 | plot.legend()
 92 | ax = plot.gca()
 93 | ax.invert_xaxis()
 94 | plot.xlabel('alpha')
 95 | plot.ylabel('Mean Square Error')
 96 | plot.axis('tight')
 97 | plot.show()
 98 | 
 99 | #print out the value of alpha that minimizes the Cv-error
100 | print("alpha Value that Minimizes CV Error  ",wineModel.alpha_)
101 | print("Minimum MSE  ", min(wineModel.mse_path_.mean(axis=-1)))


--------------------------------------------------------------------------------
/05/wineCS/wineLassoCVPrintedOutputNormalizedX.txt:
--------------------------------------------------------------------------------
1 | ('alpha Value that Minimizes CV Error  ', 0.010948337166040082)
2 | ('Minimum MSE  ', 0.433801987153697)
3 | 


--------------------------------------------------------------------------------
/05/wineCS/wineLassoCVPrintedOutputNormalizedXandY.txt:
--------------------------------------------------------------------------------
1 | ('alpha Value that Minimizes CV Error  ', 0.013561387700964642)
2 | ('Minimum MSE  ', 0.66558492060028562)
3 | 


--------------------------------------------------------------------------------
/05/wineCS/wineLassoCVPrintedOutputUn-NormalizedX.txt:
--------------------------------------------------------------------------------
1 | ('alpha Value that Minimizes CV Error  ', 0.0052692947038249062)
2 | ('Minimum MSE  ', 0.43936035436777832)
3 | 


--------------------------------------------------------------------------------
/05/wineCS/wineLassoCoefCurves.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | from sklearn import datasets, linear_model
  6 | from sklearn.linear_model import LassoCV
  7 | from math import sqrt
  8 | import matplotlib.pyplot as plot
  9 | 
 10 | #read data into iterable
 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 12 | data = urllib2.urlopen(target_url)
 13 | 
 14 | xList = []
 15 | labels = []
 16 | names = []
 17 | firstLine = True
 18 | for line in data:
 19 |     if firstLine:
 20 |         names = line.strip().split(";")
 21 |         firstLine = False
 22 |     else:
 23 |         #split on semi-colon
 24 |         row = line.strip().split(";")
 25 |         #put labels in separate array
 26 |         labels.append(float(row[-1]))
 27 |         #remove label from row
 28 |         row.pop()
 29 |         #convert row to floats
 30 |         floatRow = [float(num) for num in row]
 31 |         xList.append(floatRow)
 32 | 
 33 | #Normalize columns in x and labels
 34 | #Note: be careful about normalization.  Some penalized regression packages include it
 35 | #and some don't.
 36 | 
 37 | nrows = len(xList)
 38 | ncols = len(xList[0])
 39 | 
 40 | #calculate means and variances
 41 | xMeans = []
 42 | xSD = []
 43 | for i in range(ncols):
 44 |     col = [xList[j][i] for j in range(nrows)]
 45 |     mean = sum(col)/nrows
 46 |     xMeans.append(mean)
 47 |     colDiff = [(xList[j][i] - mean) for j in range(nrows)]
 48 |     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
 49 |     stdDev = sqrt(sumSq/nrows)
 50 |     xSD.append(stdDev)
 51 | 
 52 | #use calculate mean and standard deviation to normalize xList
 53 | xNormalized = []
 54 | for i in range(nrows):
 55 |     rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
 56 |     xNormalized.append(rowNormalized)
 57 | 
 58 | #Normalize labels
 59 | meanLabel = sum(labels)/nrows
 60 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
 61 | 
 62 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
 63 | 
 64 | #Convert list of list to np array for input to sklearn packages
 65 | 
 66 | #Unnormalized labels
 67 | Y = numpy.array(labels)
 68 | 
 69 | #normalized lables
 70 | Y = numpy.array(labelNormalized)
 71 | 
 72 | #Unnormalized X's
 73 | X = numpy.array(xList)
 74 | 
 75 | #Normlized Xss
 76 | X = numpy.array(xNormalized)
 77 | 
 78 | alphas, coefs, _  = linear_model.lasso_path(X, Y,  return_models=False)
 79 | 
 80 | 
 81 | plot.plot(alphas,coefs.T)
 82 | 
 83 | plot.xlabel('alpha')
 84 | plot.ylabel('Coefficients')
 85 | plot.axis('tight')
 86 | plot.semilogx()
 87 | ax = plot.gca()
 88 | ax.invert_xaxis()
 89 | plot.show()
 90 | 
 91 | nattr, nalpha = coefs.shape
 92 | 
 93 | #find coefficient ordering
 94 | nzList = []
 95 | for iAlpha in range(1,nalpha):
 96 |     coefList = list(coefs[: ,iAlpha])
 97 |     nzCoef = [index for index in range(nattr) if coefList[index] != 0.0]
 98 |     for q in nzCoef:
 99 |         if not(q in nzList):
100 |             nzList.append(q)
101 | 
102 | nameList = [names[nzList[i]] for i in range(len(nzList))]
103 | print("Attributes Ordered by How Early They Enter the Model", nameList)
104 | 
105 | #find coefficients corresponding to best alpha value. alpha value corresponding to
106 | #normalized X and normalized Y is 0.013561387700964642
107 | 
108 | alphaStar = 0.013561387700964642
109 | indexLTalphaStar = [index for index in range(100) if alphas[index] > alphaStar]
110 | indexStar = max(indexLTalphaStar)
111 | 
112 | #here's the set of coefficients to deploy
113 | coefStar = list(coefs[:,indexStar])
114 | print("Best Coefficient Values ", coefStar)
115 | 
116 | #The coefficients on normalized attributes give another slightly different ordering
117 | 
118 | absCoef = [abs(a) for a in coefStar]
119 | 
120 | #sort by magnitude
121 | coefSorted = sorted(absCoef, reverse=True)
122 | 
123 | idxCoefSize = [absCoef.index(a) for a in coefSorted if not(a == 0.0)]
124 | 
125 | namesList2 = [names[idxCoefSize[i]] for i in range(len(idxCoefSize))]
126 | 
127 | print("Attributes Ordered by Coef Size at Optimum alpha", namesList2)


--------------------------------------------------------------------------------
/05/wineCS/wineLassoCoefCurvesPrintedOutput.txt:
--------------------------------------------------------------------------------
 1 | ('Attributes Ordered by How Early They Enter the Model', ['"alcohol"', '"volatile acidity"', '"sulphates"', '"total sulfur dioxide"', '"chlorides"', '"fixed acidity"', '"pH"', '"free sulfur dioxide"', '"residual sugar"', '"citric acid"', '"density"'])
 2 | 
 3 | ('Best Coefficient Values ', [0.0, -0.22773815784738916, -0.0, 0.0, -0.094239023363375404, 0.022151948563542922, -0.099036391332770576, -0.0, -0.067873612822590218, 0.16804102141830754, 0.37509573430881538])
 4 | 
 5 | ('Attributes Ordered by Coef Size at Optimum alpha', ['"alcohol"', '"volatile acidity"', '"sulphates"', '"total sulfur dioxide"', '"chlorides"', '"pH"', '"free sulfur dioxide"'])
 6 | 
 7 | 
 8 | Values with Un-normalized X:
 9 | ('Attributes Ordered by How Early They Enter the Model', ['"total sulfur dioxide"', '"free sulfur dioxide"', '"alcohol"', '"fixed acidity"', '"volatile acidity"', '"sulphates"'])
10 | 
11 | ('Best Coefficient Values ', [0.044339055570034182, -1.0154179864549988, 0.0, 0.0, -0.0, 0.0064112885435006822, -0.0038622920281433199, -0.0, -0.0, 0.41982634135945091, 0.37812720947996975])
12 | 
13 | ('Attributes Ordered by Coef Size at Optimum alpha', ['"volatile acidity"', '"sulphates"', '"alcohol"', '"fixed acidity"', '"free sulfur dioxide"', '"total sulfur dioxide"'])
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/05/wineCS/wineLassoExpandedCVPrintedOutput.txt:
--------------------------------------------------------------------------------
1 | ('alpha Value that Minimizes CV Error  ', 0.016640498998569835)
2 | ('Minimum MSE  ', 0.43452874043020256)
3 | 


--------------------------------------------------------------------------------
/06/chapter06.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/06/chapter06.zip


--------------------------------------------------------------------------------
/06/simpleBagging.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import numpy
  4 | import matplotlib.pyplot as plot
  5 | from sklearn import tree
  6 | from sklearn.tree import DecisionTreeRegressor
  7 | from math import floor
  8 | import random
  9 | 
 10 | 
 11 | #Build a simple data set with y = x + random
 12 | nPoints = 1000
 13 | 
 14 | #x values for plotting
 15 | xPlot = [(float(i)/float(nPoints) - 0.5) for i in range(nPoints + 1)]
 16 | 
 17 | #x needs to be list of lists.
 18 | x = [[s] for s in xPlot]
 19 | 
 20 | #y (labels) has random noise added to x-value
 21 | #set seed
 22 | random.seed(1)
 23 | y = [s + numpy.random.normal(scale=0.1) for s in xPlot]
 24 | 
 25 | #take fixed test set 30% of sample
 26 | nSample = int(nPoints * 0.30)
 27 | idxTest = random.sample(range(nPoints), nSample)
 28 | idxTest.sort()
 29 | idxTrain = [idx for idx in range(nPoints) if not(idx in idxTest)]
 30 | 
 31 | #Define test and training attribute and label sets
 32 | xTrain = [x[r] for r in idxTrain]
 33 | xTest = [x[r] for r in idxTest]
 34 | yTrain = [y[r] for r in idxTrain]
 35 | yTest = [y[r] for r in idxTest]
 36 | 
 37 | #train a series of models on random subsets of the training data
 38 | #collect the models in a list and check error of composite as list grows
 39 | 
 40 | #maximum number of models to generate
 41 | numTreesMax = 20
 42 | 
 43 | #tree depth - typically at the high end
 44 | treeDepth = 1
 45 | 
 46 | #initialize a list to hold models
 47 | modelList = []
 48 | predList = []
 49 | 
 50 | #number of samples to draw for stochastic bagging
 51 | nBagSamples = int(len(xTrain) * 0.5)
 52 | 
 53 | for iTrees in range(numTreesMax):
 54 |     idxBag = []
 55 |     for i in range(nBagSamples):
 56 |         idxBag.append(random.choice(range(len(xTrain))))
 57 |     xTrainBag = [xTrain[i] for i in idxBag]
 58 |     yTrainBag = [yTrain[i] for i in idxBag]
 59 | 
 60 |     modelList.append(DecisionTreeRegressor(max_depth=treeDepth))
 61 |     modelList[-1].fit(xTrainBag, yTrainBag)
 62 | 
 63 |     #make prediction with latest model and add to list of predictions
 64 |     latestPrediction = modelList[-1].predict(xTest)
 65 |     predList.append(list(latestPrediction))
 66 | 
 67 | 
 68 | #build cumulative prediction from first "n" models
 69 | mse = []
 70 | allPredictions = []
 71 | for iModels in range(len(modelList)):
 72 | 
 73 |     #average first "iModels" of the predictions
 74 |     prediction = []
 75 |     for iPred in range(len(xTest)):
 76 |         prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)])/(iModels + 1))
 77 | 
 78 |     allPredictions.append(prediction)
 79 |     errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))]
 80 |     mse.append(sum([e * e for e in errors]) / len(yTest))
 81 | 
 82 | 
 83 | nModels = [i + 1 for i in range(len(modelList))]
 84 | 
 85 | plot.plot(nModels,mse)
 86 | plot.axis('tight')
 87 | plot.xlabel('Number of Models in Ensemble')
 88 | plot.ylabel('Mean Squared Error')
 89 | plot.ylim((0.0, max(mse)))
 90 | plot.show()
 91 | 
 92 | plotList = [0, 9, 19]
 93 | for iPlot in plotList:
 94 |     plot.plot(xTest, allPredictions[iPlot])
 95 | plot.plot(xTest, yTest, linestyle="--")
 96 | plot.axis('tight')
 97 | plot.xlabel('x value')
 98 | plot.ylabel('Predictions')
 99 | plot.show()
100 | 
101 | print('Minimum MSE')
102 | print(min(mse))
103 | 
104 | 
105 | #With treeDepth = 1
106 | #Minimum MSE
107 | #0.0242960117899
108 | 
109 | 
110 | 
111 | #With treeDepth = 5
112 | #Minimum MSE
113 | #0.0118893503384


--------------------------------------------------------------------------------
/06/simpleGBM.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import numpy
  4 | import matplotlib.pyplot as plot
  5 | from sklearn import tree
  6 | from sklearn.tree import DecisionTreeRegressor
  7 | from math import floor
  8 | import random
  9 | 
 10 | #Build a simple data set with y = x + random
 11 | nPoints = 1000
 12 | 
 13 | #x values for plotting
 14 | xPlot = [(float(i)/float(nPoints) - 0.5) for i in range(nPoints + 1)]
 15 | 
 16 | #x needs to be list of lists.
 17 | x = [[s] for s in xPlot]
 18 | 
 19 | #y (labels) has random noise added to x-value
 20 | #set seed
 21 | numpy.random.seed(1)
 22 | y = [s + numpy.random.normal(scale=0.1) for s in xPlot]
 23 | 
 24 | #take fixed test set 30% of sample
 25 | nSample = int(nPoints * 0.30)
 26 | idxTest = random.sample(range(nPoints), nSample)
 27 | idxTest.sort()
 28 | idxTrain = [idx for idx in range(nPoints) if not(idx in idxTest)]
 29 | 
 30 | #Define test and training attribute and label sets
 31 | xTrain = [x[r] for r in idxTrain]
 32 | xTest = [x[r] for r in idxTest]
 33 | yTrain = [y[r] for r in idxTrain]
 34 | yTest = [y[r] for r in idxTest]
 35 | 
 36 | #train a series of models on random subsets of the training data
 37 | #collect the models in a list and check error of composite as list grows
 38 | 
 39 | #maximum number of models to generate
 40 | numTreesMax = 30
 41 | 
 42 | #tree depth - typically at the high end
 43 | treeDepth = 5
 44 | 
 45 | #initialize a list to hold models
 46 | modelList = []
 47 | predList = []
 48 | eps = 0.3
 49 | 
 50 | #initialize residuals to be the labels y
 51 | residuals = list(yTrain)
 52 | 
 53 | for iTrees in range(numTreesMax):
 54 | 
 55 |     modelList.append(DecisionTreeRegressor(max_depth=treeDepth))
 56 |     modelList[-1].fit(xTrain, residuals)
 57 | 
 58 |     #make prediction with latest model and add to list of predictions
 59 |     latestInSamplePrediction = modelList[-1].predict(xTrain)
 60 | 
 61 |     #use new predictions to update residuals
 62 |     residuals = [residuals[i] - eps * latestInSamplePrediction[i] for i in range(len(residuals))]
 63 | 
 64 |     latestOutSamplePrediction = modelList[-1].predict(xTest)
 65 |     predList.append(list(latestOutSamplePrediction))
 66 | 
 67 | 
 68 | #build cumulative prediction from first "n" models
 69 | mse = []
 70 | allPredictions = []
 71 | for iModels in range(len(modelList)):
 72 | 
 73 |     #add the first "iModels" of the predictions and multiply by eps
 74 |     prediction = []
 75 |     for iPred in range(len(xTest)):
 76 |         prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)]) * eps)
 77 | 
 78 |     allPredictions.append(prediction)
 79 |     errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))]
 80 |     mse.append(sum([e * e for e in errors]) / len(yTest))
 81 | 
 82 | 
 83 | nModels = [i + 1 for i in range(len(modelList))]
 84 | 
 85 | plot.plot(nModels,mse)
 86 | plot.axis('tight')
 87 | plot.xlabel('Number of Models in Ensemble')
 88 | plot.ylabel('Mean Squared Error')
 89 | plot.ylim((0.0, max(mse)))
 90 | plot.show()
 91 | 
 92 | plotList = [0, 14, 29]
 93 | lineType = [':', '-.', '--']
 94 | plot.figure()
 95 | for i in range(len(plotList)):
 96 |     iPlot = plotList[i]
 97 |     textLegend = 'Prediction with ' + str(iPlot) + ' Trees'
 98 |     plot.plot(xTest, allPredictions[iPlot], label = textLegend, linestyle = lineType[i])
 99 | plot.plot(xTest, yTest, label='True y Value', alpha=0.25)
100 | plot.legend(bbox_to_anchor=(1,0.3))
101 | plot.axis('tight')
102 | plot.xlabel('x value')
103 | plot.ylabel('Predictions')
104 | plot.show()
105 | 
106 | 


--------------------------------------------------------------------------------
/06/simpleTree.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import numpy
  4 | import matplotlib.pyplot as plot
  5 | from sklearn import tree
  6 | from sklearn.tree import DecisionTreeRegressor
  7 | from sklearn.externals.six import StringIO
  8 | 
  9 | #Build a simple data set with y = x + random
 10 | nPoints = 100
 11 | 
 12 | #x values for plotting
 13 | xPlot = [(float(i)/float(nPoints) - 0.5) for i in range(nPoints + 1)]
 14 | 
 15 | #x needs to be list of lists.
 16 | x = [[s] for s in xPlot]
 17 | 
 18 | #y (labels) has random noise added to x-value
 19 | #set seed
 20 | numpy.random.seed(1)
 21 | y = [s + numpy.random.normal(scale=0.1) for s in xPlot]
 22 | 
 23 | plot.plot(xPlot,y)
 24 | plot.axis('tight')
 25 | plot.xlabel('x')
 26 | plot.ylabel('y')
 27 | plot.show()
 28 | 
 29 | simpleTree = DecisionTreeRegressor(max_depth=1)
 30 | simpleTree.fit(x, y)
 31 | 
 32 | #draw the tree
 33 | with open("simpleTree.dot", 'w') as f:
 34 |     f = tree.export_graphviz(simpleTree, out_file=f)
 35 | 
 36 | #compare prediction from tree with true values
 37 | 
 38 | yHat  = simpleTree.predict(x)
 39 | 
 40 | plot.figure()
 41 | plot.plot(xPlot, y, label='True y')
 42 | plot.plot(xPlot, yHat, label='Tree Prediction ', linestyle='--')
 43 | plot.legend(bbox_to_anchor=(1,0.2))
 44 | plot.axis('tight')
 45 | plot.xlabel('x')
 46 | plot.ylabel('y')
 47 | plot.show()
 48 | 
 49 | simpleTree2 = DecisionTreeRegressor(max_depth=2)
 50 | simpleTree2.fit(x, y)
 51 | 
 52 | #draw the tree
 53 | with open("simpleTree2.dot", 'w') as f:
 54 |     f = tree.export_graphviz(simpleTree2, out_file=f)
 55 | 
 56 | #compare prediction from tree with true values
 57 | 
 58 | yHat  = simpleTree2.predict(x)
 59 | 
 60 | plot.figure()
 61 | plot.plot(xPlot, y, label='True y')
 62 | plot.plot(xPlot, yHat, label='Tree Prediction ', linestyle='--')
 63 | plot.legend(bbox_to_anchor=(1,0.2))
 64 | plot.axis('tight')
 65 | plot.xlabel('x')
 66 | plot.ylabel('y')
 67 | plot.show()
 68 | 
 69 | #split point calculations - try every possible split point to find the best one
 70 | sse = []
 71 | xMin = []
 72 | for i in range(1, len(xPlot)):
 73 |     #divide list into points on left and right of split point
 74 |     lhList = list(xPlot[0:i])
 75 |     rhList = list(xPlot[i:len(xPlot)])
 76 | 
 77 |     #calculate averages on each side
 78 |     lhAvg = sum(lhList) / len(lhList)
 79 |     rhAvg = sum(rhList) / len(rhList)
 80 | 
 81 |     #calculate sum square error on left, right and total
 82 |     lhSse = sum([(s - lhAvg) * (s - lhAvg) for s in lhList])
 83 |     rhSse = sum([(s - rhAvg) * (s - rhAvg) for s in rhList])
 84 | 
 85 |     #add sum of left and right to list of errors
 86 | 
 87 |     sse.append(lhSse + rhSse)
 88 |     xMin.append(max(lhList))
 89 | 
 90 | plot.plot(range(1, len(xPlot)), sse)
 91 | plot.xlabel('Split Point Index')
 92 | plot.ylabel('Sum Squared Error')
 93 | plot.show()
 94 | 
 95 | minSse = min(sse)
 96 | idxMin = sse.index(minSse)
 97 | print(xMin[idxMin])
 98 | 
 99 | #what happens if the depth is really high?
100 | simpleTree6 = DecisionTreeRegressor(max_depth=6)
101 | simpleTree6.fit(x, y)
102 | 
103 | #too many nodes to draw the tree
104 | #with open("simpleTree2.dot", 'w') as f:
105 | #    f = tree.export_graphviz(simpleTree6, out_file=f)
106 | 
107 | #compare prediction from tree with true values
108 | 
109 | yHat  = simpleTree6.predict(x)
110 | 
111 | plot.figure()
112 | plot.plot(xPlot, y, label='True y')
113 | plot.plot(xPlot, yHat, label='Tree Prediction ', linestyle='--')
114 | plot.legend(bbox_to_anchor=(1,0.2))
115 | plot.axis('tight')
116 | plot.xlabel('x')
117 | plot.ylabel('y')
118 | plot.show()
119 | 
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/06/simpleTreeCV.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike-bowles'
 2 | 
 3 | import numpy
 4 | import matplotlib.pyplot as plot
 5 | from sklearn import tree
 6 | from sklearn.tree import DecisionTreeRegressor
 7 | from sklearn.externals.six import StringIO
 8 | 
 9 | #Build a simple data set with y = x + random
10 | nPoints = 100
11 | 
12 | #x values for plotting
13 | xPlot = [(float(i)/float(nPoints) - 0.5) for i in range(nPoints + 1)]
14 | 
15 | #x needs to be list of lists.
16 | x = [[s] for s in xPlot]
17 | 
18 | #y (labels) has random noise added to x-value
19 | #set seed
20 | numpy.random.seed(1)
21 | y = [s + numpy.random.normal(scale=0.1) for s in xPlot]
22 | 
23 | nrow = len(x)
24 | 
25 | #fit trees with several different values for depth and use x-validation to see which works best.
26 | 
27 | depthList = [1, 2, 3, 4, 5, 6, 7]
28 | xvalMSE = []
29 | nxval = 10
30 | 
31 | for iDepth in depthList:
32 | 
33 |     #build cross-validation loop to fit tree and evaluate on out of sample data
34 |     for ixval in range(nxval):
35 | 
36 |         #Define test and training index sets
37 |         idxTest = [a for a in range(nrow) if a%nxval == ixval%nxval]
38 |         idxTrain = [a for a in range(nrow) if a%nxval != ixval%nxval]
39 | 
40 |         #Define test and training attribute and label sets
41 |         xTrain = [x[r] for r in idxTrain]
42 |         xTest = [x[r] for r in idxTest]
43 |         yTrain = [y[r] for r in idxTrain]
44 |         yTest = [y[r] for r in idxTest]
45 | 
46 |         #train tree of appropriate depth and accumulate out of sample (oos) errors
47 |         treeModel = DecisionTreeRegressor(max_depth=iDepth)
48 |         treeModel.fit(xTrain, yTrain)
49 | 
50 |         treePrediction = treeModel.predict(xTest)
51 |         error = [yTest[r] - treePrediction[r] for r in range(len(yTest))]
52 | 
53 |         #accumulate squared errors
54 |         if ixval == 0:
55 |             oosErrors = sum([e * e for e in error])
56 |         else:
57 |             #accumulate predictions
58 |             oosErrors += sum([e * e for e in error])
59 | 
60 |     #average the squared errors and accumulate by tree depth
61 | 
62 |     mse = oosErrors/nrow
63 |     xvalMSE.append(mse)
64 | 
65 | plot.plot(depthList, xvalMSE)
66 | plot.axis('tight')
67 | plot.xlabel('Tree Depth')
68 | plot.ylabel('Mean Squared Error')
69 | plot.show()
70 | 


--------------------------------------------------------------------------------
/06/wineBagging.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | from sklearn import tree
  6 | from sklearn.tree import DecisionTreeRegressor
  7 | import random
  8 | from math import sqrt
  9 | import matplotlib.pyplot as plot
 10 | 
 11 | #read data into iterable
 12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 13 | data = urllib2.urlopen(target_url)
 14 | 
 15 | xList = []
 16 | labels = []
 17 | names = []
 18 | firstLine = True
 19 | for line in data:
 20 |     if firstLine:
 21 |         names = line.strip().split(";")
 22 |         firstLine = False
 23 |     else:
 24 |         #split on semi-colon
 25 |         row = line.strip().split(";")
 26 |         #put labels in separate array
 27 |         labels.append(float(row[-1]))
 28 |         #remove label from row
 29 |         row.pop()
 30 |         #convert row to floats
 31 |         floatRow = [float(num) for num in row]
 32 |         xList.append(floatRow)
 33 | 
 34 | nrows = len(xList)
 35 | ncols = len(xList[0])
 36 | 
 37 | #take fixed test set 30% of sample
 38 | random.seed(1)
 39 | nSample = int(nrows * 0.30)
 40 | idxTest = random.sample(range(nrows), nSample)
 41 | idxTest.sort()
 42 | idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)]
 43 | 
 44 | #Define test and training attribute and label sets
 45 | xTrain = [xList[r] for r in idxTrain]
 46 | xTest = [xList[r] for r in idxTest]
 47 | yTrain = [labels[r] for r in idxTrain]
 48 | yTest = [labels[r] for r in idxTest]
 49 | 
 50 | #train a series of models on random subsets of the training data
 51 | #collect the models in a list and check error of composite as list grows
 52 | 
 53 | #maximum number of models to generate
 54 | numTreesMax = 30
 55 | 
 56 | #tree depth - typically at the high end
 57 | treeDepth = 1
 58 | 
 59 | #initialize a list to hold models
 60 | modelList = []
 61 | predList = []
 62 | 
 63 | #number of samples to draw for stochastic bagging
 64 | nBagSamples = int(len(xTrain) * 0.5)
 65 | 
 66 | for iTrees in range(numTreesMax):
 67 |     idxBag = []
 68 |     for i in range(nBagSamples):
 69 |         idxBag.append(random.choice(range(len(xTrain))))
 70 |     xTrainBag = [xTrain[i] for i in idxBag]
 71 |     yTrainBag = [yTrain[i] for i in idxBag]
 72 | 
 73 |     modelList.append(DecisionTreeRegressor(max_depth=treeDepth))
 74 |     modelList[-1].fit(xTrainBag, yTrainBag)
 75 | 
 76 |     #make prediction with latest model and add to list of predictions
 77 |     latestPrediction = modelList[-1].predict(xTest)
 78 |     predList.append(list(latestPrediction))
 79 | 
 80 | 
 81 | #build cumulative prediction from first "n" models
 82 | mse = []
 83 | allPredictions = []
 84 | for iModels in range(len(modelList)):
 85 | 
 86 |     #average first "iModels" of the predictions
 87 |     prediction = []
 88 |     for iPred in range(len(xTest)):
 89 |         prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)])/(iModels + 1))
 90 | 
 91 |     allPredictions.append(prediction)
 92 |     errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))]
 93 |     mse.append(sum([e * e for e in errors]) / len(yTest))
 94 | 
 95 | 
 96 | nModels = [i + 1 for i in range(len(modelList))]
 97 | 
 98 | plot.plot(nModels,mse)
 99 | plot.axis('tight')
100 | plot.xlabel('Number of Tree Models in Ensemble')
101 | plot.ylabel('Mean Squared Error')
102 | plot.ylim((0.0, max(mse)))
103 | plot.show()
104 | 
105 | print('Minimum MSE')
106 | print(min(mse))
107 | 
108 | #with treeDepth = 1
109 | #Minimum MSE
110 | #0.516236026081
111 | 
112 | 
113 | #with treeDepth = 5
114 | #Minimum MSE
115 | #0.39815421341
116 | 
117 | #with treeDepth = 12 & numTreesMax = 100
118 | #Minimum MSE
119 | #0.350749027669


--------------------------------------------------------------------------------
/06/wineGBM.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | from sklearn import tree
  6 | from sklearn.tree import DecisionTreeRegressor
  7 | import random
  8 | from math import sqrt
  9 | import matplotlib.pyplot as plot
 10 | 
 11 | #read data into iterable
 12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 13 | data = urllib2.urlopen(target_url)
 14 | 
 15 | xList = []
 16 | labels = []
 17 | names = []
 18 | firstLine = True
 19 | for line in data:
 20 |     if firstLine:
 21 |         names = line.strip().split(";")
 22 |         firstLine = False
 23 |     else:
 24 |         #split on semi-colon
 25 |         row = line.strip().split(";")
 26 |         #put labels in separate array
 27 |         labels.append(float(row[-1]))
 28 |         #remove label from row
 29 |         row.pop()
 30 |         #convert row to floats
 31 |         floatRow = [float(num) for num in row]
 32 |         xList.append(floatRow)
 33 | 
 34 | nrows = len(xList)
 35 | ncols = len(xList[0])
 36 | 
 37 | #take fixed test set 30% of sample
 38 | nSample = int(nrows * 0.30)
 39 | idxTest = random.sample(range(nrows), nSample)
 40 | idxTest.sort()
 41 | idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)]
 42 | 
 43 | #Define test and training attribute and label sets
 44 | xTrain = [xList[r] for r in idxTrain]
 45 | xTest = [xList[r] for r in idxTest]
 46 | yTrain = [labels[r] for r in idxTrain]
 47 | yTest = [labels[r] for r in idxTest]
 48 | 
 49 | #train a series of models on random subsets of the training data
 50 | #collect the models in a list and check error of composite as list grows
 51 | 
 52 | #maximum number of models to generate
 53 | numTreesMax = 30
 54 | 
 55 | #tree depth - typically at the high end
 56 | treeDepth = 5
 57 | 
 58 | #initialize a list to hold models
 59 | modelList = []
 60 | predList = []
 61 | eps = 0.1
 62 | 
 63 | #initialize residuals to be the labels y
 64 | residuals = list(yTrain)
 65 | 
 66 | for iTrees in range(numTreesMax):
 67 | 
 68 |     modelList.append(DecisionTreeRegressor(max_depth=treeDepth))
 69 |     modelList[-1].fit(xTrain, residuals)
 70 | 
 71 |     #make prediction with latest model and add to list of predictions
 72 |     latestInSamplePrediction = modelList[-1].predict(xTrain)
 73 | 
 74 |     #use new predictions to update residuals
 75 |     residuals = [residuals[i] - eps * latestInSamplePrediction[i] for i in range(len(residuals))]
 76 | 
 77 |     latestOutSamplePrediction = modelList[-1].predict(xTest)
 78 |     predList.append(list(latestOutSamplePrediction))
 79 | 
 80 | 
 81 | #build cumulative prediction from first "n" models
 82 | mse = []
 83 | allPredictions = []
 84 | for iModels in range(len(modelList)):
 85 | 
 86 |     #add the first "iModels" of the predictions and multiply by eps
 87 |     prediction = []
 88 |     for iPred in range(len(xTest)):
 89 |         prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)]) * eps)
 90 | 
 91 |     allPredictions.append(prediction)
 92 |     errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))]
 93 |     mse.append(sum([e * e for e in errors]) / len(yTest))
 94 | 
 95 | 
 96 | nModels = [i + 1 for i in range(len(modelList))]
 97 | 
 98 | plot.plot(nModels,mse)
 99 | plot.axis('tight')
100 | plot.xlabel('Number of Trees in Ensemble')
101 | plot.ylabel('Mean Squared Error')
102 | plot.ylim((0.0, max(mse)))
103 | plot.show()
104 | 
105 | print('Minimum MSE')
106 | print(min(mse))
107 | 
108 | #printed output
109 | #Minimum MSE
110 | #0.405031864814


--------------------------------------------------------------------------------
/06/wineRF.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | from sklearn import tree
  6 | from sklearn.tree import DecisionTreeRegressor
  7 | import random
  8 | from math import sqrt
  9 | import matplotlib.pyplot as plot
 10 | 
 11 | #read data into iterable
 12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 13 | data = urllib2.urlopen(target_url)
 14 | 
 15 | xList = []
 16 | labels = []
 17 | names = []
 18 | firstLine = True
 19 | for line in data:
 20 |     if firstLine:
 21 |         names = line.strip().split(";")
 22 |         firstLine = False
 23 |     else:
 24 |         #split on semi-colon
 25 |         row = line.strip().split(";")
 26 |         #put labels in separate array
 27 |         labels.append(float(row[-1]))
 28 |         #remove label from row
 29 |         row.pop()
 30 |         #convert row to floats
 31 |         floatRow = [float(num) for num in row]
 32 |         xList.append(floatRow)
 33 | 
 34 | nrows = len(xList)
 35 | ncols = len(xList[0])
 36 | 
 37 | #take fixed test set 30% of sample
 38 | random.seed(1)  #set seed so results are the same each run
 39 | nSample = int(nrows * 0.30)
 40 | idxTest = random.sample(range(nrows), nSample)
 41 | idxTest.sort()
 42 | idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)]
 43 | 
 44 | #Define test and training attribute and label sets
 45 | xTrain = [xList[r] for r in idxTrain]
 46 | xTest = [xList[r] for r in idxTest]
 47 | yTrain = [labels[r] for r in idxTrain]
 48 | yTest = [labels[r] for r in idxTest]
 49 | 
 50 | #train a series of models on random subsets of the training data
 51 | #collect the models in a list and check error of composite as list grows
 52 | 
 53 | #maximum number of models to generate
 54 | numTreesMax = 30
 55 | 
 56 | #tree depth - typically at the high end
 57 | treeDepth = 12
 58 | 
 59 | #pick how many attributes will be used in each model.
 60 | # authors recommend 1/3 for regression problem
 61 | nAttr = 4
 62 | 
 63 | #initialize a list to hold models
 64 | modelList = []
 65 | indexList = []
 66 | predList = []
 67 | nTrainRows = len(yTrain)
 68 | 
 69 | for iTrees in range(numTreesMax):
 70 | 
 71 |     modelList.append(DecisionTreeRegressor(max_depth=treeDepth))
 72 | 
 73 |     #take random sample of attributes
 74 |     idxAttr = random.sample(range(ncols), nAttr)
 75 |     idxAttr.sort()
 76 |     indexList.append(idxAttr)
 77 | 
 78 |     #take a random sample of training rows
 79 |     idxRows = []
 80 |     for i in range(int(0.5 * nTrainRows)):
 81 |         idxRows.append(random.choice(range(len(xTrain))))
 82 |     idxRows.sort()
 83 | 
 84 |     #build training set
 85 |     xRfTrain = []
 86 |     yRfTrain = []
 87 | 
 88 |     for i in range(len(idxRows)):
 89 |         temp = [xTrain[idxRows[i]][j] for j in idxAttr]
 90 |         xRfTrain.append(temp)
 91 |         yRfTrain.append(yTrain[idxRows[i]])
 92 | 
 93 |     modelList[-1].fit(xRfTrain, yRfTrain)
 94 | 
 95 |     #restrict xTest to attributes selected for training
 96 |     xRfTest = []
 97 |     for xx in xTest:
 98 |         temp = [xx[i] for i in idxAttr]
 99 |         xRfTest.append(temp)
100 | 
101 |     latestOutSamplePrediction = modelList[-1].predict(xRfTest)
102 |     predList.append(list(latestOutSamplePrediction))
103 | 
104 | 
105 | #build cumulative prediction from first "n" models
106 | mse = []
107 | allPredictions = []
108 | for iModels in range(len(modelList)):
109 | 
110 |     #add the first "iModels" of the predictions and multiply by eps
111 |     prediction = []
112 |     for iPred in range(len(xTest)):
113 |         prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)]) / (iModels + 1))
114 | 
115 |     allPredictions.append(prediction)
116 |     errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))]
117 |     mse.append(sum([e * e for e in errors]) / len(yTest))
118 | 
119 | 
120 | nModels = [i + 1 for i in range(len(modelList))]
121 | 
122 | plot.plot(nModels,mse)
123 | plot.axis('tight')
124 | plot.xlabel('Number of Trees in Ensemble')
125 | plot.ylabel('Mean Squared Error')
126 | plot.ylim((0.0, max(mse)))
127 | plot.show()
128 | 
129 | print('Minimum MSE')
130 | print(min(mse))
131 | 
132 | #printed output
133 | 
134 | #Depth 1
135 | #Minimum MSE
136 | #0.52666715461
137 | 
138 | #Depth 5
139 | #Minimum MSE
140 | #0.426116327584
141 | 
142 | #Depth 12
143 | #Minimum MSE
144 | #0.38508387863


--------------------------------------------------------------------------------
/06/wineTree.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike-bowles'
 2 | 
 3 | import urllib2
 4 | import numpy
 5 | from sklearn import tree
 6 | from sklearn.tree import DecisionTreeRegressor
 7 | from sklearn.externals.six import StringIO
 8 | from math import sqrt
 9 | import matplotlib.pyplot as plot
10 | 
11 | #read data into iterable
12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
13 | data = urllib2.urlopen(target_url)
14 | 
15 | xList = []
16 | labels = []
17 | names = []
18 | firstLine = True
19 | for line in data:
20 |     if firstLine:
21 |         names = line.strip().split(";")
22 |         firstLine = False
23 |     else:
24 |         #split on semi-colon
25 |         row = line.strip().split(";")
26 |         #put labels in separate array
27 |         labels.append(float(row[-1]))
28 |         #remove label from row
29 |         row.pop()
30 |         #convert row to floats
31 |         floatRow = [float(num) for num in row]
32 |         xList.append(floatRow)
33 | 
34 | nrows = len(xList)
35 | ncols = len(xList[0])
36 | 
37 | wineTree = DecisionTreeRegressor(max_depth=3)
38 | 
39 | wineTree.fit(xList, labels)
40 | 
41 | with open("wineTree.dot", 'w') as f:
42 |     f = tree.export_graphviz(wineTree, out_file=f)
43 | #Note: The code above exports the trained tree info to a Graphviz "dot" file.
44 | #Drawing the graph requires installing GraphViz and the running the following on the command line
45 | #dot -Tpng wineTree.dot -o wineTree.png
46 | 
47 | 


--------------------------------------------------------------------------------
/07/abaloneGBM.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | 
  3 | import urllib2
  4 | from pylab import *
  5 | import matplotlib.pyplot as plot
  6 | import numpy
  7 | from sklearn.cross_validation import train_test_split
  8 | from sklearn import ensemble
  9 | from sklearn.metrics import mean_squared_error
 10 | 
 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
 12 | #read abalone data
 13 | data = urllib2.urlopen(target_url)
 14 | 
 15 | xList = []
 16 | labels = []
 17 | for line in data:
 18 |     #split on semi-colon
 19 |     row = line.strip().split(",")
 20 | 
 21 |     #put labels in separate array and remove label from row
 22 |     labels.append(float(row.pop()))
 23 | 
 24 |     #form list of list of attributes (all strings)
 25 |     xList.append(row)
 26 | 
 27 | #code three-valued sex attribute as numeric
 28 | xCoded = []
 29 | for row in xList:
 30 |     #first code the three-valued sex variable
 31 |     codedSex = [0.0, 0.0]
 32 |     if row[0] == 'M': codedSex[0] = 1.0
 33 |     if row[0] == 'F': codedSex[1] = 1.0
 34 | 
 35 |     numRow = [float(row[i]) for i in range(1,len(row))]
 36 |     rowCoded = list(codedSex) + numRow
 37 |     xCoded.append(rowCoded)
 38 | 
 39 | #list of names for
 40 | abaloneNames = numpy.array(['Sex1', 'Sex2', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
 41 |                             'Viscera weight', 'Shell weight', 'Rings'])
 42 | 
 43 | #number of rows and columns in x matrix
 44 | nrows = len(xCoded)
 45 | ncols = len(xCoded[1])
 46 | 
 47 | #form x and y into numpy arrays and make up column names
 48 | X = numpy.array(xCoded)
 49 | y = numpy.array(labels)
 50 | 
 51 | #break into training and test sets.
 52 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
 53 | 
 54 | #instantiate model
 55 | nEst = 2000
 56 | depth = 5
 57 | learnRate = 0.005
 58 | maxFeatures = 3
 59 | subsamp = 0.5
 60 | abaloneGBMModel = ensemble.GradientBoostingRegressor(n_estimators=nEst, max_depth=depth, 
 61 |                                                      learning_rate=learnRate, max_features=maxFeatures,
 62 |                                                      subsample=subsamp, loss='ls')
 63 | 
 64 | #train
 65 | abaloneGBMModel.fit(xTrain, yTrain)
 66 | 
 67 | # compute mse on test set
 68 | msError = []
 69 | predictions = abaloneGBMModel.staged_decision_function(xTest)
 70 | for p in predictions:
 71 |     msError.append(mean_squared_error(yTest, p))
 72 | 
 73 | print("MSE" )
 74 | print(min(msError))
 75 | print(msError.index(min(msError)))
 76 | 
 77 | #plot training and test errors vs number of trees in ensemble
 78 | plot.figure()
 79 | plot.plot(range(1, nEst + 1), abaloneGBMModel.train_score_, label='Training Set MSE', linestyle=":")
 80 | plot.plot(range(1, nEst + 1), msError, label='Test Set MSE')
 81 | plot.legend(loc='upper right')
 82 | plot.xlabel('Number of Trees in Ensemble')
 83 | plot.ylabel('Mean Squared Error')
 84 | plot.show()
 85 | 
 86 | # Plot feature importance
 87 | featureImportance = abaloneGBMModel.feature_importances_
 88 | 
 89 | # normalize by max importance
 90 | featureImportance = featureImportance / featureImportance.max()
 91 | idxSorted = numpy.argsort(featureImportance)
 92 | barPos = numpy.arange(idxSorted.shape[0]) + .5
 93 | plot.barh(barPos, featureImportance[idxSorted], align='center')
 94 | plot.yticks(barPos, abaloneNames[idxSorted])
 95 | plot.xlabel('Variable Importance')
 96 | plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)
 97 | plot.show()
 98 | 
 99 | # Printed Output:
100 | 
101 | # for Gradient Boosting
102 | # nEst = 2000
103 | # depth = 5
104 | # learnRate = 0.003
105 | # maxFeatures = None
106 | # subsamp = 0.5
107 | #
108 | # MSE
109 | # 4.22969363284
110 | # 1736
111 | 
112 | #for Gradient Boosting with RF base learners
113 | # nEst = 2000
114 | # depth = 5
115 | # learnRate = 0.005
116 | # maxFeatures = 3
117 | # subsamp = 0.5
118 | #
119 | # MSE
120 | # 4.27564515749
121 | # 1687
122 | 


--------------------------------------------------------------------------------
/07/abaloneRF.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mike_bowles'
 2 | 
 3 | import urllib2
 4 | from pylab import *
 5 | import matplotlib.pyplot as plot
 6 | import numpy
 7 | from sklearn.cross_validation import train_test_split
 8 | from sklearn import ensemble
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | 
12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
13 | #read abalone data
14 | data = urllib2.urlopen(target_url)
15 | 
16 | xList = []
17 | labels = []
18 | for line in data:
19 |     #split on semi-colon
20 |     row = line.strip().split(",")
21 | 
22 |     #put labels in separate array and remove label from row
23 |     labels.append(float(row.pop()))
24 | 
25 |     #form list of list of attributes (all strings)
26 |     xList.append(row)
27 | 
28 | #code three-valued sex attribute as numeric
29 | xCoded = []
30 | for row in xList:
31 |     #first code the three-valued sex variable
32 |     codedSex = [0.0, 0.0]
33 |     if row[0] == 'M': codedSex[0] = 1.0
34 |     if row[0] == 'F': codedSex[1] = 1.0
35 | 
36 |     numRow = [float(row[i]) for i in range(1,len(row))]
37 |     rowCoded = list(codedSex) + numRow
38 |     xCoded.append(rowCoded)
39 | 
40 | #list of names for
41 | abaloneNames = numpy.array(['Sex1', 'Sex2', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
42 |                             'Viscera weight', 'Shell weight', 'Rings'])
43 | 
44 | #number of rows and columns in x matrix
45 | nrows = len(xCoded)
46 | ncols = len(xCoded[1])
47 | 
48 | #form x and y into numpy arrays and make up column names
49 | X = numpy.array(xCoded)
50 | y = numpy.array(labels)
51 | 
52 | #break into training and test sets.
53 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
54 | 
55 | #train random forest at a range of ensemble sizes in order to see how the mse changes
56 | mseOos = []
57 | nTreeList = range(50, 500, 10)
58 | for iTrees in nTreeList:
59 |     depth = None
60 |     maxFeat  = 4 #try tweaking
61 |     abaloneRFModel = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
62 |                                                  oob_score=False, random_state=531)
63 | 
64 |     abaloneRFModel.fit(xTrain,yTrain)
65 | 
66 |     #Accumulate mse on test set
67 |     prediction = abaloneRFModel.predict(xTest)
68 |     mseOos.append(mean_squared_error(yTest, prediction))
69 | 
70 | 
71 | print("MSE" )
72 | print(mseOos[-1])
73 | 
74 | 
75 | #plot training and test errors vs number of trees in ensemble
76 | plot.plot(nTreeList, mseOos)
77 | plot.xlabel('Number of Trees in Ensemble')
78 | plot.ylabel('Mean Squared Error')
79 | #plot.ylim([0.0, 1.1*max(mseOob)])
80 | plot.show()
81 | 
82 | # Plot feature importance
83 | featureImportance = abaloneRFModel.feature_importances_
84 | 
85 | # normalize by max importance
86 | featureImportance = featureImportance / featureImportance.max()
87 | sortedIdx = numpy.argsort(featureImportance)
88 | barPos = numpy.arange(sortedIdx.shape[0]) + .5
89 | plot.barh(barPos, featureImportance[sortedIdx], align='center')
90 | plot.yticks(barPos, abaloneNames[sortedIdx])
91 | plot.xlabel('Variable Importance')
92 | plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)
93 | plot.show()
94 | 
95 | # Printed Output:
96 | # MSE
97 | # 4.30971555911


--------------------------------------------------------------------------------
/07/glassGbm.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | 
  3 | import urllib2
  4 | from math import sqrt, fabs, exp
  5 | import matplotlib.pyplot as plot
  6 | from sklearn.linear_model import enet_path
  7 | from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
  8 | from sklearn.cross_validation import train_test_split
  9 | from sklearn import ensemble
 10 | import numpy
 11 | 
 12 | 
 13 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"
 14 | data = urllib2.urlopen(target_url)
 15 | 
 16 | #arrange data into list for labels and list of lists for attributes
 17 | xList = []
 18 | for line in data:
 19 |     #split on comma
 20 |     row = line.strip().split(",")
 21 |     xList.append(row)
 22 | 
 23 | glassNames = numpy.array(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'])
 24 | 
 25 | #Separate attributes and labels
 26 | xNum = []
 27 | labels = []
 28 | 
 29 | for row in xList:
 30 |     labels.append(row.pop())
 31 |     l = len(row)
 32 |     #eliminate ID
 33 |     attrRow = [float(row[i]) for i in range(1, l)]
 34 |     xNum.append(attrRow)
 35 | 
 36 | #number of rows and columns in x matrix
 37 | nrows = len(xNum)
 38 | ncols = len(xNum[1])
 39 | 
 40 | #Labels are integers from 1 to 7 with no examples of 4.
 41 | #gb requires consecutive integers starting at 0
 42 | newLabels = []
 43 | labelSet = set(labels)
 44 | labelList = list(labelSet)
 45 | labelList.sort()
 46 | nlabels = len(labelList)
 47 | for l in labels:
 48 |     index = labelList.index(l)
 49 |     newLabels.append(index)
 50 | 
 51 | #Class populations:
 52 | #old label     new label     num of examples
 53 | #1              0               70
 54 | #2              1               76
 55 | #3              2               17
 56 | #5              3               13
 57 | #6              4               9
 58 | #7              5               29
 59 | #
 60 | #Drawing 30% test sample may not preserve population proportions
 61 | 
 62 | #stratified sampling by labels.
 63 | xTemp = [xNum[i] for i in range(nrows) if newLabels[i] == 0]
 64 | yTemp = [newLabels[i] for i in range(nrows) if newLabels[i] == 0]
 65 | xTrain, xTest, yTrain, yTest = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531)
 66 | for iLabel in range(1, len(labelList)):
 67 |     #segregate x and y according to labels
 68 |     xTemp = [xNum[i] for i in range(nrows) if newLabels[i] == iLabel]
 69 |     yTemp = [newLabels[i] for i in range(nrows) if newLabels[i] == iLabel]
 70 | 
 71 |     #form train and test sets on segregated subset of examples
 72 |     xTrainTemp, xTestTemp, yTrainTemp, yTestTemp = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531)
 73 | 
 74 |     #accumulate
 75 |     xTrain = numpy.append(xTrain, xTrainTemp, axis=0); xTest = numpy.append(xTest, xTestTemp, axis=0)
 76 |     yTrain = numpy.append(yTrain, yTrainTemp, axis=0); yTest = numpy.append(yTest, yTestTemp, axis=0)
 77 | 
 78 | #instantiate model
 79 | nEst = 500
 80 | depth = 3
 81 | learnRate = 0.003
 82 | maxFeatures = 3
 83 | subSamp = 0.5
 84 | glassGBMModel = ensemble.GradientBoostingClassifier(n_estimators=nEst, max_depth=depth,
 85 |                                                          learning_rate=learnRate, max_features=maxFeatures,
 86 |                                                          subsample=subSamp)
 87 | 
 88 | #train
 89 | glassGBMModel.fit(xTrain, yTrain)
 90 | 
 91 | # compute auc on test set as function of ensemble size
 92 | missClassError = []
 93 | missClassBest = 1.0
 94 | predictions = glassGBMModel.staged_decision_function(xTest)
 95 | for p in predictions:
 96 |     missClass = 0
 97 |     for i in range(len(p)):
 98 |         listP = p[i].tolist()
 99 |         if listP.index(max(listP)) != yTest[i]:
100 |             missClass += 1
101 |     missClass = float(missClass)/len(p)
102 | 
103 |     missClassError.append(missClass)
104 | 
105 |     #capture best predictions
106 |     if missClass < missClassBest:
107 |         missClassBest = missClass
108 |         pBest = p
109 | 
110 | idxBest = missClassError.index(min(missClassError))
111 | 
112 | #print best values
113 | print("Best Missclassification Error" )
114 | print(missClassBest)
115 | print("Number of Trees for Best Missclassification Error")
116 | print(idxBest)
117 | 
118 | #plot training deviance and test auc's vs number of trees in ensemble
119 | missClassError = [100*mce for mce in missClassError]
120 | plot.figure()
121 | plot.plot(range(1, nEst + 1), glassGBMModel.train_score_, label='Training Set Deviance', linestyle=":")
122 | plot.plot(range(1, nEst + 1), missClassError, label='Test Set Error')
123 | plot.legend(loc='upper right')
124 | plot.xlabel('Number of Trees in Ensemble')
125 | plot.ylabel('Deviance / Classification Error')
126 | plot.show()
127 | 
128 | # Plot feature importance
129 | featureImportance = glassGBMModel.feature_importances_
130 | 
131 | # normalize by max importance
132 | featureImportance = featureImportance / featureImportance.max()
133 | 
134 | #plot variable importance
135 | idxSorted = numpy.argsort(featureImportance)
136 | barPos = numpy.arange(idxSorted.shape[0]) + .5
137 | plot.barh(barPos, featureImportance[idxSorted], align='center')
138 | plot.yticks(barPos, glassNames[idxSorted])
139 | plot.xlabel('Variable Importance')
140 | plot.show()
141 | 
142 | #generate confusion matrix for best prediction.
143 | pBestList = pBest.tolist()
144 | bestPrediction = [r.index(max(r)) for r in pBestList]
145 | confusionMat = confusion_matrix(yTest, bestPrediction)
146 | print('')
147 | print("Confusion Matrix")
148 | print(confusionMat)
149 | 
150 | 
151 | # Printed Output:
152 | #
153 | # nEst = 500
154 | # depth = 3
155 | # learnRate = 0.003
156 | # maxFeatures = None
157 | # subSamp = 0.5
158 | #
159 | #
160 | # Best Missclassification Error
161 | # 0.242424242424
162 | # Number of Trees for Best Missclassification Error
163 | # 113
164 | #
165 | # Confusion Matrix
166 | # [[19  1  0  0  0  1]
167 | #  [ 3 19  0  1  0  0]
168 | #  [ 4  1  0  0  1  0]
169 | #  [ 0  3  0  1  0  0]
170 | #  [ 0  0  0  0  3  0]
171 | #  [ 0  1  0  1  0  7]]
172 | #
173 | 
174 | 
175 | 
176 | # For gradient boosting using random forest base learners
177 | # nEst = 500
178 | # depth = 3
179 | # learnRate = 0.003
180 | # maxFeatures = 3
181 | # subSamp = 0.5
182 | #
183 | #
184 | #
185 | # Best Missclassification Error
186 | # 0.227272727273
187 | # Number of Trees for Best Missclassification Error
188 | # 267
189 | #
190 | # Confusion Matrix
191 | # [[20  1  0  0  0  0]
192 | #  [ 3 20  0  0  0  0]
193 | #  [ 3  3  0  0  0  0]
194 | #  [ 0  4  0  0  0  0]
195 | #  [ 0  0  0  0  3  0]
196 | #  [ 0  2  0  0  0  7]]


--------------------------------------------------------------------------------
/07/glassRF.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | 
  3 | import urllib2
  4 | from math import sqrt, fabs, exp
  5 | import matplotlib.pyplot as plot
  6 | from sklearn.linear_model import enet_path
  7 | from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve
  8 | from sklearn.cross_validation import train_test_split
  9 | from sklearn import ensemble
 10 | import numpy
 11 | 
 12 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"
 13 | data = urllib2.urlopen(target_url)
 14 | 
 15 | #arrange data into list for labels and list of lists for attributes
 16 | xList = []
 17 | for line in data:
 18 |     #split on comma
 19 |     row = line.strip().split(",")
 20 |     xList.append(row)
 21 | 
 22 | glassNames = numpy.array(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'])
 23 | 
 24 | #Separate attributes and labels
 25 | xNum = []
 26 | labels = []
 27 | 
 28 | for row in xList:
 29 |     labels.append(row.pop())
 30 |     l = len(row)
 31 |     #eliminate ID
 32 |     attrRow = [float(row[i]) for i in range(1, l)]
 33 |     xNum.append(attrRow)
 34 | 
 35 | #number of rows and columns in x matrix
 36 | nrows = len(xNum)
 37 | ncols = len(xNum[1])
 38 | 
 39 | #Labels are integers from 1 to 7 with no examples of 4.
 40 | #gb requires consecutive integers starting at 0
 41 | newLabels = []
 42 | labelSet = set(labels)
 43 | labelList = list(labelSet)
 44 | labelList.sort()
 45 | nlabels = len(labelList)
 46 | for l in labels:
 47 |     index = labelList.index(l)
 48 |     newLabels.append(index)
 49 | 
 50 | #Class populations:
 51 | #old label     new label     num of examples
 52 | #1              0               70
 53 | #2              1               76
 54 | #3              2               17
 55 | #5              3               13
 56 | #6              4               9
 57 | #7              5               29
 58 | #
 59 | #Drawing 30% test sample may not preserve population proportions
 60 | 
 61 | #stratified sampling by labels.
 62 | xTemp = [xNum[i] for i in range(nrows) if newLabels[i] == 0]
 63 | yTemp = [newLabels[i] for i in range(nrows) if newLabels[i] == 0]
 64 | xTrain, xTest, yTrain, yTest = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531)
 65 | for iLabel in range(1, len(labelList)):
 66 |     #segregate x and y according to labels
 67 |     xTemp = [xNum[i] for i in range(nrows) if newLabels[i] == iLabel]
 68 |     yTemp = [newLabels[i] for i in range(nrows) if newLabels[i] == iLabel]
 69 | 
 70 |     #form train and test sets on segregated subset of examples
 71 |     xTrainTemp, xTestTemp, yTrainTemp, yTestTemp = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531)
 72 | 
 73 |     #accumulate
 74 |     xTrain = numpy.append(xTrain, xTrainTemp, axis=0); xTest = numpy.append(xTest, xTestTemp, axis=0)
 75 |     yTrain = numpy.append(yTrain, yTrainTemp, axis=0); yTest = numpy.append(yTest, yTestTemp, axis=0)
 76 |     
 77 | missCLassError = []
 78 | nTreeList = range(50, 2000, 50)
 79 | for iTrees in nTreeList:
 80 |     depth = None
 81 |     maxFeat  = 4 #try tweaking
 82 |     glassRFModel = ensemble.RandomForestClassifier(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
 83 |                                                  oob_score=False, random_state=531)
 84 | 
 85 |     glassRFModel.fit(xTrain,yTrain)
 86 | 
 87 |     #Accumulate auc on test set
 88 |     prediction = glassRFModel.predict(xTest)
 89 |     correct = accuracy_score(yTest, prediction)
 90 | 
 91 |     missCLassError.append(1.0 - correct)
 92 | 
 93 | print("Missclassification Error" )
 94 | print(missCLassError[-1])
 95 | 
 96 | #generate confusion matrix
 97 | pList = prediction.tolist()
 98 | confusionMat = confusion_matrix(yTest, pList)
 99 | print('')
100 | print("Confusion Matrix")
101 | print(confusionMat)
102 | 
103 | 
104 | 
105 | #plot training and test errors vs number of trees in ensemble
106 | plot.plot(nTreeList, missCLassError)
107 | plot.xlabel('Number of Trees in Ensemble')
108 | plot.ylabel('Missclassification Error Rate')
109 | #plot.ylim([0.0, 1.1*max(mseOob)])
110 | plot.show()
111 | 
112 | # Plot feature importance
113 | featureImportance = glassRFModel.feature_importances_
114 | 
115 | # normalize by max importance
116 | featureImportance = featureImportance / featureImportance.max()
117 | 
118 | #plot variable importance
119 | idxSorted = numpy.argsort(featureImportance)
120 | barPos = numpy.arange(idxSorted.shape[0]) + .5
121 | plot.barh(barPos, featureImportance[idxSorted], align='center')
122 | plot.yticks(barPos, glassNames[idxSorted])
123 | plot.xlabel('Variable Importance')
124 | plot.show()
125 | 
126 | 
127 | # Printed Output:
128 | # Missclassification Error
129 | # 0.227272727273
130 | #
131 | # Confusion Matrix
132 | # [[17  1  2  0  0  1]
133 | #  [ 2 18  1  2  0  0]
134 | #  [ 3  0  3  0  0  0]
135 | #  [ 0  0  0  4  0  0]
136 | #  [ 0  1  0  0  2  0]
137 | #  [ 0  2  0  0  0  7]]
138 | 


--------------------------------------------------------------------------------
/07/rocksVMinesGBM.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | 
  3 | import urllib2
  4 | from math import sqrt, fabs, exp
  5 | import matplotlib.pyplot as plot
  6 | from sklearn.cross_validation import train_test_split
  7 | from sklearn import ensemble
  8 | from sklearn.metrics import roc_auc_score, roc_curve
  9 | import numpy
 10 | 
 11 | #read data from uci data repository
 12 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
 13 | data = urllib2.urlopen(target_url)
 14 | 
 15 | 
 16 | #arrange data into list for labels and list of lists for attributes
 17 | xList = []
 18 | 
 19 | 
 20 | for line in data:
 21 |     #split on comma
 22 |     row = line.strip().split(",")
 23 |     xList.append(row)
 24 | 
 25 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0
 26 | 
 27 | xNum = []
 28 | labels = []
 29 | 
 30 | for row in xList:
 31 |     lastCol = row.pop()
 32 |     if lastCol == "M":
 33 |         labels.append(1)
 34 |     else:
 35 |         labels.append(0)
 36 |     attrRow = [float(elt) for elt in row]
 37 |     xNum.append(attrRow)
 38 | 
 39 | #number of rows and columns in x matrix
 40 | nrows = len(xNum)
 41 | ncols = len(xNum[1])
 42 | 
 43 | #form x and y into numpy arrays and make up column names
 44 | X = numpy.array(xNum)
 45 | y = numpy.array(labels)
 46 | rockVMinesNames = numpy.array(['V' + str(i) for i in range(ncols)])
 47 | 
 48 | #break into training and test sets.
 49 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
 50 | 
 51 | #instantiate model
 52 | nEst = 2000
 53 | depth = 3
 54 | learnRate = 0.007
 55 | maxFeatures = 20
 56 | rockVMinesGBMModel = ensemble.GradientBoostingClassifier(n_estimators=nEst, max_depth=depth,
 57 |                                                          learning_rate=learnRate,
 58 |                                                          max_features=maxFeatures)
 59 | #train
 60 | rockVMinesGBMModel.fit(xTrain, yTrain)
 61 | 
 62 | # compute auc on test set as function of ensemble size
 63 | auc = []
 64 | aucBest = 0.0
 65 | predictions = rockVMinesGBMModel.staged_decision_function(xTest)
 66 | for p in predictions:
 67 |     aucCalc = roc_auc_score(yTest, p)
 68 |     auc.append(aucCalc)
 69 | 
 70 |     #capture best predictions
 71 |     if aucCalc > aucBest:
 72 |         aucBest = aucCalc
 73 |         pBest = p
 74 | 
 75 | idxBest = auc.index(max(auc))
 76 | 
 77 | #print best values
 78 | print("Best AUC" )
 79 | print(auc[idxBest])
 80 | print("Number of Trees for Best AUC")
 81 | print(idxBest)
 82 | 
 83 | #plot training deviance and test auc's vs number of trees in ensemble
 84 | plot.figure()
 85 | plot.plot(range(1, nEst + 1), rockVMinesGBMModel.train_score_, label='Training Set Deviance', linestyle=":")
 86 | plot.plot(range(1, nEst + 1), auc, label='Test Set AUC')
 87 | plot.legend(loc='upper right')
 88 | plot.xlabel('Number of Trees in Ensemble')
 89 | plot.ylabel('Deviance / AUC')
 90 | plot.show()
 91 | 
 92 | # Plot feature importance
 93 | featureImportance = rockVMinesGBMModel.feature_importances_
 94 | 
 95 | # normalize by max importance
 96 | featureImportance = featureImportance / featureImportance.max()
 97 | 
 98 | #plot importance of top 30
 99 | idxSorted = numpy.argsort(featureImportance)[30:60]
100 | 
101 | barPos = numpy.arange(idxSorted.shape[0]) + .5
102 | plot.barh(barPos, featureImportance[idxSorted], align='center')
103 | plot.yticks(barPos, rockVMinesNames[idxSorted])
104 | plot.xlabel('Variable Importance')
105 | plot.show()
106 | 
107 | #pick some threshold values and calc confusion matrix for best predictions
108 | #notice that GBM predictions don't fall in range of (0, 1)
109 | 
110 | #plot best version of ROC curve
111 | fpr, tpr, thresh = roc_curve(yTest, list(pBest))
112 | ctClass = [i*0.01 for i in range(101)]
113 | 
114 | plot.plot(fpr, tpr, linewidth=2)
115 | plot.plot(ctClass, ctClass, linestyle=':')
116 | plot.xlabel('False Positive Rate')
117 | plot.ylabel('True Positive Rate')
118 | plot.show()
119 | 
120 | #pick some threshold values and calc confusion matrix for best predictions
121 | #notice that GBM predictions don't fall in range of (0, 1)
122 | #pick threshold values at 25th, 50th and 75th percentiles
123 | idx25 = int(len(thresh) * 0.25)
124 | idx50 = int(len(thresh) * 0.50)
125 | idx75 = int(len(thresh) * 0.75)
126 | 
127 | #calculate total points, total positives and total negatives
128 | totalPts = len(yTest)
129 | P = sum(yTest)
130 | N = totalPts - P
131 | 
132 | print('')
133 | print('Confusion Matrices for Different Threshold Values')
134 | 
135 | #25th
136 | TP = tpr[idx25] * P; FN = P - TP; FP = fpr[idx25] * N; TN = N - FP
137 | print('')
138 | print('Threshold Value =   ', thresh[idx25])
139 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
140 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
141 | 
142 | #50th
143 | TP = tpr[idx50] * P; FN = P - TP; FP = fpr[idx50] * N; TN = N - FP
144 | print('')
145 | print('Threshold Value =   ', thresh[idx50])
146 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
147 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
148 | 
149 | #75th
150 | TP = tpr[idx75] * P; FN = P - TP; FP = fpr[idx75] * N; TN = N - FP
151 | print('')
152 | print('Threshold Value =   ', thresh[idx75])
153 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
154 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
155 | 
156 | 
157 | # Printed Output:
158 | #
159 | # Best AUC
160 | # 0.936105476673
161 | # Number of Trees for Best AUC
162 | # 1989
163 | #
164 | # Confusion Matrices for Different Threshold Values
165 | #
166 | # ('Threshold Value =   ', 6.2941249291909935)
167 | # ('TP = ', 0.23809523809523808, 'FP = ', 0.015873015873015872)
168 | # ('FN = ', 0.30158730158730157, 'TN = ', 0.44444444444444442)
169 | #
170 | # ('Threshold Value =   ', 2.2710265370949441)
171 | # ('TP = ', 0.44444444444444442, 'FP = ', 0.063492063492063489)
172 | # ('FN = ', 0.095238095238095233, 'TN = ', 0.3968253968253968)
173 | #
174 | # ('Threshold Value =   ', -3.0947902666953317)
175 | # ('TP = ', 0.53968253968253965, 'FP = ', 0.22222222222222221)
176 | # ('FN = ', 0.0, 'TN = ', 0.23809523809523808)
177 | #
178 | #
179 | # Printed Output with max_features = 20 (Random Forest base learners):
180 | #
181 | # Best AUC
182 | # 0.956389452333
183 | # Number of Trees for Best AUC
184 | # 1426
185 | #
186 | # Confusion Matrices for Different Threshold Values
187 | #
188 | # ('Threshold Value =   ', 5.8332200248698536)
189 | # ('TP = ', 0.23809523809523808, 'FP = ', 0.015873015873015872)
190 | # ('FN = ', 0.30158730158730157, 'TN = ', 0.44444444444444442)
191 | #
192 | # ('Threshold Value =   ', 2.0281780133610567)
193 | # ('TP = ', 0.47619047619047616, 'FP = ', 0.031746031746031744)
194 | # ('FN = ', 0.063492063492063489, 'TN = ', 0.42857142857142855)
195 | #
196 | # ('Threshold Value =   ', -1.2965629080181333)
197 | # ('TP = ', 0.53968253968253965, 'FP = ', 0.22222222222222221)
198 | # ('FN = ', 0.0, 'TN = ', 0.23809523809523808)


--------------------------------------------------------------------------------
/07/rocksVMinesRF.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike_bowles'
  2 | 
  3 | import urllib2
  4 | from math import sqrt, fabs, exp
  5 | import matplotlib.pyplot as plot
  6 | from sklearn.cross_validation import train_test_split
  7 | from sklearn import ensemble
  8 | from sklearn.metrics import roc_auc_score, roc_curve
  9 | import numpy
 10 | 
 11 | #read data from uci data repository
 12 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
 13 | data = urllib2.urlopen(target_url)
 14 | 
 15 | #arrange data into list for labels and list of lists for attributes
 16 | xList = []
 17 | 
 18 | for line in data:
 19 |     #split on comma
 20 |     row = line.strip().split(",")
 21 |     xList.append(row)
 22 | 
 23 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0
 24 | 
 25 | xNum = []
 26 | labels = []
 27 | 
 28 | for row in xList:
 29 |     lastCol = row.pop()
 30 |     if lastCol == "M":
 31 |         labels.append(1)
 32 |     else:
 33 |         labels.append(0)
 34 |     attrRow = [float(elt) for elt in row]
 35 |     xNum.append(attrRow)
 36 | 
 37 | #number of rows and columns in x matrix
 38 | nrows = len(xNum)
 39 | ncols = len(xNum[1])
 40 | 
 41 | #form x and y into numpy arrays and make up column names
 42 | X = numpy.array(xNum)
 43 | y = numpy.array(labels)
 44 | rocksVMinesNames = numpy.array(['V' + str(i) for i in range(ncols)])
 45 | 
 46 | #break into training and test sets.
 47 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
 48 | 
 49 | auc = []
 50 | nTreeList = range(50, 2000, 50)
 51 | for iTrees in nTreeList:
 52 |     depth = None
 53 |     maxFeat  = 8 #try tweaking
 54 |     rocksVMinesRFModel = ensemble.RandomForestClassifier(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
 55 |                                                  oob_score=False, random_state=531)
 56 | 
 57 |     rocksVMinesRFModel.fit(xTrain,yTrain)
 58 | 
 59 |     #Accumulate auc on test set
 60 |     prediction = rocksVMinesRFModel.predict_proba(xTest)
 61 |     aucCalc = roc_auc_score(yTest, prediction[:,1:2])
 62 |     auc.append(aucCalc)
 63 | 
 64 | print("AUC" )
 65 | print(auc[-1])
 66 | 
 67 | 
 68 | #plot training and test errors vs number of trees in ensemble
 69 | plot.plot(nTreeList, auc)
 70 | plot.xlabel('Number of Trees in Ensemble')
 71 | plot.ylabel('Area Under ROC Curve - AUC')
 72 | #plot.ylim([0.0, 1.1*max(mseOob)])
 73 | plot.show()
 74 | 
 75 | # Plot feature importance
 76 | featureImportance = rocksVMinesRFModel.feature_importances_
 77 | 
 78 | # normalize by max importance
 79 | featureImportance = featureImportance / featureImportance.max()
 80 | 
 81 | #plot importance of top 30
 82 | idxSorted = numpy.argsort(featureImportance)[30:60]
 83 | idxTemp = numpy.argsort(featureImportance)[::-1]
 84 | print(idxTemp)
 85 | barPos = numpy.arange(idxSorted.shape[0]) + .5
 86 | plot.barh(barPos, featureImportance[idxSorted], align='center')
 87 | plot.yticks(barPos, rocksVMinesNames[idxSorted])
 88 | plot.xlabel('Variable Importance')
 89 | plot.show()
 90 | 
 91 | #plot best version of ROC curve
 92 | fpr, tpr, thresh = roc_curve(yTest, list(prediction[:,1:2]))
 93 | ctClass = [i*0.01 for i in range(101)]
 94 | 
 95 | plot.plot(fpr, tpr, linewidth=2)
 96 | plot.plot(ctClass, ctClass, linestyle=':')
 97 | plot.xlabel('False Positive Rate')
 98 | plot.ylabel('True Positive Rate')
 99 | plot.show()
100 | 
101 | #pick some threshold values and calc confusion matrix for best predictions
102 | #notice that GBM predictions don't fall in range of (0, 1)
103 | #pick threshold values at 25th, 50th and 75th percentiles
104 | idx25 = int(len(thresh) * 0.25)
105 | idx50 = int(len(thresh) * 0.50)
106 | idx75 = int(len(thresh) * 0.75)
107 | 
108 | #calculate total points, total positives and total negatives
109 | totalPts = len(yTest)
110 | P = sum(yTest)
111 | N = totalPts - P
112 | 
113 | print('')
114 | print('Confusion Matrices for Different Threshold Values')
115 | 
116 | #25th
117 | TP = tpr[idx25] * P; FN = P - TP; FP = fpr[idx25] * N; TN = N - FP
118 | print('')
119 | print('Threshold Value =   ', thresh[idx25])
120 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
121 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
122 | 
123 | #50th
124 | TP = tpr[idx50] * P; FN = P - TP; FP = fpr[idx50] * N; TN = N - FP
125 | print('')
126 | print('Threshold Value =   ', thresh[idx50])
127 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
128 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
129 | 
130 | #75th
131 | TP = tpr[idx75] * P; FN = P - TP; FP = fpr[idx75] * N; TN = N - FP
132 | print('')
133 | print('Threshold Value =   ', thresh[idx75])
134 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
135 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
136 | 
137 | 
138 | # Printed Output:
139 | #
140 | # AUC
141 | # 0.950304259635
142 | #
143 | # Confusion Matrices for Different Threshold Values
144 | #
145 | # ('Threshold Value =   ', 0.76051282051282054)
146 | # ('TP = ', 0.25396825396825395, 'FP = ', 0.0)
147 | # ('FN = ', 0.2857142857142857, 'TN = ', 0.46031746031746029)
148 | #
149 | # ('Threshold Value =   ', 0.62461538461538457)
150 | # ('TP = ', 0.46031746031746029, 'FP = ', 0.047619047619047616)
151 | # ('FN = ', 0.079365079365079361, 'TN = ', 0.41269841269841268)
152 | #
153 | # ('Threshold Value =   ', 0.46564102564102566)
154 | # ('TP = ', 0.53968253968253965, 'FP = ', 0.22222222222222221)
155 | # ('FN = ', 0.0, 'TN = ', 0.23809523809523808)


--------------------------------------------------------------------------------
/07/timingComparisons.txt:
--------------------------------------------------------------------------------
 1 | timing and perf comparisons
 2 | 
 3 | dataset		algo		training time		perf		perf metric
 4 | 
 5 | glass - 	RF 2000 trees 	0:00:02.354401     	0.227272727273 	class error
 6 | glass - 	gbm 500 trees 	0:00:03.879308		0.227272727273
 7 | glass - 	lasso 		0:00:12.296948		0.373831775701
 8 | 	
 9 | rvmines		rf 2000 trees	0:00:02.760755		0.950304259635	auc
10 | rvmines		gbm 2000 trees	0:00:04.201122		0.956389452333 	auc
11 | rvmines		enet		0:00:00.519870*		0.868672796508
12 | 
13 | abalone		rf 500 trees	0:00:08.060850		4.30971555911 	MSE
14 | abalone		gbm 2000 trees	0:00:22.726849		4.20153525438	mse
15 | 
16 | wine		rf 500 trees	0:00:02.665874		0.314125711509	mse
17 | wine		gbm 2000 trees	0:00:13.081342		0.355898056894  mse
18 | wine		lasso-expanded	0:00:00.646788*		0.434528740430
19 | 
20 | 
21 | 
22 | *time per cross-validation fold
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/07/wineBagging.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mike-bowles'
  2 | 
  3 | import urllib2
  4 | import numpy
  5 | import matplotlib.pyplot as plot
  6 | from sklearn import tree
  7 | from sklearn.tree import DecisionTreeRegressor
  8 | from math import floor
  9 | import random
 10 | 
 11 | 
 12 | # Read wine quality data from UCI website
 13 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
 14 | data = urllib2.urlopen(target_url)
 15 | 
 16 | xList = []
 17 | labels = []
 18 | names = []
 19 | firstLine = True
 20 | for line in data:
 21 |     if firstLine:
 22 |         names = line.strip().split(";")
 23 |         firstLine = False
 24 |     else:
 25 |         #split on semi-colon
 26 |         row = line.strip().split(";")
 27 |         #put labels in separate array
 28 |         labels.append(float(row[-1]))
 29 |         #remove label from row
 30 |         row.pop()
 31 |         #convert row to floats
 32 |         floatRow = [float(num) for num in row]
 33 |         xList.append(floatRow)
 34 | 
 35 | nrows = len(xList)
 36 | ncols = len(xList[0])
 37 | 
 38 | 
 39 | #take fixed test set 30% of sample
 40 | nSample = int(nrows * 0.30)
 41 | idxTest = random.sample(range(nrows), nSample)
 42 | idxTest.sort()
 43 | idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)]
 44 | 
 45 | #Define test and training attribute and label sets
 46 | xTrain = [xList[r] for r in idxTrain]
 47 | xTest = [xList[r] for r in idxTest]
 48 | yTrain = [labels[r] for r in idxTrain]
 49 | yTest = [labels[r] for r in idxTest]
 50 | 
 51 | #train a series of models on random subsets of the training data
 52 | #collect the models in a list and check error of composite as list grows
 53 | 
 54 | #maximum number of models to generate
 55 | numTreesMax = 100
 56 | 
 57 | #tree depth - typically at the high end
 58 | treeDepth = 5
 59 | 
 60 | #initialize a list to hold models
 61 | modelList = []
 62 | predList = []
 63 | 
 64 | #number of samples to draw for stochastic bagging
 65 | bagFract = 0.5
 66 | nBagSamples = int(len(xTrain) * bagFract)
 67 | 
 68 | for iTrees in range(numTreesMax):
 69 |     idxBag = []
 70 |     for i in range(nBagSamples):
 71 |         idxBag.append(random.choice(range(len(xTrain))))
 72 |     xTrainBag = [xTrain[i] for i in idxBag]
 73 |     yTrainBag = [yTrain[i] for i in idxBag]
 74 | 
 75 |     modelList.append(DecisionTreeRegressor(max_depth=treeDepth))
 76 |     modelList[-1].fit(xTrainBag, yTrainBag)
 77 | 
 78 |     #make prediction with latest model and add to list of predictions
 79 |     latestPrediction = modelList[-1].predict(xTest)
 80 |     predList.append(list(latestPrediction))
 81 | 
 82 | 
 83 | #build cumulative prediction from first "n" models
 84 | mse = []
 85 | allPredictions = []
 86 | for iModels in range(len(modelList)):
 87 | 
 88 |     #average first "iModels" of the predictions
 89 |     prediction = []
 90 |     for iPred in range(len(xTest)):
 91 |         prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)])/(iModels + 1))
 92 | 
 93 |     allPredictions.append(prediction)
 94 |     errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))]
 95 |     mse.append(sum([e * e for e in errors]) / len(yTest))
 96 | 
 97 | nModels = [i + 1 for i in range(len(modelList))]
 98 | 
 99 | plot.plot(nModels,mse)
100 | plot.axis('tight')
101 | plot.xlabel('Number of Models in Ensemble')
102 | plot.ylabel('Mean Squared Error')
103 | plot.ylim((0.0, max(mse)))
104 | plot.show()
105 | 
106 | print('Minimum MSE')
107 | print(min(mse))
108 | 
109 | 
110 | #With treeDepth = 5
111 | #     bagFract = 0.5
112 | #Minimum MSE
113 | #0.429310223079
114 | 
115 | #With treeDepth = 8
116 | #     bagFract = 0.5
117 | #Minimum MSE
118 | #0.395838627928
119 | 
120 | #With treeDepth = 10
121 | #     bagFract = 1.0
122 | #Minimum MSE
123 | #0.313120547589


--------------------------------------------------------------------------------
/07/wineGBM.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | import numpy
 3 | from sklearn.cross_validation import train_test_split
 4 | from sklearn import ensemble
 5 | from sklearn.metrics import mean_squared_error
 6 | import pylab as plot
 7 | 
 8 | # Read wine quality data from UCI website
 9 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
10 | data = urllib2.urlopen(target_url)
11 | 
12 | xList = []
13 | labels = []
14 | names = []
15 | firstLine = True
16 | for line in data:
17 |     if firstLine:
18 |         names = line.strip().split(";")
19 |         firstLine = False
20 |     else:
21 |         #split on semi-colon
22 |         row = line.strip().split(";")
23 |         #put labels in separate array
24 |         labels.append(float(row[-1]))
25 |         #remove label from row
26 |         row.pop()
27 |         #convert row to floats
28 |         floatRow = [float(num) for num in row]
29 |         xList.append(floatRow)
30 | 
31 | nrows = len(xList)
32 | ncols = len(xList[0])
33 | 
34 | X = numpy.array(xList)
35 | y = numpy.array(labels)
36 | wineNames = numpy.array(names)
37 | 
38 | #take fixed holdout set 30% of data rows
39 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
40 | 
41 | # Train gradient boosting model to minimize mean squared error
42 | nEst = 2000
43 | depth = 7
44 | learnRate = 0.01
45 | subSamp = 0.5
46 | wineGBMModel = ensemble.GradientBoostingRegressor(n_estimators=nEst,
47 |                                                   max_depth=depth,
48 |                                                   learning_rate=learnRate,
49 |                                                   subsample = subSamp,
50 |                                                   loss='ls')
51 | 
52 | wineGBMModel.fit(xTrain, yTrain)
53 | 
54 | # compute mse on test set
55 | msError = []
56 | predictions = wineGBMModel.staged_predict(xTest)
57 | for p in predictions:
58 |     msError.append(mean_squared_error(yTest, p))
59 | 
60 | print("MSE" )
61 | print(min(msError))
62 | print(msError.index(min(msError)))
63 | 
64 | #plot training and test errors vs number of trees in ensemble
65 | plot.figure()
66 | plot.plot(range(1, nEst + 1), wineGBMModel.train_score_, label='Training Set MSE')
67 | plot.plot(range(1, nEst + 1), msError, label='Test Set MSE')
68 | plot.legend(loc='upper right')
69 | plot.xlabel('Number of Trees in Ensemble')
70 | plot.ylabel('Mean Squared Error')
71 | plot.show()
72 | 
73 | # Plot feature importance
74 | featureImportance = wineGBMModel.feature_importances_
75 | 
76 | # normalize by max importance
77 | featureImportance = featureImportance / featureImportance.max()
78 | idxSorted = numpy.argsort(featureImportance)
79 | barPos = numpy.arange(idxSorted.shape[0]) + .5
80 | plot.barh(barPos, featureImportance[idxSorted], align='center')
81 | plot.yticks(barPos, wineNames[idxSorted])
82 | plot.xlabel('Variable Importance')
83 | plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)
84 | plot.show()
85 | 
86 | 
87 | # Printed Output:
88 | # for:
89 | #nEst = 2000
90 | #depth = 7
91 | #learnRate = 0.01
92 | #subSamp = 0.5
93 | #
94 | # MSE
95 | # 0.313361215728
96 | # 840
97 | 


--------------------------------------------------------------------------------
/07/wineRF.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | import numpy
 3 | from sklearn.cross_validation import train_test_split
 4 | from sklearn import ensemble
 5 | from sklearn.metrics import mean_squared_error
 6 | import pylab as plot
 7 | 
 8 | 
 9 | # Read wine quality data from UCI website
10 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
11 | data = urllib2.urlopen(target_url)
12 | 
13 | xList = []
14 | labels = []
15 | names = []
16 | firstLine = True
17 | for line in data:
18 |     if firstLine:
19 |         names = line.strip().split(";")
20 |         firstLine = False
21 |     else:
22 |         #split on semi-colon
23 |         row = line.strip().split(";")
24 |         #put labels in separate array
25 |         labels.append(float(row[-1]))
26 |         #remove label from row
27 |         row.pop()
28 |         #convert row to floats
29 |         floatRow = [float(num) for num in row]
30 |         xList.append(floatRow)
31 | 
32 | nrows = len(xList)
33 | ncols = len(xList[0])
34 | 
35 | X = numpy.array(xList)
36 | y = numpy.array(labels)
37 | wineNames = numpy.array(names)
38 | 
39 | #take fixed holdout set 30% of data rows
40 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
41 | 
42 | #train random forest at a range of ensemble sizes in order to see how the mse changes
43 | mseOos = []
44 | nTreeList = range(50, 500, 10)
45 | for iTrees in nTreeList:
46 |     depth = None
47 |     maxFeat  = 4 #try tweaking
48 |     wineRFModel = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
49 |                                                  oob_score=False, random_state=531)
50 | 
51 |     wineRFModel.fit(xTrain,yTrain)
52 | 
53 |     #Accumulate mse on test set
54 |     prediction = wineRFModel.predict(xTest)
55 |     mseOos.append(mean_squared_error(yTest, prediction))
56 | 
57 | 
58 | print("MSE" )
59 | print(mseOos[-1])
60 | 
61 | 
62 | #plot training and test errors vs number of trees in ensemble
63 | plot.plot(nTreeList, mseOos)
64 | plot.xlabel('Number of Trees in Ensemble')
65 | plot.ylabel('Mean Squared Error')
66 | #plot.ylim([0.0, 1.1*max(mseOob)])
67 | plot.show()
68 | 
69 | # Plot feature importance
70 | featureImportance = wineRFModel.feature_importances_
71 | 
72 | # normalize by max importance
73 | featureImportance = featureImportance / featureImportance.max()
74 | sorted_idx = numpy.argsort(featureImportance)
75 | barPos = numpy.arange(sorted_idx.shape[0]) + .5
76 | plot.barh(barPos, featureImportance[sorted_idx], align='center')
77 | plot.yticks(barPos, wineNames[sorted_idx])
78 | plot.xlabel('Variable Importance')
79 | plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)
80 | plot.show()
81 | 
82 | #printed output
83 | #MSE
84 | #0.314125711509


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Machine learning: Essential techniques for predictive analysis
2 | ===================
3 | This is the cloned source code for this book. The original source comes from: http://www.wiley.com/WileyCDA/WileyTitle/productCd-1118961749.html
4 | 
5 | In the original source code, all data is loaded through urllib call to get data on the fly, but this is slow. This source code changed this to load the data from local.


--------------------------------------------------------------------------------