├── .gitignore ├── 01 └── chapter01.txt ├── 02 ├── abaloneCorrHeat.py ├── abaloneCorrMat.txt ├── abaloneParallelPlot.py ├── abaloneSummary.py ├── abaloneSummaryOutput.txt ├── chapter02.zip ├── corrCalc.py ├── corrPlot.py ├── glassCorrHeatMap.py ├── glassParallelPlot.py ├── glassSummary.py ├── glassSummary.txt ├── linePlots.py ├── outputRocksVMinesContents.txt ├── outputSummaryStats.txt ├── pandasReadSummarize.py ├── pandasReadSummarizeOutput.txt ├── qqplotAttribute.py ├── rVMSummaryStats.py ├── rockVmineContents.py ├── rockVmineSummaries.py ├── sampleCorrHeatMap.py ├── targetCorr.py ├── wineCorrHeatMap.py ├── wineParallelPlot.py ├── wineSummary.py └── wineSummary.txt ├── 03 ├── chapter03.zip ├── classifierPerformance_RocksVMines.py ├── classifierPerformance_RocksVMinesOutput.txt ├── classifierRidgeRocksVMines.py ├── classifierRidgeRocksVMinesOutput.txt ├── fwdStepwiseWine.py ├── fwdStepwiseWineOutput.txt ├── regressionErrorMeasures.py ├── ridgeWine.py └── ridgeWineOutput.txt ├── 04 ├── chapter04.zip ├── cvCurveDetails.txt ├── glmnetOrderedNamesList.txt ├── glmnetWine.py ├── larsAbalone.py ├── larsAbaloneOutput.txt ├── larsRocksVMines.py ├── larsWine.py ├── larsWine2.py ├── larsWineCV.py ├── orderedNamesList.txt ├── rocksVMinesCoefOrder.txt └── wineBasisExpand.py ├── 05 ├── chapter05.zip ├── glass │ └── glassENetRegCV.py ├── rocksVMines │ ├── rocksVMinesCoefCurves.py │ ├── rocksVMinesCoefCurvesPrintedOutput.txt │ ├── rocksVMinesENetRegCV.py │ ├── rocksVMinesENetRegCVPrintedOutput.txt │ ├── rocksVMinesGlmnet.py │ └── rocksVMinesGlmnetPrintedOutput.txt └── wineCS │ ├── wineExpandedLassoCV.py │ ├── wineLassoCV.py │ ├── wineLassoCVPrintedOutputNormalizedX.txt │ ├── wineLassoCVPrintedOutputNormalizedXandY.txt │ ├── wineLassoCVPrintedOutputUn-NormalizedX.txt │ ├── wineLassoCoefCurves.py │ ├── wineLassoCoefCurvesPrintedOutput.txt │ └── wineLassoExpandedCVPrintedOutput.txt ├── 06 ├── chapter06.zip ├── simpleBagging.py ├── simpleGBM.py ├── simpleTree.py ├── simpleTreeCV.py ├── wineBagging.py ├── wineGBM.py ├── wineRF.py └── wineTree.py ├── 07 ├── abaloneGBM.py ├── abaloneRF.py ├── glassGbm.py ├── glassRF.py ├── rocksVMinesGBM.py ├── rocksVMinesRF.py ├── timingComparisons.txt ├── wineBagging.py ├── wineGBM.py └── wineRF.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | ### JetBrains template 98 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 99 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 100 | 101 | # User-specific stuff: 102 | .idea/**/workspace.xml 103 | .idea/**/tasks.xml 104 | .idea/dictionaries 105 | 106 | # Sensitive or high-churn files: 107 | .idea/**/dataSources/ 108 | .idea/**/dataSources.ids 109 | .idea/**/dataSources.xml 110 | .idea/**/dataSources.local.xml 111 | .idea/**/sqlDataSources.xml 112 | .idea/**/dynamic.xml 113 | .idea/**/uiDesigner.xml 114 | 115 | # Gradle: 116 | .idea/**/gradle.xml 117 | .idea/**/libraries 118 | 119 | # Mongo Explorer plugin: 120 | .idea/**/mongoSettings.xml 121 | 122 | ## File-based project format: 123 | *.iws 124 | 125 | ## Plugin-specific files: 126 | 127 | # IntelliJ 128 | /out/ 129 | .idea/* 130 | 131 | # mpeltonen/sbt-idea plugin 132 | .idea_modules/ 133 | 134 | # JIRA plugin 135 | atlassian-ide-plugin.xml 136 | 137 | # Crashlytics plugin (for Android Studio and IntelliJ) 138 | com_crashlytics_export_strings.xml 139 | crashlytics.properties 140 | crashlytics-build.properties 141 | fabric.properties 142 | 143 | -------------------------------------------------------------------------------- /01/chapter01.txt: -------------------------------------------------------------------------------- 1 | Chapter 1 of Machine Learning in Python has no code associated with it. 2 | -------------------------------------------------------------------------------- /02/abaloneCorrHeat.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | import matplotlib.pyplot as plot 5 | 6 | target_url = ("http://archive.ics.uci.edu/ml/machine-" 7 | "learning-databases/abalone/abalone.data") 8 | #read abalone data 9 | abalone = pd.read_csv(target_url,header=None, prefix="V") 10 | abalone.columns = ['Sex', 'Length', 'Diameter', 'Height', 11 | 'Whole weight', 'Shucked weight', 12 | 'Viscera weight', 'Shell weight', 'Rings'] 13 | 14 | #calculate correlation matrix 15 | corMat = DataFrame(abalone.iloc[:,1:9].corr()) 16 | #print correlation matrix 17 | print(corMat) 18 | 19 | #visualize correlations using heatmap 20 | plot.pcolor(corMat) 21 | plot.show() 22 | 23 | -------------------------------------------------------------------------------- /02/abaloneCorrMat.txt: -------------------------------------------------------------------------------- 1 | Length Diameter Height Whole Wt Shucked Wt 2 | Length 1.000000 0.986812 0.827554 0.925261 0.897914 3 | Diameter 0.986812 1.000000 0.833684 0.925452 0.893162 4 | Height 0.827554 0.833684 1.000000 0.819221 0.774972 5 | Whole weight 0.925261 0.925452 0.819221 1.000000 0.969405 6 | Shucked weight 0.897914 0.893162 0.774972 0.969405 1.000000 7 | Viscera weight 0.903018 0.899724 0.798319 0.966375 0.931961 8 | Shell weight 0.897706 0.905330 0.817338 0.955355 0.882617 9 | Rings 0.556720 0.574660 0.557467 0.540390 0.420884 10 | 11 | Viscera weight Shell weight Rings 12 | Length 0.903018 0.897706 0.556720 13 | Diameter 0.899724 0.905330 0.574660 14 | Height 0.798319 0.817338 0.557467 15 | Whole weight 0.966375 0.955355 0.540390 16 | Shucked weight 0.931961 0.882617 0.420884 17 | Viscera weight 1.000000 0.907656 0.503819 18 | Shell weight 0.907656 1.000000 0.627574 19 | Rings 0.503819 0.627574 1.000000 20 | -------------------------------------------------------------------------------- /02/abaloneParallelPlot.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | import matplotlib.pyplot as plot 5 | from math import exp 6 | target_url = ("http://archive.ics.uci.edu/ml/machine-" 7 | "learning-databases/abalone/abalone.data") 8 | #read abalone data 9 | abalone = pd.read_csv(target_url,header=None, prefix="V") 10 | abalone.columns = ['Sex', 'Length', 'Diameter', 'Height', 11 | 'Whole Wt', 'Shucked Wt', 12 | 'Viscera Wt', 'Shell Wt', 'Rings'] 13 | #get summary to use for scaling 14 | summary = abalone.describe() 15 | minRings = summary.iloc[3,7] 16 | maxRings = summary.iloc[7,7] 17 | nrows = len(abalone.index) 18 | 19 | for i in range(nrows): 20 | #plot rows of data as if they were series data 21 | dataRow = abalone.iloc[i,1:8] 22 | labelColor = (abalone.iloc[i,8] - minRings) / (maxRings - minRings) 23 | dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5) 24 | 25 | plot.xlabel("Attribute Index") 26 | plot.ylabel(("Attribute Values")) 27 | plot.show() 28 | 29 | #renormalize using mean and standard variation, then compress 30 | # with logit function 31 | 32 | meanRings = summary.iloc[1,7] 33 | sdRings = summary.iloc[2,7] 34 | 35 | for i in range(nrows): 36 | #plot rows of data as if they were series data 37 | dataRow = abalone.iloc[i,1:8] 38 | normTarget = (abalone.iloc[i,8] - meanRings)/sdRings 39 | labelColor = 1.0/(1.0 + exp(-normTarget)) 40 | dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5) 41 | 42 | plot.xlabel("Attribute Index") 43 | plot.ylabel(("Attribute Values")) 44 | plot.show() -------------------------------------------------------------------------------- /02/abaloneSummary.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | 7 | target_url = ("http://archive.ics.uci.edu/ml/machine-" 8 | "learning-databases/abalone/abalone.data") 9 | #read abalone data 10 | abalone = pd.read_csv(target_url,header=None, prefix="V") 11 | abalone.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 12 | 'Shucked weight', 'Viscera weight', 'Shell weight', 13 | 'Rings'] 14 | 15 | 16 | print(abalone.head()) 17 | print(abalone.tail()) 18 | 19 | #print summary of data frame 20 | summary = abalone.describe() 21 | print(summary) 22 | 23 | #box plot the real-valued attributes 24 | #convert to array for plot routine 25 | array = abalone.iloc[:,1:9].values 26 | boxplot(array) 27 | plot.xlabel("Attribute Index") 28 | plot.ylabel(("Quartile Ranges")) 29 | show() 30 | 31 | #the last column (rings) is out of scale with the rest 32 | # - remove and replot 33 | array2 = abalone.iloc[:,1:8].values 34 | boxplot(array2) 35 | plot.xlabel("Attribute Index") 36 | plot.ylabel(("Quartile Ranges")) 37 | show() 38 | 39 | #removing is okay but renormalizing the variables generalizes better. 40 | #renormalize columns to zero mean and unit standard deviation 41 | #this is a common normalization and desirable for other operations 42 | # (like k-means clustering or k-nearest neighbors 43 | abaloneNormalized = abalone.iloc[:,1:9] 44 | 45 | 46 | for i in range(8): 47 | mean = summary.iloc[1, i] 48 | sd = summary.iloc[2, i] 49 | abaloneNormalized.iloc[:,i:(i + 1)] = ( 50 | abaloneNormalized.iloc[:,i:(i + 1)] - mean) / sd 51 | 52 | array3 = abaloneNormalized.values 53 | boxplot(array3) 54 | plot.xlabel("Attribute Index") 55 | plot.ylabel(("Quartile Ranges - Normalized ")) 56 | show() -------------------------------------------------------------------------------- /02/abaloneSummaryOutput.txt: -------------------------------------------------------------------------------- 1 | Sex Length Diameter Height Whole wt Shucked wt Viscera wt 2 | 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 3 | 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 4 | 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 5 | 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 6 | 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 7 | 8 | Shell weight Rings 9 | 0 0.150 15 10 | 1 0.070 7 11 | 2 0.210 9 12 | 3 0.155 10 13 | 4 0.055 7 14 | Sex Length Diameter Height Whole weight Shucked weight 15 | 4172 F 0.565 0.450 0.165 0.8870 0.3700 16 | 4173 M 0.590 0.440 0.135 0.9660 0.4390 17 | 4174 M 0.600 0.475 0.205 1.1760 0.5255 18 | 4175 F 0.625 0.485 0.150 1.0945 0.5310 19 | 4176 M 0.710 0.555 0.195 1.9485 0.9455 20 | 21 | Viscera weight Shell weight Rings 22 | 4172 0.2390 0.2490 11 23 | 4173 0.2145 0.2605 10 24 | 4174 0.2875 0.3080 9 25 | 4175 0.2610 0.2960 10 26 | 4176 0.3765 0.4950 12 27 | Length Diameter Height Whole wt Shucked wt 28 | count 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 29 | mean 0.523992 0.407881 0.139516 0.828742 0.359367 30 | std 0.120093 0.099240 0.041827 0.490389 0.221963 31 | min 0.075000 0.055000 0.000000 0.002000 0.001000 32 | 25% 0.450000 0.350000 0.115000 0.441500 0.186000 33 | 50% 0.545000 0.425000 0.140000 0.799500 0.336000 34 | 75% 0.615000 0.480000 0.165000 1.153000 0.502000 35 | max 0.815000 0.650000 1.130000 2.825500 1.488000 36 | 37 | Viscera weight Shell weight Rings 38 | count 4177.000000 4177.000000 4177.000000 39 | mean 0.180594 0.238831 9.933684 40 | std 0.109614 0.139203 3.224169 41 | min 0.000500 0.001500 1.000000 42 | 25% 0.093500 0.130000 8.000000 43 | 50% 0.171000 0.234000 9.000000 44 | 75% 0.253000 0.329000 11.000000 45 | max 0.760000 1.005000 29.000000 46 | -------------------------------------------------------------------------------- /02/chapter02.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/02/chapter02.zip -------------------------------------------------------------------------------- /02/corrCalc.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | from math import sqrt 5 | import sys 6 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 7 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 8 | 9 | #read rocks versus mines data into pandas data frame 10 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V") 11 | 12 | #calculate correlations between real-valued attributes 13 | dataRow2 = rocksVMines.iloc[1,0:60] 14 | dataRow3 = rocksVMines.iloc[2,0:60] 15 | dataRow21 = rocksVMines.iloc[20,0:60] 16 | 17 | mean2 = 0.0; mean3 = 0.0; mean21 = 0.0 18 | numElt = len(dataRow2) 19 | for i in range(numElt): 20 | mean2 += dataRow2[i]/numElt 21 | mean3 += dataRow3[i]/numElt 22 | mean21 += dataRow21[i]/numElt 23 | 24 | var2 = 0.0; var3 = 0.0; var21 = 0.0 25 | for i in range(numElt): 26 | var2 += (dataRow2[i] - mean2) * (dataRow2[i] - mean2)/numElt 27 | var3 += (dataRow3[i] - mean3) * (dataRow3[i] - mean3)/numElt 28 | var21 += (dataRow21[i] - mean21) * (dataRow21[i] - mean21)/numElt 29 | 30 | corr23 = 0.0; corr221 = 0.0 31 | for i in range(numElt): 32 | corr23 += (dataRow2[i] - mean2) * \ 33 | (dataRow3[i] - mean3) / (sqrt(var2*var3) * numElt) 34 | corr221 += (dataRow2[i] - mean2) * \ 35 | (dataRow21[i] - mean21) / (sqrt(var2*var21) * numElt) 36 | 37 | sys.stdout.write("Correlation between attribute 2 and 3 \n") 38 | print(corr23) 39 | sys.stdout.write(" \n") 40 | 41 | sys.stdout.write("Correlation between attribute 2 and 21 \n") 42 | print(corr221) 43 | sys.stdout.write(" \n") 44 | 45 | 46 | # Output: 47 | # Correlation between attribute 2 and 3 48 | # 0.770938121191 49 | # 50 | # Correlation between attribute 2 and 21 51 | # 0.466548080789 -------------------------------------------------------------------------------- /02/corrPlot.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | import matplotlib.pyplot as plot 5 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 6 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 7 | 8 | #read rocks versus mines data into pandas data frame 9 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V") 10 | 11 | #calculate correlations between real-valued attributes 12 | dataRow2 = rocksVMines.iloc[1,0:60] 13 | dataRow3 = rocksVMines.iloc[2,0:60] 14 | 15 | plot.scatter(dataRow2, dataRow3) 16 | 17 | 18 | plot.xlabel("2nd Attribute") 19 | plot.ylabel(("3rd Attribute")) 20 | plot.show() 21 | 22 | dataRow21 = rocksVMines.iloc[20,0:60] 23 | 24 | plot.scatter(dataRow2, dataRow21) 25 | 26 | 27 | plot.xlabel("2nd Attribute") 28 | plot.ylabel(("21st Attribute")) 29 | plot.show() -------------------------------------------------------------------------------- /02/glassCorrHeatMap.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | from math import exp 7 | 8 | target_url = ("https://archive.ics.uci.edu/ml/machine-" 9 | "learning-databases/glass/glass.data") 10 | glass = pd.read_csv(target_url,header=None, prefix="V") 11 | glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si', 12 | 'K', 'Ca', 'Ba', 'Fe', 'Type'] 13 | ncols = len(glass.columns) 14 | 15 | #calculate correlation matrix 16 | corMat = DataFrame(glass.iloc[:, 1:(ncols - 1)].corr()) 17 | 18 | #visualize correlations using heatmap 19 | plot.pcolor(corMat) 20 | plot.show() -------------------------------------------------------------------------------- /02/glassParallelPlot.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | 7 | target_url = ("https://archive.ics.uci.edu/ml/machine-" 8 | "learning-databases/glass/glass.data") 9 | glass = pd.read_csv(target_url,header=None, prefix="V") 10 | glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si', 11 | 'K', 'Ca', 'Ba', 'Fe', 'Type'] 12 | 13 | 14 | glassNormalized = glass 15 | ncols = len(glassNormalized.columns) 16 | nrows = len(glassNormalized.index) 17 | summary = glassNormalized.describe() 18 | nDataCol = ncols - 1 19 | 20 | #normalize except for labels 21 | for i in range(ncols - 1): 22 | mean = summary.iloc[1, i] 23 | sd = summary.iloc[2, i] 24 | glassNormalized.iloc[:,i:(i + 1)] = \ 25 | (glassNormalized.iloc[:,i:(i + 1)] - mean) / sd 26 | 27 | #Plot Parallel Coordinate Graph with normalized values 28 | for i in range(nrows): 29 | #plot rows of data as if they were series data 30 | dataRow = glassNormalized.iloc[i,1:nDataCol] 31 | labelColor = glassNormalized.iloc[i,nDataCol]/7.0 32 | dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5) 33 | 34 | plot.xlabel("Attribute Index") 35 | plot.ylabel(("Attribute Values")) 36 | plot.show() 37 | -------------------------------------------------------------------------------- /02/glassSummary.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | 7 | target_url = ("https://archive.ics.uci.edu/ml/machine-" 8 | "learning-databases/glass/glass.data") 9 | 10 | glass = pd.read_csv(target_url,header=None, prefix="V") 11 | glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si', 12 | 'K', 'Ca', 'Ba', 'Fe', 'Type'] 13 | 14 | print(glass.head()) 15 | 16 | #generate statistical summaries 17 | summary = glass.describe() 18 | print(summary) 19 | ncol1 = len(glass.columns) 20 | 21 | glassNormalized = glass.iloc[:, 1:ncol1] 22 | ncol2 = len(glassNormalized.columns) 23 | summary2 = glassNormalized.describe() 24 | 25 | for i in range(ncol2): 26 | mean = summary2.iloc[1, i] 27 | sd = summary2.iloc[2, i] 28 | glassNormalized.iloc[:,i:(i + 1)] = \ 29 | (glassNormalized.iloc[:,i:(i + 1)] - mean) / sd 30 | 31 | array = glassNormalized.values 32 | boxplot(array) 33 | plot.xlabel("Attribute Index") 34 | plot.ylabel(("Quartile Ranges - Normalized ")) 35 | show() -------------------------------------------------------------------------------- /02/glassSummary.txt: -------------------------------------------------------------------------------- 1 | print(glass.head()) 2 | 3 | Id RI Na Mg Al Si K Ca Ba Fe Type 4 | 0 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0 1 5 | 1 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0 1 6 | 2 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0 1 7 | 3 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0 1 8 | 4 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0 1 9 | 10 | 11 | print(summary) - Abridged 12 | Id RI Na Mg Al 13 | count 214.000000 214.000000 214.000000 214.000000 214.000000 14 | mean 107.500000 1.518365 13.407850 2.684533 1.444907 15 | std 61.920648 0.003037 0.816604 1.442408 0.499270 16 | min 1.000000 1.511150 10.730000 0.000000 0.290000 17 | 25% 54.250000 1.516523 12.907500 2.115000 1.190000 18 | 50% 107.500000 1.517680 13.300000 3.480000 1.360000 19 | 75% 160.750000 1.519157 13.825000 3.600000 1.630000 20 | max 214.000000 1.533930 17.380000 4.490000 3.500000 21 | 22 | K Ca Ba Fe Type 23 | count 214.000000 214.000000 214.000000 214.000000 214.000000 24 | mean 0.497056 8.956963 0.175047 0.057009 2.780374 25 | std 0.652192 1.423153 0.497219 0.097439 2.103739 26 | min 0.000000 5.430000 0.000000 0.000000 1.000000 27 | 25% 0.122500 8.240000 0.000000 0.000000 1.000000 28 | 50% 0.555000 8.600000 0.000000 0.000000 2.000000 29 | 75% 0.610000 9.172500 0.000000 0.100000 3.000000 30 | max 6.210000 16.190000 3.150000 0.510000 7.000000 31 | 32 | -------------------------------------------------------------------------------- /02/linePlots.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | import matplotlib.pyplot as plot 5 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 6 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 7 | 8 | #read rocks versus mines data into pandas data frame 9 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V") 10 | 11 | for i in range(208): 12 | #assign color based on color based on "M" or "R" labels 13 | if rocksVMines.iat[i,60] == "M": 14 | pcolor = "red" 15 | else: 16 | pcolor = "blue" 17 | 18 | #plot rows of data as if they were series data 19 | dataRow = rocksVMines.iloc[i,0:60] 20 | dataRow.plot(color=pcolor, alpha=0.5) 21 | 22 | plot.xlabel("Attribute Index") 23 | plot.ylabel(("Attribute Values")) 24 | plot.show() 25 | -------------------------------------------------------------------------------- /02/outputRocksVMinesContents.txt: -------------------------------------------------------------------------------- 1 | Col# Number Strings Other 2 | 0 208 0 0 3 | 1 208 0 0 4 | 2 208 0 0 5 | 3 208 0 0 6 | 4 208 0 0 7 | 5 208 0 0 8 | 6 208 0 0 9 | 7 208 0 0 10 | 8 208 0 0 11 | 9 208 0 0 12 | 10 208 0 0 13 | 11 208 0 0 14 | . . . . 15 | . . . . 16 | . . . . 17 | 54 208 0 0 18 | 55 208 0 0 19 | 56 208 0 0 20 | 57 208 0 0 21 | 58 208 0 0 22 | 59 208 0 0 23 | 60 0 208 0 24 | -------------------------------------------------------------------------------- /02/outputSummaryStats.txt: -------------------------------------------------------------------------------- 1 | Mean = 0.0538923076923 Standard Deviation = 0.0464159832226 2 | 3 | Boundaries for 4 Equal Percentiles 4 | [0.0057999999999999996, 0.024375000000000001, 0.044049999999999999, 0.064500000000000002, 0.4264] 5 | 6 | Boundaries for 10 Equal Percentiles 7 | [0.00579999999999, 0.0141, 0.022740000000, 0.0278699999999, 0.0362200000000, 0.0440499999999, 0.050719999999, 0.0599599999999, 0.0779400000000, 0.10836, 0.4264] 8 | 9 | Unique Label Values 10 | set(['R', 'M']) 11 | 12 | Counts for Each Value of Categorical Label 13 | ['R', 'M'] 14 | [97, 111] 15 | -------------------------------------------------------------------------------- /02/pandasReadSummarize.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | import matplotlib.pyplot as plot 5 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 6 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 7 | 8 | #read rocks versus mines data into pandas data frame 9 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V") 10 | 11 | #print head and tail of data frame 12 | print(rocksVMines.head()) 13 | print(rocksVMines.tail()) 14 | 15 | #print summary of data frame 16 | summary = rocksVMines.describe() 17 | print(summary) -------------------------------------------------------------------------------- /02/pandasReadSummarizeOutput.txt: -------------------------------------------------------------------------------- 1 | V0 V1 V2 ... V57 V58 V59 V60 2 | 0 0.0200 0.0371 0.0428 ... 0.0084 0.0090 0.0032 R 3 | 1 0.0453 0.0523 0.0843 ... 0.0049 0.0052 0.0044 R 4 | 2 0.0262 0.0582 0.1099 ... 0.0164 0.0095 0.0078 R 5 | 3 0.0100 0.0171 0.0623 ... 0.0044 0.0040 0.0117 R 6 | 4 0.0762 0.0666 0.0481 ... 0.0048 0.0107 0.0094 R 7 | 8 | [5 rows x 61 columns] 9 | V0 V1 V2 ... V57 V58 V59 V60 10 | 203 0.0187 0.0346 0.0168 ... 0.0115 0.0193 0.0157 M 11 | 204 0.0323 0.0101 0.0298 ... 0.0032 0.0062 0.0067 M 12 | 205 0.0522 0.0437 0.0180 ... 0.0138 0.0077 0.0031 M 13 | 206 0.0303 0.0353 0.0490 ... 0.0079 0.0036 0.0048 M 14 | 207 0.0260 0.0363 0.0136 ... 0.0036 0.0061 0.0115 M 15 | 16 | 17 | V0 V1 ... V58 V59 18 | count 208.000000 208.000000 ... 208.000000 208.000000 19 | mean 0.029164 0.038437 ... 0.007941 0.006507 20 | std 0.022991 0.032960 ... 0.006181 0.005031 21 | min 0.001500 0.000600 ... 0.000100 0.000600 22 | 25% 0.013350 0.016450 ... 0.003675 0.003100 23 | 50% 0.022800 0.030800 ... 0.006400 0.005300 24 | 75% 0.035550 0.047950 ... 0.010325 0.008525 25 | max 0.137100 0.233900 ... 0.036400 0.043900 -------------------------------------------------------------------------------- /02/qqplotAttribute.py: -------------------------------------------------------------------------------- 1 | __author__ = 'ubuntu' 2 | import numpy as np 3 | import pylab 4 | import scipy.stats as stats 5 | import urllib2 6 | import sys 7 | 8 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 9 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 10 | 11 | data = urllib2.urlopen(target_url) 12 | 13 | 14 | #arrange data into list for labels and list of lists for attributes 15 | xList = [] 16 | labels = [] 17 | 18 | for line in data: 19 | #split on comma 20 | row = line.strip().split(",") 21 | xList.append(row) 22 | nrow = len(xList) 23 | ncol = len(xList[1]) 24 | 25 | type = [0]*3 26 | colCounts = [] 27 | 28 | #generate summary statistics for column 3 (e.g.) 29 | col = 3 30 | colData = [] 31 | for row in xList: 32 | colData.append(float(row[col])) 33 | 34 | 35 | stats.probplot(colData, dist="norm", plot=pylab) 36 | pylab.show() -------------------------------------------------------------------------------- /02/rVMSummaryStats.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import urllib2 3 | import sys 4 | import numpy as np 5 | 6 | #read data from uci data repository 7 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 8 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 9 | 10 | data = urllib2.urlopen(target_url) 11 | 12 | #arrange data into list for labels and list of lists for attributes 13 | xList = [] 14 | labels = [] 15 | 16 | for line in data: 17 | #split on comma 18 | row = line.strip().split(",") 19 | xList.append(row) 20 | nrow = len(xList) 21 | ncol = len(xList[1]) 22 | 23 | type = [0]*3 24 | colCounts = [] 25 | 26 | #generate summary statistics for column 3 (e.g.) 27 | col = 3 28 | colData = [] 29 | for row in xList: 30 | colData.append(float(row[col])) 31 | 32 | colArray = np.array(colData) 33 | colMean = np.mean(colArray) 34 | colsd = np.std(colArray) 35 | sys.stdout.write("Mean = " + '\t' + str(colMean) + '\t\t' + 36 | "Standard Deviation = " + '\t ' + str(colsd) + "\n") 37 | 38 | 39 | #calculate quantile boundaries 40 | ntiles = 4 41 | 42 | percentBdry = [] 43 | 44 | for i in range(ntiles+1): 45 | percentBdry.append(np.percentile(colArray, i*(100)/ntiles)) 46 | 47 | sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n") 48 | print(percentBdry) 49 | sys.stdout.write(" \n") 50 | 51 | 52 | #run again with 10 equal intervals 53 | ntiles = 10 54 | 55 | percentBdry = [] 56 | 57 | for i in range(ntiles+1): 58 | percentBdry.append(np.percentile(colArray, i*(100)/ntiles)) 59 | 60 | sys.stdout.write("Boundaries for 10 Equal Percentiles \n") 61 | print(percentBdry) 62 | sys.stdout.write(" \n") 63 | 64 | 65 | #The last column contains categorical variables 66 | 67 | col = 60 68 | colData = [] 69 | for row in xList: 70 | colData.append(row[col]) 71 | 72 | unique = set(colData) 73 | sys.stdout.write("Unique Label Values \n") 74 | print(unique) 75 | 76 | #count up the number of elements having each value 77 | 78 | catDict = dict(zip(list(unique),range(len(unique)))) 79 | 80 | catCount = [0]*2 81 | 82 | for elt in colData: 83 | catCount[catDict[elt]] += 1 84 | 85 | sys.stdout.write("\nCounts for Each Value of Categorical Label \n") 86 | print(list(unique)) 87 | print(catCount) 88 | -------------------------------------------------------------------------------- /02/rockVmineContents.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import urllib2 3 | import sys 4 | 5 | #read data from uci data repository 6 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 7 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 8 | 9 | data = urllib2.urlopen(target_url) 10 | 11 | 12 | #arrange data into list for labels and list of lists for attributes 13 | xList = [] 14 | labels = [] 15 | 16 | for line in data: 17 | #split on comma 18 | row = line.strip().split(",") 19 | xList.append(row) 20 | nrow = len(xList) 21 | ncol = len(xList[1]) 22 | 23 | type = [0]*3 24 | colCounts = [] 25 | 26 | for col in range(ncol): 27 | for row in xList: 28 | try: 29 | a = float(row[col]) 30 | if isinstance(a, float): 31 | type[0] += 1 32 | except ValueError: 33 | if len(row[col]) > 0: 34 | type[1] += 1 35 | else: 36 | type[2] += 1 37 | 38 | colCounts.append(type) 39 | type = [0]*3 40 | 41 | sys.stdout.write("Col#" + '\t' + "Number" + '\t' + 42 | "Strings" + '\t ' + "Other\n") 43 | iCol = 0 44 | for types in colCounts: 45 | sys.stdout.write(str(iCol) + '\t\t' + str(types[0]) + '\t\t' + 46 | str(types[1]) + '\t\t' + str(types[2]) + "\n") 47 | iCol += 1 -------------------------------------------------------------------------------- /02/rockVmineSummaries.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import urllib2 3 | import sys 4 | 5 | #read data from uci data repository 6 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 7 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 8 | 9 | data = urllib2.urlopen(target_url) 10 | 11 | #arrange data into list for labels and list of lists for attributes 12 | xList = [] 13 | labels = [] 14 | for line in data: 15 | #split on comma 16 | row = line.strip().split(",") 17 | xList.append(row) 18 | 19 | sys.stdout.write("Number of Rows of Data = " + str(len(xList)) + '\n') 20 | sys.stdout.write("Number of Columns of Data = " + str(len(xList[1]))) 21 | 22 | -------------------------------------------------------------------------------- /02/sampleCorrHeatMap.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | import matplotlib.pyplot as plot 5 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 6 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 7 | 8 | #read rocks versus mines data into pandas data frame 9 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V") 10 | 11 | #calculate correlations between real-valued attributes 12 | 13 | corMat = DataFrame(rocksVMines.corr()) 14 | 15 | #visualize correlations using heatmap 16 | plot.pcolor(corMat) 17 | plot.show() 18 | -------------------------------------------------------------------------------- /02/targetCorr.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | import matplotlib.pyplot as plot 5 | from random import uniform 6 | target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" 7 | "databases/undocumented/connectionist-bench/sonar/sonar.all-data") 8 | 9 | #read rocks versus mines data into pandas data frame 10 | rocksVMines = pd.read_csv(target_url,header=None, prefix="V") 11 | 12 | #change the targets to numeric values 13 | target = [] 14 | for i in range(208): 15 | #assign 0 or 1 target value based on "M" or "R" labels 16 | if rocksVMines.iat[i,60] == "M": 17 | target.append(1.0) 18 | else: 19 | target.append(0.0) 20 | 21 | #plot rows of data as if they were series data 22 | dataRow = rocksVMines.iloc[0:208,35] 23 | plot.scatter(dataRow, target) 24 | 25 | plot.xlabel("Attribute Value") 26 | plot.ylabel("Target Value") 27 | plot.show() 28 | 29 | # 30 | #To improve the visualization, this version dithers the points a little 31 | # and makes them somewhat transparent 32 | target = [] 33 | for i in range(208): 34 | #assign 0 or 1 target value based on "M" or "R" labels 35 | # and add some dither 36 | if rocksVMines.iat[i,60] == "M": 37 | target.append(1.0 + uniform(-0.1, 0.1)) 38 | else: 39 | target.append(0.0 + uniform(-0.1, 0.1)) 40 | 41 | #plot rows of data as if they were series data 42 | dataRow = rocksVMines.iloc[0:208,35] 43 | plot.scatter(dataRow, target, alpha=0.5, s=120) 44 | 45 | plot.xlabel("Attribute Value") 46 | plot.ylabel("Target Value") 47 | plot.show() -------------------------------------------------------------------------------- /02/wineCorrHeatMap.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | from math import exp 7 | 8 | target_url = ("http://archive.ics.uci.edu/ml/machine-" 9 | "learning-databases/wine-quality/winequality-red.csv") 10 | wine = pd.read_csv(target_url,header=0, sep=";") 11 | wineCols = len(wine.columns) 12 | 13 | #calculate correlation matrix 14 | corMat = DataFrame(wine.corr()) 15 | 16 | #visualize correlations using heatmap 17 | plot.pcolor(corMat) 18 | plot.show() -------------------------------------------------------------------------------- /02/wineParallelPlot.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | from math import exp 7 | 8 | target_url = ("http://archive.ics.uci.edu/ml/machine-" 9 | "learning-databases/wine-quality/winequality-red.csv") 10 | 11 | wine = pd.read_csv(target_url,header=0, sep=";") 12 | 13 | #print column names in order to have the full versions 14 | print(wine.columns) 15 | 16 | #change column names to shorter ones to fit graph 17 | wine.columns = ['fixAcid', 'volAcid', 'citAcid', 18 | 'resSugr', 'chlor', 'frSO2', 'totSO2', 19 | 'dens', 'pH', 'sulpha', 'alcohol', 'quality'] 20 | 21 | #generate statistical summaries 22 | summary = wine.describe() 23 | nrows = len(wine.index) 24 | tasteCol = len(summary.columns) 25 | meanTaste = summary.iloc[1,tasteCol - 1] 26 | sdTaste = summary.iloc[2,tasteCol - 1] 27 | nDataCol = len(wine.columns) -1 28 | 29 | for i in range(nrows): 30 | #plot rows of data as if they were series data 31 | dataRow = wine.iloc[i,1:nDataCol] 32 | normTarget = (wine.iloc[i,nDataCol] - meanTaste)/sdTaste 33 | labelColor = 1.0/(1.0 + exp(-normTarget)) 34 | dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5) 35 | 36 | plot.xlabel("Attribute Index") 37 | plot.ylabel(("Attribute Values")) 38 | plot.show() 39 | 40 | wineNormalized = wine 41 | ncols = len(wineNormalized.columns) 42 | 43 | for i in range(ncols): 44 | mean = summary.iloc[1, i] 45 | sd = summary.iloc[2, i] 46 | wineNormalized.iloc[:,i:(i + 1)] = \ 47 | (wineNormalized.iloc[:,i:(i + 1)] - mean) / sd 48 | 49 | #Try again with normalized values 50 | for i in range(nrows): 51 | #plot rows of data as if they were series data 52 | dataRow = wineNormalized.iloc[i,1:nDataCol] 53 | normTarget = wineNormalized.iloc[i,nDataCol] 54 | labelColor = 1.0/(1.0 + exp(-normTarget)) 55 | dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5) 56 | 57 | plot.xlabel("Attribute Index") 58 | plot.ylabel(("Attribute Values")) 59 | plot.show() -------------------------------------------------------------------------------- /02/wineSummary.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | 7 | target_url = ("http://archive.ics.uci.edu/ml/machine-" 8 | "learning-databases/wine-quality/winequality-red.csv") 9 | 10 | wine = pd.read_csv(target_url,header=0, sep=";") 11 | 12 | print(wine.head()) 13 | 14 | #generate statistical summaries 15 | summary = wine.describe() 16 | print(summary) 17 | 18 | wineNormalized = wine 19 | ncols = len(wineNormalized.columns) 20 | 21 | for i in range(ncols): 22 | mean = summary.iloc[1, i] 23 | sd = summary.iloc[2, i] 24 | wineNormalized.iloc[:,i:(i + 1)] = \ 25 | (wineNormalized.iloc[:,i:(i + 1)] - mean) / sd 26 | 27 | array = wineNormalized.values 28 | boxplot(array) 29 | plot.xlabel("Attribute Index") 30 | plot.ylabel(("Quartile Ranges - Normalized ")) 31 | show() -------------------------------------------------------------------------------- /02/wineSummary.txt: -------------------------------------------------------------------------------- 1 | fixed acidity vola acidity citric acid resid sugar chlorides 2 | 0 7.4 0.70 0.00 1.9 0.076 3 | 1 7.8 0.88 0.00 2.6 0.098 4 | 2 7.8 0.76 0.04 2.3 0.092 5 | 3 11.2 0.28 0.56 1.9 0.075 6 | 4 7.4 0.70 0.00 1.9 0.076 7 | 8 | free sulfur dioxide tot sulfur dioxide density pH sulphates 9 | 0 11 34 0.9978 3.51 0.56 10 | 1 25 67 0.9968 3.20 0.68 11 | 2 15 54 0.9970 3.26 0.65 12 | 3 17 60 0.9980 3.16 0.58 13 | 4 11 34 0.9978 3.51 0.56 14 | 15 | alcohol quality 16 | 0 9.4 5 17 | 1 9.8 5 18 | 2 9.8 5 19 | 3 9.8 6 20 | 4 9.4 5 21 | fixed acidity volatile acidity citric acid residual sugar 22 | count 1599.000000 1599.000000 1599.000000 1599.000000 23 | mean 8.319637 0.527821 0.270976 2.538806 24 | std 1.741096 0.179060 0.194801 1.409928 25 | min 4.600000 0.120000 0.000000 0.900000 26 | 25% 7.100000 0.390000 0.090000 1.900000 27 | 50% 7.900000 0.520000 0.260000 2.200000 28 | 75% 9.200000 0.640000 0.420000 2.600000 29 | max 15.900000 1.580000 1.000000 15.500000 30 | 31 | chlorides free sulfur dioxide tot sulfur dioxide density 32 | count 1599.000000 1599.000000 1599.000000 1599.000000 33 | mean 0.087467 15.874922 46.467792 0.996747 34 | std 0.047065 10.460157 32.895324 0.001887 35 | min 0.012000 1.000000 6.000000 0.990070 36 | 25% 0.070000 7.000000 22.000000 0.995600 37 | 50% 0.079000 14.000000 38.000000 0.996750 38 | 75% 0.090000 21.000000 62.000000 0.997835 39 | max 0.611000 72.000000 289.000000 1.003690 40 | 41 | pH sulphates alcohol quality 42 | count 1599.000000 1599.000000 1599.000000 1599.000000 43 | mean 3.311113 0.658149 10.422983 5.636023 44 | std 0.154386 0.169507 1.065668 0.807569 45 | min 2.740000 0.330000 8.400000 3.000000 46 | 25% 3.210000 0.550000 9.500000 5.000000 47 | 50% 3.310000 0.620000 10.200000 6.000000 48 | 75% 3.400000 0.730000 11.100000 6.000000 49 | max 4.010000 2.000000 14.900000 8.000000 50 | -------------------------------------------------------------------------------- /03/chapter03.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/03/chapter03.zip -------------------------------------------------------------------------------- /03/classifierPerformance_RocksVMines.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | #use scikit learn package to perform linear regression 3 | #read in the rocks versus mines data set from uci.edu data repository 4 | import urllib2 5 | import numpy 6 | import random 7 | from sklearn import datasets, linear_model 8 | from sklearn.metrics import roc_curve, auc 9 | import pylab as pl 10 | 11 | 12 | def confusionMatrix(predicted, actual, threshold): 13 | if len(predicted) != len(actual): return -1 14 | tp = 0.0 15 | fp = 0.0 16 | tn = 0.0 17 | fn = 0.0 18 | for i in range(len(actual)): 19 | if actual[i] > 0.5: #labels that are 1.0 (positive examples) 20 | if predicted[i] > threshold: 21 | tp += 1.0 #correctly predicted positive 22 | else: 23 | fn += 1.0 #incorrectly predicted negative 24 | else: #labels that are 0.0 (negative examples) 25 | if predicted[i] < threshold: 26 | tn += 1.0 #correctly predicted negative 27 | else: 28 | fp += 1.0 #incorrectly predicted positive 29 | rtn = [tp, fn, fp, tn] 30 | return rtn 31 | 32 | 33 | #read data from uci data repository 34 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data" 35 | data = urllib2.urlopen(target_url) 36 | 37 | #arrange data into list for labels and list of lists for attributes 38 | xList = [] 39 | labels = [] 40 | for line in data: 41 | #split on comma 42 | row = line.strip().split(",") 43 | #assign label 1.0 for "M" and 0.0 for "R" 44 | if(row[-1] == 'M'): 45 | labels.append(1.0) 46 | else: 47 | labels.append(0.0) 48 | #remove label from row 49 | row.pop() 50 | #convert row to floats 51 | floatRow = [float(num) for num in row] 52 | xList.append(floatRow) 53 | 54 | #divide attribute matrix and label vector into training(2/3 of data) and test sets (1/3 of data) 55 | indices = range(len(xList)) 56 | xListTest = [xList[i] for i in indices if i%3 == 0 ] 57 | xListTrain = [xList[i] for i in indices if i%3 != 0 ] 58 | labelsTest = [labels[i] for i in indices if i%3 == 0] 59 | labelsTrain = [labels[i] for i in indices if i%3 != 0] 60 | 61 | #form list of list input into numpy arrays to match input class for scikit-learn linear model 62 | xTrain = numpy.array(xListTrain); yTrain = numpy.array(labelsTrain); xTest = numpy.array(xListTest); yTest = numpy.array(labelsTest) 63 | 64 | #check shapes to see what they look like 65 | print("Shape of xTrain array", xTrain.shape) 66 | print("Shape of yTrain array", yTrain.shape) 67 | print("Shape of xTest array", xTest.shape) 68 | print("Shape of yTest array", yTest.shape) 69 | 70 | #train linear regression model 71 | rocksVMinesModel = linear_model.LinearRegression() 72 | rocksVMinesModel.fit(xTrain,yTrain) 73 | 74 | #generate predictions on in-sample error 75 | trainingPredictions = rocksVMinesModel.predict(xTrain) 76 | print("Some values predicted by model", trainingPredictions[0:5], trainingPredictions[-6:-1]) 77 | 78 | #generate confusion matrix for predictions on training set (in-sample 79 | confusionMatTrain = confusionMatrix(trainingPredictions, yTrain, 0.5) 80 | #pick threshold value and generate confusion matrix entries 81 | tp = confusionMatTrain[0]; fn = confusionMatTrain[1]; fp = confusionMatTrain[2]; tn = confusionMatTrain[3] 82 | 83 | print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n') 84 | 85 | #generate predictions on out-of-sample data 86 | testPredictions = rocksVMinesModel.predict(xTest) 87 | 88 | #generate confusion matrix from predictions on out-of-sample data 89 | conMatTest = confusionMatrix(testPredictions, yTest, 0.5) 90 | #pick threshold value and generate confusion matrix entries 91 | tp = conMatTest[0]; fn = conMatTest[1]; fp = conMatTest[2]; tn = conMatTest[3] 92 | print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n') 93 | 94 | #generate ROC curve for in-sample 95 | 96 | fpr, tpr, thresholds = roc_curve(yTrain,trainingPredictions) 97 | roc_auc = auc(fpr, tpr) 98 | print( 'AUC for in-sample ROC curve: %f' % roc_auc) 99 | 100 | # Plot ROC curve 101 | pl.clf() 102 | pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 103 | pl.plot([0, 1], [0, 1], 'k--') 104 | pl.xlim([0.0, 1.0]) 105 | pl.ylim([0.0, 1.0]) 106 | pl.xlabel('False Positive Rate') 107 | pl.ylabel('True Positive Rate') 108 | pl.title('In sample ROC rocks versus mines') 109 | pl.legend(loc="lower right") 110 | pl.show() 111 | 112 | #generate ROC curve for out-of-sample 113 | fpr, tpr, thresholds = roc_curve(yTest,testPredictions) 114 | roc_auc = auc(fpr, tpr) 115 | print( 'AUC for out-of-sample ROC curve: %f' % roc_auc) 116 | 117 | # Plot ROC curve 118 | pl.clf() 119 | pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 120 | pl.plot([0, 1], [0, 1], 'k--') 121 | pl.xlim([0.0, 1.0]) 122 | pl.ylim([0.0, 1.0]) 123 | pl.xlabel('False Positive Rate') 124 | pl.ylabel('True Positive Rate') 125 | pl.title('Out-of-sample ROC rocks versus mines') 126 | pl.legend(loc="lower right") 127 | pl.show() -------------------------------------------------------------------------------- /03/classifierPerformance_RocksVMinesOutput.txt: -------------------------------------------------------------------------------- 1 | ('Shape of xTrain array', (138, 60)) 2 | ('Shape of yTrain array', (138,)) 3 | ('Shape of xTest array', (70, 60)) 4 | ('Shape of yTest array', (70,)) 5 | ('Some values predicted by model', array([-0.10240253, 0.42090698, 0.38593034, 0.36094537, 0.31520494]), array([ 1.11094176, 1.12242751, 0.77626699, 1.02016858, 0.66338081])) 6 | tp = 68.0 fn = 6.0 7 | fp = 7.0 tn = 57.0 8 | 9 | tp = 28.0 fn = 9.0 10 | fp = 9.0 tn = 24.0 11 | 12 | AUC for in-sample ROC curve: 0.979519 13 | AUC for out-of-sample ROC curve: 0.848485 14 | 15 | -------------------------------------------------------------------------------- /03/classifierRidgeRocksVMines.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | import urllib2 3 | import numpy 4 | from sklearn import datasets, linear_model 5 | from sklearn.metrics import roc_curve, auc 6 | import pylab as plt 7 | 8 | #read data from uci data repository 9 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data" 10 | data = urllib2.urlopen(target_url) 11 | 12 | #arrange data into list for labels and list of lists for attributes 13 | xList = [] 14 | labels = [] 15 | for line in data: 16 | #split on comma 17 | row = line.strip().split(",") 18 | #assign label 1.0 for "M" and 0.0 for "R" 19 | if(row[-1] == 'M'): 20 | labels.append(1.0) 21 | else: 22 | labels.append(0.0) 23 | #remove lable from row 24 | row.pop() 25 | #convert row to floats 26 | floatRow = [float(num) for num in row] 27 | xList.append(floatRow) 28 | 29 | #divide attribute matrix and label vector into training(2/3 of data) and test sets (1/3 of data) 30 | indices = range(len(xList)) 31 | xListTest = [xList[i] for i in indices if i%3 == 0 ] 32 | xListTrain = [xList[i] for i in indices if i%3 != 0 ] 33 | labelsTest = [labels[i] for i in indices if i%3 == 0] 34 | labelsTrain = [labels[i] for i in indices if i%3 != 0] 35 | 36 | #form list of list input into numpy arrays to match input class for scikit-learn linear model 37 | xTrain = numpy.array(xListTrain); yTrain = numpy.array(labelsTrain); xTest = numpy.array(xListTest); yTest = numpy.array(labelsTest) 38 | 39 | alphaList = [0.1**i for i in [-3, -2, -1, 0,1, 2, 3, 4, 5]] 40 | 41 | aucList = [] 42 | for alph in alphaList: 43 | rocksVMinesRidgeModel = linear_model.Ridge(alpha=alph) 44 | rocksVMinesRidgeModel.fit(xTrain, yTrain) 45 | fpr, tpr, thresholds = roc_curve(yTest,rocksVMinesRidgeModel.predict(xTest)) 46 | roc_auc = auc(fpr, tpr) 47 | aucList.append(roc_auc) 48 | 49 | 50 | print("AUC alpha") 51 | for i in range(len(aucList)): 52 | print(aucList[i], alphaList[i]) 53 | 54 | #plot auc values versus alpha values 55 | x = [-3, -2, -1, 0,1, 2, 3, 4, 5] 56 | plt.plot(x, aucList) 57 | plt.xlabel('-log(alpha)') 58 | plt.ylabel('AUC') 59 | plt.show() 60 | 61 | #visualize the performance of the best classifier 62 | indexBest = aucList.index(max(aucList)) 63 | alph = alphaList[indexBest] 64 | rocksVMinesRidgeModel = linear_model.Ridge(alpha=alph) 65 | rocksVMinesRidgeModel.fit(xTrain, yTrain) 66 | 67 | #scatter plot of actual vs predicted 68 | plt.scatter(rocksVMinesRidgeModel.predict(xTest), yTest, s=100, alpha=0.25) 69 | plt.xlabel("Predicted Value") 70 | plt.ylabel("Actual Value") 71 | plt.show() -------------------------------------------------------------------------------- /03/classifierRidgeRocksVMinesOutput.txt: -------------------------------------------------------------------------------- 1 | AUC alpha 2 | (0.84111384111384113, 999.9999999999999) 3 | (0.86404586404586403, 99.99999999999999) 4 | (0.9074529074529073, 10.0) 5 | (0.91809991809991809, 1.0) 6 | (0.88288288288288286, 0.1) 7 | (0.8615888615888615, 0.010000000000000002) 8 | (0.85176085176085159, 0.0010000000000000002) 9 | (0.85094185094185093, 0.00010000000000000002) 10 | (0.84930384930384917, 1.0000000000000003e-05) 11 | -------------------------------------------------------------------------------- /03/fwdStepwiseWine.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | import urllib2 3 | import numpy 4 | from sklearn import datasets, linear_model 5 | from math import sqrt 6 | import matplotlib.pyplot as plt 7 | 8 | def xattrSelect(x, idxSet): 9 | #takes X matrix as list of list and returns subset containing columns in idxSet 10 | xOut = [] 11 | for row in x: 12 | xOut.append([row[i] for i in idxSet]) 13 | return(xOut) 14 | 15 | #read data into iterable 16 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 17 | data = urllib2.urlopen(target_url) 18 | xList = [] 19 | labels = [] 20 | names = [] 21 | firstLine = True 22 | for line in data: 23 | if firstLine: 24 | names = line.strip().split(";") 25 | firstLine = False 26 | else: 27 | #split on semi-colon 28 | row = line.strip().split(";") 29 | #put labels in separate array 30 | labels.append(float(row[-1])) 31 | #remove label from row 32 | row.pop() 33 | #convert row to floats 34 | floatRow = [float(num) for num in row] 35 | xList.append(floatRow) 36 | 37 | #divide attributes and labels into training and test sets 38 | indices = range(len(xList)) 39 | xListTest = [xList[i] for i in indices if i%3 == 0 ] 40 | xListTrain = [xList[i] for i in indices if i%3 != 0 ] 41 | labelsTest = [labels[i] for i in indices if i%3 == 0] 42 | labelsTrain = [labels[i] for i in indices if i%3 != 0] 43 | 44 | #build list of attributes one-at-a-time - starting with empty 45 | attributeList = [] 46 | index = range(len(xList[1])) 47 | indexSet = set(index) 48 | indexSeq = [] 49 | oosError = [] 50 | 51 | for i in index: 52 | attSet = set(attributeList) 53 | #attributes not in list already 54 | attTrySet = indexSet - attSet 55 | #form into list 56 | attTry = [ii for ii in attTrySet] 57 | errorList = [] 58 | attTemp = [] 59 | #try each attribute not in set to see which one gives least oos error 60 | for iTry in attTry: 61 | attTemp = [] + attributeList 62 | attTemp.append(iTry) 63 | #use attTemp to form training and testing sub matrices as list of lists 64 | xTrainTemp = xattrSelect(xListTrain, attTemp) 65 | xTestTemp = xattrSelect(xListTest, attTemp) 66 | #form into numpy arrays 67 | xTrain = numpy.array(xTrainTemp); yTrain = numpy.array(labelsTrain); xTest = numpy.array(xTestTemp); yTest = numpy.array(labelsTest) 68 | #use sci-kit learn linear regression 69 | wineQModel = linear_model.LinearRegression() 70 | wineQModel.fit(xTrain,yTrain) 71 | #use trained model to generate prediction and calculate rmsError 72 | rmsError = numpy.linalg.norm((yTest-wineQModel.predict(xTest)), 2)/sqrt(len(yTest)) 73 | errorList.append(rmsError) 74 | attTemp = [] 75 | 76 | iBest = numpy.argmin(errorList) 77 | attributeList.append(attTry[iBest]) 78 | oosError.append(errorList[iBest]) 79 | 80 | print("Out of sample error versus attribute set size" ) 81 | print(oosError) 82 | print("\n" + "Best attribute indices") 83 | print(attributeList) 84 | namesList = [names[i] for i in attributeList] 85 | print("\n" + "Best attribute names") 86 | print(namesList) 87 | 88 | #Plot error versus number of attributes 89 | x = range(len(oosError)) 90 | plt.plot(x, oosError, 'k') 91 | plt.xlabel('Number of Attributes') 92 | plt.ylabel('Error (RMS)') 93 | plt.show() 94 | 95 | #Plot histogram of out of sample errors for best number of attributes 96 | #Identify index corresponding to min value, retrain with the corresponding attributes 97 | #Use resulting model to predict against out of sample data. Plot errors (aka residuals) 98 | indexBest = oosError.index(min(oosError)) 99 | attributesBest = attributeList[1:(indexBest+1)] 100 | 101 | #Define column-wise subsets of xListTrain and xListTest and convert to numpy 102 | xTrainTemp = xattrSelect(xListTrain, attributesBest) 103 | xTestTemp = xattrSelect(xListTest, attributesBest) 104 | xTrain = numpy.array(xTrainTemp); xTest = numpy.array(xTestTemp) 105 | 106 | #train and plot error histogram 107 | wineQModel = linear_model.LinearRegression() 108 | wineQModel.fit(xTrain,yTrain) 109 | errorVector = yTest-wineQModel.predict(xTest) 110 | plt.hist(errorVector) 111 | plt.xlabel("Bin Boundaries") 112 | plt.ylabel("Counts") 113 | plt.show() 114 | 115 | #scatter plot of actual versus predicted 116 | plt.scatter(wineQModel.predict(xTest), yTest, s=100, alpha=0.10) 117 | plt.xlabel('Predicted Taste Score') 118 | plt.ylabel('Actual Taste Score') 119 | plt.show() -------------------------------------------------------------------------------- /03/fwdStepwiseWineOutput.txt: -------------------------------------------------------------------------------- 1 | Out of sample error versus attribute set size 2 | [0.7234259255116281, 0.68609931528371915, 0.67343650334202809, 0.66770332138977984, 0.66225585685222743, 0.65900047541546247, 0.65727172061430772, 0.65709058062076986, 0.65699930964461406, 0.65758189400434675, 0.65739098690113373] 3 | 4 | Best attribute indices 5 | [10, 1, 9, 4, 6, 8, 5, 3, 2, 7, 0] 6 | 7 | Best attribute names 8 | ['"alcohol"', '"volatile acidity"', '"sulphates"', '"chlorides"', '"total sulfur dioxide"', '"pH"', '"free sulfur dioxide"', '"residual sugar"', '"citric acid"', '"density"', '"fixed acidity"'] 9 | 10 | -------------------------------------------------------------------------------- /03/regressionErrorMeasures.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | 4 | #here are some made-up numbers to start with 5 | target = [1.5, 2.1, 3.3, -4.7, -2.3, 0.75] 6 | prediction = [0.5, 1.5, 2.1, -2.2, 0.1, -0.5] 7 | 8 | error = [] 9 | for i in range(len(target)): 10 | error.append(target[i] - prediction[i]) 11 | 12 | #print the errors 13 | print("Errors ",) 14 | print(error) 15 | #ans: [1.0, 0.60000000000000009, 1.1999999999999997, -2.5, -2.3999999999999999, 1.25] 16 | 17 | 18 | 19 | #calculate the squared errors and absolute value of errors 20 | squaredError = [] 21 | absError = [] 22 | for val in error: 23 | squaredError.append(val*val) 24 | absError.append(abs(val)) 25 | 26 | 27 | #print squared errors and absolute value of errors 28 | print("Squared Error") 29 | print(squaredError) 30 | #ans: [1.0, 0.3600000000000001, 1.4399999999999993, 6.25, 5.7599999999999998, 1.5625] 31 | print("Absolute Value of Error") 32 | print(absError) 33 | #ans: [1.0, 0.60000000000000009, 1.1999999999999997, 2.5, 2.3999999999999999, 1.25] 34 | 35 | 36 | #calculate and print mean squared error MSE 37 | print("MSE = ", sum(squaredError)/len(squaredError)) 38 | #ans: 2.72875 39 | 40 | 41 | from math import sqrt 42 | #calculate and print square root of MSE (RMSE) 43 | print("RMSE = ", sqrt(sum(squaredError)/len(squaredError))) 44 | #ans: 1.65189285367 45 | 46 | 47 | #calculate and print mean absolute error MAE 48 | print("MAE = ", sum(absError)/len(absError)) 49 | #ans: 1.49166666667 50 | 51 | 52 | #compare MSE to target variance 53 | targetDeviation = [] 54 | targetMean = sum(target)/len(target) 55 | for val in target: 56 | targetDeviation.append((val - targetMean)*(val - targetMean)) 57 | 58 | #print the target variance 59 | print("Target Variance = ", sum(targetDeviation)/len(targetDeviation)) 60 | #ans: 7.5703472222222219 61 | 62 | #print the the target standard deviation (square root of variance) 63 | print("Target Standard Deviation = ", sqrt(sum(targetDeviation)/len(targetDeviation))) 64 | #ans: 2.7514263977475797 -------------------------------------------------------------------------------- /03/ridgeWine.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import datasets, linear_model 6 | from math import sqrt 7 | import matplotlib.pyplot as plt 8 | 9 | #read data into iterable 10 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 11 | data = urllib2.urlopen(target_url) 12 | 13 | xList = [] 14 | labels = [] 15 | names = [] 16 | firstLine = True 17 | for line in data: 18 | if firstLine: 19 | names = line.strip().split(";") 20 | firstLine = False 21 | else: 22 | #split on semi-colon 23 | row = line.strip().split(";") 24 | #put labels in separate array 25 | labels.append(float(row[-1])) 26 | #remove label from row 27 | row.pop() 28 | #convert row to floats 29 | floatRow = [float(num) for num in row] 30 | xList.append(floatRow) 31 | 32 | #divide attributes and labels into training and test sets 33 | indices = range(len(xList)) 34 | xListTest = [xList[i] for i in indices if i%3 == 0 ] 35 | xListTrain = [xList[i] for i in indices if i%3 != 0 ] 36 | labelsTest = [labels[i] for i in indices if i%3 == 0] 37 | labelsTrain = [labels[i] for i in indices if i%3 != 0] 38 | 39 | xTrain = numpy.array(xListTrain); yTrain = numpy.array(labelsTrain); xTest = numpy.array(xListTest); yTest = numpy.array(labelsTest) 40 | 41 | alphaList = [0.1**i for i in [0,1, 2, 3, 4, 5, 6]] 42 | 43 | rmsError = [] 44 | for alph in alphaList: 45 | wineRidgeModel = linear_model.Ridge(alpha=alph) 46 | wineRidgeModel.fit(xTrain, yTrain) 47 | rmsError.append(numpy.linalg.norm((yTest-wineRidgeModel.predict(xTest)), 2)/sqrt(len(yTest))) 48 | 49 | print("RMS Error alpha") 50 | for i in range(len(rmsError)): 51 | print(rmsError[i], alphaList[i]) 52 | 53 | #plot curve of out-of-sample error versus alpha 54 | x = range(len(rmsError)) 55 | plt.plot(x, rmsError, 'k') 56 | plt.xlabel('-log(alpha)') 57 | plt.ylabel('Error (RMS)') 58 | plt.show() 59 | 60 | #Plot histogram of out of sample errors for best alpha value and scatter plot of actual versus predicted 61 | #Identify index corresponding to min value, retrain with the corresponding value of alpha 62 | #Use resulting model to predict against out of sample data. Plot errors (aka residuals) 63 | indexBest = rmsError.index(min(rmsError)) 64 | alph = alphaList[indexBest] 65 | wineRidgeModel = linear_model.Ridge(alpha=alph) 66 | wineRidgeModel.fit(xTrain, yTrain) 67 | errorVector = yTest-wineRidgeModel.predict(xTest) 68 | plt.hist(errorVector) 69 | plt.xlabel("Bin Boundaries") 70 | plt.ylabel("Counts") 71 | plt.show() 72 | 73 | plt.scatter(wineRidgeModel.predict(xTest), yTest, s=100, alpha=0.10) 74 | plt.xlabel('Predicted Taste Score') 75 | plt.ylabel('Actual Taste Score') 76 | plt.show() 77 | 78 | -------------------------------------------------------------------------------- /03/ridgeWineOutput.txt: -------------------------------------------------------------------------------- 1 | RMS Error alpha 2 | (0.65957881763424564, 1.0) 3 | (0.65786109188085928, 0.1) 4 | (0.65761721446402455, 0.010000000000000002) 5 | (0.65752164826417536, 0.0010000000000000002) 6 | (0.65741906801092931, 0.00010000000000000002) 7 | (0.65739416288512531, 1.0000000000000003e-05) 8 | (0.65739130871558593, 1.0000000000000004e-06) 9 | -------------------------------------------------------------------------------- /04/chapter04.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/04/chapter04.zip -------------------------------------------------------------------------------- /04/cvCurveDetails.txt: -------------------------------------------------------------------------------- 1 | Output: 2 | ('Minimum Mean Square Error', 0.5873018933136459) 3 | ('Index of Minimum Mean Square Error', 311) 4 | -------------------------------------------------------------------------------- /04/glmnetOrderedNamesList.txt: -------------------------------------------------------------------------------- 1 | ['"alcohol"', '"volatile acidity"', '"sulphates"', '"total sulfur dioxide"', '"chlorides"', '"fixed acidity"', '"pH"'] 2 | -------------------------------------------------------------------------------- /04/glmnetWine.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import datasets, linear_model 6 | from math import sqrt 7 | import matplotlib.pyplot as plot 8 | def S(z, gamma): 9 | if gamma >= abs(z): 10 | return 0.0 11 | return (z/abs(z))*(abs(z) - gamma) 12 | 13 | #read data into iterable 14 | target_url = ("http://archive.ics.uci.edu/ml/machine-learning-" 15 | "databases/wine-quality/winequality-red.csv") 16 | data = urllib2.urlopen(target_url) 17 | 18 | xList = [] 19 | labels = [] 20 | names = [] 21 | firstLine = True 22 | for line in data: 23 | if firstLine: 24 | names = line.strip().split(";") 25 | firstLine = False 26 | else: 27 | #split on semi-colon 28 | row = line.strip().split(";") 29 | #put labels in separate array 30 | labels.append(float(row[-1])) 31 | #remove label from row 32 | row.pop() 33 | #convert row to floats 34 | floatRow = [float(num) for num in row] 35 | xList.append(floatRow) 36 | 37 | #Normalize columns in x and labels 38 | 39 | nrows = len(xList) 40 | ncols = len(xList[0]) 41 | 42 | #calculate means and variances 43 | xMeans = [] 44 | xSD = [] 45 | for i in range(ncols): 46 | col = [xList[j][i] for j in range(nrows)] 47 | mean = sum(col)/nrows 48 | xMeans.append(mean) 49 | colDiff = [(xList[j][i] - mean) for j in range(nrows)] 50 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)]) 51 | stdDev = sqrt(sumSq/nrows) 52 | xSD.append(stdDev) 53 | 54 | #use calculate mean and standard deviation to normalize xList 55 | xNormalized = [] 56 | for i in range(nrows): 57 | rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] 58 | for j in range(ncols)] 59 | xNormalized.append(rowNormalized) 60 | 61 | #Normalize labels 62 | meanLabel = sum(labels)/nrows 63 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - 64 | meanLabel) for i in range(nrows)])/nrows) 65 | 66 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)] 67 | 68 | #select value for alpha parameter 69 | 70 | alpha = 1.0 71 | 72 | #make a pass through the data to determine value of lambda that 73 | # just suppresses all coefficients. 74 | #start with betas all equal to zero. 75 | 76 | 77 | xy = [0.0]*ncols 78 | for i in range(nrows): 79 | for j in range(ncols): 80 | xy[j] += xNormalized[i][j] * labelNormalized[i] 81 | 82 | maxXY = 0.0 83 | for i in range(ncols): 84 | val = abs(xy[i])/nrows 85 | if val > maxXY: 86 | maxXY = val 87 | 88 | #calculate starting value for lambda 89 | lam = maxXY/alpha 90 | 91 | #this value of lambda corresponds to beta = list of 0's 92 | #initialize a vector of coefficients beta 93 | beta = [0.0] * ncols 94 | 95 | #initialize matrix of betas at each step 96 | betaMat = [] 97 | betaMat.append(list(beta)) 98 | 99 | #begin iteration 100 | nSteps = 100 101 | lamMult = 0.93 #100 steps gives reduction by factor of 1000 in 102 | # lambda (recommended by authors) 103 | nzList = [] 104 | 105 | for iStep in range(nSteps): 106 | #make lambda smaller so that some coefficient becomes non-zero 107 | lam = lam * lamMult 108 | 109 | deltaBeta = 100.0 110 | eps = 0.01 111 | iterStep = 0 112 | betaInner = list(beta) 113 | while deltaBeta > eps: 114 | iterStep += 1 115 | if iterStep > 100: break 116 | 117 | #cycle through attributes and update one-at-a-time 118 | #record starting value for comparison 119 | betaStart = list(betaInner) 120 | for iCol in range(ncols): 121 | 122 | xyj = 0.0 123 | for i in range(nrows): 124 | #calculate residual with current value of beta 125 | labelHat = sum([xNormalized[i][k]*betaInner[k] 126 | for k in range(ncols)]) 127 | residual = labelNormalized[i] - labelHat 128 | 129 | xyj += xNormalized[i][iCol] * residual 130 | 131 | uncBeta = xyj/nrows + betaInner[iCol] 132 | betaInner[iCol] = S(uncBeta, lam * alpha) / (1 + 133 | lam * (1 - alpha)) 134 | 135 | sumDiff = sum([abs(betaInner[n] - betaStart[n]) 136 | for n in range(ncols)]) 137 | sumBeta = sum([abs(betaInner[n]) for n in range(ncols)]) 138 | deltaBeta = sumDiff/sumBeta 139 | print(iStep, iterStep) 140 | beta = betaInner 141 | 142 | #add newly determined beta to list 143 | betaMat.append(beta) 144 | 145 | #keep track of the order in which the betas become non-zero 146 | nzBeta = [index for index in range(ncols) if beta[index] != 0.0] 147 | for q in nzBeta: 148 | if (q in nzList) == False: 149 | nzList.append(q) 150 | 151 | #print out the ordered list of betas 152 | nameList = [names[nzList[i]] for i in range(len(nzList))] 153 | print(nameList) 154 | 155 | nPts = len(betaMat) 156 | for i in range(ncols): 157 | #plot range of beta values for each attribute 158 | coefCurve = [betaMat[k][i] for k in range(nPts)] 159 | xaxis = range(nPts) 160 | plot.plot(xaxis, coefCurve) 161 | 162 | plot.xlabel("Steps Taken") 163 | plot.ylabel(("Coefficient Values")) 164 | plot.show() 165 | 166 | -------------------------------------------------------------------------------- /04/larsAbalone.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | 3 | import urllib2 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | 7 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data" 8 | #read abalone data 9 | data = urllib2.urlopen(target_url) 10 | 11 | xList = [] 12 | labels = [] 13 | 14 | for line in data: 15 | #split on semi-colon 16 | row = line.strip().split(",") 17 | 18 | #put labels in separate array and remove label from row 19 | labels.append(float(row.pop())) 20 | 21 | #form list of list of attributes (all strings) 22 | xList.append(row) 23 | 24 | names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings'] 25 | 26 | #code three-valued sex attribute as numeric 27 | xCoded = [] 28 | for row in xList: 29 | #first code the three-valued sex variable 30 | codedSex = [0.0, 0.0] 31 | if row[0] == 'M': codedSex[0] = 1.0 32 | if row[0] == 'F': codedSex[1] = 1.0 33 | 34 | numRow = [float(row[i]) for i in range(1,len(row))] 35 | rowCoded = list(codedSex) + numRow 36 | xCoded.append(rowCoded) 37 | 38 | namesCoded = ['Sex1', 'Sex2', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings'] 39 | 40 | nrows = len(xCoded) 41 | ncols = len(xCoded[1]) 42 | 43 | xMeans = [] 44 | xSD = [] 45 | for i in range(ncols): 46 | col = [xCoded[j][i] for j in range(nrows)] 47 | mean = sum(col)/nrows 48 | xMeans.append(mean) 49 | colDiff = [(xCoded[j][i] - mean) for j in range(nrows)] 50 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)]) 51 | stdDev = sqrt(sumSq/nrows) 52 | xSD.append(stdDev) 53 | 54 | #use calculate mean and standard deviation to normalize xCoded 55 | xNormalized = [] 56 | for i in range(nrows): 57 | rowNormalized = [(xCoded[i][j] - xMeans[j])/xSD[j] for j in range(ncols)] 58 | xNormalized.append(rowNormalized) 59 | 60 | #Normalize labels 61 | meanLabel = sum(labels)/nrows 62 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows) 63 | 64 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)] 65 | 66 | #initialize a vector of coefficients beta 67 | beta = [0.0] * ncols 68 | 69 | #initialize matrix of betas at each step 70 | betaMat = [] 71 | betaMat.append(list(beta)) 72 | 73 | 74 | #number of steps to take 75 | nSteps = 350 76 | stepSize = 0.004 77 | nzList = [] 78 | 79 | for i in range(nSteps): 80 | #calculate residuals 81 | residuals = [0.0] * nrows 82 | for j in range(nrows): 83 | labelsHat = sum([xNormalized[j][k] * beta[k] for k in range(ncols)]) 84 | residuals[j] = labelNormalized[j] - labelsHat 85 | 86 | #calculate correlation between attribute columns from normalized wine and residual 87 | corr = [0.0] * ncols 88 | 89 | for j in range(ncols): 90 | corr[j] = sum([xNormalized[k][j] * residuals[k] for k in range(nrows)]) / nrows 91 | 92 | iStar = 0 93 | corrStar = corr[0] 94 | 95 | for j in range(1, (ncols)): 96 | if abs(corrStar) < abs(corr[j]): 97 | iStar = j; corrStar = corr[j] 98 | 99 | beta[iStar] += stepSize * corrStar / abs(corrStar) 100 | betaMat.append(list(beta)) 101 | 102 | 103 | nzBeta = [index for index in range(ncols) if beta[index] != 0.0] 104 | for q in nzBeta: 105 | if (q in nzList) == False: 106 | nzList.append(q) 107 | 108 | nameList = [namesCoded[nzList[i]] for i in range(len(nzList))] 109 | 110 | print(nameList) 111 | for i in range(ncols): 112 | #plot range of beta values for each attribute 113 | coefCurve = [betaMat[k][i] for k in range(nSteps)] 114 | xaxis = range(nSteps) 115 | plot.plot(xaxis, coefCurve) 116 | 117 | plot.xlabel("Steps Taken") 118 | plot.ylabel(("Coefficient Values")) 119 | plot.show() 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /04/larsAbaloneOutput.txt: -------------------------------------------------------------------------------- 1 | ['Shell weight', 'Height', 'Sex2', 'Shucked weight', 'Diameter', 'Sex1'] 2 | -------------------------------------------------------------------------------- /04/larsRocksVMines.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import urllib2 3 | import sys 4 | from math import sqrt 5 | import matplotlib.pyplot as plot 6 | 7 | #read data from uci data repository 8 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data" 9 | data = urllib2.urlopen(target_url) 10 | 11 | 12 | #arrange data into list for labels and list of lists for attributes 13 | xList = [] 14 | 15 | 16 | for line in data: 17 | #split on comma 18 | row = line.strip().split(",") 19 | xList.append(row) 20 | 21 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0 22 | 23 | xNum = [] 24 | labels = [] 25 | 26 | for row in xList: 27 | lastCol = row.pop() 28 | if lastCol == "M": 29 | labels.append(1.0) 30 | else: 31 | labels.append(0.0) 32 | attrRow = [float(elt) for elt in row] 33 | xNum.append(attrRow) 34 | 35 | #number of rows and columns in x matrix 36 | nrow = len(xNum) 37 | ncol = len(xNum[1]) 38 | 39 | 40 | 41 | #calculate means and variances 42 | xMeans = [] 43 | xSD = [] 44 | for i in range(ncol): 45 | col = [xNum[j][i] for j in range(nrow)] 46 | mean = sum(col)/nrow 47 | xMeans.append(mean) 48 | colDiff = [(xNum[j][i] - mean) for j in range(nrow)] 49 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)]) 50 | stdDev = sqrt(sumSq/nrow) 51 | xSD.append(stdDev) 52 | 53 | #use calculate mean and standard deviation to normalize xNum 54 | xNormalized = [] 55 | for i in range(nrow): 56 | rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)] 57 | xNormalized.append(rowNormalized) 58 | 59 | #Normalize labels 60 | meanLabel = sum(labels)/nrow 61 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrow)])/nrow) 62 | 63 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrow)] 64 | 65 | #initialize a vector of coefficients beta 66 | beta = [0.0] * ncol 67 | 68 | #initialize matrix of betas at each step 69 | betaMat = [] 70 | betaMat.append(list(beta)) 71 | 72 | 73 | #number of steps to take 74 | nSteps = 350 75 | stepSize = 0.004 76 | nzList = [] 77 | 78 | for i in range(nSteps): 79 | #calculate residuals 80 | residuals = [0.0] * nrow 81 | for j in range(nrow): 82 | labelsHat = sum([xNormalized[j][k] * beta[k] for k in range(ncol)]) 83 | residuals[j] = labelNormalized[j] - labelsHat 84 | 85 | #calculate correlation between attribute columns from normalized wine and residual 86 | corr = [0.0] * ncol 87 | 88 | for j in range(ncol): 89 | corr[j] = sum([xNormalized[k][j] * residuals[k] for k in range(nrow)]) / nrow 90 | 91 | iStar = 0 92 | corrStar = corr[0] 93 | 94 | for j in range(1, (ncol)): 95 | if abs(corrStar) < abs(corr[j]): 96 | iStar = j; corrStar = corr[j] 97 | 98 | beta[iStar] += stepSize * corrStar / abs(corrStar) 99 | betaMat.append(list(beta)) 100 | 101 | 102 | nzBeta = [index for index in range(ncol) if beta[index] != 0.0] 103 | for q in nzBeta: 104 | if (q in nzList) == False: 105 | nzList.append(q) 106 | 107 | #make up names for columns of xNum 108 | names = ['V' + str(i) for i in range(ncol)] 109 | nameList = [names[nzList[i]] for i in range(len(nzList))] 110 | 111 | print(nameList) 112 | for i in range(ncol): 113 | #plot range of beta values for each attribute 114 | coefCurve = [betaMat[k][i] for k in range(nSteps)] 115 | xaxis = range(nSteps) 116 | plot.plot(xaxis, coefCurve) 117 | 118 | plot.xlabel("Steps Taken") 119 | plot.ylabel(("Coefficient Values")) 120 | plot.show() -------------------------------------------------------------------------------- /04/larsWine.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import pandas as pd 3 | from pandas import DataFrame 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | from math import fabs 7 | 8 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 9 | wine = pd.read_csv(target_url,header=0, sep=";") 10 | 11 | #normalize the wine data 12 | summary = wine.describe() 13 | print(summary) 14 | 15 | wineNormalized = wine 16 | ncols = len(wineNormalized.columns) 17 | nrows = len(wineNormalized) 18 | 19 | for i in range(ncols): 20 | mean = summary.iloc[1, i] 21 | sd = summary.iloc[2, i] 22 | wineNormalized.iloc[:,i:(i + 1)] = (wineNormalized.iloc[:,i:(i + 1)] - mean) / sd 23 | 24 | #initialize a vector of coefficients beta 25 | beta = [0.0] * (ncols - 1) 26 | #initialize matrix of betas at each step 27 | betaMat = [] 28 | betaMat.append(list(beta)) 29 | #initialize residuals list 30 | residuals = [0.0] * nrows 31 | 32 | #number of steps to take 33 | nSteps = 100 34 | stepSize = 0.1 35 | 36 | for i in range(nSteps): 37 | #calculate residuals 38 | for j in range(nrows): 39 | residuals[j] = wineNormalized.iloc[j, (ncols - 1)] 40 | for k in range(ncols - 1): 41 | residuals[j] += - wineNormalized.iloc[j, k] * beta[k] 42 | 43 | #calculate correlation between attribute columns from normalized wine and residual 44 | corr = [0.0] * (ncols - 1) 45 | 46 | for j in range(ncols - 1): 47 | for k in range(nrows): 48 | corr[j] += wineNormalized.iloc[k,j] * residuals[k] / nrows 49 | 50 | iStar = 0 51 | corrStar = corr[0] 52 | 53 | for j in range(1, (ncols - 1)): 54 | if abs(corrStar) < abs(corr[j]): 55 | iStar = j; corrStar = corr[j] 56 | 57 | beta[iStar] += stepSize * corrStar / abs(corrStar) 58 | betaMat.append(list(beta)) 59 | 60 | 61 | for i in range(ncols - 1): 62 | #plot range of beta values for each attribute 63 | coefCurve = betaMat[0:nSteps][i] 64 | coefCurve.plot() 65 | 66 | plot.xlabel("Attribute Index") 67 | plot.ylabel(("Attribute Values")) 68 | plot.show() 69 | -------------------------------------------------------------------------------- /04/larsWine2.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import datasets, linear_model 6 | from math import sqrt 7 | import matplotlib.pyplot as plot 8 | 9 | #read data into iterable 10 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 11 | data = urllib2.urlopen(target_url) 12 | 13 | xList = [] 14 | labels = [] 15 | names = [] 16 | firstLine = True 17 | for line in data: 18 | if firstLine: 19 | names = line.strip().split(";") 20 | firstLine = False 21 | else: 22 | #split on semi-colon 23 | row = line.strip().split(";") 24 | #put labels in separate array 25 | labels.append(float(row[-1])) 26 | #remove label from row 27 | row.pop() 28 | #convert row to floats 29 | floatRow = [float(num) for num in row] 30 | xList.append(floatRow) 31 | 32 | #Normalize columns in x and labels 33 | 34 | nrows = len(xList) 35 | ncols = len(xList[0]) 36 | 37 | #calculate means and variances 38 | xMeans = [] 39 | xSD = [] 40 | for i in range(ncols): 41 | col = [xList[j][i] for j in range(nrows)] 42 | mean = sum(col)/nrows 43 | xMeans.append(mean) 44 | colDiff = [(xList[j][i] - mean) for j in range(nrows)] 45 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)]) 46 | stdDev = sqrt(sumSq/nrows) 47 | xSD.append(stdDev) 48 | 49 | #use calculate mean and standard deviation to normalize xList 50 | xNormalized = [] 51 | for i in range(nrows): 52 | rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)] 53 | xNormalized.append(rowNormalized) 54 | 55 | #Normalize labels 56 | meanLabel = sum(labels)/nrows 57 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows) 58 | 59 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)] 60 | 61 | #initialize a vector of coefficients beta 62 | beta = [0.0] * ncols 63 | 64 | #initialize matrix of betas at each step 65 | betaMat = [] 66 | betaMat.append(list(beta)) 67 | 68 | 69 | #number of steps to take 70 | nSteps = 350 71 | stepSize = 0.004 72 | nzList = [] 73 | 74 | for i in range(nSteps): 75 | #calculate residuals 76 | residuals = [0.0] * nrows 77 | for j in range(nrows): 78 | labelsHat = sum([xNormalized[j][k] * beta[k] for k in range(ncols)]) 79 | residuals[j] = labelNormalized[j] - labelsHat 80 | 81 | #calculate correlation between attribute columns from normalized wine and residual 82 | corr = [0.0] * ncols 83 | 84 | for j in range(ncols): 85 | corr[j] = sum([xNormalized[k][j] * residuals[k] for k in range(nrows)]) / nrows 86 | 87 | iStar = 0 88 | corrStar = corr[0] 89 | 90 | for j in range(1, (ncols)): 91 | if abs(corrStar) < abs(corr[j]): 92 | iStar = j; corrStar = corr[j] 93 | 94 | beta[iStar] += stepSize * corrStar / abs(corrStar) 95 | betaMat.append(list(beta)) 96 | 97 | 98 | nzBeta = [index for index in range(ncols) if beta[index] != 0.0] 99 | for q in nzBeta: 100 | if (q in nzList) == False: 101 | nzList.append(q) 102 | 103 | nameList = [names[nzList[i]] for i in range(len(nzList))] 104 | 105 | print(nameList) 106 | for i in range(ncols): 107 | #plot range of beta values for each attribute 108 | coefCurve = [betaMat[k][i] for k in range(nSteps)] 109 | xaxis = range(nSteps) 110 | plot.plot(xaxis, coefCurve) 111 | 112 | plot.xlabel("Steps Taken") 113 | plot.ylabel(("Coefficient Values")) 114 | plot.show() -------------------------------------------------------------------------------- /04/larsWineCV.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import datasets, linear_model 6 | from math import sqrt 7 | import matplotlib.pyplot as plot 8 | 9 | 10 | #read data into iterable 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 12 | data = urllib2.urlopen(target_url) 13 | 14 | xList = [] 15 | labels = [] 16 | names = [] 17 | firstLine = True 18 | for line in data: 19 | if firstLine: 20 | names = line.strip().split(";") 21 | firstLine = False 22 | else: 23 | #split on semi-colon 24 | row = line.strip().split(";") 25 | #put labels in separate array 26 | labels.append(float(row[-1])) 27 | #remove label from row 28 | row.pop() 29 | #convert row to floats 30 | floatRow = [float(num) for num in row] 31 | xList.append(floatRow) 32 | 33 | #Normalize columns in x and labels 34 | 35 | nrows = len(xList) 36 | ncols = len(xList[0]) 37 | 38 | #calculate means and variances 39 | xMeans = [] 40 | xSD = [] 41 | for i in range(ncols): 42 | col = [xList[j][i] for j in range(nrows)] 43 | mean = sum(col)/nrows 44 | xMeans.append(mean) 45 | colDiff = [(xList[j][i] - mean) for j in range(nrows)] 46 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)]) 47 | stdDev = sqrt(sumSq/nrows) 48 | xSD.append(stdDev) 49 | 50 | #use calculated mean and standard deviation to normalize xList 51 | xNormalized = [] 52 | for i in range(nrows): 53 | rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)] 54 | xNormalized.append(rowNormalized) 55 | 56 | #Normalize labels 57 | meanLabel = sum(labels)/nrows 58 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows) 59 | 60 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)] 61 | 62 | #Build cross-validation loop to determine best coefficient values. 63 | 64 | #number of cross validation folds 65 | nxval = 10 66 | 67 | #number of steps and step size 68 | nSteps = 350 69 | stepSize = 0.004 70 | 71 | #initialize list for storing errors. 72 | errors = [] 73 | for i in range(nSteps): 74 | b = [] 75 | errors.append(b) 76 | 77 | 78 | for ixval in range(nxval): 79 | #Define test and training index sets 80 | idxTest = [a for a in range(nrows) if a%nxval == ixval*nxval] 81 | idxTrain = [a for a in range(nrows) if a%nxval != ixval*nxval] 82 | 83 | #Define test and training attribute and label sets 84 | xTrain = [xNormalized[r] for r in idxTrain] 85 | xTest = [xNormalized[r] for r in idxTest] 86 | labelTrain = [labelNormalized[r] for r in idxTrain] 87 | labelTest = [labelNormalized[r] for r in idxTest] 88 | 89 | #Train LARS regression on Training Data 90 | nrowsTrain = len(idxTrain) 91 | nrowsTest = len(idxTest) 92 | 93 | #initialize a vector of coefficients beta 94 | beta = [0.0] * ncols 95 | 96 | #initialize matrix of betas at each step 97 | betaMat = [] 98 | betaMat.append(list(beta)) 99 | 100 | for iStep in range(nSteps): 101 | #calculate residuals 102 | residuals = [0.0] * nrows 103 | for j in range(nrowsTrain): 104 | labelsHat = sum([xTrain[j][k] * beta[k] for k in range(ncols)]) 105 | residuals[j] = labelTrain[j] - labelsHat 106 | 107 | #calculate correlation between attribute columns from normalized wine and residual 108 | corr = [0.0] * ncols 109 | 110 | for j in range(ncols): 111 | corr[j] = sum([xTrain[k][j] * residuals[k] for k in range(nrowsTrain)]) / nrowsTrain 112 | 113 | iStar = 0 114 | corrStar = corr[0] 115 | 116 | for j in range(1, (ncols)): 117 | if abs(corrStar) < abs(corr[j]): 118 | iStar = j; corrStar = corr[j] 119 | 120 | beta[iStar] += stepSize * corrStar / abs(corrStar) 121 | betaMat.append(list(beta)) 122 | 123 | #Use beta just calculated to predict and accumulate out of sample error - not being used in the calc of beta 124 | for j in range(nrowsTest): 125 | labelsHat = sum([xTest[j][k] * beta[k] for k in range(ncols)]) 126 | err = labelTest[j] - labelsHat 127 | errors[iStep].append(err) 128 | 129 | cvCurve = [] 130 | for errVect in errors: 131 | mse = sum([x*x for x in errVect])/len(errVect) 132 | cvCurve.append(mse) 133 | 134 | minMse = min(cvCurve) 135 | minPt = [i for i in range(len(cvCurve)) if cvCurve[i] == minMse ][0] 136 | print("Minimum Mean Square Error", minMse) 137 | print("Index of Minimum Mean Square Error", minPt) 138 | 139 | xaxis = range(len(cvCurve)) 140 | plot.plot(xaxis, cvCurve) 141 | 142 | plot.xlabel("Steps Taken") 143 | plot.ylabel(("Mean Square Error")) 144 | plot.show() 145 | -------------------------------------------------------------------------------- /04/orderedNamesList.txt: -------------------------------------------------------------------------------- 1 | ['"alcohol"', '"volatile acidity"', '"sulphates"', '"total sulfur dioxide"', '"chlorides"', '"fixed acidity"', '"pH"', '"free sulfur dioxide"', '"citric acid"', '"residual sugar"', '"density"'] 2 | -------------------------------------------------------------------------------- /04/rocksVMinesCoefOrder.txt: -------------------------------------------------------------------------------- 1 | ['V10', 'V48', 'V44', 'V11', 'V35', 'V51', 'V20', 'V3', 'V21', 'V15', 'V43', 'V0', 'V22', 'V45', 'V53', 'V27', 'V30', 'V50', 'V58', 'V46', 'V56', 'V28', 'V39'] 2 | 3 | -------------------------------------------------------------------------------- /04/wineBasisExpand.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import matplotlib.pyplot as plot 5 | from math import sqrt, cos, log 6 | 7 | #read data into iterable 8 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 9 | data = urllib2.urlopen(target_url) 10 | 11 | xList = [] 12 | labels = [] 13 | names = [] 14 | firstLine = True 15 | for line in data: 16 | if firstLine: 17 | names = line.strip().split(";") 18 | firstLine = False 19 | else: 20 | #split on semi-colon 21 | row = line.strip().split(";") 22 | #put labels in separate array 23 | labels.append(float(row[-1])) 24 | #remove label from row 25 | row.pop() 26 | #convert row to floats 27 | floatRow = [float(num) for num in row] 28 | xList.append(floatRow) 29 | 30 | 31 | #extend the alcohol variable (the last column in that attribute matrix 32 | xExtended = [] 33 | alchCol = len(xList[1]) 34 | 35 | 36 | for row in xList: 37 | newRow = list(row) 38 | alch = row[alchCol - 1] 39 | newRow.append((alch - 7) * (alch - 7)/10) 40 | newRow.append(5 * log(alch - 7)) 41 | newRow.append(cos(alch)) 42 | xExtended.append(newRow) 43 | 44 | nrow = len(xList) 45 | v1 = [xExtended[j][alchCol - 1] for j in range(nrow)] 46 | 47 | for i in range(4): 48 | v2 = [xExtended[j][alchCol - 1 + i] for j in range(nrow)] 49 | plot.scatter(v1,v2) 50 | 51 | plot.xlabel("Alcohol") 52 | plot.ylabel(("Extension Functions of Alcohol")) 53 | plot.show() 54 | -------------------------------------------------------------------------------- /05/chapter05.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/05/chapter05.zip -------------------------------------------------------------------------------- /05/glass/glassENetRegCV.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from math import sqrt, fabs, exp 3 | import matplotlib.pyplot as plot 4 | from sklearn.linear_model import enet_path 5 | from sklearn.metrics import roc_auc_score, roc_curve 6 | import numpy 7 | 8 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data" 9 | data = urllib2.urlopen(target_url) 10 | 11 | #arrange data into list for labels and list of lists for attributes 12 | xList = [] 13 | for line in data: 14 | #split on comma 15 | row = line.strip().split(",") 16 | xList.append(row) 17 | 18 | names = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'] 19 | 20 | #Separate attributes and labels 21 | xNum = [] 22 | labels = [] 23 | 24 | for row in xList: 25 | labels.append(row.pop()) 26 | l = len(row) 27 | #eliminate ID 28 | attrRow = [float(row[i]) for i in range(1, l)] 29 | xNum.append(attrRow) 30 | 31 | #number of rows and columns in x matrix 32 | nrow = len(xNum) 33 | ncol = len(xNum[1]) 34 | 35 | #creat one versus all label vectors 36 | #get distinct glass types and assign index to each 37 | yOneVAll = [] 38 | labelSet = set(labels) 39 | labelList = list(labelSet) 40 | labelList.sort() 41 | nlabels = len(labelList) 42 | for i in range(nrow): 43 | yRow = [0.0]*nlabels 44 | index = labelList.index(labels[i]) 45 | yRow[index] = 1.0 46 | yOneVAll.append(yRow) 47 | 48 | #calculate means and variances 49 | xMeans = [] 50 | xSD = [] 51 | for i in range(ncol): 52 | col = [xNum[j][i] for j in range(nrow)] 53 | mean = sum(col)/nrow 54 | xMeans.append(mean) 55 | colDiff = [(xNum[j][i] - mean) for j in range(nrow)] 56 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)]) 57 | stdDev = sqrt(sumSq/nrow) 58 | xSD.append(stdDev) 59 | 60 | #use calculate mean and standard deviation to normalize xNum 61 | xNormalized = [] 62 | for i in range(nrow): 63 | rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)] 64 | xNormalized.append(rowNormalized) 65 | 66 | #normalize y's to center 67 | yMeans = [] 68 | ySD = [] 69 | for i in range(nlabels): 70 | col = [yOneVAll[j][i] for j in range(nrow)] 71 | mean = sum(col)/nrow 72 | yMeans.append(mean) 73 | colDiff = [(yOneVAll[j][i] - mean) for j in range(nrow)] 74 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)]) 75 | stdDev = sqrt(sumSq/nrow) 76 | ySD.append(stdDev) 77 | 78 | yNormalized = [] 79 | for i in range(nrow): 80 | rowNormalized = [(yOneVAll[i][j] - yMeans[j])/ySD[j] for j in range(nlabels)] 81 | yNormalized.append(rowNormalized) 82 | 83 | 84 | #number of cross validation folds 85 | nxval = 10 86 | nAlphas=100 87 | misClass = [0.0] * nAlphas 88 | 89 | for ixval in range(nxval): 90 | #Define test and training index sets 91 | idxTest = [a for a in range(nrow) if a%nxval == ixval%nxval] 92 | idxTrain = [a for a in range(nrow) if a%nxval != ixval%nxval] 93 | 94 | #Define test and training attribute and label sets 95 | xTrain = numpy.array([xNormalized[r] for r in idxTrain]) 96 | xTest = numpy.array([xNormalized[r] for r in idxTest]) 97 | yTrain = [yNormalized[r] for r in idxTrain] 98 | yTest = [yNormalized[r] for r in idxTest] 99 | labelsTest = [labels[r] for r in idxTest] 100 | 101 | #build model for each column in yTrain 102 | models = [] 103 | lenTrain = len(yTrain) 104 | lenTest = nrow - lenTrain 105 | for iModel in range(nlabels): 106 | yTemp = numpy.array([yTrain[j][iModel] for j in range(lenTrain)]) 107 | models.append(enet_path(xTrain, yTemp,l1_ratio=1.0, fit_intercept=False, eps=0.5e-3, n_alphas=nAlphas , return_models=False)) 108 | 109 | for iStep in range(1,nAlphas): 110 | #Assemble the predictions for all the models, find largest prediction and calc error 111 | allPredictions = [] 112 | for iModel in range(nlabels): 113 | _, coefs, _ = models[iModel] 114 | predTemp = list(numpy.dot(xTest, coefs[:,iStep])) 115 | #un-normalize the prediction for comparison 116 | predUnNorm = [(predTemp[j]*ySD[iModel] + yMeans[iModel]) for j in range(len(predTemp))] 117 | allPredictions.append(predUnNorm) 118 | 119 | predictions = [] 120 | for i in range(lenTest): 121 | listOfPredictions = [allPredictions[j][i] for j in range(nlabels) ] 122 | idxMax = listOfPredictions.index(max(listOfPredictions)) 123 | if labelList[idxMax] != labelsTest[i]: 124 | misClass[iStep] += 1.0 125 | 126 | misClassPlot = [misClass[i]/nrow for i in range(1, nAlphas)] 127 | 128 | plot.plot(misClassPlot) 129 | 130 | plot.xlabel("Penalty Parameter Steps") 131 | plot.ylabel(("Misclassification Error Rate")) 132 | plot.show() -------------------------------------------------------------------------------- /05/rocksVMines/rocksVMinesCoefCurves.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import urllib2 3 | from math import sqrt, fabs, exp 4 | import matplotlib.pyplot as plot 5 | from sklearn.linear_model import enet_path 6 | from sklearn.metrics import roc_auc_score, roc_curve 7 | import numpy 8 | 9 | #read data from uci data repository 10 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data" 11 | data = urllib2.urlopen(target_url) 12 | 13 | 14 | #arrange data into list for labels and list of lists for attributes 15 | xList = [] 16 | 17 | 18 | for line in data: 19 | #split on comma 20 | row = line.strip().split(",") 21 | xList.append(row) 22 | 23 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0 24 | 25 | xNum = [] 26 | labels = [] 27 | 28 | for row in xList: 29 | lastCol = row.pop() 30 | if lastCol == "M": 31 | labels.append(1.0) 32 | else: 33 | labels.append(0.0) 34 | attrRow = [float(elt) for elt in row] 35 | xNum.append(attrRow) 36 | 37 | #number of rows and columns in x matrix 38 | nrow = len(xNum) 39 | ncol = len(xNum[1]) 40 | 41 | alpha = 1.0 42 | 43 | #calculate means and variances 44 | xMeans = [] 45 | xSD = [] 46 | for i in range(ncol): 47 | col = [xNum[j][i] for j in range(nrow)] 48 | mean = sum(col)/nrow 49 | xMeans.append(mean) 50 | colDiff = [(xNum[j][i] - mean) for j in range(nrow)] 51 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)]) 52 | stdDev = sqrt(sumSq/nrow) 53 | xSD.append(stdDev) 54 | 55 | #use calculate mean and standard deviation to normalize xNum 56 | xNormalized = [] 57 | for i in range(nrow): 58 | rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)] 59 | xNormalized.append(rowNormalized) 60 | 61 | #normalize labels to center 62 | 63 | meanLabel = sum(labels)/nrow 64 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrow)])/nrow) 65 | 66 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrow)] 67 | 68 | #Convert normalized labels to numpy array 69 | Y = numpy.array(labelNormalized) 70 | 71 | #Convert normalized attributes to numpy array 72 | X = numpy.array(xNormalized) 73 | 74 | alphas, coefs, _ = enet_path(X, Y,l1_ratio=0.8, fit_intercept=False, return_models=False) 75 | 76 | plot.plot(alphas,coefs.T) 77 | 78 | plot.xlabel('alpha') 79 | plot.ylabel('Coefficients') 80 | plot.axis('tight') 81 | plot.semilogx() 82 | ax = plot.gca() 83 | ax.invert_xaxis() 84 | plot.show() 85 | 86 | nattr, nalpha = coefs.shape 87 | 88 | #find coefficient ordering 89 | nzList = [] 90 | for iAlpha in range(1,nalpha): 91 | coefList = list(coefs[: ,iAlpha]) 92 | nzCoef = [index for index in range(nattr) if coefList[index] != 0.0] 93 | for q in nzCoef: 94 | if not(q in nzList): 95 | nzList.append(q) 96 | 97 | #make up names for columns of X 98 | names = ['V' + str(i) for i in range(ncol)] 99 | nameList = [names[nzList[i]] for i in range(len(nzList))] 100 | print("Attributes Ordered by How Early They Enter the Model") 101 | print(nameList) 102 | print('') 103 | #find coefficients corresponding to best alpha value. alpha value corresponding to 104 | #normalized X and normalized Y is 0.020334883589342503 105 | 106 | alphaStar = 0.020334883589342503 107 | indexLTalphaStar = [index for index in range(100) if alphas[index] > alphaStar] 108 | indexStar = max(indexLTalphaStar) 109 | 110 | #here's the set of coefficients to deploy 111 | coefStar = list(coefs[:,indexStar]) 112 | print("Best Coefficient Values ") 113 | print(coefStar) 114 | print('') 115 | #The coefficients on normalized attributes give another slightly different ordering 116 | 117 | absCoef = [abs(a) for a in coefStar] 118 | 119 | #sort by magnitude 120 | coefSorted = sorted(absCoef, reverse=True) 121 | 122 | idxCoefSize = [absCoef.index(a) for a in coefSorted if not(a == 0.0)] 123 | 124 | namesList2 = [names[idxCoefSize[i]] for i in range(len(idxCoefSize))] 125 | 126 | print("Attributes Ordered by Coef Size at Optimum alpha") 127 | print(namesList2) -------------------------------------------------------------------------------- /05/rocksVMines/rocksVMinesCoefCurvesPrintedOutput.txt: -------------------------------------------------------------------------------- 1 | Attributes Ordered by How Early They Enter the Model 2 | ['V10', 'V48', 'V11', 'V44', 'V35', 'V51', 'V20', 'V3', 'V21', 'V45', 'V43', 'V15', 'V0', 'V22', 'V27', 'V50', 'V53', 'V30', 'V58', 'V56', 'V28', 'V39', 'V46', 'V19', 'V54', 'V29', 'V57', 'V6', 'V8', 'V7', 'V49', 'V2', 'V23', 'V37', 'V55', 'V4', 'V13', 'V36', 'V38', 'V26', 'V31', 'V1', 'V34', 'V33', 'V24', 'V16', 'V17', 'V5', 'V52', 'V41', 'V40', 'V59', 'V12', 'V9', 'V18', 'V14', 'V47', 'V42'] 3 | 4 | Best Coefficient Values 5 | [0.082258256813766639, 0.0020619887220043702, -0.11828642590855878, 0.16633956932499627, 0.0042854388193718004, -0.0, -0.04366252474594004, -0.07751510487942842, 0.10000054356323497, 0.0, 0.090617207036282038, 0.21210870399915693, -0.0, -0.010655386149821946, -0.0, -0.13328659558143779, -0.0, 0.0, 0.0, 0.052814854501417867, 0.038531154796719078, 0.0035515348181877982, 0.090854714680378215, 0.030316113904025031, -0.0, 0.0, 0.0086195542357481014, 0.0, 0.0, 0.17497679257272536, -0.2215687804617206, 0.012614243827937584, 0.0, -0.0, 0.0, -0.17160601809439849, -0.080450013824209077, 0.078096790041518344, 0.022035287616766441, -0.072184409273692227, 0.0, -0.0, 0.0, 0.057018816876250704, 0.096478265685721556, 0.039917367637236176, 0.049158231541622875, 0.0, 0.22671917920123755, -0.096272735479951091, 0.0, 0.078886784332226484, 0.0, 0.062312821755756878, -0.082785510713295471, 0.014466967172068596, -0.074326527525632721, 0.068096475974257331, 0.070488864435477847, 0.0] 6 | 7 | Attributes Ordered by Coef Size at Optimum alpha 8 | ['V48', 'V30', 'V11', 'V29', 'V35', 'V3', 'V15', 'V2', 'V8', 'V44', 'V49', 'V22', 'V10', 'V54', 'V0', 'V36', 'V51', 'V37', 'V7', 'V56', 'V39', 'V58', 'V57', 'V53', 'V43', 'V19', 'V46', 'V6', 'V45', 'V20', 'V23', 'V38', 'V55', 'V31', 'V13', 'V26', 'V4', 'V21', 'V1'] 9 | 10 | -------------------------------------------------------------------------------- /05/rocksVMines/rocksVMinesENetRegCV.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import urllib2 3 | from math import sqrt, fabs, exp 4 | import matplotlib.pyplot as plot 5 | from sklearn.linear_model import enet_path 6 | from sklearn.metrics import roc_auc_score, roc_curve 7 | import numpy 8 | 9 | #read data from uci data repository 10 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data" 11 | data = urllib2.urlopen(target_url) 12 | 13 | 14 | #arrange data into list for labels and list of lists for attributes 15 | xList = [] 16 | 17 | 18 | for line in data: 19 | #split on comma 20 | row = line.strip().split(",") 21 | xList.append(row) 22 | 23 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0 24 | 25 | xNum = [] 26 | labels = [] 27 | 28 | for row in xList: 29 | lastCol = row.pop() 30 | if lastCol == "M": 31 | labels.append(1.0) 32 | else: 33 | labels.append(0.0) 34 | attrRow = [float(elt) for elt in row] 35 | xNum.append(attrRow) 36 | 37 | #number of rows and columns in x matrix 38 | nrow = len(xNum) 39 | ncol = len(xNum[1]) 40 | 41 | alpha = 1.0 42 | 43 | #calculate means and variances 44 | xMeans = [] 45 | xSD = [] 46 | for i in range(ncol): 47 | col = [xNum[j][i] for j in range(nrow)] 48 | mean = sum(col)/nrow 49 | xMeans.append(mean) 50 | colDiff = [(xNum[j][i] - mean) for j in range(nrow)] 51 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)]) 52 | stdDev = sqrt(sumSq/nrow) 53 | xSD.append(stdDev) 54 | 55 | #use calculate mean and standard deviation to normalize xNum 56 | xNormalized = [] 57 | for i in range(nrow): 58 | rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)] 59 | xNormalized.append(rowNormalized) 60 | 61 | #normalize labels to center 62 | #Normalize labels 63 | meanLabel = sum(labels)/nrow 64 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrow)])/nrow) 65 | 66 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrow)] 67 | 68 | 69 | #number of cross validation folds 70 | nxval = 10 71 | 72 | 73 | for ixval in range(nxval): 74 | #Define test and training index sets 75 | idxTest = [a for a in range(nrow) if a%nxval == ixval%nxval] 76 | idxTrain = [a for a in range(nrow) if a%nxval != ixval%nxval] 77 | 78 | #Define test and training attribute and label sets 79 | xTrain = numpy.array([xNormalized[r] for r in idxTrain]) 80 | xTest = numpy.array([xNormalized[r] for r in idxTest]) 81 | labelTrain = numpy.array([labelNormalized[r] for r in idxTrain]) 82 | labelTest = numpy.array([labelNormalized[r] for r in idxTest]) 83 | alphas, coefs, _ = enet_path(xTrain, labelTrain,l1_ratio=0.8, fit_intercept=False, return_models=False) 84 | 85 | #apply coefs to test data to produce predictions and accumulate 86 | if ixval == 0: 87 | pred = numpy.dot(xTest, coefs) 88 | yOut = labelTest 89 | else: 90 | #accumulate predictions 91 | yTemp = numpy.array(yOut) 92 | yOut = numpy.concatenate((yTemp, labelTest), axis=0) 93 | 94 | #accumulate predictions 95 | predTemp = numpy.array(pred) 96 | pred = numpy.concatenate((predTemp, numpy.dot(xTest, coefs)), axis = 0) 97 | 98 | 99 | #calculate miss classification error 100 | misClassRate = [] 101 | _,nPred = pred.shape 102 | for iPred in range(1, nPred): 103 | predList = list(pred[:, iPred]) 104 | errCnt = 0.0 105 | for irow in range(nrow): 106 | if (predList[irow] < 0.0) and (yOut[irow] >= 0.0): 107 | errCnt += 1.0 108 | elif (predList[irow] >= 0.0) and (yOut[irow] < 0.0): 109 | errCnt += 1.0 110 | misClassRate.append(errCnt/nrow) 111 | 112 | #find minimum point for plot and for print 113 | minError = min(misClassRate) 114 | idxMin = misClassRate.index(minError) 115 | plotAlphas = list(alphas[1:len(alphas)]) 116 | 117 | plot.figure() 118 | plot.plot(plotAlphas, misClassRate, label='Misclassification Error Across Folds', linewidth=2) 119 | plot.axvline(plotAlphas[idxMin], linestyle='--', 120 | label='CV Estimate of Best alpha') 121 | plot.legend() 122 | plot.semilogx() 123 | ax = plot.gca() 124 | ax.invert_xaxis() 125 | plot.xlabel('alpha') 126 | plot.ylabel('Misclassification Error') 127 | plot.axis('tight') 128 | plot.show() 129 | 130 | 131 | 132 | #calculate AUC. 133 | idxPos = [i for i in range(nrow) if yOut[i] > 0.0] 134 | yOutBin = [0] * nrow 135 | for i in idxPos: yOutBin[i] = 1 136 | 137 | auc = [] 138 | for iPred in range(1, nPred): 139 | predList = list(pred[:, iPred]) 140 | aucCalc = roc_auc_score(yOutBin, predList) 141 | auc.append(aucCalc) 142 | 143 | maxAUC = max(auc) 144 | idxMax = auc.index(maxAUC) 145 | 146 | plot.figure() 147 | plot.plot(plotAlphas, auc, label='AUC Across Folds', linewidth=2) 148 | plot.axvline(plotAlphas[idxMax], linestyle='--', 149 | label='CV Estimate of Best alpha') 150 | plot.legend() 151 | plot.semilogx() 152 | ax = plot.gca() 153 | ax.invert_xaxis() 154 | plot.xlabel('alpha') 155 | plot.ylabel('Area Under the ROC Curve') 156 | plot.axis('tight') 157 | plot.show() 158 | 159 | 160 | #plot best version of ROC curve 161 | fpr, tpr, thresh = roc_curve(yOutBin, list(pred[:, idxMax])) 162 | ctClass = [i*0.01 for i in range(101)] 163 | 164 | plot.plot(fpr, tpr, linewidth=2) 165 | plot.plot(ctClass, ctClass, linestyle=':') 166 | plot.xlabel('False Positive Rate') 167 | plot.ylabel('True Positive Rate') 168 | plot.show() 169 | 170 | print('Best Value of Misclassification Error = ', misClassRate[idxMin]) 171 | print('Best alpha for Misclassification Error = ', plotAlphas[idxMin]) 172 | print('') 173 | print('Best Value for AUC = ', auc[idxMax]) 174 | print('Best alpha for AUC = ', plotAlphas[idxMax]) 175 | 176 | print('') 177 | print('Confusion Matrices for Different Threshold Values') 178 | 179 | #pick some points along the curve to print. There are 208 points. The extremes aren't useful 180 | #Sample at 52, 104 and 156. Use the calculated values of tpr and fpr along with definitions and 181 | #threshold values. 182 | #Some nomenclature (e.g. see wikkipedia "receiver operating curve") 183 | 184 | 185 | #P = Positive cases 186 | P = len(idxPos) 187 | #N = Negative cases 188 | N = nrow - P 189 | #TP = True positives = tpr * P 190 | TP = tpr[52] * P 191 | #FN = False negatives = P - TP 192 | FN = P - TP 193 | #FP = False positives = fpr * N 194 | FP = fpr[52] * N 195 | #TN = True negatives = N - FP 196 | TN = N - FP 197 | 198 | print('Threshold Value = ', thresh[52]) 199 | print('TP = ', TP, 'FP = ', FP) 200 | print('FN = ', FN, 'TN = ', TN) 201 | 202 | TP = tpr[104] * P; FN = P - TP; FP = fpr[104] * N; TN = N - FP 203 | 204 | print('Threshold Value = ', thresh[104]) 205 | print('TP = ', TP, 'FP = ', FP) 206 | print('FN = ', FN, 'TN = ', TN) 207 | 208 | TP = tpr[156] * P; FN = P - TP; FP = fpr[156] * N; TN = N - FP 209 | 210 | print('Threshold Value = ', thresh[156]) 211 | print('TP = ', TP, 'FP = ', FP) 212 | print('FN = ', FN, 'TN = ', TN) -------------------------------------------------------------------------------- /05/rocksVMines/rocksVMinesENetRegCVPrintedOutput.txt: -------------------------------------------------------------------------------- 1 | ('Best Value of Misclassification Error = ', 0.22115384615384615) 2 | ('Best alpha for Misclassification Error = ', 0.017686244720179375) 3 | 4 | ('Best Value for AUC = ', 0.86867279650784812) 5 | ('Best alpha for AUC = ', 0.020334883589342503) 6 | 7 | Confusion Matrices for Different Threshold Values 8 | ('Threshold Value = ', 0.37952298245219962) 9 | ('TP = ', 48.0, 'FP = ', 5.0) 10 | ('FN = ', 63.0, 'TN = ', 92.0) 11 | ('Threshold Value = ', -0.045503481125357965) 12 | ('TP = ', 85.0, 'FP = ', 20.0) 13 | ('FN = ', 26.0, 'TN = ', 77.0) 14 | ('Threshold Value = ', -0.4272522354395466) 15 | ('TP = ', 107.0, 'FP = ', 49.999999999999993) 16 | ('FN = ', 4.0, 'TN = ', 47.000000000000007) 17 | 18 | -------------------------------------------------------------------------------- /05/rocksVMines/rocksVMinesGlmnet.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | import urllib2 3 | import sys 4 | from math import sqrt, fabs, exp 5 | import matplotlib.pyplot as plot 6 | 7 | def S(z,gamma): 8 | if gamma >= fabs(z): 9 | return 0.0 10 | if z > 0.0: 11 | return z - gamma 12 | else: 13 | return z + gamma 14 | 15 | def Pr(b0,b,x): 16 | n = len(x) 17 | sum = b0 18 | for i in range(n): 19 | sum += b[i]*x[i] 20 | if sum < -100: sum = -100 21 | return 1.0/(1.0 + exp(-sum)) 22 | 23 | 24 | #read data from uci data repository 25 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data" 26 | data = urllib2.urlopen(target_url) 27 | 28 | 29 | #arrange data into list for labels and list of lists for attributes 30 | xList = [] 31 | 32 | 33 | for line in data: 34 | #split on comma 35 | row = line.strip().split(",") 36 | xList.append(row) 37 | 38 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0 39 | 40 | xNum = [] 41 | labels = [] 42 | 43 | for row in xList: 44 | lastCol = row.pop() 45 | if lastCol == "M": 46 | labels.append(1.0) 47 | else: 48 | labels.append(0.0) 49 | attrRow = [float(elt) for elt in row] 50 | xNum.append(attrRow) 51 | 52 | #number of rows and columns in x matrix 53 | nrow = len(xNum) 54 | ncol = len(xNum[1]) 55 | 56 | alpha = 0.8 57 | #calculate means and variances 58 | xMeans = [] 59 | xSD = [] 60 | for i in range(ncol): 61 | col = [xNum[j][i] for j in range(nrow)] 62 | mean = sum(col)/nrow 63 | xMeans.append(mean) 64 | colDiff = [(xNum[j][i] - mean) for j in range(nrow)] 65 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)]) 66 | stdDev = sqrt(sumSq/nrow) 67 | xSD.append(stdDev) 68 | 69 | #use calculate mean and standard deviation to normalize xNum 70 | xNormalized = [] 71 | for i in range(nrow): 72 | rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)] 73 | xNormalized.append(rowNormalized) 74 | 75 | #Do Not Normalize labels but do calculate averages 76 | meanLabel = sum(labels)/nrow 77 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrow)])/nrow) 78 | 79 | #initialize probabilities and weights 80 | sumWxr = [0.0] * ncol 81 | sumWxx = [0.0] * ncol 82 | sumWr = 0.0 83 | sumW = 0.0 84 | 85 | #calculate starting points for betas 86 | for iRow in range(nrow): 87 | p = meanLabel 88 | w = p * (1.0 - p) 89 | #residual for logistic 90 | r = (labels[iRow] - p) / w 91 | x = xNormalized[iRow] 92 | sumWxr = [sumWxr[i] + w * x[i] * r for i in range(ncol)] 93 | sumWxx = [sumWxx[i] + w * x[i] * x[i] for i in range(ncol)] 94 | sumWr = sumWr + w * r 95 | sumW = sumW + w 96 | 97 | avgWxr = [sumWxr[i]/nrow for i in range(ncol)] 98 | avgWxx = [sumWxx[i]/nrow for i in range(ncol)] 99 | 100 | maxWxr = 0.0 101 | for i in range(ncol): 102 | val = abs(avgWxr[i]) 103 | if val > maxWxr: 104 | maxWxr = val 105 | 106 | #calculate starting value for lambda 107 | lam = maxWxr/alpha 108 | 109 | #this value of lambda corresponds to beta = list of 0's 110 | #initialize a vector of coefficients beta 111 | beta = [0.0] * ncol 112 | beta0 = sumWr/sumW 113 | 114 | #initialize matrix of betas at each step 115 | betaMat = [] 116 | betaMat.append(list(beta)) 117 | 118 | beta0List = [] 119 | beta0List.append(beta0) 120 | 121 | #begin iteration 122 | nSteps = 100 123 | lamMult = 0.93 #100 steps gives reduction by factor of 1000 in lambda (recommended by authors) 124 | nzList = [] 125 | for iStep in range(nSteps): 126 | #decrease lambda 127 | lam = lam * lamMult 128 | 129 | 130 | #Use incremental change in betas to control inner iteration 131 | 132 | 133 | #set middle loop values for betas = to outer values 134 | # values are used for calculating weights and probabilities 135 | #inner values are used for calculating penalized regression updates 136 | 137 | #take pass through data to calculate averages over data require for iteration 138 | #initilize accumulators 139 | 140 | betaIRLS = list(beta) 141 | beta0IRLS = beta0 142 | distIRLS = 100.0 143 | #Middle loop to calculate new betas with fixed IRLS weights and probabilities 144 | iterIRLS = 0 145 | while distIRLS > 0.01: 146 | iterIRLS += 1 147 | iterInner = 0.0 148 | 149 | betaInner = list(betaIRLS) 150 | beta0Inner = beta0IRLS 151 | distInner = 100.0 152 | while distInner > 0.01: 153 | iterInner += 1 154 | if iterInner > 100: break 155 | 156 | #cycle through attributes and update one-at-a-time 157 | #record starting value for comparison 158 | betaStart = list(betaInner) 159 | for iCol in range(ncol): 160 | 161 | sumWxr = 0.0 162 | sumWxx = 0.0 163 | sumWr = 0.0 164 | sumW = 0.0 165 | 166 | for iRow in range(nrow): 167 | x = list(xNormalized[iRow]) 168 | y = labels[iRow] 169 | p = Pr(beta0IRLS, betaIRLS, x) 170 | if abs(p) < 1e-5: 171 | p = 0.0 172 | w = 1e-5 173 | elif abs(1.0 - p) < 1e-5: 174 | p = 1.0 175 | w = 1e-5 176 | else: 177 | w = p * (1.0 - p) 178 | 179 | z = (y - p) / w + beta0IRLS + sum([x[i] * betaIRLS[i] for i in range(ncol)]) 180 | r = z - beta0Inner - sum([x[i] * betaInner[i] for i in range(ncol)]) 181 | sumWxr += w * x[iCol] * r 182 | sumWxx += w * x[iCol] * x[iCol] 183 | sumWr += w * r 184 | sumW += w 185 | 186 | avgWxr = sumWxr / nrow 187 | avgWxx = sumWxx / nrow 188 | 189 | beta0Inner = beta0Inner + sumWr / sumW 190 | uncBeta = avgWxr + avgWxx * betaInner[iCol] 191 | betaInner[iCol] = S(uncBeta, lam * alpha) / (avgWxx + lam * (1.0 - alpha)) 192 | 193 | sumDiff = sum([abs(betaInner[n] - betaStart[n]) for n in range(ncol)]) 194 | sumBeta = sum([abs(betaInner[n]) for n in range(ncol)]) 195 | distInner = sumDiff/sumBeta 196 | #print number of steps for inner and middle loop convergence to monitor behavior 197 | #print(iStep, iterIRLS, iterInner) 198 | 199 | #if exit inner while loop, then set betaMiddle = betaMiddle and run through middle loop again. 200 | 201 | #Check change in betaMiddle to see if IRLS is converged 202 | a = sum([abs(betaIRLS[i] - betaInner[i]) for i in range(ncol)]) 203 | b = sum([abs(betaIRLS[i]) for i in range(ncol)]) 204 | distIRLS = a / (b + 0.0001) 205 | dBeta = [betaInner[i] - betaIRLS[i] for i in range(ncol)] 206 | gradStep = 1.0 207 | temp = [betaIRLS[i] + gradStep * dBeta[i] for i in range(ncol)] 208 | betaIRLS = list(temp) 209 | 210 | beta = list(betaIRLS) 211 | beta0 = beta0IRLS 212 | betaMat.append(list(beta)) 213 | beta0List.append(beta0) 214 | 215 | nzBeta = [index for index in range(ncol) if beta[index] != 0.0] 216 | for q in nzBeta: 217 | if not(q in nzList): 218 | nzList.append(q) 219 | 220 | #make up names for columns of xNum 221 | names = ['V' + str(i) for i in range(ncol)] 222 | nameList = [names[nzList[i]] for i in range(len(nzList))] 223 | 224 | print("Attributes Ordered by How Early They Enter the Model") 225 | print(nameList) 226 | for i in range(ncol): 227 | #plot range of beta values for each attribute 228 | coefCurve = [betaMat[k][i] for k in range(nSteps)] 229 | xaxis = range(nSteps) 230 | plot.plot(xaxis, coefCurve) 231 | 232 | plot.xlabel("Steps Taken") 233 | plot.ylabel("Coefficient Values") 234 | plot.show() -------------------------------------------------------------------------------- /05/rocksVMines/rocksVMinesGlmnetPrintedOutput.txt: -------------------------------------------------------------------------------- 1 | Attributes Ordered by How Early They Enter the Model 2 | ['V10', 'V48', 'V11', 'V44', 'V35', 'V51', 'V20', 'V3', 'V50', 'V21', 'V43', 'V47', 'V15', 'V27', 'V0', 'V22', 'V36', 'V30', 'V53', 'V56', 'V58', 'V6', 'V19', 'V28', 'V39', 'V49', 'V7', 'V23', 'V54', 'V8', 'V14', 'V2', 'V29', 'V38', 'V57', 'V45', 'V13', 'V32', 'V31', 'V42', 'V16', 'V37', 'V59', 'V52', 'V25', 'V18', 'V1', 'V33', 'V4', 'V55', 'V17', 'V46', 'V26', 'V12', 'V40', 'V34', 'V5', 'V24', 'V41', 'V9'] 3 | 4 | -------------------------------------------------------------------------------- /05/wineCS/wineExpandedLassoCV.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import datasets, linear_model 6 | from sklearn.linear_model import LassoCV 7 | from math import sqrt 8 | import matplotlib.pyplot as plot 9 | 10 | #read data into iterable 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 12 | data = urllib2.urlopen(target_url) 13 | 14 | xList = [] 15 | labels = [] 16 | names = [] 17 | firstLine = True 18 | for line in data: 19 | if firstLine: 20 | names = line.strip().split(";") 21 | firstLine = False 22 | else: 23 | #split on semi-colon 24 | row = line.strip().split(";") 25 | #put labels in separate array 26 | labels.append(float(row[-1])) 27 | #remove label from row 28 | row.pop() 29 | #convert row to floats 30 | floatRow = [float(num) for num in row] 31 | xList.append(floatRow) 32 | 33 | #append square of last term (alcohol) 34 | 35 | for i in range(len(xList)): 36 | alcElt = xList[i][-1] 37 | volAcid = xList[i][1] 38 | temp = list(xList[i]) 39 | temp.append(alcElt*alcElt) 40 | temp.append(alcElt*volAcid) 41 | xList[i] = list(temp) 42 | 43 | #add new name to variable list 44 | names[-1] = "alco^2" 45 | names.append("alco*volAcid") 46 | 47 | #Normalize columns in x and labels 48 | #Note: be careful about normalization. Some penalized regression packages include it 49 | #and some don't. 50 | 51 | nrows = len(xList) 52 | ncols = len(xList[0]) 53 | 54 | #calculate means and variances 55 | xMeans = [] 56 | xSD = [] 57 | for i in range(ncols): 58 | col = [xList[j][i] for j in range(nrows)] 59 | mean = sum(col)/nrows 60 | xMeans.append(mean) 61 | colDiff = [(xList[j][i] - mean) for j in range(nrows)] 62 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)]) 63 | stdDev = sqrt(sumSq/nrows) 64 | xSD.append(stdDev) 65 | 66 | #use calculate mean and standard deviation to normalize xList 67 | xNormalized = [] 68 | for i in range(nrows): 69 | rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)] 70 | xNormalized.append(rowNormalized) 71 | 72 | #Normalize labels 73 | meanLabel = sum(labels)/nrows 74 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows) 75 | 76 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)] 77 | 78 | #Convert list of list to np array for input to sklearn packages 79 | 80 | #Unnormalized labels 81 | Y = numpy.array(labels) 82 | 83 | #normalized lables 84 | #Y = numpy.array(labelNormalized) 85 | 86 | #Unnormalized X's 87 | X = numpy.array(xList) 88 | 89 | #Normlized Xss 90 | X = numpy.array(xNormalized) 91 | 92 | #Call LassoCV from sklearn.linear_model 93 | wineModel = LassoCV(cv=10).fit(X, Y) 94 | 95 | # Display results 96 | 97 | 98 | plot.figure() 99 | plot.plot(wineModel.alphas_, wineModel.mse_path_, ':') 100 | plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1), 101 | label='Average MSE Across Folds', linewidth=2) 102 | plot.axvline(wineModel.alpha_, linestyle='--', 103 | label='CV Estimate of Best alpha') 104 | plot.semilogx() 105 | plot.legend() 106 | ax = plot.gca() 107 | ax.invert_xaxis() 108 | plot.xlabel('alpha') 109 | plot.ylabel('Mean Square Error') 110 | plot.axis('tight') 111 | plot.show() 112 | 113 | #print out the value of alpha that minimizes the Cv-error 114 | print("alpha Value that Minimizes CV Error ",wineModel.alpha_) 115 | print("Minimum MSE ", min(wineModel.mse_path_.mean(axis=-1))) 116 | -------------------------------------------------------------------------------- /05/wineCS/wineLassoCV.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import datasets, linear_model 6 | from sklearn.linear_model import LassoCV 7 | from math import sqrt 8 | import matplotlib.pyplot as plot 9 | 10 | #read data into iterable 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 12 | data = urllib2.urlopen(target_url) 13 | 14 | xList = [] 15 | labels = [] 16 | names = [] 17 | firstLine = True 18 | for line in data: 19 | if firstLine: 20 | names = line.strip().split(";") 21 | firstLine = False 22 | else: 23 | #split on semi-colon 24 | row = line.strip().split(";") 25 | #put labels in separate array 26 | labels.append(float(row[-1])) 27 | #remove label from row 28 | row.pop() 29 | #convert row to floats 30 | floatRow = [float(num) for num in row] 31 | xList.append(floatRow) 32 | 33 | #Normalize columns in x and labels 34 | #Note: be careful about normalization. Some penalized regression packages include it 35 | #and some don't. 36 | 37 | nrows = len(xList) 38 | ncols = len(xList[0]) 39 | 40 | #calculate means and variances 41 | xMeans = [] 42 | xSD = [] 43 | for i in range(ncols): 44 | col = [xList[j][i] for j in range(nrows)] 45 | mean = sum(col)/nrows 46 | xMeans.append(mean) 47 | colDiff = [(xList[j][i] - mean) for j in range(nrows)] 48 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)]) 49 | stdDev = sqrt(sumSq/nrows) 50 | xSD.append(stdDev) 51 | 52 | #use calculate mean and standard deviation to normalize xList 53 | xNormalized = [] 54 | for i in range(nrows): 55 | rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)] 56 | xNormalized.append(rowNormalized) 57 | 58 | #Normalize labels 59 | meanLabel = sum(labels)/nrows 60 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows) 61 | 62 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)] 63 | 64 | #Convert list of list to np array for input to sklearn packages 65 | 66 | #Unnormalized labels 67 | Y = numpy.array(labels) 68 | 69 | #normalized lables 70 | Y = numpy.array(labelNormalized) 71 | 72 | #Unnormalized X's 73 | X = numpy.array(xList) 74 | 75 | #Normlized Xss 76 | X = numpy.array(xNormalized) 77 | 78 | #Call LassoCV from sklearn.linear_model 79 | wineModel = LassoCV(cv=10).fit(X, Y) 80 | 81 | # Display results 82 | 83 | 84 | plot.figure() 85 | plot.plot(wineModel.alphas_, wineModel.mse_path_, ':') 86 | plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1), 87 | label='Average MSE Across Folds', linewidth=2) 88 | plot.axvline(wineModel.alpha_, linestyle='--', 89 | label='CV Estimate of Best alpha') 90 | plot.semilogx() 91 | plot.legend() 92 | ax = plot.gca() 93 | ax.invert_xaxis() 94 | plot.xlabel('alpha') 95 | plot.ylabel('Mean Square Error') 96 | plot.axis('tight') 97 | plot.show() 98 | 99 | #print out the value of alpha that minimizes the Cv-error 100 | print("alpha Value that Minimizes CV Error ",wineModel.alpha_) 101 | print("Minimum MSE ", min(wineModel.mse_path_.mean(axis=-1))) -------------------------------------------------------------------------------- /05/wineCS/wineLassoCVPrintedOutputNormalizedX.txt: -------------------------------------------------------------------------------- 1 | ('alpha Value that Minimizes CV Error ', 0.010948337166040082) 2 | ('Minimum MSE ', 0.433801987153697) 3 | -------------------------------------------------------------------------------- /05/wineCS/wineLassoCVPrintedOutputNormalizedXandY.txt: -------------------------------------------------------------------------------- 1 | ('alpha Value that Minimizes CV Error ', 0.013561387700964642) 2 | ('Minimum MSE ', 0.66558492060028562) 3 | -------------------------------------------------------------------------------- /05/wineCS/wineLassoCVPrintedOutputUn-NormalizedX.txt: -------------------------------------------------------------------------------- 1 | ('alpha Value that Minimizes CV Error ', 0.0052692947038249062) 2 | ('Minimum MSE ', 0.43936035436777832) 3 | -------------------------------------------------------------------------------- /05/wineCS/wineLassoCoefCurves.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import datasets, linear_model 6 | from sklearn.linear_model import LassoCV 7 | from math import sqrt 8 | import matplotlib.pyplot as plot 9 | 10 | #read data into iterable 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 12 | data = urllib2.urlopen(target_url) 13 | 14 | xList = [] 15 | labels = [] 16 | names = [] 17 | firstLine = True 18 | for line in data: 19 | if firstLine: 20 | names = line.strip().split(";") 21 | firstLine = False 22 | else: 23 | #split on semi-colon 24 | row = line.strip().split(";") 25 | #put labels in separate array 26 | labels.append(float(row[-1])) 27 | #remove label from row 28 | row.pop() 29 | #convert row to floats 30 | floatRow = [float(num) for num in row] 31 | xList.append(floatRow) 32 | 33 | #Normalize columns in x and labels 34 | #Note: be careful about normalization. Some penalized regression packages include it 35 | #and some don't. 36 | 37 | nrows = len(xList) 38 | ncols = len(xList[0]) 39 | 40 | #calculate means and variances 41 | xMeans = [] 42 | xSD = [] 43 | for i in range(ncols): 44 | col = [xList[j][i] for j in range(nrows)] 45 | mean = sum(col)/nrows 46 | xMeans.append(mean) 47 | colDiff = [(xList[j][i] - mean) for j in range(nrows)] 48 | sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)]) 49 | stdDev = sqrt(sumSq/nrows) 50 | xSD.append(stdDev) 51 | 52 | #use calculate mean and standard deviation to normalize xList 53 | xNormalized = [] 54 | for i in range(nrows): 55 | rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)] 56 | xNormalized.append(rowNormalized) 57 | 58 | #Normalize labels 59 | meanLabel = sum(labels)/nrows 60 | sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows) 61 | 62 | labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)] 63 | 64 | #Convert list of list to np array for input to sklearn packages 65 | 66 | #Unnormalized labels 67 | Y = numpy.array(labels) 68 | 69 | #normalized lables 70 | Y = numpy.array(labelNormalized) 71 | 72 | #Unnormalized X's 73 | X = numpy.array(xList) 74 | 75 | #Normlized Xss 76 | X = numpy.array(xNormalized) 77 | 78 | alphas, coefs, _ = linear_model.lasso_path(X, Y, return_models=False) 79 | 80 | 81 | plot.plot(alphas,coefs.T) 82 | 83 | plot.xlabel('alpha') 84 | plot.ylabel('Coefficients') 85 | plot.axis('tight') 86 | plot.semilogx() 87 | ax = plot.gca() 88 | ax.invert_xaxis() 89 | plot.show() 90 | 91 | nattr, nalpha = coefs.shape 92 | 93 | #find coefficient ordering 94 | nzList = [] 95 | for iAlpha in range(1,nalpha): 96 | coefList = list(coefs[: ,iAlpha]) 97 | nzCoef = [index for index in range(nattr) if coefList[index] != 0.0] 98 | for q in nzCoef: 99 | if not(q in nzList): 100 | nzList.append(q) 101 | 102 | nameList = [names[nzList[i]] for i in range(len(nzList))] 103 | print("Attributes Ordered by How Early They Enter the Model", nameList) 104 | 105 | #find coefficients corresponding to best alpha value. alpha value corresponding to 106 | #normalized X and normalized Y is 0.013561387700964642 107 | 108 | alphaStar = 0.013561387700964642 109 | indexLTalphaStar = [index for index in range(100) if alphas[index] > alphaStar] 110 | indexStar = max(indexLTalphaStar) 111 | 112 | #here's the set of coefficients to deploy 113 | coefStar = list(coefs[:,indexStar]) 114 | print("Best Coefficient Values ", coefStar) 115 | 116 | #The coefficients on normalized attributes give another slightly different ordering 117 | 118 | absCoef = [abs(a) for a in coefStar] 119 | 120 | #sort by magnitude 121 | coefSorted = sorted(absCoef, reverse=True) 122 | 123 | idxCoefSize = [absCoef.index(a) for a in coefSorted if not(a == 0.0)] 124 | 125 | namesList2 = [names[idxCoefSize[i]] for i in range(len(idxCoefSize))] 126 | 127 | print("Attributes Ordered by Coef Size at Optimum alpha", namesList2) -------------------------------------------------------------------------------- /05/wineCS/wineLassoCoefCurvesPrintedOutput.txt: -------------------------------------------------------------------------------- 1 | ('Attributes Ordered by How Early They Enter the Model', ['"alcohol"', '"volatile acidity"', '"sulphates"', '"total sulfur dioxide"', '"chlorides"', '"fixed acidity"', '"pH"', '"free sulfur dioxide"', '"residual sugar"', '"citric acid"', '"density"']) 2 | 3 | ('Best Coefficient Values ', [0.0, -0.22773815784738916, -0.0, 0.0, -0.094239023363375404, 0.022151948563542922, -0.099036391332770576, -0.0, -0.067873612822590218, 0.16804102141830754, 0.37509573430881538]) 4 | 5 | ('Attributes Ordered by Coef Size at Optimum alpha', ['"alcohol"', '"volatile acidity"', '"sulphates"', '"total sulfur dioxide"', '"chlorides"', '"pH"', '"free sulfur dioxide"']) 6 | 7 | 8 | Values with Un-normalized X: 9 | ('Attributes Ordered by How Early They Enter the Model', ['"total sulfur dioxide"', '"free sulfur dioxide"', '"alcohol"', '"fixed acidity"', '"volatile acidity"', '"sulphates"']) 10 | 11 | ('Best Coefficient Values ', [0.044339055570034182, -1.0154179864549988, 0.0, 0.0, -0.0, 0.0064112885435006822, -0.0038622920281433199, -0.0, -0.0, 0.41982634135945091, 0.37812720947996975]) 12 | 13 | ('Attributes Ordered by Coef Size at Optimum alpha', ['"volatile acidity"', '"sulphates"', '"alcohol"', '"fixed acidity"', '"free sulfur dioxide"', '"total sulfur dioxide"']) 14 | 15 | 16 | -------------------------------------------------------------------------------- /05/wineCS/wineLassoExpandedCVPrintedOutput.txt: -------------------------------------------------------------------------------- 1 | ('alpha Value that Minimizes CV Error ', 0.016640498998569835) 2 | ('Minimum MSE ', 0.43452874043020256) 3 | -------------------------------------------------------------------------------- /06/chapter06.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/derekhe/machine-learning-in-python-essential-techniques-for-predictive-analysis-source/d91f60cc29fdbaad32819058f9b2742e955e586a/06/chapter06.zip -------------------------------------------------------------------------------- /06/simpleBagging.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import numpy 4 | import matplotlib.pyplot as plot 5 | from sklearn import tree 6 | from sklearn.tree import DecisionTreeRegressor 7 | from math import floor 8 | import random 9 | 10 | 11 | #Build a simple data set with y = x + random 12 | nPoints = 1000 13 | 14 | #x values for plotting 15 | xPlot = [(float(i)/float(nPoints) - 0.5) for i in range(nPoints + 1)] 16 | 17 | #x needs to be list of lists. 18 | x = [[s] for s in xPlot] 19 | 20 | #y (labels) has random noise added to x-value 21 | #set seed 22 | random.seed(1) 23 | y = [s + numpy.random.normal(scale=0.1) for s in xPlot] 24 | 25 | #take fixed test set 30% of sample 26 | nSample = int(nPoints * 0.30) 27 | idxTest = random.sample(range(nPoints), nSample) 28 | idxTest.sort() 29 | idxTrain = [idx for idx in range(nPoints) if not(idx in idxTest)] 30 | 31 | #Define test and training attribute and label sets 32 | xTrain = [x[r] for r in idxTrain] 33 | xTest = [x[r] for r in idxTest] 34 | yTrain = [y[r] for r in idxTrain] 35 | yTest = [y[r] for r in idxTest] 36 | 37 | #train a series of models on random subsets of the training data 38 | #collect the models in a list and check error of composite as list grows 39 | 40 | #maximum number of models to generate 41 | numTreesMax = 20 42 | 43 | #tree depth - typically at the high end 44 | treeDepth = 1 45 | 46 | #initialize a list to hold models 47 | modelList = [] 48 | predList = [] 49 | 50 | #number of samples to draw for stochastic bagging 51 | nBagSamples = int(len(xTrain) * 0.5) 52 | 53 | for iTrees in range(numTreesMax): 54 | idxBag = [] 55 | for i in range(nBagSamples): 56 | idxBag.append(random.choice(range(len(xTrain)))) 57 | xTrainBag = [xTrain[i] for i in idxBag] 58 | yTrainBag = [yTrain[i] for i in idxBag] 59 | 60 | modelList.append(DecisionTreeRegressor(max_depth=treeDepth)) 61 | modelList[-1].fit(xTrainBag, yTrainBag) 62 | 63 | #make prediction with latest model and add to list of predictions 64 | latestPrediction = modelList[-1].predict(xTest) 65 | predList.append(list(latestPrediction)) 66 | 67 | 68 | #build cumulative prediction from first "n" models 69 | mse = [] 70 | allPredictions = [] 71 | for iModels in range(len(modelList)): 72 | 73 | #average first "iModels" of the predictions 74 | prediction = [] 75 | for iPred in range(len(xTest)): 76 | prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)])/(iModels + 1)) 77 | 78 | allPredictions.append(prediction) 79 | errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))] 80 | mse.append(sum([e * e for e in errors]) / len(yTest)) 81 | 82 | 83 | nModels = [i + 1 for i in range(len(modelList))] 84 | 85 | plot.plot(nModels,mse) 86 | plot.axis('tight') 87 | plot.xlabel('Number of Models in Ensemble') 88 | plot.ylabel('Mean Squared Error') 89 | plot.ylim((0.0, max(mse))) 90 | plot.show() 91 | 92 | plotList = [0, 9, 19] 93 | for iPlot in plotList: 94 | plot.plot(xTest, allPredictions[iPlot]) 95 | plot.plot(xTest, yTest, linestyle="--") 96 | plot.axis('tight') 97 | plot.xlabel('x value') 98 | plot.ylabel('Predictions') 99 | plot.show() 100 | 101 | print('Minimum MSE') 102 | print(min(mse)) 103 | 104 | 105 | #With treeDepth = 1 106 | #Minimum MSE 107 | #0.0242960117899 108 | 109 | 110 | 111 | #With treeDepth = 5 112 | #Minimum MSE 113 | #0.0118893503384 -------------------------------------------------------------------------------- /06/simpleGBM.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import numpy 4 | import matplotlib.pyplot as plot 5 | from sklearn import tree 6 | from sklearn.tree import DecisionTreeRegressor 7 | from math import floor 8 | import random 9 | 10 | #Build a simple data set with y = x + random 11 | nPoints = 1000 12 | 13 | #x values for plotting 14 | xPlot = [(float(i)/float(nPoints) - 0.5) for i in range(nPoints + 1)] 15 | 16 | #x needs to be list of lists. 17 | x = [[s] for s in xPlot] 18 | 19 | #y (labels) has random noise added to x-value 20 | #set seed 21 | numpy.random.seed(1) 22 | y = [s + numpy.random.normal(scale=0.1) for s in xPlot] 23 | 24 | #take fixed test set 30% of sample 25 | nSample = int(nPoints * 0.30) 26 | idxTest = random.sample(range(nPoints), nSample) 27 | idxTest.sort() 28 | idxTrain = [idx for idx in range(nPoints) if not(idx in idxTest)] 29 | 30 | #Define test and training attribute and label sets 31 | xTrain = [x[r] for r in idxTrain] 32 | xTest = [x[r] for r in idxTest] 33 | yTrain = [y[r] for r in idxTrain] 34 | yTest = [y[r] for r in idxTest] 35 | 36 | #train a series of models on random subsets of the training data 37 | #collect the models in a list and check error of composite as list grows 38 | 39 | #maximum number of models to generate 40 | numTreesMax = 30 41 | 42 | #tree depth - typically at the high end 43 | treeDepth = 5 44 | 45 | #initialize a list to hold models 46 | modelList = [] 47 | predList = [] 48 | eps = 0.3 49 | 50 | #initialize residuals to be the labels y 51 | residuals = list(yTrain) 52 | 53 | for iTrees in range(numTreesMax): 54 | 55 | modelList.append(DecisionTreeRegressor(max_depth=treeDepth)) 56 | modelList[-1].fit(xTrain, residuals) 57 | 58 | #make prediction with latest model and add to list of predictions 59 | latestInSamplePrediction = modelList[-1].predict(xTrain) 60 | 61 | #use new predictions to update residuals 62 | residuals = [residuals[i] - eps * latestInSamplePrediction[i] for i in range(len(residuals))] 63 | 64 | latestOutSamplePrediction = modelList[-1].predict(xTest) 65 | predList.append(list(latestOutSamplePrediction)) 66 | 67 | 68 | #build cumulative prediction from first "n" models 69 | mse = [] 70 | allPredictions = [] 71 | for iModels in range(len(modelList)): 72 | 73 | #add the first "iModels" of the predictions and multiply by eps 74 | prediction = [] 75 | for iPred in range(len(xTest)): 76 | prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)]) * eps) 77 | 78 | allPredictions.append(prediction) 79 | errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))] 80 | mse.append(sum([e * e for e in errors]) / len(yTest)) 81 | 82 | 83 | nModels = [i + 1 for i in range(len(modelList))] 84 | 85 | plot.plot(nModels,mse) 86 | plot.axis('tight') 87 | plot.xlabel('Number of Models in Ensemble') 88 | plot.ylabel('Mean Squared Error') 89 | plot.ylim((0.0, max(mse))) 90 | plot.show() 91 | 92 | plotList = [0, 14, 29] 93 | lineType = [':', '-.', '--'] 94 | plot.figure() 95 | for i in range(len(plotList)): 96 | iPlot = plotList[i] 97 | textLegend = 'Prediction with ' + str(iPlot) + ' Trees' 98 | plot.plot(xTest, allPredictions[iPlot], label = textLegend, linestyle = lineType[i]) 99 | plot.plot(xTest, yTest, label='True y Value', alpha=0.25) 100 | plot.legend(bbox_to_anchor=(1,0.3)) 101 | plot.axis('tight') 102 | plot.xlabel('x value') 103 | plot.ylabel('Predictions') 104 | plot.show() 105 | 106 | -------------------------------------------------------------------------------- /06/simpleTree.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import numpy 4 | import matplotlib.pyplot as plot 5 | from sklearn import tree 6 | from sklearn.tree import DecisionTreeRegressor 7 | from sklearn.externals.six import StringIO 8 | 9 | #Build a simple data set with y = x + random 10 | nPoints = 100 11 | 12 | #x values for plotting 13 | xPlot = [(float(i)/float(nPoints) - 0.5) for i in range(nPoints + 1)] 14 | 15 | #x needs to be list of lists. 16 | x = [[s] for s in xPlot] 17 | 18 | #y (labels) has random noise added to x-value 19 | #set seed 20 | numpy.random.seed(1) 21 | y = [s + numpy.random.normal(scale=0.1) for s in xPlot] 22 | 23 | plot.plot(xPlot,y) 24 | plot.axis('tight') 25 | plot.xlabel('x') 26 | plot.ylabel('y') 27 | plot.show() 28 | 29 | simpleTree = DecisionTreeRegressor(max_depth=1) 30 | simpleTree.fit(x, y) 31 | 32 | #draw the tree 33 | with open("simpleTree.dot", 'w') as f: 34 | f = tree.export_graphviz(simpleTree, out_file=f) 35 | 36 | #compare prediction from tree with true values 37 | 38 | yHat = simpleTree.predict(x) 39 | 40 | plot.figure() 41 | plot.plot(xPlot, y, label='True y') 42 | plot.plot(xPlot, yHat, label='Tree Prediction ', linestyle='--') 43 | plot.legend(bbox_to_anchor=(1,0.2)) 44 | plot.axis('tight') 45 | plot.xlabel('x') 46 | plot.ylabel('y') 47 | plot.show() 48 | 49 | simpleTree2 = DecisionTreeRegressor(max_depth=2) 50 | simpleTree2.fit(x, y) 51 | 52 | #draw the tree 53 | with open("simpleTree2.dot", 'w') as f: 54 | f = tree.export_graphviz(simpleTree2, out_file=f) 55 | 56 | #compare prediction from tree with true values 57 | 58 | yHat = simpleTree2.predict(x) 59 | 60 | plot.figure() 61 | plot.plot(xPlot, y, label='True y') 62 | plot.plot(xPlot, yHat, label='Tree Prediction ', linestyle='--') 63 | plot.legend(bbox_to_anchor=(1,0.2)) 64 | plot.axis('tight') 65 | plot.xlabel('x') 66 | plot.ylabel('y') 67 | plot.show() 68 | 69 | #split point calculations - try every possible split point to find the best one 70 | sse = [] 71 | xMin = [] 72 | for i in range(1, len(xPlot)): 73 | #divide list into points on left and right of split point 74 | lhList = list(xPlot[0:i]) 75 | rhList = list(xPlot[i:len(xPlot)]) 76 | 77 | #calculate averages on each side 78 | lhAvg = sum(lhList) / len(lhList) 79 | rhAvg = sum(rhList) / len(rhList) 80 | 81 | #calculate sum square error on left, right and total 82 | lhSse = sum([(s - lhAvg) * (s - lhAvg) for s in lhList]) 83 | rhSse = sum([(s - rhAvg) * (s - rhAvg) for s in rhList]) 84 | 85 | #add sum of left and right to list of errors 86 | 87 | sse.append(lhSse + rhSse) 88 | xMin.append(max(lhList)) 89 | 90 | plot.plot(range(1, len(xPlot)), sse) 91 | plot.xlabel('Split Point Index') 92 | plot.ylabel('Sum Squared Error') 93 | plot.show() 94 | 95 | minSse = min(sse) 96 | idxMin = sse.index(minSse) 97 | print(xMin[idxMin]) 98 | 99 | #what happens if the depth is really high? 100 | simpleTree6 = DecisionTreeRegressor(max_depth=6) 101 | simpleTree6.fit(x, y) 102 | 103 | #too many nodes to draw the tree 104 | #with open("simpleTree2.dot", 'w') as f: 105 | # f = tree.export_graphviz(simpleTree6, out_file=f) 106 | 107 | #compare prediction from tree with true values 108 | 109 | yHat = simpleTree6.predict(x) 110 | 111 | plot.figure() 112 | plot.plot(xPlot, y, label='True y') 113 | plot.plot(xPlot, yHat, label='Tree Prediction ', linestyle='--') 114 | plot.legend(bbox_to_anchor=(1,0.2)) 115 | plot.axis('tight') 116 | plot.xlabel('x') 117 | plot.ylabel('y') 118 | plot.show() 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /06/simpleTreeCV.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import numpy 4 | import matplotlib.pyplot as plot 5 | from sklearn import tree 6 | from sklearn.tree import DecisionTreeRegressor 7 | from sklearn.externals.six import StringIO 8 | 9 | #Build a simple data set with y = x + random 10 | nPoints = 100 11 | 12 | #x values for plotting 13 | xPlot = [(float(i)/float(nPoints) - 0.5) for i in range(nPoints + 1)] 14 | 15 | #x needs to be list of lists. 16 | x = [[s] for s in xPlot] 17 | 18 | #y (labels) has random noise added to x-value 19 | #set seed 20 | numpy.random.seed(1) 21 | y = [s + numpy.random.normal(scale=0.1) for s in xPlot] 22 | 23 | nrow = len(x) 24 | 25 | #fit trees with several different values for depth and use x-validation to see which works best. 26 | 27 | depthList = [1, 2, 3, 4, 5, 6, 7] 28 | xvalMSE = [] 29 | nxval = 10 30 | 31 | for iDepth in depthList: 32 | 33 | #build cross-validation loop to fit tree and evaluate on out of sample data 34 | for ixval in range(nxval): 35 | 36 | #Define test and training index sets 37 | idxTest = [a for a in range(nrow) if a%nxval == ixval%nxval] 38 | idxTrain = [a for a in range(nrow) if a%nxval != ixval%nxval] 39 | 40 | #Define test and training attribute and label sets 41 | xTrain = [x[r] for r in idxTrain] 42 | xTest = [x[r] for r in idxTest] 43 | yTrain = [y[r] for r in idxTrain] 44 | yTest = [y[r] for r in idxTest] 45 | 46 | #train tree of appropriate depth and accumulate out of sample (oos) errors 47 | treeModel = DecisionTreeRegressor(max_depth=iDepth) 48 | treeModel.fit(xTrain, yTrain) 49 | 50 | treePrediction = treeModel.predict(xTest) 51 | error = [yTest[r] - treePrediction[r] for r in range(len(yTest))] 52 | 53 | #accumulate squared errors 54 | if ixval == 0: 55 | oosErrors = sum([e * e for e in error]) 56 | else: 57 | #accumulate predictions 58 | oosErrors += sum([e * e for e in error]) 59 | 60 | #average the squared errors and accumulate by tree depth 61 | 62 | mse = oosErrors/nrow 63 | xvalMSE.append(mse) 64 | 65 | plot.plot(depthList, xvalMSE) 66 | plot.axis('tight') 67 | plot.xlabel('Tree Depth') 68 | plot.ylabel('Mean Squared Error') 69 | plot.show() 70 | -------------------------------------------------------------------------------- /06/wineBagging.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import tree 6 | from sklearn.tree import DecisionTreeRegressor 7 | import random 8 | from math import sqrt 9 | import matplotlib.pyplot as plot 10 | 11 | #read data into iterable 12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 13 | data = urllib2.urlopen(target_url) 14 | 15 | xList = [] 16 | labels = [] 17 | names = [] 18 | firstLine = True 19 | for line in data: 20 | if firstLine: 21 | names = line.strip().split(";") 22 | firstLine = False 23 | else: 24 | #split on semi-colon 25 | row = line.strip().split(";") 26 | #put labels in separate array 27 | labels.append(float(row[-1])) 28 | #remove label from row 29 | row.pop() 30 | #convert row to floats 31 | floatRow = [float(num) for num in row] 32 | xList.append(floatRow) 33 | 34 | nrows = len(xList) 35 | ncols = len(xList[0]) 36 | 37 | #take fixed test set 30% of sample 38 | random.seed(1) 39 | nSample = int(nrows * 0.30) 40 | idxTest = random.sample(range(nrows), nSample) 41 | idxTest.sort() 42 | idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)] 43 | 44 | #Define test and training attribute and label sets 45 | xTrain = [xList[r] for r in idxTrain] 46 | xTest = [xList[r] for r in idxTest] 47 | yTrain = [labels[r] for r in idxTrain] 48 | yTest = [labels[r] for r in idxTest] 49 | 50 | #train a series of models on random subsets of the training data 51 | #collect the models in a list and check error of composite as list grows 52 | 53 | #maximum number of models to generate 54 | numTreesMax = 30 55 | 56 | #tree depth - typically at the high end 57 | treeDepth = 1 58 | 59 | #initialize a list to hold models 60 | modelList = [] 61 | predList = [] 62 | 63 | #number of samples to draw for stochastic bagging 64 | nBagSamples = int(len(xTrain) * 0.5) 65 | 66 | for iTrees in range(numTreesMax): 67 | idxBag = [] 68 | for i in range(nBagSamples): 69 | idxBag.append(random.choice(range(len(xTrain)))) 70 | xTrainBag = [xTrain[i] for i in idxBag] 71 | yTrainBag = [yTrain[i] for i in idxBag] 72 | 73 | modelList.append(DecisionTreeRegressor(max_depth=treeDepth)) 74 | modelList[-1].fit(xTrainBag, yTrainBag) 75 | 76 | #make prediction with latest model and add to list of predictions 77 | latestPrediction = modelList[-1].predict(xTest) 78 | predList.append(list(latestPrediction)) 79 | 80 | 81 | #build cumulative prediction from first "n" models 82 | mse = [] 83 | allPredictions = [] 84 | for iModels in range(len(modelList)): 85 | 86 | #average first "iModels" of the predictions 87 | prediction = [] 88 | for iPred in range(len(xTest)): 89 | prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)])/(iModels + 1)) 90 | 91 | allPredictions.append(prediction) 92 | errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))] 93 | mse.append(sum([e * e for e in errors]) / len(yTest)) 94 | 95 | 96 | nModels = [i + 1 for i in range(len(modelList))] 97 | 98 | plot.plot(nModels,mse) 99 | plot.axis('tight') 100 | plot.xlabel('Number of Tree Models in Ensemble') 101 | plot.ylabel('Mean Squared Error') 102 | plot.ylim((0.0, max(mse))) 103 | plot.show() 104 | 105 | print('Minimum MSE') 106 | print(min(mse)) 107 | 108 | #with treeDepth = 1 109 | #Minimum MSE 110 | #0.516236026081 111 | 112 | 113 | #with treeDepth = 5 114 | #Minimum MSE 115 | #0.39815421341 116 | 117 | #with treeDepth = 12 & numTreesMax = 100 118 | #Minimum MSE 119 | #0.350749027669 -------------------------------------------------------------------------------- /06/wineGBM.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import tree 6 | from sklearn.tree import DecisionTreeRegressor 7 | import random 8 | from math import sqrt 9 | import matplotlib.pyplot as plot 10 | 11 | #read data into iterable 12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 13 | data = urllib2.urlopen(target_url) 14 | 15 | xList = [] 16 | labels = [] 17 | names = [] 18 | firstLine = True 19 | for line in data: 20 | if firstLine: 21 | names = line.strip().split(";") 22 | firstLine = False 23 | else: 24 | #split on semi-colon 25 | row = line.strip().split(";") 26 | #put labels in separate array 27 | labels.append(float(row[-1])) 28 | #remove label from row 29 | row.pop() 30 | #convert row to floats 31 | floatRow = [float(num) for num in row] 32 | xList.append(floatRow) 33 | 34 | nrows = len(xList) 35 | ncols = len(xList[0]) 36 | 37 | #take fixed test set 30% of sample 38 | nSample = int(nrows * 0.30) 39 | idxTest = random.sample(range(nrows), nSample) 40 | idxTest.sort() 41 | idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)] 42 | 43 | #Define test and training attribute and label sets 44 | xTrain = [xList[r] for r in idxTrain] 45 | xTest = [xList[r] for r in idxTest] 46 | yTrain = [labels[r] for r in idxTrain] 47 | yTest = [labels[r] for r in idxTest] 48 | 49 | #train a series of models on random subsets of the training data 50 | #collect the models in a list and check error of composite as list grows 51 | 52 | #maximum number of models to generate 53 | numTreesMax = 30 54 | 55 | #tree depth - typically at the high end 56 | treeDepth = 5 57 | 58 | #initialize a list to hold models 59 | modelList = [] 60 | predList = [] 61 | eps = 0.1 62 | 63 | #initialize residuals to be the labels y 64 | residuals = list(yTrain) 65 | 66 | for iTrees in range(numTreesMax): 67 | 68 | modelList.append(DecisionTreeRegressor(max_depth=treeDepth)) 69 | modelList[-1].fit(xTrain, residuals) 70 | 71 | #make prediction with latest model and add to list of predictions 72 | latestInSamplePrediction = modelList[-1].predict(xTrain) 73 | 74 | #use new predictions to update residuals 75 | residuals = [residuals[i] - eps * latestInSamplePrediction[i] for i in range(len(residuals))] 76 | 77 | latestOutSamplePrediction = modelList[-1].predict(xTest) 78 | predList.append(list(latestOutSamplePrediction)) 79 | 80 | 81 | #build cumulative prediction from first "n" models 82 | mse = [] 83 | allPredictions = [] 84 | for iModels in range(len(modelList)): 85 | 86 | #add the first "iModels" of the predictions and multiply by eps 87 | prediction = [] 88 | for iPred in range(len(xTest)): 89 | prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)]) * eps) 90 | 91 | allPredictions.append(prediction) 92 | errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))] 93 | mse.append(sum([e * e for e in errors]) / len(yTest)) 94 | 95 | 96 | nModels = [i + 1 for i in range(len(modelList))] 97 | 98 | plot.plot(nModels,mse) 99 | plot.axis('tight') 100 | plot.xlabel('Number of Trees in Ensemble') 101 | plot.ylabel('Mean Squared Error') 102 | plot.ylim((0.0, max(mse))) 103 | plot.show() 104 | 105 | print('Minimum MSE') 106 | print(min(mse)) 107 | 108 | #printed output 109 | #Minimum MSE 110 | #0.405031864814 -------------------------------------------------------------------------------- /06/wineRF.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import tree 6 | from sklearn.tree import DecisionTreeRegressor 7 | import random 8 | from math import sqrt 9 | import matplotlib.pyplot as plot 10 | 11 | #read data into iterable 12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 13 | data = urllib2.urlopen(target_url) 14 | 15 | xList = [] 16 | labels = [] 17 | names = [] 18 | firstLine = True 19 | for line in data: 20 | if firstLine: 21 | names = line.strip().split(";") 22 | firstLine = False 23 | else: 24 | #split on semi-colon 25 | row = line.strip().split(";") 26 | #put labels in separate array 27 | labels.append(float(row[-1])) 28 | #remove label from row 29 | row.pop() 30 | #convert row to floats 31 | floatRow = [float(num) for num in row] 32 | xList.append(floatRow) 33 | 34 | nrows = len(xList) 35 | ncols = len(xList[0]) 36 | 37 | #take fixed test set 30% of sample 38 | random.seed(1) #set seed so results are the same each run 39 | nSample = int(nrows * 0.30) 40 | idxTest = random.sample(range(nrows), nSample) 41 | idxTest.sort() 42 | idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)] 43 | 44 | #Define test and training attribute and label sets 45 | xTrain = [xList[r] for r in idxTrain] 46 | xTest = [xList[r] for r in idxTest] 47 | yTrain = [labels[r] for r in idxTrain] 48 | yTest = [labels[r] for r in idxTest] 49 | 50 | #train a series of models on random subsets of the training data 51 | #collect the models in a list and check error of composite as list grows 52 | 53 | #maximum number of models to generate 54 | numTreesMax = 30 55 | 56 | #tree depth - typically at the high end 57 | treeDepth = 12 58 | 59 | #pick how many attributes will be used in each model. 60 | # authors recommend 1/3 for regression problem 61 | nAttr = 4 62 | 63 | #initialize a list to hold models 64 | modelList = [] 65 | indexList = [] 66 | predList = [] 67 | nTrainRows = len(yTrain) 68 | 69 | for iTrees in range(numTreesMax): 70 | 71 | modelList.append(DecisionTreeRegressor(max_depth=treeDepth)) 72 | 73 | #take random sample of attributes 74 | idxAttr = random.sample(range(ncols), nAttr) 75 | idxAttr.sort() 76 | indexList.append(idxAttr) 77 | 78 | #take a random sample of training rows 79 | idxRows = [] 80 | for i in range(int(0.5 * nTrainRows)): 81 | idxRows.append(random.choice(range(len(xTrain)))) 82 | idxRows.sort() 83 | 84 | #build training set 85 | xRfTrain = [] 86 | yRfTrain = [] 87 | 88 | for i in range(len(idxRows)): 89 | temp = [xTrain[idxRows[i]][j] for j in idxAttr] 90 | xRfTrain.append(temp) 91 | yRfTrain.append(yTrain[idxRows[i]]) 92 | 93 | modelList[-1].fit(xRfTrain, yRfTrain) 94 | 95 | #restrict xTest to attributes selected for training 96 | xRfTest = [] 97 | for xx in xTest: 98 | temp = [xx[i] for i in idxAttr] 99 | xRfTest.append(temp) 100 | 101 | latestOutSamplePrediction = modelList[-1].predict(xRfTest) 102 | predList.append(list(latestOutSamplePrediction)) 103 | 104 | 105 | #build cumulative prediction from first "n" models 106 | mse = [] 107 | allPredictions = [] 108 | for iModels in range(len(modelList)): 109 | 110 | #add the first "iModels" of the predictions and multiply by eps 111 | prediction = [] 112 | for iPred in range(len(xTest)): 113 | prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)]) / (iModels + 1)) 114 | 115 | allPredictions.append(prediction) 116 | errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))] 117 | mse.append(sum([e * e for e in errors]) / len(yTest)) 118 | 119 | 120 | nModels = [i + 1 for i in range(len(modelList))] 121 | 122 | plot.plot(nModels,mse) 123 | plot.axis('tight') 124 | plot.xlabel('Number of Trees in Ensemble') 125 | plot.ylabel('Mean Squared Error') 126 | plot.ylim((0.0, max(mse))) 127 | plot.show() 128 | 129 | print('Minimum MSE') 130 | print(min(mse)) 131 | 132 | #printed output 133 | 134 | #Depth 1 135 | #Minimum MSE 136 | #0.52666715461 137 | 138 | #Depth 5 139 | #Minimum MSE 140 | #0.426116327584 141 | 142 | #Depth 12 143 | #Minimum MSE 144 | #0.38508387863 -------------------------------------------------------------------------------- /06/wineTree.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | from sklearn import tree 6 | from sklearn.tree import DecisionTreeRegressor 7 | from sklearn.externals.six import StringIO 8 | from math import sqrt 9 | import matplotlib.pyplot as plot 10 | 11 | #read data into iterable 12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 13 | data = urllib2.urlopen(target_url) 14 | 15 | xList = [] 16 | labels = [] 17 | names = [] 18 | firstLine = True 19 | for line in data: 20 | if firstLine: 21 | names = line.strip().split(";") 22 | firstLine = False 23 | else: 24 | #split on semi-colon 25 | row = line.strip().split(";") 26 | #put labels in separate array 27 | labels.append(float(row[-1])) 28 | #remove label from row 29 | row.pop() 30 | #convert row to floats 31 | floatRow = [float(num) for num in row] 32 | xList.append(floatRow) 33 | 34 | nrows = len(xList) 35 | ncols = len(xList[0]) 36 | 37 | wineTree = DecisionTreeRegressor(max_depth=3) 38 | 39 | wineTree.fit(xList, labels) 40 | 41 | with open("wineTree.dot", 'w') as f: 42 | f = tree.export_graphviz(wineTree, out_file=f) 43 | #Note: The code above exports the trained tree info to a Graphviz "dot" file. 44 | #Drawing the graph requires installing GraphViz and the running the following on the command line 45 | #dot -Tpng wineTree.dot -o wineTree.png 46 | 47 | -------------------------------------------------------------------------------- /07/abaloneGBM.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | 3 | import urllib2 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | import numpy 7 | from sklearn.cross_validation import train_test_split 8 | from sklearn import ensemble 9 | from sklearn.metrics import mean_squared_error 10 | 11 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data" 12 | #read abalone data 13 | data = urllib2.urlopen(target_url) 14 | 15 | xList = [] 16 | labels = [] 17 | for line in data: 18 | #split on semi-colon 19 | row = line.strip().split(",") 20 | 21 | #put labels in separate array and remove label from row 22 | labels.append(float(row.pop())) 23 | 24 | #form list of list of attributes (all strings) 25 | xList.append(row) 26 | 27 | #code three-valued sex attribute as numeric 28 | xCoded = [] 29 | for row in xList: 30 | #first code the three-valued sex variable 31 | codedSex = [0.0, 0.0] 32 | if row[0] == 'M': codedSex[0] = 1.0 33 | if row[0] == 'F': codedSex[1] = 1.0 34 | 35 | numRow = [float(row[i]) for i in range(1,len(row))] 36 | rowCoded = list(codedSex) + numRow 37 | xCoded.append(rowCoded) 38 | 39 | #list of names for 40 | abaloneNames = numpy.array(['Sex1', 'Sex2', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 41 | 'Viscera weight', 'Shell weight', 'Rings']) 42 | 43 | #number of rows and columns in x matrix 44 | nrows = len(xCoded) 45 | ncols = len(xCoded[1]) 46 | 47 | #form x and y into numpy arrays and make up column names 48 | X = numpy.array(xCoded) 49 | y = numpy.array(labels) 50 | 51 | #break into training and test sets. 52 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531) 53 | 54 | #instantiate model 55 | nEst = 2000 56 | depth = 5 57 | learnRate = 0.005 58 | maxFeatures = 3 59 | subsamp = 0.5 60 | abaloneGBMModel = ensemble.GradientBoostingRegressor(n_estimators=nEst, max_depth=depth, 61 | learning_rate=learnRate, max_features=maxFeatures, 62 | subsample=subsamp, loss='ls') 63 | 64 | #train 65 | abaloneGBMModel.fit(xTrain, yTrain) 66 | 67 | # compute mse on test set 68 | msError = [] 69 | predictions = abaloneGBMModel.staged_decision_function(xTest) 70 | for p in predictions: 71 | msError.append(mean_squared_error(yTest, p)) 72 | 73 | print("MSE" ) 74 | print(min(msError)) 75 | print(msError.index(min(msError))) 76 | 77 | #plot training and test errors vs number of trees in ensemble 78 | plot.figure() 79 | plot.plot(range(1, nEst + 1), abaloneGBMModel.train_score_, label='Training Set MSE', linestyle=":") 80 | plot.plot(range(1, nEst + 1), msError, label='Test Set MSE') 81 | plot.legend(loc='upper right') 82 | plot.xlabel('Number of Trees in Ensemble') 83 | plot.ylabel('Mean Squared Error') 84 | plot.show() 85 | 86 | # Plot feature importance 87 | featureImportance = abaloneGBMModel.feature_importances_ 88 | 89 | # normalize by max importance 90 | featureImportance = featureImportance / featureImportance.max() 91 | idxSorted = numpy.argsort(featureImportance) 92 | barPos = numpy.arange(idxSorted.shape[0]) + .5 93 | plot.barh(barPos, featureImportance[idxSorted], align='center') 94 | plot.yticks(barPos, abaloneNames[idxSorted]) 95 | plot.xlabel('Variable Importance') 96 | plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1) 97 | plot.show() 98 | 99 | # Printed Output: 100 | 101 | # for Gradient Boosting 102 | # nEst = 2000 103 | # depth = 5 104 | # learnRate = 0.003 105 | # maxFeatures = None 106 | # subsamp = 0.5 107 | # 108 | # MSE 109 | # 4.22969363284 110 | # 1736 111 | 112 | #for Gradient Boosting with RF base learners 113 | # nEst = 2000 114 | # depth = 5 115 | # learnRate = 0.005 116 | # maxFeatures = 3 117 | # subsamp = 0.5 118 | # 119 | # MSE 120 | # 4.27564515749 121 | # 1687 122 | -------------------------------------------------------------------------------- /07/abaloneRF.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | 3 | import urllib2 4 | from pylab import * 5 | import matplotlib.pyplot as plot 6 | import numpy 7 | from sklearn.cross_validation import train_test_split 8 | from sklearn import ensemble 9 | from sklearn.metrics import mean_squared_error 10 | 11 | 12 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data" 13 | #read abalone data 14 | data = urllib2.urlopen(target_url) 15 | 16 | xList = [] 17 | labels = [] 18 | for line in data: 19 | #split on semi-colon 20 | row = line.strip().split(",") 21 | 22 | #put labels in separate array and remove label from row 23 | labels.append(float(row.pop())) 24 | 25 | #form list of list of attributes (all strings) 26 | xList.append(row) 27 | 28 | #code three-valued sex attribute as numeric 29 | xCoded = [] 30 | for row in xList: 31 | #first code the three-valued sex variable 32 | codedSex = [0.0, 0.0] 33 | if row[0] == 'M': codedSex[0] = 1.0 34 | if row[0] == 'F': codedSex[1] = 1.0 35 | 36 | numRow = [float(row[i]) for i in range(1,len(row))] 37 | rowCoded = list(codedSex) + numRow 38 | xCoded.append(rowCoded) 39 | 40 | #list of names for 41 | abaloneNames = numpy.array(['Sex1', 'Sex2', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 42 | 'Viscera weight', 'Shell weight', 'Rings']) 43 | 44 | #number of rows and columns in x matrix 45 | nrows = len(xCoded) 46 | ncols = len(xCoded[1]) 47 | 48 | #form x and y into numpy arrays and make up column names 49 | X = numpy.array(xCoded) 50 | y = numpy.array(labels) 51 | 52 | #break into training and test sets. 53 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531) 54 | 55 | #train random forest at a range of ensemble sizes in order to see how the mse changes 56 | mseOos = [] 57 | nTreeList = range(50, 500, 10) 58 | for iTrees in nTreeList: 59 | depth = None 60 | maxFeat = 4 #try tweaking 61 | abaloneRFModel = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat, 62 | oob_score=False, random_state=531) 63 | 64 | abaloneRFModel.fit(xTrain,yTrain) 65 | 66 | #Accumulate mse on test set 67 | prediction = abaloneRFModel.predict(xTest) 68 | mseOos.append(mean_squared_error(yTest, prediction)) 69 | 70 | 71 | print("MSE" ) 72 | print(mseOos[-1]) 73 | 74 | 75 | #plot training and test errors vs number of trees in ensemble 76 | plot.plot(nTreeList, mseOos) 77 | plot.xlabel('Number of Trees in Ensemble') 78 | plot.ylabel('Mean Squared Error') 79 | #plot.ylim([0.0, 1.1*max(mseOob)]) 80 | plot.show() 81 | 82 | # Plot feature importance 83 | featureImportance = abaloneRFModel.feature_importances_ 84 | 85 | # normalize by max importance 86 | featureImportance = featureImportance / featureImportance.max() 87 | sortedIdx = numpy.argsort(featureImportance) 88 | barPos = numpy.arange(sortedIdx.shape[0]) + .5 89 | plot.barh(barPos, featureImportance[sortedIdx], align='center') 90 | plot.yticks(barPos, abaloneNames[sortedIdx]) 91 | plot.xlabel('Variable Importance') 92 | plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1) 93 | plot.show() 94 | 95 | # Printed Output: 96 | # MSE 97 | # 4.30971555911 -------------------------------------------------------------------------------- /07/glassGbm.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | 3 | import urllib2 4 | from math import sqrt, fabs, exp 5 | import matplotlib.pyplot as plot 6 | from sklearn.linear_model import enet_path 7 | from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix 8 | from sklearn.cross_validation import train_test_split 9 | from sklearn import ensemble 10 | import numpy 11 | 12 | 13 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data" 14 | data = urllib2.urlopen(target_url) 15 | 16 | #arrange data into list for labels and list of lists for attributes 17 | xList = [] 18 | for line in data: 19 | #split on comma 20 | row = line.strip().split(",") 21 | xList.append(row) 22 | 23 | glassNames = numpy.array(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']) 24 | 25 | #Separate attributes and labels 26 | xNum = [] 27 | labels = [] 28 | 29 | for row in xList: 30 | labels.append(row.pop()) 31 | l = len(row) 32 | #eliminate ID 33 | attrRow = [float(row[i]) for i in range(1, l)] 34 | xNum.append(attrRow) 35 | 36 | #number of rows and columns in x matrix 37 | nrows = len(xNum) 38 | ncols = len(xNum[1]) 39 | 40 | #Labels are integers from 1 to 7 with no examples of 4. 41 | #gb requires consecutive integers starting at 0 42 | newLabels = [] 43 | labelSet = set(labels) 44 | labelList = list(labelSet) 45 | labelList.sort() 46 | nlabels = len(labelList) 47 | for l in labels: 48 | index = labelList.index(l) 49 | newLabels.append(index) 50 | 51 | #Class populations: 52 | #old label new label num of examples 53 | #1 0 70 54 | #2 1 76 55 | #3 2 17 56 | #5 3 13 57 | #6 4 9 58 | #7 5 29 59 | # 60 | #Drawing 30% test sample may not preserve population proportions 61 | 62 | #stratified sampling by labels. 63 | xTemp = [xNum[i] for i in range(nrows) if newLabels[i] == 0] 64 | yTemp = [newLabels[i] for i in range(nrows) if newLabels[i] == 0] 65 | xTrain, xTest, yTrain, yTest = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531) 66 | for iLabel in range(1, len(labelList)): 67 | #segregate x and y according to labels 68 | xTemp = [xNum[i] for i in range(nrows) if newLabels[i] == iLabel] 69 | yTemp = [newLabels[i] for i in range(nrows) if newLabels[i] == iLabel] 70 | 71 | #form train and test sets on segregated subset of examples 72 | xTrainTemp, xTestTemp, yTrainTemp, yTestTemp = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531) 73 | 74 | #accumulate 75 | xTrain = numpy.append(xTrain, xTrainTemp, axis=0); xTest = numpy.append(xTest, xTestTemp, axis=0) 76 | yTrain = numpy.append(yTrain, yTrainTemp, axis=0); yTest = numpy.append(yTest, yTestTemp, axis=0) 77 | 78 | #instantiate model 79 | nEst = 500 80 | depth = 3 81 | learnRate = 0.003 82 | maxFeatures = 3 83 | subSamp = 0.5 84 | glassGBMModel = ensemble.GradientBoostingClassifier(n_estimators=nEst, max_depth=depth, 85 | learning_rate=learnRate, max_features=maxFeatures, 86 | subsample=subSamp) 87 | 88 | #train 89 | glassGBMModel.fit(xTrain, yTrain) 90 | 91 | # compute auc on test set as function of ensemble size 92 | missClassError = [] 93 | missClassBest = 1.0 94 | predictions = glassGBMModel.staged_decision_function(xTest) 95 | for p in predictions: 96 | missClass = 0 97 | for i in range(len(p)): 98 | listP = p[i].tolist() 99 | if listP.index(max(listP)) != yTest[i]: 100 | missClass += 1 101 | missClass = float(missClass)/len(p) 102 | 103 | missClassError.append(missClass) 104 | 105 | #capture best predictions 106 | if missClass < missClassBest: 107 | missClassBest = missClass 108 | pBest = p 109 | 110 | idxBest = missClassError.index(min(missClassError)) 111 | 112 | #print best values 113 | print("Best Missclassification Error" ) 114 | print(missClassBest) 115 | print("Number of Trees for Best Missclassification Error") 116 | print(idxBest) 117 | 118 | #plot training deviance and test auc's vs number of trees in ensemble 119 | missClassError = [100*mce for mce in missClassError] 120 | plot.figure() 121 | plot.plot(range(1, nEst + 1), glassGBMModel.train_score_, label='Training Set Deviance', linestyle=":") 122 | plot.plot(range(1, nEst + 1), missClassError, label='Test Set Error') 123 | plot.legend(loc='upper right') 124 | plot.xlabel('Number of Trees in Ensemble') 125 | plot.ylabel('Deviance / Classification Error') 126 | plot.show() 127 | 128 | # Plot feature importance 129 | featureImportance = glassGBMModel.feature_importances_ 130 | 131 | # normalize by max importance 132 | featureImportance = featureImportance / featureImportance.max() 133 | 134 | #plot variable importance 135 | idxSorted = numpy.argsort(featureImportance) 136 | barPos = numpy.arange(idxSorted.shape[0]) + .5 137 | plot.barh(barPos, featureImportance[idxSorted], align='center') 138 | plot.yticks(barPos, glassNames[idxSorted]) 139 | plot.xlabel('Variable Importance') 140 | plot.show() 141 | 142 | #generate confusion matrix for best prediction. 143 | pBestList = pBest.tolist() 144 | bestPrediction = [r.index(max(r)) for r in pBestList] 145 | confusionMat = confusion_matrix(yTest, bestPrediction) 146 | print('') 147 | print("Confusion Matrix") 148 | print(confusionMat) 149 | 150 | 151 | # Printed Output: 152 | # 153 | # nEst = 500 154 | # depth = 3 155 | # learnRate = 0.003 156 | # maxFeatures = None 157 | # subSamp = 0.5 158 | # 159 | # 160 | # Best Missclassification Error 161 | # 0.242424242424 162 | # Number of Trees for Best Missclassification Error 163 | # 113 164 | # 165 | # Confusion Matrix 166 | # [[19 1 0 0 0 1] 167 | # [ 3 19 0 1 0 0] 168 | # [ 4 1 0 0 1 0] 169 | # [ 0 3 0 1 0 0] 170 | # [ 0 0 0 0 3 0] 171 | # [ 0 1 0 1 0 7]] 172 | # 173 | 174 | 175 | 176 | # For gradient boosting using random forest base learners 177 | # nEst = 500 178 | # depth = 3 179 | # learnRate = 0.003 180 | # maxFeatures = 3 181 | # subSamp = 0.5 182 | # 183 | # 184 | # 185 | # Best Missclassification Error 186 | # 0.227272727273 187 | # Number of Trees for Best Missclassification Error 188 | # 267 189 | # 190 | # Confusion Matrix 191 | # [[20 1 0 0 0 0] 192 | # [ 3 20 0 0 0 0] 193 | # [ 3 3 0 0 0 0] 194 | # [ 0 4 0 0 0 0] 195 | # [ 0 0 0 0 3 0] 196 | # [ 0 2 0 0 0 7]] -------------------------------------------------------------------------------- /07/glassRF.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | 3 | import urllib2 4 | from math import sqrt, fabs, exp 5 | import matplotlib.pyplot as plot 6 | from sklearn.linear_model import enet_path 7 | from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve 8 | from sklearn.cross_validation import train_test_split 9 | from sklearn import ensemble 10 | import numpy 11 | 12 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data" 13 | data = urllib2.urlopen(target_url) 14 | 15 | #arrange data into list for labels and list of lists for attributes 16 | xList = [] 17 | for line in data: 18 | #split on comma 19 | row = line.strip().split(",") 20 | xList.append(row) 21 | 22 | glassNames = numpy.array(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']) 23 | 24 | #Separate attributes and labels 25 | xNum = [] 26 | labels = [] 27 | 28 | for row in xList: 29 | labels.append(row.pop()) 30 | l = len(row) 31 | #eliminate ID 32 | attrRow = [float(row[i]) for i in range(1, l)] 33 | xNum.append(attrRow) 34 | 35 | #number of rows and columns in x matrix 36 | nrows = len(xNum) 37 | ncols = len(xNum[1]) 38 | 39 | #Labels are integers from 1 to 7 with no examples of 4. 40 | #gb requires consecutive integers starting at 0 41 | newLabels = [] 42 | labelSet = set(labels) 43 | labelList = list(labelSet) 44 | labelList.sort() 45 | nlabels = len(labelList) 46 | for l in labels: 47 | index = labelList.index(l) 48 | newLabels.append(index) 49 | 50 | #Class populations: 51 | #old label new label num of examples 52 | #1 0 70 53 | #2 1 76 54 | #3 2 17 55 | #5 3 13 56 | #6 4 9 57 | #7 5 29 58 | # 59 | #Drawing 30% test sample may not preserve population proportions 60 | 61 | #stratified sampling by labels. 62 | xTemp = [xNum[i] for i in range(nrows) if newLabels[i] == 0] 63 | yTemp = [newLabels[i] for i in range(nrows) if newLabels[i] == 0] 64 | xTrain, xTest, yTrain, yTest = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531) 65 | for iLabel in range(1, len(labelList)): 66 | #segregate x and y according to labels 67 | xTemp = [xNum[i] for i in range(nrows) if newLabels[i] == iLabel] 68 | yTemp = [newLabels[i] for i in range(nrows) if newLabels[i] == iLabel] 69 | 70 | #form train and test sets on segregated subset of examples 71 | xTrainTemp, xTestTemp, yTrainTemp, yTestTemp = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531) 72 | 73 | #accumulate 74 | xTrain = numpy.append(xTrain, xTrainTemp, axis=0); xTest = numpy.append(xTest, xTestTemp, axis=0) 75 | yTrain = numpy.append(yTrain, yTrainTemp, axis=0); yTest = numpy.append(yTest, yTestTemp, axis=0) 76 | 77 | missCLassError = [] 78 | nTreeList = range(50, 2000, 50) 79 | for iTrees in nTreeList: 80 | depth = None 81 | maxFeat = 4 #try tweaking 82 | glassRFModel = ensemble.RandomForestClassifier(n_estimators=iTrees, max_depth=depth, max_features=maxFeat, 83 | oob_score=False, random_state=531) 84 | 85 | glassRFModel.fit(xTrain,yTrain) 86 | 87 | #Accumulate auc on test set 88 | prediction = glassRFModel.predict(xTest) 89 | correct = accuracy_score(yTest, prediction) 90 | 91 | missCLassError.append(1.0 - correct) 92 | 93 | print("Missclassification Error" ) 94 | print(missCLassError[-1]) 95 | 96 | #generate confusion matrix 97 | pList = prediction.tolist() 98 | confusionMat = confusion_matrix(yTest, pList) 99 | print('') 100 | print("Confusion Matrix") 101 | print(confusionMat) 102 | 103 | 104 | 105 | #plot training and test errors vs number of trees in ensemble 106 | plot.plot(nTreeList, missCLassError) 107 | plot.xlabel('Number of Trees in Ensemble') 108 | plot.ylabel('Missclassification Error Rate') 109 | #plot.ylim([0.0, 1.1*max(mseOob)]) 110 | plot.show() 111 | 112 | # Plot feature importance 113 | featureImportance = glassRFModel.feature_importances_ 114 | 115 | # normalize by max importance 116 | featureImportance = featureImportance / featureImportance.max() 117 | 118 | #plot variable importance 119 | idxSorted = numpy.argsort(featureImportance) 120 | barPos = numpy.arange(idxSorted.shape[0]) + .5 121 | plot.barh(barPos, featureImportance[idxSorted], align='center') 122 | plot.yticks(barPos, glassNames[idxSorted]) 123 | plot.xlabel('Variable Importance') 124 | plot.show() 125 | 126 | 127 | # Printed Output: 128 | # Missclassification Error 129 | # 0.227272727273 130 | # 131 | # Confusion Matrix 132 | # [[17 1 2 0 0 1] 133 | # [ 2 18 1 2 0 0] 134 | # [ 3 0 3 0 0 0] 135 | # [ 0 0 0 4 0 0] 136 | # [ 0 1 0 0 2 0] 137 | # [ 0 2 0 0 0 7]] 138 | -------------------------------------------------------------------------------- /07/rocksVMinesGBM.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | 3 | import urllib2 4 | from math import sqrt, fabs, exp 5 | import matplotlib.pyplot as plot 6 | from sklearn.cross_validation import train_test_split 7 | from sklearn import ensemble 8 | from sklearn.metrics import roc_auc_score, roc_curve 9 | import numpy 10 | 11 | #read data from uci data repository 12 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data" 13 | data = urllib2.urlopen(target_url) 14 | 15 | 16 | #arrange data into list for labels and list of lists for attributes 17 | xList = [] 18 | 19 | 20 | for line in data: 21 | #split on comma 22 | row = line.strip().split(",") 23 | xList.append(row) 24 | 25 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0 26 | 27 | xNum = [] 28 | labels = [] 29 | 30 | for row in xList: 31 | lastCol = row.pop() 32 | if lastCol == "M": 33 | labels.append(1) 34 | else: 35 | labels.append(0) 36 | attrRow = [float(elt) for elt in row] 37 | xNum.append(attrRow) 38 | 39 | #number of rows and columns in x matrix 40 | nrows = len(xNum) 41 | ncols = len(xNum[1]) 42 | 43 | #form x and y into numpy arrays and make up column names 44 | X = numpy.array(xNum) 45 | y = numpy.array(labels) 46 | rockVMinesNames = numpy.array(['V' + str(i) for i in range(ncols)]) 47 | 48 | #break into training and test sets. 49 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531) 50 | 51 | #instantiate model 52 | nEst = 2000 53 | depth = 3 54 | learnRate = 0.007 55 | maxFeatures = 20 56 | rockVMinesGBMModel = ensemble.GradientBoostingClassifier(n_estimators=nEst, max_depth=depth, 57 | learning_rate=learnRate, 58 | max_features=maxFeatures) 59 | #train 60 | rockVMinesGBMModel.fit(xTrain, yTrain) 61 | 62 | # compute auc on test set as function of ensemble size 63 | auc = [] 64 | aucBest = 0.0 65 | predictions = rockVMinesGBMModel.staged_decision_function(xTest) 66 | for p in predictions: 67 | aucCalc = roc_auc_score(yTest, p) 68 | auc.append(aucCalc) 69 | 70 | #capture best predictions 71 | if aucCalc > aucBest: 72 | aucBest = aucCalc 73 | pBest = p 74 | 75 | idxBest = auc.index(max(auc)) 76 | 77 | #print best values 78 | print("Best AUC" ) 79 | print(auc[idxBest]) 80 | print("Number of Trees for Best AUC") 81 | print(idxBest) 82 | 83 | #plot training deviance and test auc's vs number of trees in ensemble 84 | plot.figure() 85 | plot.plot(range(1, nEst + 1), rockVMinesGBMModel.train_score_, label='Training Set Deviance', linestyle=":") 86 | plot.plot(range(1, nEst + 1), auc, label='Test Set AUC') 87 | plot.legend(loc='upper right') 88 | plot.xlabel('Number of Trees in Ensemble') 89 | plot.ylabel('Deviance / AUC') 90 | plot.show() 91 | 92 | # Plot feature importance 93 | featureImportance = rockVMinesGBMModel.feature_importances_ 94 | 95 | # normalize by max importance 96 | featureImportance = featureImportance / featureImportance.max() 97 | 98 | #plot importance of top 30 99 | idxSorted = numpy.argsort(featureImportance)[30:60] 100 | 101 | barPos = numpy.arange(idxSorted.shape[0]) + .5 102 | plot.barh(barPos, featureImportance[idxSorted], align='center') 103 | plot.yticks(barPos, rockVMinesNames[idxSorted]) 104 | plot.xlabel('Variable Importance') 105 | plot.show() 106 | 107 | #pick some threshold values and calc confusion matrix for best predictions 108 | #notice that GBM predictions don't fall in range of (0, 1) 109 | 110 | #plot best version of ROC curve 111 | fpr, tpr, thresh = roc_curve(yTest, list(pBest)) 112 | ctClass = [i*0.01 for i in range(101)] 113 | 114 | plot.plot(fpr, tpr, linewidth=2) 115 | plot.plot(ctClass, ctClass, linestyle=':') 116 | plot.xlabel('False Positive Rate') 117 | plot.ylabel('True Positive Rate') 118 | plot.show() 119 | 120 | #pick some threshold values and calc confusion matrix for best predictions 121 | #notice that GBM predictions don't fall in range of (0, 1) 122 | #pick threshold values at 25th, 50th and 75th percentiles 123 | idx25 = int(len(thresh) * 0.25) 124 | idx50 = int(len(thresh) * 0.50) 125 | idx75 = int(len(thresh) * 0.75) 126 | 127 | #calculate total points, total positives and total negatives 128 | totalPts = len(yTest) 129 | P = sum(yTest) 130 | N = totalPts - P 131 | 132 | print('') 133 | print('Confusion Matrices for Different Threshold Values') 134 | 135 | #25th 136 | TP = tpr[idx25] * P; FN = P - TP; FP = fpr[idx25] * N; TN = N - FP 137 | print('') 138 | print('Threshold Value = ', thresh[idx25]) 139 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts) 140 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts) 141 | 142 | #50th 143 | TP = tpr[idx50] * P; FN = P - TP; FP = fpr[idx50] * N; TN = N - FP 144 | print('') 145 | print('Threshold Value = ', thresh[idx50]) 146 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts) 147 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts) 148 | 149 | #75th 150 | TP = tpr[idx75] * P; FN = P - TP; FP = fpr[idx75] * N; TN = N - FP 151 | print('') 152 | print('Threshold Value = ', thresh[idx75]) 153 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts) 154 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts) 155 | 156 | 157 | # Printed Output: 158 | # 159 | # Best AUC 160 | # 0.936105476673 161 | # Number of Trees for Best AUC 162 | # 1989 163 | # 164 | # Confusion Matrices for Different Threshold Values 165 | # 166 | # ('Threshold Value = ', 6.2941249291909935) 167 | # ('TP = ', 0.23809523809523808, 'FP = ', 0.015873015873015872) 168 | # ('FN = ', 0.30158730158730157, 'TN = ', 0.44444444444444442) 169 | # 170 | # ('Threshold Value = ', 2.2710265370949441) 171 | # ('TP = ', 0.44444444444444442, 'FP = ', 0.063492063492063489) 172 | # ('FN = ', 0.095238095238095233, 'TN = ', 0.3968253968253968) 173 | # 174 | # ('Threshold Value = ', -3.0947902666953317) 175 | # ('TP = ', 0.53968253968253965, 'FP = ', 0.22222222222222221) 176 | # ('FN = ', 0.0, 'TN = ', 0.23809523809523808) 177 | # 178 | # 179 | # Printed Output with max_features = 20 (Random Forest base learners): 180 | # 181 | # Best AUC 182 | # 0.956389452333 183 | # Number of Trees for Best AUC 184 | # 1426 185 | # 186 | # Confusion Matrices for Different Threshold Values 187 | # 188 | # ('Threshold Value = ', 5.8332200248698536) 189 | # ('TP = ', 0.23809523809523808, 'FP = ', 0.015873015873015872) 190 | # ('FN = ', 0.30158730158730157, 'TN = ', 0.44444444444444442) 191 | # 192 | # ('Threshold Value = ', 2.0281780133610567) 193 | # ('TP = ', 0.47619047619047616, 'FP = ', 0.031746031746031744) 194 | # ('FN = ', 0.063492063492063489, 'TN = ', 0.42857142857142855) 195 | # 196 | # ('Threshold Value = ', -1.2965629080181333) 197 | # ('TP = ', 0.53968253968253965, 'FP = ', 0.22222222222222221) 198 | # ('FN = ', 0.0, 'TN = ', 0.23809523809523808) -------------------------------------------------------------------------------- /07/rocksVMinesRF.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike_bowles' 2 | 3 | import urllib2 4 | from math import sqrt, fabs, exp 5 | import matplotlib.pyplot as plot 6 | from sklearn.cross_validation import train_test_split 7 | from sklearn import ensemble 8 | from sklearn.metrics import roc_auc_score, roc_curve 9 | import numpy 10 | 11 | #read data from uci data repository 12 | target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data" 13 | data = urllib2.urlopen(target_url) 14 | 15 | #arrange data into list for labels and list of lists for attributes 16 | xList = [] 17 | 18 | for line in data: 19 | #split on comma 20 | row = line.strip().split(",") 21 | xList.append(row) 22 | 23 | #separate labels from attributes, convert from attributes from string to numeric and convert "M" to 1 and "R" to 0 24 | 25 | xNum = [] 26 | labels = [] 27 | 28 | for row in xList: 29 | lastCol = row.pop() 30 | if lastCol == "M": 31 | labels.append(1) 32 | else: 33 | labels.append(0) 34 | attrRow = [float(elt) for elt in row] 35 | xNum.append(attrRow) 36 | 37 | #number of rows and columns in x matrix 38 | nrows = len(xNum) 39 | ncols = len(xNum[1]) 40 | 41 | #form x and y into numpy arrays and make up column names 42 | X = numpy.array(xNum) 43 | y = numpy.array(labels) 44 | rocksVMinesNames = numpy.array(['V' + str(i) for i in range(ncols)]) 45 | 46 | #break into training and test sets. 47 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531) 48 | 49 | auc = [] 50 | nTreeList = range(50, 2000, 50) 51 | for iTrees in nTreeList: 52 | depth = None 53 | maxFeat = 8 #try tweaking 54 | rocksVMinesRFModel = ensemble.RandomForestClassifier(n_estimators=iTrees, max_depth=depth, max_features=maxFeat, 55 | oob_score=False, random_state=531) 56 | 57 | rocksVMinesRFModel.fit(xTrain,yTrain) 58 | 59 | #Accumulate auc on test set 60 | prediction = rocksVMinesRFModel.predict_proba(xTest) 61 | aucCalc = roc_auc_score(yTest, prediction[:,1:2]) 62 | auc.append(aucCalc) 63 | 64 | print("AUC" ) 65 | print(auc[-1]) 66 | 67 | 68 | #plot training and test errors vs number of trees in ensemble 69 | plot.plot(nTreeList, auc) 70 | plot.xlabel('Number of Trees in Ensemble') 71 | plot.ylabel('Area Under ROC Curve - AUC') 72 | #plot.ylim([0.0, 1.1*max(mseOob)]) 73 | plot.show() 74 | 75 | # Plot feature importance 76 | featureImportance = rocksVMinesRFModel.feature_importances_ 77 | 78 | # normalize by max importance 79 | featureImportance = featureImportance / featureImportance.max() 80 | 81 | #plot importance of top 30 82 | idxSorted = numpy.argsort(featureImportance)[30:60] 83 | idxTemp = numpy.argsort(featureImportance)[::-1] 84 | print(idxTemp) 85 | barPos = numpy.arange(idxSorted.shape[0]) + .5 86 | plot.barh(barPos, featureImportance[idxSorted], align='center') 87 | plot.yticks(barPos, rocksVMinesNames[idxSorted]) 88 | plot.xlabel('Variable Importance') 89 | plot.show() 90 | 91 | #plot best version of ROC curve 92 | fpr, tpr, thresh = roc_curve(yTest, list(prediction[:,1:2])) 93 | ctClass = [i*0.01 for i in range(101)] 94 | 95 | plot.plot(fpr, tpr, linewidth=2) 96 | plot.plot(ctClass, ctClass, linestyle=':') 97 | plot.xlabel('False Positive Rate') 98 | plot.ylabel('True Positive Rate') 99 | plot.show() 100 | 101 | #pick some threshold values and calc confusion matrix for best predictions 102 | #notice that GBM predictions don't fall in range of (0, 1) 103 | #pick threshold values at 25th, 50th and 75th percentiles 104 | idx25 = int(len(thresh) * 0.25) 105 | idx50 = int(len(thresh) * 0.50) 106 | idx75 = int(len(thresh) * 0.75) 107 | 108 | #calculate total points, total positives and total negatives 109 | totalPts = len(yTest) 110 | P = sum(yTest) 111 | N = totalPts - P 112 | 113 | print('') 114 | print('Confusion Matrices for Different Threshold Values') 115 | 116 | #25th 117 | TP = tpr[idx25] * P; FN = P - TP; FP = fpr[idx25] * N; TN = N - FP 118 | print('') 119 | print('Threshold Value = ', thresh[idx25]) 120 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts) 121 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts) 122 | 123 | #50th 124 | TP = tpr[idx50] * P; FN = P - TP; FP = fpr[idx50] * N; TN = N - FP 125 | print('') 126 | print('Threshold Value = ', thresh[idx50]) 127 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts) 128 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts) 129 | 130 | #75th 131 | TP = tpr[idx75] * P; FN = P - TP; FP = fpr[idx75] * N; TN = N - FP 132 | print('') 133 | print('Threshold Value = ', thresh[idx75]) 134 | print('TP = ', TP/totalPts, 'FP = ', FP/totalPts) 135 | print('FN = ', FN/totalPts, 'TN = ', TN/totalPts) 136 | 137 | 138 | # Printed Output: 139 | # 140 | # AUC 141 | # 0.950304259635 142 | # 143 | # Confusion Matrices for Different Threshold Values 144 | # 145 | # ('Threshold Value = ', 0.76051282051282054) 146 | # ('TP = ', 0.25396825396825395, 'FP = ', 0.0) 147 | # ('FN = ', 0.2857142857142857, 'TN = ', 0.46031746031746029) 148 | # 149 | # ('Threshold Value = ', 0.62461538461538457) 150 | # ('TP = ', 0.46031746031746029, 'FP = ', 0.047619047619047616) 151 | # ('FN = ', 0.079365079365079361, 'TN = ', 0.41269841269841268) 152 | # 153 | # ('Threshold Value = ', 0.46564102564102566) 154 | # ('TP = ', 0.53968253968253965, 'FP = ', 0.22222222222222221) 155 | # ('FN = ', 0.0, 'TN = ', 0.23809523809523808) -------------------------------------------------------------------------------- /07/timingComparisons.txt: -------------------------------------------------------------------------------- 1 | timing and perf comparisons 2 | 3 | dataset algo training time perf perf metric 4 | 5 | glass - RF 2000 trees 0:00:02.354401 0.227272727273 class error 6 | glass - gbm 500 trees 0:00:03.879308 0.227272727273 7 | glass - lasso 0:00:12.296948 0.373831775701 8 | 9 | rvmines rf 2000 trees 0:00:02.760755 0.950304259635 auc 10 | rvmines gbm 2000 trees 0:00:04.201122 0.956389452333 auc 11 | rvmines enet 0:00:00.519870* 0.868672796508 12 | 13 | abalone rf 500 trees 0:00:08.060850 4.30971555911 MSE 14 | abalone gbm 2000 trees 0:00:22.726849 4.20153525438 mse 15 | 16 | wine rf 500 trees 0:00:02.665874 0.314125711509 mse 17 | wine gbm 2000 trees 0:00:13.081342 0.355898056894 mse 18 | wine lasso-expanded 0:00:00.646788* 0.434528740430 19 | 20 | 21 | 22 | *time per cross-validation fold 23 | 24 | 25 | -------------------------------------------------------------------------------- /07/wineBagging.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mike-bowles' 2 | 3 | import urllib2 4 | import numpy 5 | import matplotlib.pyplot as plot 6 | from sklearn import tree 7 | from sklearn.tree import DecisionTreeRegressor 8 | from math import floor 9 | import random 10 | 11 | 12 | # Read wine quality data from UCI website 13 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 14 | data = urllib2.urlopen(target_url) 15 | 16 | xList = [] 17 | labels = [] 18 | names = [] 19 | firstLine = True 20 | for line in data: 21 | if firstLine: 22 | names = line.strip().split(";") 23 | firstLine = False 24 | else: 25 | #split on semi-colon 26 | row = line.strip().split(";") 27 | #put labels in separate array 28 | labels.append(float(row[-1])) 29 | #remove label from row 30 | row.pop() 31 | #convert row to floats 32 | floatRow = [float(num) for num in row] 33 | xList.append(floatRow) 34 | 35 | nrows = len(xList) 36 | ncols = len(xList[0]) 37 | 38 | 39 | #take fixed test set 30% of sample 40 | nSample = int(nrows * 0.30) 41 | idxTest = random.sample(range(nrows), nSample) 42 | idxTest.sort() 43 | idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)] 44 | 45 | #Define test and training attribute and label sets 46 | xTrain = [xList[r] for r in idxTrain] 47 | xTest = [xList[r] for r in idxTest] 48 | yTrain = [labels[r] for r in idxTrain] 49 | yTest = [labels[r] for r in idxTest] 50 | 51 | #train a series of models on random subsets of the training data 52 | #collect the models in a list and check error of composite as list grows 53 | 54 | #maximum number of models to generate 55 | numTreesMax = 100 56 | 57 | #tree depth - typically at the high end 58 | treeDepth = 5 59 | 60 | #initialize a list to hold models 61 | modelList = [] 62 | predList = [] 63 | 64 | #number of samples to draw for stochastic bagging 65 | bagFract = 0.5 66 | nBagSamples = int(len(xTrain) * bagFract) 67 | 68 | for iTrees in range(numTreesMax): 69 | idxBag = [] 70 | for i in range(nBagSamples): 71 | idxBag.append(random.choice(range(len(xTrain)))) 72 | xTrainBag = [xTrain[i] for i in idxBag] 73 | yTrainBag = [yTrain[i] for i in idxBag] 74 | 75 | modelList.append(DecisionTreeRegressor(max_depth=treeDepth)) 76 | modelList[-1].fit(xTrainBag, yTrainBag) 77 | 78 | #make prediction with latest model and add to list of predictions 79 | latestPrediction = modelList[-1].predict(xTest) 80 | predList.append(list(latestPrediction)) 81 | 82 | 83 | #build cumulative prediction from first "n" models 84 | mse = [] 85 | allPredictions = [] 86 | for iModels in range(len(modelList)): 87 | 88 | #average first "iModels" of the predictions 89 | prediction = [] 90 | for iPred in range(len(xTest)): 91 | prediction.append(sum([predList[i][iPred] for i in range(iModels + 1)])/(iModels + 1)) 92 | 93 | allPredictions.append(prediction) 94 | errors = [(yTest[i] - prediction[i]) for i in range(len(yTest))] 95 | mse.append(sum([e * e for e in errors]) / len(yTest)) 96 | 97 | nModels = [i + 1 for i in range(len(modelList))] 98 | 99 | plot.plot(nModels,mse) 100 | plot.axis('tight') 101 | plot.xlabel('Number of Models in Ensemble') 102 | plot.ylabel('Mean Squared Error') 103 | plot.ylim((0.0, max(mse))) 104 | plot.show() 105 | 106 | print('Minimum MSE') 107 | print(min(mse)) 108 | 109 | 110 | #With treeDepth = 5 111 | # bagFract = 0.5 112 | #Minimum MSE 113 | #0.429310223079 114 | 115 | #With treeDepth = 8 116 | # bagFract = 0.5 117 | #Minimum MSE 118 | #0.395838627928 119 | 120 | #With treeDepth = 10 121 | # bagFract = 1.0 122 | #Minimum MSE 123 | #0.313120547589 -------------------------------------------------------------------------------- /07/wineGBM.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import numpy 3 | from sklearn.cross_validation import train_test_split 4 | from sklearn import ensemble 5 | from sklearn.metrics import mean_squared_error 6 | import pylab as plot 7 | 8 | # Read wine quality data from UCI website 9 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 10 | data = urllib2.urlopen(target_url) 11 | 12 | xList = [] 13 | labels = [] 14 | names = [] 15 | firstLine = True 16 | for line in data: 17 | if firstLine: 18 | names = line.strip().split(";") 19 | firstLine = False 20 | else: 21 | #split on semi-colon 22 | row = line.strip().split(";") 23 | #put labels in separate array 24 | labels.append(float(row[-1])) 25 | #remove label from row 26 | row.pop() 27 | #convert row to floats 28 | floatRow = [float(num) for num in row] 29 | xList.append(floatRow) 30 | 31 | nrows = len(xList) 32 | ncols = len(xList[0]) 33 | 34 | X = numpy.array(xList) 35 | y = numpy.array(labels) 36 | wineNames = numpy.array(names) 37 | 38 | #take fixed holdout set 30% of data rows 39 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531) 40 | 41 | # Train gradient boosting model to minimize mean squared error 42 | nEst = 2000 43 | depth = 7 44 | learnRate = 0.01 45 | subSamp = 0.5 46 | wineGBMModel = ensemble.GradientBoostingRegressor(n_estimators=nEst, 47 | max_depth=depth, 48 | learning_rate=learnRate, 49 | subsample = subSamp, 50 | loss='ls') 51 | 52 | wineGBMModel.fit(xTrain, yTrain) 53 | 54 | # compute mse on test set 55 | msError = [] 56 | predictions = wineGBMModel.staged_predict(xTest) 57 | for p in predictions: 58 | msError.append(mean_squared_error(yTest, p)) 59 | 60 | print("MSE" ) 61 | print(min(msError)) 62 | print(msError.index(min(msError))) 63 | 64 | #plot training and test errors vs number of trees in ensemble 65 | plot.figure() 66 | plot.plot(range(1, nEst + 1), wineGBMModel.train_score_, label='Training Set MSE') 67 | plot.plot(range(1, nEst + 1), msError, label='Test Set MSE') 68 | plot.legend(loc='upper right') 69 | plot.xlabel('Number of Trees in Ensemble') 70 | plot.ylabel('Mean Squared Error') 71 | plot.show() 72 | 73 | # Plot feature importance 74 | featureImportance = wineGBMModel.feature_importances_ 75 | 76 | # normalize by max importance 77 | featureImportance = featureImportance / featureImportance.max() 78 | idxSorted = numpy.argsort(featureImportance) 79 | barPos = numpy.arange(idxSorted.shape[0]) + .5 80 | plot.barh(barPos, featureImportance[idxSorted], align='center') 81 | plot.yticks(barPos, wineNames[idxSorted]) 82 | plot.xlabel('Variable Importance') 83 | plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1) 84 | plot.show() 85 | 86 | 87 | # Printed Output: 88 | # for: 89 | #nEst = 2000 90 | #depth = 7 91 | #learnRate = 0.01 92 | #subSamp = 0.5 93 | # 94 | # MSE 95 | # 0.313361215728 96 | # 840 97 | -------------------------------------------------------------------------------- /07/wineRF.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import numpy 3 | from sklearn.cross_validation import train_test_split 4 | from sklearn import ensemble 5 | from sklearn.metrics import mean_squared_error 6 | import pylab as plot 7 | 8 | 9 | # Read wine quality data from UCI website 10 | target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 11 | data = urllib2.urlopen(target_url) 12 | 13 | xList = [] 14 | labels = [] 15 | names = [] 16 | firstLine = True 17 | for line in data: 18 | if firstLine: 19 | names = line.strip().split(";") 20 | firstLine = False 21 | else: 22 | #split on semi-colon 23 | row = line.strip().split(";") 24 | #put labels in separate array 25 | labels.append(float(row[-1])) 26 | #remove label from row 27 | row.pop() 28 | #convert row to floats 29 | floatRow = [float(num) for num in row] 30 | xList.append(floatRow) 31 | 32 | nrows = len(xList) 33 | ncols = len(xList[0]) 34 | 35 | X = numpy.array(xList) 36 | y = numpy.array(labels) 37 | wineNames = numpy.array(names) 38 | 39 | #take fixed holdout set 30% of data rows 40 | xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531) 41 | 42 | #train random forest at a range of ensemble sizes in order to see how the mse changes 43 | mseOos = [] 44 | nTreeList = range(50, 500, 10) 45 | for iTrees in nTreeList: 46 | depth = None 47 | maxFeat = 4 #try tweaking 48 | wineRFModel = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat, 49 | oob_score=False, random_state=531) 50 | 51 | wineRFModel.fit(xTrain,yTrain) 52 | 53 | #Accumulate mse on test set 54 | prediction = wineRFModel.predict(xTest) 55 | mseOos.append(mean_squared_error(yTest, prediction)) 56 | 57 | 58 | print("MSE" ) 59 | print(mseOos[-1]) 60 | 61 | 62 | #plot training and test errors vs number of trees in ensemble 63 | plot.plot(nTreeList, mseOos) 64 | plot.xlabel('Number of Trees in Ensemble') 65 | plot.ylabel('Mean Squared Error') 66 | #plot.ylim([0.0, 1.1*max(mseOob)]) 67 | plot.show() 68 | 69 | # Plot feature importance 70 | featureImportance = wineRFModel.feature_importances_ 71 | 72 | # normalize by max importance 73 | featureImportance = featureImportance / featureImportance.max() 74 | sorted_idx = numpy.argsort(featureImportance) 75 | barPos = numpy.arange(sorted_idx.shape[0]) + .5 76 | plot.barh(barPos, featureImportance[sorted_idx], align='center') 77 | plot.yticks(barPos, wineNames[sorted_idx]) 78 | plot.xlabel('Variable Importance') 79 | plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1) 80 | plot.show() 81 | 82 | #printed output 83 | #MSE 84 | #0.314125711509 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Machine learning: Essential techniques for predictive analysis 2 | =================== 3 | This is the cloned source code for this book. The original source comes from: http://www.wiley.com/WileyCDA/WileyTitle/productCd-1118961749.html 4 | 5 | In the original source code, all data is loaded through urllib call to get data on the fly, but this is slow. This source code changed this to load the data from local. --------------------------------------------------------------------------------