├── .gitignore ├── LICENSE ├── Makefile ├── R ├── main.Rmd └── make.R ├── README.markdown ├── measurements ├── GADT.dat ├── Python.dat ├── ST.dat ├── STU.dat └── Vector.dat ├── result ├── figure │ ├── haskell-densities-1.png │ └── haskell-python-densities-1.png └── main.md ├── sh └── poorManBenchmarTool.sh └── src ├── Main.py ├── MainGADT.hs ├── MainST.hs ├── MainSTU.hs └── MainVector.hs /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | *.swp 3 | *.hi 4 | *.o 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | result/main.md: R/make.R measurements/Python.dat measurements/GADT.dat measurements/STU.dat measurements/ST.dat measurements/Vector.dat R/main.Rmd 2 | R --slave -f $< 3 | mv main.md result/ 4 | mv figure result/ 5 | 6 | measurements/Python.dat: src/Main.py sh/poorManBenchmarTool.sh 7 | bash sh/poorManBenchmarTool.sh "python $<" > $@ 8 | 9 | measurements/GADT.dat: build/MainGADT sh/poorManBenchmarTool.sh 10 | bash sh/poorManBenchmarTool.sh $< > $@ 11 | 12 | measurements/ST.dat: build/MainST sh/poorManBenchmarTool.sh 13 | bash sh/poorManBenchmarTool.sh $< > $@ 14 | 15 | measurements/STU.dat: build/MainSTU sh/poorManBenchmarTool.sh 16 | bash sh/poorManBenchmarTool.sh $< > $@ 17 | 18 | measurements/Vector.dat: build/MainVector sh/poorManBenchmarTool.sh 19 | bash sh/poorManBenchmarTool.sh $< > $@ 20 | 21 | build/MainGADT: src/MainGADT.hs 22 | ghc -O2 $< -o $@ 23 | 24 | build/MainST: src/MainST.hs 25 | ghc -O2 $< -o $@ 26 | 27 | build/MainSTU: src/MainSTU.hs 28 | ghc -O2 $< -o $@ 29 | 30 | build/MainVector: src/MainVector.hs 31 | ghc -O2 $< -o $@ 32 | -------------------------------------------------------------------------------- /R/main.Rmd: -------------------------------------------------------------------------------- 1 | # "Haskell's stochastic gradient descent implementations comparison. 2 | 3 | **Date:** "2016-07-14" **Author:** "Aner Oscar Lucero" 4 | 5 | ## Abstract 6 | 7 | I made three implementations of *stochastic gradient descent* using GADTs, boxed 8 | arrays, unboxed arrays and `Data.Vector`. I found that the GADTs implementation runs faster. I 9 | also compared performance with python. 10 | 11 | ## Method 12 | 13 | I made various implementations of *stochastic gradient descent* **(SGD)** to train logistic 14 | regression models. SGD finds the optimal parameters for the model using one 15 | example at a time, in contrast to batch-gradient descent which uses many samples 16 | at once to train the model. SGD only requires to define vector dot product. The 17 | implementations differ in the way I implemented the vectors. 18 | 19 | In pseudo-Haskell-code, SGD looks like: 20 | 21 | ```haskell 22 | model :: Vector (nFeatures + 1) Double 23 | features :: Vector (nFeatures) Double 24 | 25 | model := initial_model --Usually all zeros 26 | foreach example in examples: 27 | (features, target) := example 28 | prediction = hypotesis model features 29 | difference = prediction - target 30 | model = (model - learningRate * (difference * features + lambda * model')) 31 | 32 | where 33 | --here `dot` represents the scalar product 34 | hypotesis :: model -> features -> target 35 | hypotesis = 1 / ( 1 + exp (negate $ model `dot` (1 :- features))) 36 | model' = "model with it first element set to 0" 37 | ``` 38 | 39 | **GADTs** 40 | 41 | The first implementation performs vector operations using: 42 | 43 | ```haskell 44 | infixr 5 :- 45 | data Vector :: Nat -> * -> * where 46 | Nil :: Vector 'Z a 47 | (:-) :: a -> Vector n a -> Vector ('S n) a 48 | 49 | -- Inner product 50 | {-# INLINE (%.) #-} 51 | (%.) :: Num a => Vector ('S n) a -> Vector ('S n) a -> a 52 | (%.) (a :- Nil) (b :- Nil) = a * b 53 | (%.) (a :- as@(_ :- _)) (b :- bs@(_ :- _)) = a * b + (as %. bs) 54 | 55 | ``` 56 | 57 | The `foreach` part of the algorithm is handled by 58 | ```haskell 59 | nextModel :: Num a => 60 | a -> --lambda 61 | a -> --learningRate 62 | a -> --difference 63 | Vector ('S n) a -> --model 64 | Vector n a -> --Features 65 | Vector ('S n) a --the resulting model 66 | 67 | nextModel lambda learningRate difference (m :- model) features = 68 | (m - learningRate * difference) :- nextModel' lambda learningRate difference model features 69 | 70 | nextModel' :: Num a => a -> a -> a -> Vector n a -> Vector n a -> Vector n a 71 | nextModel' _ _ _ Nil Nil = Nil 72 | nextModel' lambda learningRate difference (m :- ms) (f :- fs) = 73 | (m - learningRate * (difference * f + lambda * m)) :- nextModel' lambda learningRate difference ms fs 74 | ``` 75 | 76 | With the hypothesis calculated as: 77 | 78 | ```haskell 79 | sigmoidHypothesis :: Model -> Features -> Target 80 | sigmoidHypothesis model features = 81 | 1 / ( 1 + exp (negate $ model %. (1 :- features))) 82 | ``` 83 | The whole code can be found [here](../src/MainGADT.hs). 84 | 85 | **MonadST, STArray & IArray** 86 | 87 | The second implementation performs vector operations using an `IArray` 88 | parametrized by its number of elements: 89 | 90 | ```haskell 91 | data Vector :: Nat -> * where 92 | Vector :: SNat n -> Int -> Array Int Double -> Vector n 93 | ``` 94 | 95 | The `foreach` part of the algorithm and the hypothesis are handled by the 96 | following imperative-style code. 97 | 98 | ```haskell 99 | {-# INLINE nextModel #-} 100 | nextModel :: 101 | Double -> --lambda 102 | Double -> --learningRate 103 | Double -> --difference 104 | Model -> --model 105 | Features -> 106 | Model --the resulting model 107 | 108 | nextModel lambda learningRate difference (Vector sn nResultElements modelArr) (Vector _ _ featureArr) = Vector sn nResultElements $ runSTArray $ do 109 | result <- newArray (1, nResultElements) 0 :: ST s (STArray s Int Double) 110 | writeArray result 1 $ (modelArr ! 1) - learningRate * difference 111 | forM_ [2..nResultElements] (\elementIndex -> 112 | do 113 | let modelElement = modelArr ! elementIndex 114 | let featureElement = featureArr ! (elementIndex - 1) 115 | writeArray result elementIndex $ modelElement - learningRate * (difference * featureElement + lambda * modelElement)) 116 | return result 117 | 118 | sigmoidHypothesis :: Model -> Features -> Target 119 | sigmoidHypothesis (Vector _ nElements modelArr) (Vector _ _ featuresArr) = runST $ do 120 | expo <- newSTRef $ modelArr ! 1 121 | forM_ [2..nElements] (\elementIndex -> 122 | modifySTRef expo (+ (modelArr ! elementIndex) * (featuresArr ! (elementIndex - 1)))) 123 | readSTRef expo >>= \e -> return $ 1 / (1 + exp (negate e)) 124 | 125 | ``` 126 | 127 | The whole code can be found [here](../src/MainST.hs). 128 | 129 | **MonadST, STUArray & UArray** 130 | 131 | According to the [wiki](https://wiki.haskell.org/Arrays#Unboxed_arrays), `unboxed` 132 | arrays are a lighter version of `IArray`. It is also easy to change code using 133 | `IArray` and `STArray` to start using `UArray` and `STUArray`. This third 134 | implementation does just that. 135 | 136 | The whole code can be found [here](../src/MainSTU.hs). 137 | 138 | **Data.Vector** 139 | 140 | Haskell's [Data.Vector](https://hackage.haskell.org/package/vector) library is 141 | an efficient implementation of Int-indexed arrays (both mutable and immutable), 142 | with a powerful loop optimisation framework. 143 | 144 | Here are the highlights of implementing SGD using this library: 145 | 146 | ```haskell 147 | {-# INLINE nextModel #-} 148 | import qualified Data.Vector as V 149 | 150 | data Vector :: Nat -> * where 151 | Vector :: SNat n -> V.Vector Double -> Vector n 152 | 153 | nextModel :: 154 | Double -> --lambda 155 | Double -> --learningRate 156 | Double -> --difference 157 | Model -> --model 158 | Features -> 159 | Model --the resulting model 160 | 161 | nextModel lambda learningRate difference (Vector sn modelArr) (Vector _ featureArr) = Vector sn $ 162 | V.update 163 | (V.zipWith (-) modelArr $ (* learningRate) <$> V.zipWith (+) (fmap (* difference) featureArr) (fmap (* lambda) modelArr)) 164 | (V.singleton (0, (modelArr V.! 0) - learningRate * difference)) 165 | 166 | sigmoidHypothesis :: Model -> Features -> Target 167 | sigmoidHypothesis (Vector _ modelArr) (Vector _ featuresArr) = 168 | 1 / ( 1 + exp (negate $ V.foldl' (+) 0 $ V.zipWith (*) modelArr (V.cons 1 featuresArr))) 169 | ``` 170 | 171 | The whole code can be found [here](../src/MainVector.hs). 172 | 173 | **Python** 174 | 175 | I also made a python-3.5.2 implementation to compare. This version uses the 176 | [`numpy`](http://www.numpy.org/)-1.11.1 python library to perform the vector operations. 177 | 178 | The code can be found [here](../src/Main.py). 179 | 180 | **Benchmark** 181 | 182 | For each implementation I measured the time needed to train a model using 183 | 1,000,000 examples. This measurement is repeated 100 times to observe the 184 | variations. The code performing this operation is found 185 | [here](../sh/poorManBenchmarTool.sh). 186 | 187 | All code was compiled using `-O2`. 188 | 189 | ## Results 190 | 191 | **Comparing haskell implementations** 192 | ```{r haskell-densities, echo=FALSE} 193 | library("ggplot2") 194 | 195 | GADT <- read.table("../measurements/GADT.dat") 196 | ST <- read.table("../measurements/ST.dat") 197 | STU <- read.table("../measurements/STU.dat") 198 | VECTOR <- read.table("../measurements/Vector.dat") 199 | 200 | DF <- data.frame(GADT= GADT$V1, ST=ST$V1, STU= STU$V1, VECTOR=VECTOR$V1) 201 | dfs <- stack(DF) 202 | names(dfs)<- c("Time","Implementation") 203 | 204 | p <- ggplot(dfs, aes(x=Time)) 205 | p <- p + geom_density(aes(group=Implementation, colour=Implementation, fill=Implementation), alpha=0.3) 206 | p <- p + ggtitle("Running times for the Haskell implementations.") 207 | p + xlab("Time in seconds") 208 | ``` 209 | 210 | * The GADT implementation is the best. 211 | 212 | Here are some summary statistics of the running times for each Haskell 213 | implementation. 214 | 215 | ```{r haskell-summaries, echo=FALSE} 216 | summary(DF) 217 | ``` 218 | 219 | **Comparing to (python + numpy)** 220 | 221 | ```{r haskell-python-densities, echo=FALSE} 222 | 223 | PYTHON <- read.table("../measurements/Python.dat") 224 | 225 | DF <- data.frame(GADT= GADT$V1, PYTHON= PYTHON$V1) 226 | dfs <- stack(DF) 227 | names(dfs)<- c("Time","Implementation") 228 | 229 | pp <- ggplot(dfs, aes(x=Time)) 230 | pp <- pp + geom_density(aes(group=Implementation, colour=Implementation, fill=Implementation), alpha=0.3) 231 | pp <- pp + ggtitle("Comparing running times with (python + numpy)") 232 | pp + xlab("Time in seconds") 233 | ``` 234 | 235 | My Haskell implementation is 9 times faster than my python one. 236 | 237 | Here are some summary statistics of the running times for both 238 | implementations. 239 | 240 | ```{r haskell-python-summaries, echo=FALSE} 241 | summary(DF) 242 | ``` 243 | 244 | ## Notes 245 | 246 | I wanted my Haskell implementations to make type-safe scalar products. The 247 | convoluted computations of the next model are the consequence of performance 248 | optimizations. 249 | 250 | For the python comparison, I wanted to see how Haskell compared to a python 251 | solution using what appears to be a popular library. In other words, I don't claim 252 | that my python implementation is the best possible one. 253 | 254 | All the raw data can be found in the [measurements](../measurements) folder. 255 | -------------------------------------------------------------------------------- /R/make.R: -------------------------------------------------------------------------------- 1 | require("knitr") 2 | knit("R/main.Rmd") 3 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # Comparing Different Stochastic Gradien Descent implementations in Haskell against Python. 2 | 3 | See [result/main.md](result/main.md) 4 | 5 | Check criterion and vector 6 | -------------------------------------------------------------------------------- /measurements/GADT.dat: -------------------------------------------------------------------------------- 1 | 3.296215387 2 | 3.296006663 3 | 3.297082328 4 | 3.273238485 5 | 3.274475402 6 | 3.280059504 7 | 3.261387070 8 | 3.319965886 9 | 3.309378759 10 | 3.256174888 11 | 3.282809945 12 | 3.332691167 13 | 3.284747167 14 | 3.323907751 15 | 3.269530507 16 | 3.301798704 17 | 3.300946360 18 | 3.307245725 19 | 3.287295620 20 | 3.311520708 21 | 3.332461928 22 | 3.277716368 23 | 3.268542010 24 | 3.295990599 25 | 3.279582568 26 | 3.275063380 27 | 3.283725500 28 | 3.293785629 29 | 3.325427969 30 | 3.290225296 31 | 3.281507695 32 | 3.277553079 33 | 3.264322647 34 | 3.281448330 35 | 3.338029872 36 | 3.278580023 37 | 3.306868908 38 | 3.270407070 39 | 3.299840862 40 | 3.318207692 41 | 3.339362023 42 | 3.319625747 43 | 3.395011324 44 | 3.314813481 45 | 3.264964418 46 | 3.290781554 47 | 3.289775144 48 | 3.317314214 49 | 3.286261992 50 | 3.344243082 51 | 3.300509661 52 | 3.312652665 53 | 3.293939562 54 | 3.261842029 55 | 3.276131532 56 | 3.319239176 57 | 3.286303828 58 | 3.327178580 59 | 3.343896810 60 | 3.409834834 61 | 3.296540447 62 | 3.333961066 63 | 3.304066950 64 | 3.293266782 65 | 3.292796961 66 | 3.338154538 67 | 3.278089319 68 | 3.294620234 69 | 3.264330400 70 | 3.328510521 71 | 3.261334143 72 | 3.267222872 73 | 3.319535652 74 | 3.264449829 75 | 3.295406926 76 | 3.291554626 77 | 3.298569612 78 | 3.296716448 79 | 3.278584354 80 | 3.294950443 81 | 3.283095953 82 | 3.314441995 83 | 3.296246556 84 | 3.286817859 85 | 3.299936055 86 | 3.308276068 87 | 3.303014096 88 | 3.295106538 89 | 3.327679551 90 | 3.262077673 91 | 3.317587083 92 | 3.316162742 93 | 3.263759379 94 | 3.361914853 95 | 3.288955976 96 | 3.298148330 97 | 3.314143006 98 | 3.296001972 99 | 3.315617981 100 | 3.275872910 101 | -------------------------------------------------------------------------------- /measurements/Python.dat: -------------------------------------------------------------------------------- 1 | 30.298400912 2 | 29.646914065 3 | 29.491292726 4 | 29.442574150 5 | 29.517731458 6 | 30.332312829 7 | 31.034016463 8 | 30.258981508 9 | 29.964825087 10 | 30.508294144 11 | 29.755600720 12 | 29.621505226 13 | 29.829198055 14 | 29.616244644 15 | 30.246631434 16 | 30.458986920 17 | 30.532132587 18 | 29.953698865 19 | 30.048406519 20 | 30.425799976 21 | 29.925913799 22 | 29.838416322 23 | 29.701259311 24 | 29.757962086 25 | 29.682085784 26 | 29.544527766 27 | 29.975810036 28 | 30.174723492 29 | 30.354882438 30 | 30.232282105 31 | 30.178085799 32 | 29.931873160 33 | 29.956431196 34 | 29.654940103 35 | 29.872497846 36 | 29.479407384 37 | 29.728331444 38 | 29.639233225 39 | 29.973660968 40 | 30.668204076 41 | 29.676698571 42 | 29.883602060 43 | 29.955546944 44 | 29.808134944 45 | 30.102873520 46 | 30.020422101 47 | 29.708554750 48 | 30.739526168 49 | 30.425453385 50 | 29.610809509 51 | 30.193681636 52 | 30.090943808 53 | 29.618582898 54 | 29.938321433 55 | 29.418308761 56 | 29.736085800 57 | 29.687941609 58 | 29.819914306 59 | 29.928811333 60 | 29.749843895 61 | 29.738862402 62 | 29.695532632 63 | 29.737328074 64 | 30.052884300 65 | 30.003238512 66 | 30.017225722 67 | 29.975433716 68 | 29.834310414 69 | 29.770559314 70 | 29.838074942 71 | 29.799793231 72 | 29.818956613 73 | 29.755912091 74 | 29.726201424 75 | 29.659150859 76 | 29.858447740 77 | 29.999107963 78 | 29.746406519 79 | 29.788389595 80 | 30.181551680 81 | 29.955214381 82 | 29.996634494 83 | 29.779631768 84 | 29.861305548 85 | 30.015473586 86 | 29.460564507 87 | 30.109260229 88 | 29.714336432 89 | 29.821912641 90 | 29.759745362 91 | 29.911053618 92 | 29.931780615 93 | 29.495626618 94 | 29.932425584 95 | 30.072814013 96 | 29.880054503 97 | 29.995265241 98 | 29.866600678 99 | 29.847838456 100 | 29.689209891 101 | -------------------------------------------------------------------------------- /measurements/ST.dat: -------------------------------------------------------------------------------- 1 | 4.510327859 2 | 4.528415857 3 | 4.517268519 4 | 4.487048794 5 | 4.509418690 6 | 4.478174886 7 | 4.596751308 8 | 4.458624346 9 | 4.557297130 10 | 4.510487250 11 | 4.508649894 12 | 4.578268928 13 | 4.470667418 14 | 4.615239512 15 | 4.420900982 16 | 4.516285637 17 | 4.487450523 18 | 4.741545827 19 | 4.521133403 20 | 4.472828650 21 | 4.539857759 22 | 4.487808889 23 | 4.474381315 24 | 4.463283434 25 | 4.479497668 26 | 4.546725545 27 | 4.506791757 28 | 4.463916607 29 | 4.486865631 30 | 4.473385190 31 | 4.479705228 32 | 4.466956933 33 | 4.596750467 34 | 4.459687997 35 | 4.476494766 36 | 4.456027619 37 | 4.545636815 38 | 4.511618935 39 | 4.638233362 40 | 4.450113890 41 | 4.477192210 42 | 4.465284597 43 | 4.499870987 44 | 4.488199508 45 | 4.493534369 46 | 4.485691657 47 | 4.505824127 48 | 4.477119214 49 | 4.513393457 50 | 4.477723839 51 | 4.414702724 52 | 4.495018492 53 | 4.420915252 54 | 4.525098208 55 | 4.485285541 56 | 4.540433744 57 | 4.469086373 58 | 4.472239327 59 | 4.603926728 60 | 4.490721561 61 | 4.487828152 62 | 4.434849581 63 | 4.528821260 64 | 4.584543207 65 | 4.516533481 66 | 4.509857264 67 | 4.462967943 68 | 4.565627795 69 | 4.499846705 70 | 4.489967640 71 | 4.533268329 72 | 4.588531837 73 | 4.473569686 74 | 4.545657035 75 | 4.499818898 76 | 4.518090033 77 | 4.496478135 78 | 4.497368727 79 | 4.501748585 80 | 4.476235847 81 | 4.510225236 82 | 4.500144194 83 | 4.495126080 84 | 4.588006203 85 | 4.468447532 86 | 4.449564754 87 | 4.476711030 88 | 4.418360777 89 | 4.498161247 90 | 4.497945277 91 | 4.485399611 92 | 4.591220468 93 | 4.477688821 94 | 4.466295674 95 | 4.430039479 96 | 4.489140901 97 | 4.478452593 98 | 4.455832196 99 | 4.524251271 100 | 4.501041973 101 | -------------------------------------------------------------------------------- /measurements/STU.dat: -------------------------------------------------------------------------------- 1 | 4.486657715 2 | 4.582209960 3 | 4.483951885 4 | 4.469898482 5 | 4.478890096 6 | 4.515619024 7 | 4.545208691 8 | 4.500610279 9 | 4.475105537 10 | 4.471619912 11 | 4.503291831 12 | 4.528003492 13 | 4.509146119 14 | 4.489187416 15 | 4.589508353 16 | 4.568899147 17 | 4.716448942 18 | 4.775688869 19 | 4.567365067 20 | 4.505835637 21 | 4.446328748 22 | 4.483875726 23 | 4.616809104 24 | 4.463984168 25 | 4.512609600 26 | 4.634628855 27 | 4.529211926 28 | 4.486485838 29 | 4.482608337 30 | 4.467952979 31 | 4.555757973 32 | 4.469992398 33 | 4.504105621 34 | 4.529341091 35 | 4.552961340 36 | 4.744020910 37 | 4.714937222 38 | 4.541657370 39 | 4.483436065 40 | 4.475052678 41 | 4.480307223 42 | 4.450951372 43 | 4.524391518 44 | 4.571955703 45 | 4.801147962 46 | 4.593816459 47 | 4.669484471 48 | 4.469363960 49 | 4.452508676 50 | 4.534838649 51 | 4.451288394 52 | 4.456801493 53 | 4.609436117 54 | 4.476353998 55 | 4.474063997 56 | 4.609712214 57 | 4.493748991 58 | 4.530868394 59 | 4.466770858 60 | 4.476693395 61 | 4.452480882 62 | 4.493457485 63 | 4.493686525 64 | 4.498395890 65 | 4.467074434 66 | 4.446811874 67 | 4.472354367 68 | 4.525520524 69 | 4.494533679 70 | 4.475903102 71 | 4.498812644 72 | 4.465059663 73 | 4.544269740 74 | 4.463816132 75 | 4.485574269 76 | 4.495808858 77 | 4.553937941 78 | 4.483238174 79 | 4.463068855 80 | 4.514022295 81 | 4.456431198 82 | 4.490871412 83 | 4.469347242 84 | 4.462389469 85 | 4.468954592 86 | 4.587808652 87 | 4.483116117 88 | 4.468824093 89 | 4.463696933 90 | 4.493237773 91 | 4.496167095 92 | 4.522091619 93 | 4.517451253 94 | 4.532871843 95 | 4.499790397 96 | 4.463998546 97 | 4.495559199 98 | 4.543034115 99 | 4.548142487 100 | 4.465079626 101 | -------------------------------------------------------------------------------- /measurements/Vector.dat: -------------------------------------------------------------------------------- 1 | 4.381525027 2 | 4.321898336 3 | 4.309435375 4 | 4.306354677 5 | 4.241382336 6 | 4.263372136 7 | 4.341590570 8 | 4.312216038 9 | 4.302324142 10 | 4.339984295 11 | 4.291249067 12 | 4.269288742 13 | 4.322213189 14 | 4.290221704 15 | 4.284731555 16 | 4.391741698 17 | 4.257465111 18 | 4.273852384 19 | 4.314358707 20 | 4.330159803 21 | 4.341721884 22 | 4.438109382 23 | 4.660207510 24 | 4.347878603 25 | 4.412454457 26 | 4.327152303 27 | 4.292894610 28 | 4.369824335 29 | 4.308720920 30 | 4.272479245 31 | 4.330189913 32 | 4.313136845 33 | 4.331769655 34 | 4.333777382 35 | 4.297734615 36 | 4.287562796 37 | 4.357182377 38 | 4.275180711 39 | 4.291167934 40 | 4.271563075 41 | 4.382081695 42 | 4.337492943 43 | 4.284755671 44 | 4.336062665 45 | 4.357808998 46 | 4.270954343 47 | 4.313024621 48 | 4.324928925 49 | 4.327551814 50 | 4.302085318 51 | 4.342806752 52 | 4.322148408 53 | 4.312466315 54 | 4.384090689 55 | 4.307851765 56 | 4.328899196 57 | 4.314034952 58 | 4.383216768 59 | 4.294856057 60 | 4.355612502 61 | 4.315254664 62 | 4.340346951 63 | 4.332020967 64 | 4.323801349 65 | 4.331991145 66 | 4.318269296 67 | 4.299857814 68 | 4.344833697 69 | 4.344076479 70 | 4.652798230 71 | 4.422235691 72 | 4.358091736 73 | 4.353898189 74 | 4.379755588 75 | 4.379133582 76 | 4.360338393 77 | 4.340188563 78 | 4.273112881 79 | 4.348832188 80 | 4.313335160 81 | 4.307931613 82 | 4.247739067 83 | 4.237909469 84 | 4.293818862 85 | 4.284115957 86 | 4.333447700 87 | 4.328057492 88 | 4.333768553 89 | 4.294733646 90 | 4.379720818 91 | 4.337699430 92 | 4.298632187 93 | 4.294046690 94 | 4.344551418 95 | 4.280726218 96 | 4.310755235 97 | 4.414407623 98 | 4.317421235 99 | 4.332724433 100 | 4.293658098 101 | -------------------------------------------------------------------------------- /result/figure/haskell-densities-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argent0/haskell-ml-benchmark/4a36d1afc25ba97cc6d5661714c71627daf5d1e9/result/figure/haskell-densities-1.png -------------------------------------------------------------------------------- /result/figure/haskell-python-densities-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argent0/haskell-ml-benchmark/4a36d1afc25ba97cc6d5661714c71627daf5d1e9/result/figure/haskell-python-densities-1.png -------------------------------------------------------------------------------- /result/main.md: -------------------------------------------------------------------------------- 1 | # "Haskell's stochastic gradient descent implementations comparison. 2 | 3 | **Date:** "2016-07-14" **Author:** "Aner Oscar Lucero" 4 | 5 | ## Abstract 6 | 7 | I made three implementations of *stochastic gradient descent* using GADTs, boxed 8 | arrays, unboxed arrays and `Data.Vector`. I found that the GADTs implementation runs faster. I 9 | also compared performance with python. 10 | 11 | ## Method 12 | 13 | I made various implementations of *stochastic gradient descent* **(SGD)** to train logistic 14 | regression models. SGD finds the optimal parameters for the model using one 15 | example at a time, in contrast to batch-gradient descent which uses many samples 16 | at once to train the model. SGD only requires to define vector dot product. The 17 | implementations differ in the way I implemented the vectors. 18 | 19 | In pseudo-Haskell-code, SGD looks like: 20 | 21 | ```haskell 22 | model :: Vector (nFeatures + 1) Double 23 | features :: Vector (nFeatures) Double 24 | 25 | model := initial_model --Usually all zeros 26 | foreach example in examples: 27 | (features, target) := example 28 | prediction = hypotesis model features 29 | difference = prediction - target 30 | model = (model - learningRate * (difference * features + lambda * model')) 31 | 32 | where 33 | --here `dot` represents the scalar product 34 | hypotesis :: model -> features -> target 35 | hypotesis = 1 / ( 1 + exp (negate $ model `dot` (1 :- features))) 36 | model' = "model with it first element set to 0" 37 | ``` 38 | 39 | **GADTs** 40 | 41 | The first implementation performs vector operations using: 42 | 43 | ```haskell 44 | infixr 5 :- 45 | data Vector :: Nat -> * -> * where 46 | Nil :: Vector 'Z a 47 | (:-) :: a -> Vector n a -> Vector ('S n) a 48 | 49 | -- Inner product 50 | {-# INLINE (%.) #-} 51 | (%.) :: Num a => Vector ('S n) a -> Vector ('S n) a -> a 52 | (%.) (a :- Nil) (b :- Nil) = a * b 53 | (%.) (a :- as@(_ :- _)) (b :- bs@(_ :- _)) = a * b + (as %. bs) 54 | 55 | ``` 56 | 57 | The `foreach` part of the algorithm is handled by 58 | ```haskell 59 | nextModel :: Num a => 60 | a -> --lambda 61 | a -> --learningRate 62 | a -> --difference 63 | Vector ('S n) a -> --model 64 | Vector n a -> --Features 65 | Vector ('S n) a --the resulting model 66 | 67 | nextModel lambda learningRate difference (m :- model) features = 68 | (m - learningRate * difference) :- nextModel' lambda learningRate difference model features 69 | 70 | nextModel' :: Num a => a -> a -> a -> Vector n a -> Vector n a -> Vector n a 71 | nextModel' _ _ _ Nil Nil = Nil 72 | nextModel' lambda learningRate difference (m :- ms) (f :- fs) = 73 | (m - learningRate * (difference * f + lambda * m)) :- nextModel' lambda learningRate difference ms fs 74 | ``` 75 | 76 | With the hypothesis calculated as: 77 | 78 | ```haskell 79 | sigmoidHypothesis :: Model -> Features -> Target 80 | sigmoidHypothesis model features = 81 | 1 / ( 1 + exp (negate $ model %. (1 :- features))) 82 | ``` 83 | The whole code can be found [here](../src/MainGADT.hs). 84 | 85 | **MonadST, STArray & IArray** 86 | 87 | The second implementation performs vector operations using an `IArray` 88 | parametrized by its number of elements: 89 | 90 | ```haskell 91 | data Vector :: Nat -> * where 92 | Vector :: SNat n -> Int -> Array Int Double -> Vector n 93 | ``` 94 | 95 | The `foreach` part of the algorithm and the hypothesis are handled by the 96 | following imperative-style code. 97 | 98 | ```haskell 99 | {-# INLINE nextModel #-} 100 | nextModel :: 101 | Double -> --lambda 102 | Double -> --learningRate 103 | Double -> --difference 104 | Model -> --model 105 | Features -> 106 | Model --the resulting model 107 | 108 | nextModel lambda learningRate difference (Vector sn nResultElements modelArr) (Vector _ _ featureArr) = Vector sn nResultElements $ runSTArray $ do 109 | result <- newArray (1, nResultElements) 0 :: ST s (STArray s Int Double) 110 | writeArray result 1 $ (modelArr ! 1) - learningRate * difference 111 | forM_ [2..nResultElements] (\elementIndex -> 112 | do 113 | let modelElement = modelArr ! elementIndex 114 | let featureElement = featureArr ! (elementIndex - 1) 115 | writeArray result elementIndex $ modelElement - learningRate * (difference * featureElement + lambda * modelElement)) 116 | return result 117 | 118 | sigmoidHypothesis :: Model -> Features -> Target 119 | sigmoidHypothesis (Vector _ nElements modelArr) (Vector _ _ featuresArr) = runST $ do 120 | expo <- newSTRef $ modelArr ! 1 121 | forM_ [2..nElements] (\elementIndex -> 122 | modifySTRef expo (+ (modelArr ! elementIndex) * (featuresArr ! (elementIndex - 1)))) 123 | readSTRef expo >>= \e -> return $ 1 / (1 + exp (negate e)) 124 | 125 | ``` 126 | 127 | The whole code can be found [here](../src/MainST.hs). 128 | 129 | **MonadST, STUArray & UArray** 130 | 131 | According to the [wiki](https://wiki.haskell.org/Arrays#Unboxed_arrays), `unboxed` 132 | arrays are a lighter version of `IArray`. It is also easy to change code using 133 | `IArray` and `STArray` to start using `UArray` and `STUArray`. This third 134 | implementation does just that. 135 | 136 | The whole code can be found [here](../src/MainSTU.hs). 137 | 138 | **Data.Vector** 139 | 140 | Haskell's [Data.Vector](https://hackage.haskell.org/package/vector) library is 141 | an efficient implementation of Int-indexed arrays (both mutable and immutable), 142 | with a powerful loop optimisation framework. 143 | 144 | Here are the highlights of implementing SGD using this library: 145 | 146 | ```haskell 147 | {-# INLINE nextModel #-} 148 | import qualified Data.Vector as V 149 | 150 | data Vector :: Nat -> * where 151 | Vector :: SNat n -> V.Vector Double -> Vector n 152 | 153 | nextModel :: 154 | Double -> --lambda 155 | Double -> --learningRate 156 | Double -> --difference 157 | Model -> --model 158 | Features -> 159 | Model --the resulting model 160 | 161 | nextModel lambda learningRate difference (Vector sn modelArr) (Vector _ featureArr) = Vector sn $ 162 | V.update 163 | (V.zipWith (-) modelArr $ (* learningRate) <$> V.zipWith (+) (fmap (* difference) featureArr) (fmap (* lambda) modelArr)) 164 | (V.singleton (0, (modelArr V.! 0) - learningRate * difference)) 165 | 166 | sigmoidHypothesis :: Model -> Features -> Target 167 | sigmoidHypothesis (Vector _ modelArr) (Vector _ featuresArr) = 168 | 1 / ( 1 + exp (negate $ V.foldl' (+) 0 $ V.zipWith (*) modelArr (V.cons 1 featuresArr))) 169 | ``` 170 | 171 | The whole code can be found [here](../src/MainVector.hs). 172 | 173 | **Python** 174 | 175 | I also made a python-3.5.2 implementation to compare. This version uses the 176 | [`numpy`](http://www.numpy.org/)-1.11.1 python library to perform the vector operations. 177 | 178 | The code can be found [here](../src/Main.py). 179 | 180 | **Benchmark** 181 | 182 | For each implementation I measured the time needed to train a model using 183 | 1,000,000 examples. This measurement is repeated 100 times to observe the 184 | variations. The code performing this operation is found 185 | [here](../sh/poorManBenchmarTool.sh). 186 | 187 | All code was compiled using `-O2`. 188 | 189 | ## Results 190 | 191 | **Comparing haskell implementations** 192 | ![plot of chunk haskell-densities](figure/haskell-densities-1.png) 193 | 194 | * The GADT implementation is the best. 195 | 196 | Here are some summary statistics of the running times for each Haskell 197 | implementation. 198 | 199 | 200 | ``` 201 | ## GADT ST STU VECTOR 202 | ## Min. :3.256 Min. :4.415 Min. :4.446 Min. :4.238 203 | ## 1st Qu.:3.280 1st Qu.:4.474 1st Qu.:4.471 1st Qu.:4.295 204 | ## Median :3.296 Median :4.494 Median :4.494 Median :4.323 205 | ## Mean :3.299 Mean :4.503 Mean :4.518 Mean :4.330 206 | ## 3rd Qu.:3.315 3rd Qu.:4.517 3rd Qu.:4.537 3rd Qu.:4.344 207 | ## Max. :3.410 Max. :4.742 Max. :4.801 Max. :4.660 208 | ``` 209 | 210 | **Comparing to (python + numpy)** 211 | 212 | ![plot of chunk haskell-python-densities](figure/haskell-python-densities-1.png) 213 | 214 | My Haskell implementation is 9 times faster than my python one. 215 | 216 | Here are some summary statistics of the running times for both 217 | implementations. 218 | 219 | 220 | ``` 221 | ## GADT PYTHON 222 | ## Min. :3.256 Min. :29.42 223 | ## 1st Qu.:3.280 1st Qu.:29.73 224 | ## Median :3.296 Median :29.86 225 | ## Mean :3.299 Mean :29.91 226 | ## 3rd Qu.:3.315 3rd Qu.:30.02 227 | ## Max. :3.410 Max. :31.03 228 | ``` 229 | 230 | ## Notes 231 | 232 | I wanted my Haskell implementations to make type-safe scalar products. The 233 | convoluted computations of the next model are the consequence of performance 234 | optimizations. 235 | 236 | For the python comparison, I wanted to see how Haskell compared to a python 237 | solution using what appears to be a popular library. In other words, I don't claim 238 | that my python implementation is the best possible one. 239 | 240 | All the raw data can be found in the [measurements](../measurements) folder. 241 | -------------------------------------------------------------------------------- /sh/poorManBenchmarTool.sh: -------------------------------------------------------------------------------- 1 | count=0 2 | while test $count -lt 100 3 | do 4 | start="$(date +%s.%N)" 5 | $1 > /dev/null 6 | end="$(date +%s.%N)" 7 | 8 | runtime=$( echo "$end - $start" | bc -l ) 9 | 10 | printf "%s\n" $runtime 11 | count=$((count + 1)) 12 | done 13 | -------------------------------------------------------------------------------- /src/Main.py: -------------------------------------------------------------------------------- 1 | """ Stochastic gradient descent.""" 2 | import math 3 | import numpy as np 4 | 5 | N_FEATURES = 2 6 | 7 | def gradient(regularization_parameter, 8 | learning_rate, difference, model, 9 | features): 10 | """ The gradient """ 11 | model[0] = model[0] - learning_rate * difference 12 | model[1:] = model[1:] - learning_rate * (difference * features + regularization_parameter * model[1:]) 13 | return(model) 14 | 15 | def stochastic_gradient_descent( 16 | hypothesis, 17 | regularization_parameter, 18 | learning_rate, 19 | example, 20 | model): 21 | """ A stochastic_gradient_descent step. """ 22 | (features, target) = example 23 | approximation = hypothesis(model, features) 24 | difference = approximation - target 25 | cost = -(math.log(approximation)*target + math.log(1-approximation)*(1-target)) 26 | next_model = gradient(regularization_parameter, 27 | learning_rate, difference, model, 28 | features) 29 | return (cost, next_model) 30 | 31 | def sigmoid_hypothesis(model, features): 32 | """ The sigmoid function """ 33 | return(1 / (1 + math.exp (-(np.dot(model[1:], features) + model[0])))) 34 | 35 | def examples(): 36 | """ Generator of examples """ 37 | n_examples = 1 38 | while n_examples < 1000000: 39 | features = np.random.rand(N_FEATURES) 40 | if features[1] > 0.5: 41 | target = 1 42 | else: 43 | target = 0 44 | yield(features, target) 45 | n_examples += 1 46 | 47 | if __name__ == "__main__": 48 | (err, model) = (0, np.zeros(N_FEATURES + 1)) 49 | for example in examples(): 50 | #print(example) 51 | (err, model) = stochastic_gradient_descent( 52 | sigmoid_hypothesis, 53 | 0.01, 54 | 0.1, 55 | example, 56 | model) 57 | print((err,model)) 58 | -------------------------------------------------------------------------------- /src/MainGADT.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -Wall #-} 2 | 3 | {-# LANGUAGE GADTs #-} 4 | {-# LANGUAGE DataKinds #-} 5 | {-# LANGUAGE TypeFamilies #-} 6 | {-# LANGUAGE TypeOperators #-} 7 | {-# LANGUAGE TupleSections #-} 8 | {-# LANGUAGE Strict #-} 9 | 10 | module Main (main) where 11 | 12 | --import Naturals 13 | 14 | import System.Random 15 | 16 | data Nat :: * where 17 | Z :: Nat 18 | S :: Nat -> Nat 19 | 20 | data SNat :: Nat -> * where 21 | SZ :: SNat 'Z 22 | SS :: SNat n -> SNat ('S n) 23 | 24 | infixr 5 :- 25 | data Vector :: Nat -> * -> * where 26 | Nil :: Vector 'Z a 27 | (:-) :: a -> Vector n a -> Vector ('S n) a 28 | 29 | instance Show a => Show (Vector n a) where 30 | show Nil = "Nil" 31 | show (x :- xs) = show x ++ " :- " ++ show xs 32 | 33 | zeros :: Num a => SNat n -> Vector n a 34 | zeros SZ = Nil 35 | zeros (SS s) = 0 :- zeros s 36 | 37 | -- Inner product 38 | {-# INLINE (%.) #-} 39 | (%.) :: Num a => Vector ('S n) a -> Vector ('S n) a -> a 40 | (%.) (a :- Nil) (b :- Nil) = a * b 41 | (%.) (a :- as@(_ :- _)) (b :- bs@(_ :- _)) = a * b + (as %. bs) 42 | 43 | type NFeatures = ('S ('S 'Z)) 44 | 45 | sNFeatures :: SNat NFeatures 46 | sNFeatures = SS (SS SZ) 47 | 48 | type Model = Vector ('S NFeatures) Double 49 | type Target = Double 50 | type Features = Vector NFeatures Double 51 | 52 | type Hypothesis = Model -> Features -> Target 53 | 54 | -- General purpose example 55 | data Example = Example 56 | Features 57 | Target deriving Show 58 | 59 | {-# INLINE nextModel #-} 60 | nextModel :: Num a => 61 | a -> --lambda 62 | a -> --learningRate 63 | a -> --difference 64 | Vector ('S n) a -> --model 65 | Vector n a -> --Features 66 | Vector ('S n) a --the resulting model 67 | 68 | nextModel lambda learningRate difference (m :- model) features = 69 | (m - learningRate * difference) :- nextModel' lambda learningRate difference model features 70 | 71 | nextModel' :: Num a => a -> a -> a -> Vector n a -> Vector n a -> Vector n a 72 | nextModel' _ _ _ Nil Nil = Nil 73 | nextModel' lambda learningRate difference (m :- ms) (f :- fs) = 74 | (m - learningRate * (difference * f + lambda * m)) :- nextModel' lambda learningRate difference ms fs 75 | 76 | stochaticGradientDescentUpdate :: Hypothesis -> Double -> Double -> Example -> Model -> (Target, Model) 77 | stochaticGradientDescentUpdate hypothesis lambda learningRate (Example features target) model = 78 | let 79 | cost = negate (log approximation*target + log(1 - approximation)*(1-target)) 80 | grad = nextModel lambda learningRate difference model features 81 | in ((,) $! cost) $! grad 82 | where 83 | difference :: Target 84 | difference = approximation - target 85 | approximation :: Target 86 | approximation = hypothesis model features 87 | {-- Octave code 88 | h = sigmoid(theta'*X'); 89 | reg_theta = [0; theta(2:end)]; 90 | 91 | first_cost_term = (log(h)*y); 92 | second_const_term = log(1 - h) * (1 - y); 93 | third_cost_term = (lambda/2)*sum(reg_theta.^2); 94 | 95 | J = ((-1)/m)*( first_cost_term + second_const_term - third_cost_term); 96 | grad = (1/m)*( ( (h - y') * X) + (lambda) *reg_theta'); 97 | -} 98 | 99 | -- Test main 100 | main :: IO () 101 | main = 102 | print $ foldr folder (1000,zeroModel) $ take 1000000 d 103 | where 104 | folder ex (_, model) = 105 | let 106 | (err, nxtModel) = stochaticGradientDescentUpdate sigmoidHypothesis 0.01 0.1 ex model 107 | in (err, nxtModel) 108 | 109 | d :: [Example] 110 | d = example <$> randomRs (0,1) (mkStdGen 332) <*> randomRs (0,1) (mkStdGen 2132) 111 | example :: Double -> Double -> Example 112 | example x y = Example (x :- y :- Nil) (if y > 0.5 then 1 else 0) 113 | zeroModel = 0 :- zeros sNFeatures 114 | 115 | sigmoidHypothesis :: Model -> Features -> Target 116 | sigmoidHypothesis model features = 117 | 1 / ( 1 + exp (negate $ model %. (1 :- features))) 118 | 119 | -- vim: expandtab 120 | -------------------------------------------------------------------------------- /src/MainST.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -Wall #-} 2 | 3 | {-# LANGUAGE GADTs #-} 4 | {-# LANGUAGE DataKinds #-} 5 | {-# LANGUAGE TypeFamilies #-} 6 | {-# LANGUAGE TypeOperators #-} 7 | {-# LANGUAGE TupleSections #-} 8 | {-# LANGUAGE Strict #-} 9 | 10 | module Main (main) where 11 | 12 | import System.Random 13 | import Data.Array.ST 14 | import Control.Monad.ST 15 | import Data.STRef 16 | import Control.Monad 17 | import Data.Array.IArray 18 | import Data.Foldable (toList) 19 | 20 | data Nat :: * where 21 | Z :: Nat 22 | S :: Nat -> Nat 23 | 24 | data SNat :: Nat -> * where 25 | SZ :: SNat 'Z 26 | SS :: SNat n -> SNat ('S n) 27 | 28 | instance Show (SNat n) where 29 | show = ("SNat " ++) . show . snatToInt 30 | 31 | snatToInt :: SNat n -> Int 32 | snatToInt = snatToInt' 0 33 | where 34 | snatToInt' :: Int -> SNat n -> Int 35 | snatToInt' acc SZ = acc 36 | snatToInt' acc (SS sn) = snatToInt' (acc+1) sn 37 | 38 | infixr 5 :- 39 | data Vector' :: Nat -> * -> * where 40 | Nil :: Vector' 'Z a 41 | (:-) :: a -> Vector' n a -> Vector' ('S n) a 42 | 43 | instance Foldable (Vector' n) where 44 | foldMap _ Nil = mempty 45 | foldMap f (x :- xs) = f x `mappend` foldMap f xs 46 | 47 | data Vector :: Nat -> * where 48 | Vector :: SNat n -> Int -> Array Int Double -> Vector n 49 | 50 | instance Show (Vector n) where 51 | show (Vector _ nElements a) = "Vector " ++ show nElements ++ " " ++ show a 52 | 53 | vector :: SNat ('S n) -> Vector' ('S n) Double -> Vector ('S n) 54 | vector sn = Vector sn (snatToInt sn) . listArray (1, snatToInt sn) . toList 55 | 56 | zeros :: SNat ('S n) -> Vector ('S n) 57 | zeros sn = Vector sn (snatToInt sn) $ listArray (1, snatToInt sn) $ replicate (snatToInt sn) 0 58 | 59 | type NFeatures = ('S ('S 'Z)) 60 | 61 | sNFeatures :: SNat NFeatures 62 | sNFeatures = SS (SS SZ) 63 | 64 | type Model = Vector ('S NFeatures) 65 | type Target = Double 66 | type Features = Vector NFeatures 67 | 68 | type Hypothesis = Model -> Features -> Target 69 | 70 | -- General purpose example 71 | data Example = Example 72 | Features 73 | Target deriving Show 74 | 75 | {-# INLINE nextModel #-} 76 | nextModel :: 77 | Double -> --lambda 78 | Double -> --learningRate 79 | Double -> --difference 80 | Model -> --model 81 | Features -> 82 | Model --the resulting model 83 | 84 | nextModel lambda learningRate difference (Vector sn nResultElements modelArr) (Vector _ _ featureArr) = Vector sn nResultElements $ runSTArray $ do 85 | result <- newArray (1, nResultElements) 0 :: ST s (STArray s Int Double) 86 | writeArray result 1 $ (modelArr ! 1) - learningRate * difference 87 | forM_ [2..nResultElements] (\elementIndex -> 88 | do 89 | let modelElement = modelArr ! elementIndex 90 | let featureElement = featureArr ! (elementIndex - 1) 91 | writeArray result elementIndex $ modelElement - learningRate * (difference * featureElement + lambda * modelElement)) 92 | return result 93 | 94 | stochaticGradientDescentUpdate :: Hypothesis -> Double -> Double -> Example -> Model -> (Target, Model) 95 | stochaticGradientDescentUpdate hypothesis lambda learningRate (Example features target) model = 96 | let 97 | cost = negate (log approximation*target + log(1 - approximation)*(1-target)) 98 | nxtModel = nextModel lambda learningRate difference model features 99 | in ((,) $! cost) $! nxtModel 100 | where 101 | difference :: Target 102 | difference = approximation - target 103 | approximation :: Target 104 | approximation = hypothesis model features 105 | {-- Octave code 106 | h = sigmoid(theta'*X'); 107 | reg_theta = [0; theta(2:end)]; 108 | 109 | first_cost_term = (log(h)*y); 110 | second_const_term = log(1 - h) * (1 - y); 111 | third_cost_term = (lambda/2)*sum(reg_theta.^2); 112 | 113 | J = ((-1)/m)*( first_cost_term + second_const_term - third_cost_term); 114 | grad = (1/m)*( ( (h - y') * X) + (lambda) *reg_theta'); 115 | -} 116 | 117 | -- Test main 118 | main :: IO () 119 | main = 120 | print $ foldr folder (1000,zeroModel) $ take 1000000 d 121 | where 122 | folder ex (_, model) = 123 | let 124 | (err, nxtModel) = stochaticGradientDescentUpdate sigmoidHypothesis 0.01 0.1 ex model 125 | in (err, nxtModel) 126 | 127 | d :: [Example] 128 | d = example <$> randomRs (0,1) (mkStdGen 332) <*> randomRs (0,1) (mkStdGen 2132) 129 | example :: Double -> Double -> Example 130 | example x y = 131 | Example 132 | (vector 133 | sNFeatures -- SS because of the 1 134 | (x :- y :- Nil)) 135 | (if y > 0.5 then 1 else 0) 136 | zeroModel = zeros (SS sNFeatures) 137 | 138 | sigmoidHypothesis :: Model -> Features -> Target 139 | sigmoidHypothesis (Vector _ nElements modelArr) (Vector _ _ featuresArr) = runST $ do 140 | expo <- newSTRef $ modelArr ! 1 141 | forM_ [2..nElements] (\elementIndex -> 142 | modifySTRef expo (+ (modelArr ! elementIndex) * (featuresArr ! (elementIndex - 1)))) 143 | readSTRef expo >>= \e -> return $ 1 / (1 + exp (negate e)) 144 | 145 | 146 | -- vim: expandtab 147 | -------------------------------------------------------------------------------- /src/MainSTU.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -Wall #-} 2 | 3 | {-# LANGUAGE GADTs #-} 4 | {-# LANGUAGE DataKinds #-} 5 | {-# LANGUAGE TypeFamilies #-} 6 | {-# LANGUAGE TypeOperators #-} 7 | {-# LANGUAGE TupleSections #-} 8 | {-# LANGUAGE Strict #-} 9 | 10 | module Main (main) where 11 | 12 | import System.Random 13 | 14 | import Control.Monad.ST 15 | import Data.STRef 16 | import Data.Array.ST 17 | 18 | import Control.Monad 19 | import Data.Array.IArray 20 | import Data.Array.Unboxed 21 | import Data.Foldable (toList) 22 | 23 | data Nat :: * where 24 | Z :: Nat 25 | S :: Nat -> Nat 26 | 27 | data SNat :: Nat -> * where 28 | SZ :: SNat 'Z 29 | SS :: SNat n -> SNat ('S n) 30 | 31 | instance Show (SNat n) where 32 | show = ("SNat " ++) . show . snatToInt 33 | 34 | snatToInt :: SNat n -> Int 35 | snatToInt = snatToInt' 0 36 | where 37 | snatToInt' :: Int -> SNat n -> Int 38 | snatToInt' acc SZ = acc 39 | snatToInt' acc (SS sn) = snatToInt' (acc+1) sn 40 | 41 | infixr 5 :- 42 | data Vector' :: Nat -> * -> * where 43 | Nil :: Vector' 'Z a 44 | (:-) :: a -> Vector' n a -> Vector' ('S n) a 45 | 46 | instance Foldable (Vector' n) where 47 | foldMap _ Nil = mempty 48 | foldMap f (x :- xs) = f x `mappend` foldMap f xs 49 | 50 | data Vector :: Nat -> * where 51 | Vector :: SNat n -> Int -> UArray Int Double -> Vector n 52 | 53 | instance Show (Vector n) where 54 | show (Vector _ nElements a) = "Vector " ++ show nElements ++ " " ++ show a 55 | 56 | vector :: SNat ('S n) -> Vector' ('S n) Double -> Vector ('S n) 57 | vector sn = Vector sn (snatToInt sn) . listArray (1, snatToInt sn) . toList 58 | 59 | zeros :: SNat ('S n) -> Vector ('S n) 60 | zeros sn = Vector sn (snatToInt sn) $ listArray (1, snatToInt sn) $ replicate (snatToInt sn) 0 61 | 62 | type NFeatures = ('S ('S 'Z)) 63 | 64 | sNFeatures :: SNat NFeatures 65 | sNFeatures = SS (SS SZ) 66 | 67 | type Model = Vector ('S NFeatures) 68 | type Target = Double 69 | type Features = Vector NFeatures 70 | 71 | type Hypothesis = Model -> Features -> Target 72 | 73 | -- General purpose example 74 | data Example = Example 75 | Features 76 | Target deriving Show 77 | 78 | {-# INLINE nextModel #-} 79 | nextModel :: 80 | Double -> --lambda 81 | Double -> --learningRate 82 | Double -> --difference 83 | Model -> --model 84 | Features -> 85 | Model --the resulting model 86 | 87 | nextModel lambda learningRate difference (Vector sn nResultElements modelArr) (Vector _ _ featureArr) = 88 | Vector sn nResultElements $ runSTUArray $ do 89 | result <- newArray (1, nResultElements) 0 :: ST s (STUArray s Int Double) 90 | writeArray result 1 $ (modelArr ! 1) - learningRate * difference 91 | forM_ [2..nResultElements] (\elementIndex -> 92 | do 93 | let modelElement = modelArr ! elementIndex 94 | let featureElement = featureArr ! (elementIndex - 1) 95 | writeArray result elementIndex $ modelElement - learningRate * (difference * featureElement + lambda * modelElement)) 96 | return result 97 | 98 | stochaticGradientDescentUpdate :: Hypothesis -> Double -> Double -> Example -> Model -> (Target, Model) 99 | stochaticGradientDescentUpdate hypothesis lambda learningRate (Example features target) model = 100 | let 101 | cost = negate (log approximation*target + log(1 - approximation)*(1-target)) 102 | nxtModel = nextModel lambda learningRate difference model features 103 | in ((,) $! cost) $! nxtModel 104 | where 105 | difference :: Target 106 | difference = approximation - target 107 | approximation :: Target 108 | approximation = hypothesis model features 109 | {-- Octave code 110 | h = sigmoid(theta'*X'); 111 | reg_theta = [0; theta(2:end)]; 112 | 113 | first_cost_term = (log(h)*y); 114 | second_const_term = log(1 - h) * (1 - y); 115 | third_cost_term = (lambda/2)*sum(reg_theta.^2); 116 | 117 | J = ((-1)/m)*( first_cost_term + second_const_term - third_cost_term); 118 | grad = (1/m)*( ( (h - y') * X) + (lambda) *reg_theta'); 119 | -} 120 | 121 | -- Test main 122 | main :: IO () 123 | main = 124 | print $ foldr folder (1000,zeroModel) $ take 1000000 d 125 | where 126 | folder ex (_, model) = 127 | let 128 | (err, nxtModel) = stochaticGradientDescentUpdate sigmoidHypothesis 0.01 0.1 ex model 129 | in (err, nxtModel) 130 | 131 | d :: [Example] 132 | d = example <$> randomRs (0,1) (mkStdGen 332) <*> randomRs (0,1) (mkStdGen 2132) 133 | example :: Double -> Double -> Example 134 | example x y = 135 | Example 136 | (vector 137 | sNFeatures -- SS because of the 1 138 | (x :- y :- Nil)) 139 | (if y > 0.5 then 1 else 0) 140 | zeroModel = zeros (SS sNFeatures) 141 | 142 | sigmoidHypothesis :: Model -> Features -> Target 143 | sigmoidHypothesis (Vector _ nElements modelArr) (Vector _ _ featuresArr) = runST $ do 144 | expo <- newSTRef $ modelArr ! 1 145 | forM_ [2..nElements] (\elementIndex -> 146 | modifySTRef expo (+ (modelArr ! elementIndex) * (featuresArr ! (elementIndex - 1)))) 147 | readSTRef expo >>= \e -> return $ 1 / (1 + exp (negate e)) 148 | 149 | -- vim: expandtab 150 | -------------------------------------------------------------------------------- /src/MainVector.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -Wall #-} 2 | 3 | {-# LANGUAGE GADTs #-} 4 | {-# LANGUAGE DataKinds #-} 5 | {-# LANGUAGE TypeFamilies #-} 6 | {-# LANGUAGE TypeOperators #-} 7 | {-# LANGUAGE TupleSections #-} 8 | {-# LANGUAGE Strict #-} 9 | 10 | module Main (main) where 11 | 12 | import System.Random 13 | import Data.Foldable (toList, foldl') 14 | 15 | import qualified Data.Vector as V 16 | 17 | data Nat :: * where 18 | Z :: Nat 19 | S :: Nat -> Nat 20 | 21 | data SNat :: Nat -> * where 22 | SZ :: SNat 'Z 23 | SS :: SNat n -> SNat ('S n) 24 | 25 | instance Show (SNat n) where 26 | show = ("SNat " ++) . show . snatToInt 27 | 28 | snatToInt :: SNat n -> Int 29 | snatToInt = snatToInt' 0 30 | where 31 | snatToInt' :: Int -> SNat n -> Int 32 | snatToInt' acc SZ = acc 33 | snatToInt' acc (SS sn) = snatToInt' (acc+1) sn 34 | 35 | infixr 5 :- 36 | data Vector' :: Nat -> * -> * where 37 | Nil :: Vector' 'Z a 38 | (:-) :: a -> Vector' n a -> Vector' ('S n) a 39 | 40 | instance Foldable (Vector' n) where 41 | foldMap _ Nil = mempty 42 | foldMap f (x :- xs) = f x `mappend` foldMap f xs 43 | 44 | data Vector :: Nat -> * where 45 | Vector :: SNat n -> V.Vector Double -> Vector n 46 | 47 | instance Show (Vector n) where 48 | show (Vector _ a) = "Vector " ++ show (V.length a) ++ " " ++ show a 49 | 50 | vector :: SNat ('S n) -> Vector' ('S n) Double -> Vector ('S n) 51 | vector sn = Vector sn . foldl' V.snoc V.empty . toList 52 | 53 | zeros :: SNat ('S n) -> Vector ('S n) 54 | zeros sn = Vector sn $ V.replicate (snatToInt sn) 0 55 | 56 | type NFeatures = ('S ('S 'Z)) 57 | 58 | sNFeatures :: SNat NFeatures 59 | sNFeatures = SS (SS SZ) 60 | 61 | type Model = Vector ('S NFeatures) 62 | type Target = Double 63 | type Features = Vector NFeatures 64 | 65 | type Hypothesis = Model -> Features -> Target 66 | 67 | -- General purpose example 68 | data Example = Example 69 | Features 70 | Target deriving Show 71 | 72 | {-# INLINE nextModel #-} 73 | nextModel :: 74 | Double -> --lambda 75 | Double -> --learningRate 76 | Double -> --difference 77 | Model -> --model 78 | Features -> 79 | Model --the resulting model 80 | 81 | nextModel lambda learningRate difference (Vector sn modelArr) (Vector _ featureArr) = Vector sn $ 82 | V.update 83 | (V.zipWith (-) modelArr $ (* learningRate) <$> V.zipWith (+) (fmap (* difference) featureArr) (fmap (* lambda) modelArr)) 84 | (V.singleton (0, (modelArr V.! 0) - learningRate * difference)) 85 | 86 | stochaticGradientDescentUpdate :: Hypothesis -> Double -> Double -> Example -> Model -> (Target, Model) 87 | stochaticGradientDescentUpdate hypothesis lambda learningRate (Example features target) model = 88 | let 89 | cost = negate (log approximation*target + log(1 - approximation)*(1-target)) 90 | nxtModel = nextModel lambda learningRate difference model features 91 | in ((,) $! cost) $! nxtModel 92 | where 93 | difference :: Target 94 | difference = approximation - target 95 | approximation :: Target 96 | approximation = hypothesis model features 97 | {-- Octave code 98 | h = sigmoid(theta'*X'); 99 | reg_theta = [0; theta(2:end)]; 100 | 101 | first_cost_term = (log(h)*y); 102 | second_const_term = log(1 - h) * (1 - y); 103 | third_cost_term = (lambda/2)*sum(reg_theta.^2); 104 | 105 | J = ((-1)/m)*( first_cost_term + second_const_term - third_cost_term); 106 | grad = (1/m)*( ( (h - y') * X) + (lambda) *reg_theta'); 107 | -} 108 | 109 | -- Test main 110 | main :: IO () 111 | main = 112 | print $ foldr folder (1000,zeroModel) $ take 1000000 d 113 | where 114 | folder ex (_, model) = 115 | let 116 | (err, nxtModel) = stochaticGradientDescentUpdate sigmoidHypothesis 0.01 0.1 ex model 117 | in (err, nxtModel) 118 | 119 | d :: [Example] 120 | d = example <$> randomRs (0,1) (mkStdGen 332) <*> randomRs (0,1) (mkStdGen 2132) 121 | example :: Double -> Double -> Example 122 | example x y = 123 | Example 124 | (vector 125 | sNFeatures -- SS because of the 1 126 | (x :- y :- Nil)) 127 | (if y > 0.5 then 1 else 0) 128 | zeroModel = zeros (SS sNFeatures) 129 | 130 | sigmoidHypothesis :: Model -> Features -> Target 131 | sigmoidHypothesis (Vector _ modelArr) (Vector _ featuresArr) = 132 | 1 / ( 1 + exp (negate $ V.foldl' (+) 0 $ V.zipWith (*) modelArr (V.cons 1 featuresArr))) 133 | 134 | -- vim: expandtab 135 | --------------------------------------------------------------------------------