├── .gitignore
├── README.md
├── julia
    ├── chapter10
    │   ├── linearregressionexample
    │   │   ├── data.txt
    │   │   ├── linearregression-multivariable.jl
    │   │   └── readme.md
    │   ├── logisticregressionexample
    │   │   ├── dataset1.txt
    │   │   ├── logisticregression.jl
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter11
    │   ├── annexample
    │   │   ├── checkNNGradients.jl
    │   │   ├── computeNumericalGradient.jl
    │   │   ├── dataset1.mat
    │   │   ├── debugInitializeWeights.jl
    │   │   ├── displayData.jl
    │   │   ├── neural-network.jl
    │   │   ├── nnCostFunction.jl
    │   │   ├── predict.jl
    │   │   ├── randInitializeWeights.jl
    │   │   ├── readme.md
    │   │   ├── sigmoid.jl
    │   │   ├── sigmoidGradient.jl
    │   │   ├── submit.jl
    │   │   └── weights.mat
    │   ├── dlexample
    │   │   ├── autoencoder
    │   │   │   ├── README.md
    │   │   │   ├── autoencoder.jl
    │   │   │   ├── digits.jl
    │   │   │   ├── getdata-sparseautoencoder.sh
    │   │   │   ├── getdata-vectorization.sh
    │   │   │   └── matlab
    │   │   │   │   ├── loadMNISTImages.m
    │   │   │   │   └── loadMNISTLabels.m
    │   │   ├── datautils.jl
    │   │   ├── plottingutils.jl
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter12
    │   ├── readme.md
    │   └── rlexample
    │   │   ├── DeepQLearning.jl
    │   │   ├── dqn-example.jl
    │   │   ├── dqn.jl
    │   │   ├── dqnruntest.jl
    │   │   ├── dqntest1.jl
    │   │   └── readme.md
    ├── chapter13
    │   ├── ensembleexample
    │   │   ├── Ensemble.jl
    │   │   ├── decisiontree.jl
    │   │   ├── decisiontree_test.jl
    │   │   ├── dimensionalityreduction.jl
    │   │   ├── dimensionalityreduction_test.jl
    │   │   ├── iris.csv
    │   │   ├── mlbase.jl
    │   │   ├── mlbase_test.jl
    │   │   ├── readme.md
    │   │   ├── transformers.jl
    │   │   ├── types.jl
    │   │   └── util.jl
    │   └── readme.md
    ├── chapter5
    │   ├── decisiontreeexample
    │   │   ├── DecisionTree.jl
    │   │   ├── decision_tree_test1.jl
    │   │   ├── decision_tree_test2.jl
    │   │   ├── measures.jl
    │   │   └── readme.md
    │   ├── randomforstexample
    │   │   ├── RandomForests.jl
    │   │   ├── classifier.jl
    │   │   ├── example.jl
    │   │   ├── randomforest.jl
    │   │   ├── readme.md
    │   │   ├── regressor.jl
    │   │   ├── sort.jl
    │   │   ├── split.jl
    │   │   ├── tree.jl
    │   │   └── util.jl
    │   └── readme.md
    ├── chapter6
    │   ├── knnexample
    │   │   ├── knn.jl
    │   │   ├── readme.md
    │   │   ├── sampledata.csv
    │   │   ├── test.zip
    │   │   ├── train.zip
    │   │   └── training.csv
    │   ├── readme.md
    │   └── svmexample
    │   │   ├── examplesvm1.jl
    │   │   ├── readme.md
    │   │   ├── reference
    │   │       ├── libsvm_wrapper.c
    │   │       ├── svm.cpp
    │   │       └── svm.h
    │   │   └── svm.jl
    ├── chapter7
    │   ├── aprioriexample
    │   │   ├── apriori.jl
    │   │   ├── aprioritest.jl
    │   │   ├── common.jl
    │   │   └── readme.md
    │   ├── fpgrowthexample
    │   │   ├── common.jl
    │   │   ├── fpgrowth.jl
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter8
    │   ├── k-meansexample
    │   │   ├── k-means.jl
    │   │   └── readme.md
    │   └── readme.md
    └── chapter9
    │   ├── naivebayesexample
    │       ├── NaiveBayes.jl
    │       ├── datastats.jl
    │       ├── nbexampledata-iris.jl
    │       ├── nbfunctions.jl
    │       ├── nbtest1.jl
    │       ├── nbtest2.jl
    │       ├── nbtypes.jl
    │       └── readme.md
    │   └── readme.md
├── mahout
    ├── chapter10
    │   ├── linearregressionexample
    │   │   └── readme.md
    │   ├── logisticregressionexample
    │   │   ├── pom.xml
    │   │   ├── readme.md
    │   │   └── src
    │   │   │   ├── main
    │   │   │       └── java
    │   │   │       │   └── com
    │   │   │       │       └── packt
    │   │   │       │           └── pml
    │   │   │       │               └── mahout
    │   │   │       │                   └── logreg
    │   │   │       │                       ├── LogisticRegreesionBase.java
    │   │   │       │                       ├── LogisticRegressionApp.java
    │   │   │       │                       └── LogisticRegressionBase.java
    │   │   │   └── test
    │   │   │       └── java
    │   │   │           └── com
    │   │   │               └── packt
    │   │   │                   └── pml
    │   │   │                       └── mahout
    │   │   │                           └── logreg
    │   │   │                               └── LogisticRegressionTest.java
    │   └── readme.md
    ├── chapter11
    │   ├── annexample
    │   │   └── readme.md
    │   ├── dlexample
    │   │   ├── pom.xml
    │   │   ├── readme.md
    │   │   └── src
    │   │   │   ├── main
    │   │   │       └── java
    │   │   │       │   ├── Autoencoder.java
    │   │   │       │   ├── AutoencoderComputedParams.java
    │   │   │       │   ├── AutoencoderConfig.java
    │   │   │       │   ├── AutoencoderFct.java
    │   │   │       │   ├── AutoencoderFctGrd.java
    │   │   │       │   ├── AutoencoderGradient3.java
    │   │   │       │   ├── AutoencoderLearner.java
    │   │   │       │   ├── AutoencoderLinAlgebra.java
    │   │   │       │   ├── AutoencoderLineSearch.java
    │   │   │       │   ├── AutoencoderParams.java
    │   │   │       │   ├── AutoencoderSigmoid.java
    │   │   │       │   └── two_layers_autoencoders_model.prototxt
    │   │   │   └── test
    │   │   │       └── java
    │   │   │           ├── AutoencoderTest.java
    │   │   │           ├── ExtractPatchesTest.java
    │   │   │           ├── ExtractPatchesTuplesTest.java
    │   │   │           ├── FFTConvolutionTest.java
    │   │   │           ├── FeatureExtractionTest.java
    │   │   │           ├── LinAlgebraIOUtilsTest.java
    │   │   │           ├── LoadSaveModelTest.java
    │   │   │           ├── MaxPoolerTest.java
    │   │   │           ├── OneLayerTest.java
    │   │   │           ├── PreProcessTest.java
    │   │   │           ├── RankTest.java
    │   │   │           ├── ThreeLayerTest.java
    │   │   │           └── TwoLayersTest.java
    │   └── readme.md
    ├── chapter12
    │   ├── readme.md
    │   └── rlexample
    │   │   └── readme.md
    ├── chapter13
    │   ├── ensembleexample
    │   │   ├── data
    │   │   │   └── input
    │   │   │   │   ├── u.data
    │   │   │   │   ├── u1.base
    │   │   │   │   └── ua.base
    │   │   ├── pom.xml
    │   │   ├── readme.md
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── java
    │   │   │           └── com
    │   │   │               └── packt
    │   │   │                   └── pml
    │   │   │                       └── mahout
    │   │   │                           └── ensemble
    │   │   │                               ├── Hadoop.java
    │   │   │                               ├── ItemRecommender.java
    │   │   │                               ├── RecommenderEvaluator.java
    │   │   │                               ├── Recommenders.java
    │   │   │                               ├── SlopeOneBasedRecommender.java
    │   │   │                               └── Utilities.java
    │   └── readme.md
    ├── chapter5
    │   ├── decisiontreeexample
    │   │   └── readme.md
    │   └── randomforestexample
    │   │   ├── pom.xml
    │   │   ├── readme.md
    │   │   └── src
    │   │       ├── main
    │   │           └── java
    │   │           │   └── com
    │   │           │       └── packt
    │   │           │           └── pml
    │   │           │               └── mahout
    │   │           │                   └── randomforest
    │   │           │                       └── RandomForest.java
    │   │       └── test
    │   │           └── java
    │   │               └── com
    │   │                   └── packt
    │   │                       └── pml
    │   │                           └── mahout
    │   │                               └── randomforest
    │   │                                   └── RandomForestTest.java
    ├── chapter6
    │   ├── knnexample
    │   │   ├── main
    │   │   │   └── java
    │   │   │   │   └── KNearestNeighbor.java
    │   │   ├── readme.md
    │   │   └── test
    │   │   │   └── java
    │   │   │       └── WeightedMatrixTest.java
    │   └── svmexample
    │   │   └── readme.md
    ├── chapter7
    │   ├── aprioriexample
    │   │   └── readme.md
    │   └── fpgrowthexample
    │   │   ├── pom.xml
    │   │   ├── readme.md
    │   │   └── src
    │   │       ├── main
    │   │           └── java
    │   │           │   └── com
    │   │           │       └── packt
    │   │           │           └── pml
    │   │           │               └── mahout
    │   │           │                   └── fpgrowth
    │   │           │                       ├── FrequentPatternMetrics.java
    │   │           │                       └── FrequentPatternMiningJava.java
    │   │       └── test
    │   │           └── java
    │   │               └── com
    │   │                   └── packt
    │   │                       └── pml
    │   │                           └── mahout
    │   │                               └── fpgrowth
    │   │                                   └── FPgrowthTest.java
    ├── chapter8
    │   ├── k-meansexample
    │   │   ├── pom.xml
    │   │   ├── readme.md
    │   │   └── src
    │   │   │   ├── main
    │   │   │       └── java
    │   │   │       │   └── com
    │   │   │       │       └── packt
    │   │   │       │           └── pml
    │   │   │       │               └── mahout
    │   │   │       │                   └── kmeans
    │   │   │       │                       ├── DataPreprocessing.java
    │   │   │       │                       ├── InputDriver.java
    │   │   │       │                       └── MahoutClusteringExample.java
    │   │   │   └── test
    │   │   │       └── java
    │   │   │           └── com
    │   │   │               └── packt
    │   │   │                   └── pml
    │   │   │                       └── mahout
    │   │   │                           └── kmeans
    │   │   │                               └── KMeansTest.java
    │   └── readme.md
    └── chapter9
    │   ├── naivebayesexample
    │       ├── pom.xml
    │       ├── readme.md
    │       └── src
    │       │   ├── main
    │       │       └── java
    │       │       │   ├── com
    │       │       │       └── packt
    │       │       │       │   └── pml
    │       │       │       │       └── mahout
    │       │       │       │           └── naivebayes
    │       │       │       │               └── NaiveBayes.java
    │       │       │   └── start.sh
    │       │   └── test
    │       │       └── java
    │       │           └── com
    │       │               └── packt
    │       │                   └── pml
    │       │                       └── mahout
    │       │                           └── naivebayes
    │       │                               └── NaiveBayesTest.java
    │   └── readme.md
├── python-sckit-learn
    ├── chapter10
    │   ├── linearregressionexample
    │   │   ├── data
    │   │   │   ├── winequality-red.csv
    │   │   │   └── winequality-white.csv
    │   │   ├── linear-regression-wine-data.py
    │   │   └── readme.md
    │   ├── logisticregressionexample
    │   │   ├── data
    │   │   │   ├── SMSSpamCollection
    │   │   │   ├── sms.csv
    │   │   │   ├── test.tsv
    │   │   │   └── train.tsv
    │   │   └── logistic-regression.py
    │   └── readme.md
    ├── chapter11
    │   ├── annexample
    │   │   ├── ann.py
    │   │   └── readme.md
    │   ├── dlexample
    │   │   ├── example-1-data.ods
    │   │   ├── perceptron-data.ods
    │   │   ├── perceptron.py
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter12
    │   ├── readme.md
    │   └── rlexample
    │   │   ├── q-learning.py
    │   │   └── readme.md
    ├── chapter13
    │   ├── ensembleexample
    │   │   ├── ensemble.py
    │   │   ├── ensemble_predict.py
    │   │   ├── ensemble_train.py
    │   │   ├── model_library.py
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter5
    │   ├── decisiontreeexample
    │   │   ├── data
    │   │   │   ├── ad.data
    │   │   │   └── ad.names
    │   │   ├── decision-tree.py
    │   │   └── information-gain.ods
    │   ├── randomforstexample
    │   │   ├── random-forests.py
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter6
    │   ├── knnexample
    │   │   ├── KNN.py
    │   │   ├── iris_data
    │   │   │   ├── README.md
    │   │   │   ├── iris.data
    │   │   │   └── iris.names
    │   │   ├── knn_example.png
    │   │   └── readme.md
    │   ├── readme.md
    │   └── svmexample
    │   │   ├── data
    │   │       ├── stopwords_en.txt
    │   │       └── titanic.csv
    │   │   ├── readme.md
    │   │   ├── svm.py
    │   │   └── svm_test.py
    ├── chapter7
    │   ├── aprioriexample
    │   │   ├── INTEGRATED-DATASET.csv
    │   │   ├── apriori.py
    │   │   └── readme.md
    │   └── fpgrowthexample
    │   │   ├── data
    │   │       ├── numeric.csv
    │   │       └── tsk.csv
    │   │   ├── fp_growth.py
    │   │   ├── readme.md
    │   │   └── test-fpgrowth.py
    ├── chapter8
    │   ├── k-meansexample
    │   │   ├── k-means.py
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter9
    │   ├── naivebayesexample
    │   │   ├── data-types.py
    │   │   ├── feature-selection.py
    │   │   ├── naivebayes-classifier.py
    │   │   ├── read-spam-data.py
    │   │   └── readme.md
    │   └── readme.md
    ├── data
    │   ├── stopwords_en.txt
    │   ├── titanic.csv
    │   └── titanic.png
    └── readme.md
├── r
    ├── chapter10
    │   ├── linearregressionexample
    │   │   ├── Rplots.pdf
    │   │   ├── insurance.csv
    │   │   ├── linearregression.R
    │   │   └── readme.md
    │   ├── logisticregressionexample
    │   │   ├── dataset1.txt
    │   │   ├── dataset2.txt
    │   │   ├── logisticregression.R
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter11
    │   ├── annexample
    │   │   ├── Rplots.pdf
    │   │   ├── Rplots1.pdf
    │   │   ├── ann.R
    │   │   ├── concrete.csv
    │   │   └── readme.md
    │   ├── dlexample
    │   │   ├── autoencoder.R
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter12
    │   ├── readme.md
    │   └── rlexample
    │   │   ├── Results.pdf
    │   │   ├── qlaci.zip
    │   │   ├── qlearning
    │   │       ├── DESCRIPTION
    │   │       ├── INDEX
    │   │       ├── MD5
    │   │       ├── Meta
    │   │       │   ├── Rd.rds
    │   │       │   ├── data.rds
    │   │       │   ├── hsearch.rds
    │   │       │   ├── links.rds
    │   │       │   ├── nsInfo.rds
    │   │       │   └── package.rds
    │   │       ├── NAMESPACE
    │   │       ├── R
    │   │       │   ├── qlearning
    │   │       │   ├── qlearning.rdb
    │   │       │   └── qlearning.rdx
    │   │       ├── data
    │   │       │   └── DataEx.RData
    │   │       ├── help
    │   │       │   ├── AnIndex
    │   │       │   ├── aliases.rds
    │   │       │   ├── paths.rds
    │   │       │   ├── qlearning.rdb
    │   │       │   └── qlearning.rdx
    │   │       └── html
    │   │       │   ├── 00Index.html
    │   │       │   └── R.css
    │   │   └── readme.md
    ├── chapter13
    │   ├── ensembleexample
    │   │   ├── bagging-random-forest.R
    │   │   ├── credit.csv
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter5
    │   ├── decisiontreeexample
    │   │   ├── data
    │   │   │   ├── credit.csv
    │   │   │   └── mushrooms.csv
    │   │   ├── decision-trees.r
    │   │   └── readme.md
    │   ├── randomforstexample
    │   │   ├── data
    │   │   │   ├── test.csv
    │   │   │   └── train.csv
    │   │   ├── output
    │   │   │   ├── predict1.csv
    │   │   │   └── predict2.csv
    │   │   ├── randomforest.R
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter6
    │   ├── knnexample
    │   │   ├── knn.R
    │   │   ├── readme.md
    │   │   └── wisc_bc_data.csv
    │   ├── readme.md
    │   └── svmexample
    │   │   ├── letterdata.csv
    │   │   ├── readme.md
    │   │   └── svm.R
    ├── chapter7
    │   ├── aprioriexample
    │   │   ├── Rplots.pdf
    │   │   ├── association-rules.R
    │   │   ├── groceries.csv
    │   │   ├── groceryrules.csv
    │   │   └── readme.md
    │   ├── fpgrowthexample
    │   │   └── readme.md
    │   └── readme.md
    ├── chapter8
    │   ├── k-meansexample
    │   │   ├── kmeans-clustering.R
    │   │   ├── readme.md
    │   │   └── snsdata.csv
    │   └── readme.md
    └── chapter9
    │   ├── naivebayesexample
    │       ├── Rplots.pdf
    │       ├── readme.md
    │       ├── sms_spam.csv
    │       └── snaive-bayes.R
    │   └── readme.md
└── spark
    ├── chapter10
        ├── linearregressionexample
        │   ├── ClickRate.py
        │   ├── readme.md
        │   └── startClickRate.sh
        ├── logisticregressionexample
        │   ├── dataset
        │   │   ├── spambase.DOCUMENTATION
        │   │   ├── spambase.data
        │   │   └── spambase.names
        │   ├── readme.md
        │   └── src
        │   │   └── main
        │   │       └── scala
        │   │           └── default
        │   │               └── SpamClassification-Logreg.scala
        └── readme.md
    ├── chapter11
        ├── annexample
        │   ├── build.sbt
        │   ├── readme.md
        │   └── src
        │   │   └── main
        │   │       ├── resources
        │   │           └── log4j.properties
        │   │       └── scala
        │   │           └── default
        │   │               ├── Util.scala
        │   │               ├── algo
        │   │                   ├── CostGradient.scala
        │   │                   ├── DistCostGradientComputer.scala
        │   │                   ├── GradientDescendOptimizer.scala
        │   │                   ├── LoggingAbility.scala
        │   │                   ├── NaiveCostGradientComputer.scala
        │   │                   └── Predictor.scala
        │   │               ├── example
        │   │                   └── MNIST.scala
        │   │               └── model
        │   │                   ├── NeuralNetworkClassifier.scala
        │   │                   ├── NeuralNetworkModel.scala
        │   │                   └── Topology.scala
        ├── dlexample
        │   ├── RBM.scala
        │   └── readme.md
        └── readme.md
    ├── chapter12
        ├── readme.md
        └── rlexample
        │   └── readme.md
    ├── chapter13
        ├── ensembleexample
        │   ├── data
        │   │   ├── housing.txt
        │   │   └── sample_libsvm_data.txt
        │   ├── readme.md
        │   ├── result
        │   │   ├── GBT_clas.txt
        │   │   └── GBT_regression.txt
        │   └── src
        │   │   ├── GradientBoostTree_classification.scala
        │   │   ├── GradientBoostTree_regression.scala
        │   │   └── test.txt
        └── readme.md
    ├── chapter5
        ├── decisiontreeexample
        │   ├── data
        │   │   ├── housing.txt
        │   │   └── sample_libsvm_data.txt
        │   ├── readme.md
        │   ├── result
        │   │   ├── classification.txt
        │   │   └── regression.txt
        │   └── src
        │   │   ├── decisiontree-classification.scala
        │   │   ├── decisiontree-regression.scala
        │   │   └── decsiontree-test.txt
        ├── randomforstexample
        │   ├── data
        │   │   ├── housing.txt
        │   │   └── sample_libsvm_data.txt
        │   ├── readme.md
        │   ├── result
        │   │   ├── RandomForest_regression.txt
        │   │   └── RandomForests_classification.txt
        │   └── src
        │   │   ├── RandomForest_regression.scala
        │   │   ├── RandomForests_classification.scala
        │   │   └── test.txt
        └── readme.md
    ├── chapter6
        ├── knnexample
        │   ├── example-run
        │   ├── project
        │   │   ├── Build.scala
        │   │   ├── META-INF
        │   │   │   └── MANIFEST.MF
        │   │   ├── assembly.sbt
        │   │   └── plugins.sbt
        │   ├── readme.md
        │   └── src
        │   │   └── Knn-recommender.scala
        └── svmexample
        │   ├── build.sbt
        │   ├── doc
        │       └── usage.txt
        │   ├── readme.md
        │   └── src
        │       └── main
        │           └── scala
        │               ├── KernelSVM.scala
        │               ├── Kernels.scala
        │               └── main.scala
    ├── chapter7
        ├── aprioriexample
        │   ├── pom.xml
        │   ├── readme.md
        │   └── src
        │   │   └── main
        │   │       └── scala
        │   │           └── default
        │   │               ├── Apriori.scala
        │   │               ├── BloomFilter.scala
        │   │               ├── FrequentItemSets.scala
        │   │               ├── NaiveFrequentItemSets.scala
        │   │               └── TestMain.scala
        ├── fpgrowthexample
        │   ├── readme.md
        │   └── src
        │   │   └── main
        │   │       └── scala
        │   │           └── default
        │   │               ├── FPGrowth.scala
        │   │               ├── FPTree.scala
        │   │               ├── ParallelFPGrowth.scala
        │   │               ├── Test.scala
        │   │               └── TreeNode.scala
        └── readme.md
    ├── chapter8
        ├── k-meansexample
        │   ├── build.sbt
        │   ├── input
        │   │   ├── centroids.txt
        │   │   └── points.txt
        │   ├── readme.md
        │   ├── run.sh
        │   └── src
        │   │   └── main
        │   │       └── scala
        │   │           └── default
        │   │               └── KMeans.scala
        └── readme.md
    └── chapter9
        ├── naivebayesexample
            ├── build.sbt
            ├── download-reuters.sh
            ├── project
            │   └── plugins.sbt
            ├── readme.md
            └── src
            │   └── main
            │       └── scala
            │           └── default
            │               ├── NaiveBayes.scala
            │               ├── ReutersParser.scala
            │               ├── Tokenizer.scala
            │               └── VectorUtil.scala
        └── readme.md


/.gitignore:
--------------------------------------------------------------------------------
1 | mahout/chapter10/logisticregressionexample/pom.xml


--------------------------------------------------------------------------------
/julia/chapter10/linearregressionexample/data.txt:
--------------------------------------------------------------------------------
 1 | 2104,3,399900
 2 | 1600,3,329900
 3 | 2400,3,369000
 4 | 1416,2,232000
 5 | 3000,4,539900
 6 | 1985,4,299900
 7 | 1534,3,314900
 8 | 1427,3,198999
 9 | 1380,3,212000
10 | 1494,3,242500
11 | 1940,4,239999
12 | 2000,3,347000
13 | 1890,3,329999
14 | 4478,5,699900
15 | 1268,3,259900
16 | 2300,4,449900
17 | 1320,2,299900
18 | 1236,3,199900
19 | 2609,4,499998
20 | 3031,4,599000
21 | 1767,3,252900
22 | 1888,2,255000
23 | 1604,3,242900
24 | 1962,4,259900
25 | 3890,3,573900
26 | 1100,3,249900
27 | 1458,3,464500
28 | 2526,3,469000
29 | 2200,3,475000
30 | 2637,3,299900
31 | 1839,2,349900
32 | 1000,1,169900
33 | 2040,4,314900
34 | 3137,3,579900
35 | 1811,4,285900
36 | 1437,3,249900
37 | 1239,3,229900
38 | 2132,4,345000
39 | 4215,4,549000
40 | 2162,4,287000
41 | 1664,2,368500
42 | 2238,3,329900
43 | 2567,4,314000
44 | 1200,3,299000
45 | 852,2,179900
46 | 1852,4,299900
47 | 1203,3,239500
48 | 


--------------------------------------------------------------------------------
/julia/chapter10/linearregressionexample/linearregression-multivariable.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Regression Analysis - Linear Regression example 
 3 | # Chapter 10
 4 | 
 5 | using Gadfly
 6 | 
 7 | # Normalizes the features in x
 8 | # The mean value of each feature is 0 and the standard deviation is 1
 9 | # Returns normalized x, μ and σ
10 | function featureNormalize(x)
11 | rows = size(x,1)
12 | cols = size(x,2)
13 | 
14 | μ = mean(x,1)
15 | σ = std(x,1)
16 | xNorm = zeros(x)
17 | 
18 | # normalize
19 | for i in 1:cols
20 | 	for j in 1:rows
21 | 		xNorm[j,i] = (x[j,i] - μ[i]) / σ[i];
22 | 	end
23 | end
24 | 
25 | (xNorm, μ, σ)
26 | end
27 | 
28 | 
29 | println("Loading data ... ")
30 | data = readdlm("data.txt",',')
31 | x = data[:,1:2]
32 | y = data[:, 3]
33 | m = length(y)
34 | 
35 | @printf("First 10 examples from the dataset: \n");
36 | t = [x[1:10,:] y[1:10,:]]'
37 | for i in 1:10
38 |   @printf(" x = [%.0f %.0f], y = %.0f \n", t[1,i], t[2,i], t[3,i]);
39 | end
40 | 
41 | # Scale features and set them to zero mean
42 | (x, μ, σ) = featureNormalize(x);
43 | 
44 | # Add intercept term to x
45 | x = [ones(m,1) x]
46 | 
47 | #### Run Gradient Descent
48 | α = 0.001
49 | numIter = 4000
50 | θ = zeros(3,1)
51 | jHist = zeros(numIter, 1)
52 | 
53 | for i in 1:numIter
54 |   # next theta
55 |   θ = θ - (α/m) * (x' * ((x*θ)-y))
56 |   # compute cost
57 |   jHist[i] = sum((x*θ-y).^2)/(2m)
58 | end
59 | 
60 | # plot convergence graph
61 | pl = plot(
62 |   x=collect(1:numIter),
63 |   y=jHist,
64 |   Guide.xlabel("Iterations"),
65 |   Guide.ylabel("Error"),
66 |   Guide.title("Convergence Graph"),
67 |   Geom.line
68 |   )
69 | draw(SVGJS("jHist.js.svg", 6inch, 6inch), pl)
70 | 
71 | # Estimate the price of a 1650 sq-ft, 3 br house
72 | price = [1, (1650-μ[1])/σ[1], (3-μ[2])/σ[2]]' * θ
73 | println("Estimated price for a 1650 sq-ft, 3 br house: $price")
74 | 
75 | println("done!")
76 | 


--------------------------------------------------------------------------------
/julia/chapter10/linearregressionexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter10/logisticregressionexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter10/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/checkNNGradients.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Artificial Neural Network
 3 | # Chapter 11
 4 | 
 5 | include("nnCostFunction.jl")
 6 | include("computeNumericalGradient.jl")
 7 | include("debugInitializeWeights.jl")
 8 | 
 9 | function checkNNGradients(lambda = 0)
10 |   input_layer_size = 3
11 |   hidden_layer_size = 5
12 |   num_labels = 3
13 |   m = 5
14 | 
15 |   # We generate some 'random' test data
16 |   Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size)
17 |   Theta2 = debugInitializeWeights(num_labels, hidden_layer_size)
18 |   # Reusing debugInitializeWeights to generate X
19 |   X  = debugInitializeWeights(m, input_layer_size - 1)
20 |   y  = (1 + mod(1:m, num_labels)')'
21 | 
22 |   # Unroll parameters
23 |   nn_params = [Theta1[:] ; Theta2[:]]
24 | 
25 |   # Short hand for cost function
26 |   costFunc = p -> nnCostFunction(p, input_layer_size, hidden_layer_size,
27 |                                  num_labels, X, y, lambda)
28 | 	CHECKNNGRADIENTS(lambda)
29 |   cost, grad = costFunc(nn_params)
30 |   numgrad = computeNumericalGradient(costFunc, nn_params)
31 | 
32 |   # Visually examine the two gradient computations.  The two columns
33 |   # you get should be very similar.
34 |   show([numgrad grad])
35 |   @printf("""
36 | 
37 |     The above two columns you get should be very similar.
38 |     (Left-Your Numerical Gradient, Right-Analytical Gradient)
39 | 
40 |   """)
41 | 
42 |   # Evaluate the norm of the difference between two solutions.
43 |   # If you have a correct implementation, and assuming you used EPSILON = 0.0001
44 |   # in computeNumericalGradient.m, then diff below should be less than 1e-9
45 |   diff = norm(numgrad - grad) / norm(numgrad + grad)
46 | 
47 |   @printf("""
48 |     If your backpropagation implementation is correct, then
49 |     the relative difference will be small (less than 1e-9).
50 | 
51 |     Relative Difference: %g
52 |     """, diff)
53 | 
54 | end
55 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/computeNumericalGradient.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Artificial Neural Network
 3 | # Chapter 11
 4 | 
 5 | function computeNumericalGradient(J, theta)
 6 | 
 7 |   # Notes: The following code implements numerical gradient checking, and
 8 |   #        returns the numerical gradient.It sets numgrad(i) to (a numerical
 9 |   #        approximation of) the partial derivative of J with respect to the
10 |   #        i-th input argument, evaluated at theta. (i.e., numgrad(i) should
11 |   #        be the (approximately) the partial derivative of J with respect
12 |   #        to theta(i).)
13 | 
14 |   numgrad = COMPUTENUMERICALGRADIENT(J, theta)
15 |   perturb = zeros(size(theta))
16 |   e = 1e-4
17 |   for p in 1:length(theta)
18 |     # Set perturbation vector
19 |     perturb[p] = e
20 |     loss1, _ = J(theta - perturb)
21 |     loss2, _ = J(theta + perturb)
22 |     # Compute Numerical Gradient
23 |     numgrad[p] = (loss2 - loss1) / 2e
24 |     perturb[p] = 0
25 |   end
26 |   return numgrad
27 | end
28 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/dataset1.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/julia/chapter11/annexample/dataset1.mat


--------------------------------------------------------------------------------
/julia/chapter11/annexample/debugInitializeWeights.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Artificial Neural Network
 3 | # Chapter 11
 4 | 
 5 | function debugInitializeWeights(fan_out, fan_in)
 6 |   #   Note that W should be set to a matrix of size(1 + fan_in, fan_out) as
 7 |   #   the first row of W handles the "bias" terms
 8 | 
 9 |   # Set W to zeros
10 |   W = DEBUGINITIALIZEWEIGHTS(fan_in, fan_out)
11 | 
12 |   # Initialize W using "sin", this ensures that W is always of the same
13 |   # values and will be useful for debugging
14 |   W = reshape(sin(1:length(W)), size(W)) / 10
15 | 
16 |   return W
17 | end
18 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/displayData.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Artificial Neural Network
 3 | # Chapter 11
 4 | using Images, ImageView
 5 | 
 6 | function displayData(X, example_width = round(Int, sqrt(size(X, 2))))
 7 |   # Compute rows, cols
 8 |   m, n = size(X)
 9 |   example_height = round(Int, (n / example_width))
10 | 
11 |   # Compute number of items to display
12 |   display_rows = round(Int, sqrt(m))
13 |   display_cols = round(Int, ceil(m / display_rows))
14 | 
15 |   # Between images padding
16 |   pad = 1
17 | 
18 |   # Setup blank display
19 |   display_array = - ones(pad + display_rows * (example_height + pad),
20 |                          pad + display_cols * (example_width + pad))
21 | 
22 |   # Copy each example into a patch on the display array
23 |   curr_ex = 1
24 |   for j in 1:display_rows, i in 1:display_cols
25 | 		if curr_ex > m
26 | 			break
27 | 		end
28 | 
29 | 		# Get the max value of the patch
30 | 		max_val = maximum(abs(X[curr_ex, :]))
31 | 		display_array[pad + (j - 1) * (example_height + pad) + (1:example_height),
32 | 		              pad + (i - 1) * (example_width + pad) + (1:example_width)] =
33 | 						reshape(X[curr_ex, :], (example_height, example_width)) / max_val
34 | 		curr_ex += 1
35 |   end
36 | 
37 |   # Display Image
38 |   img = Image(display_array)
39 |   [canvas, img] = DISPLAYDATA(X, example_width)
40 |   return (canvas, img)
41 | end
42 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/predict.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Artificial Neural Network
 3 | # Chapter 11
 4 | 
 5 | function predict(Theta1, Theta2, X)
 6 |   # Useful values
 7 |   m = size(X, 1)
 8 |   num_labels = size(Theta2, 1)
 9 | 
10 |   # You need to return the following variables correctly
11 |    p = PREDICT(Theta1, Theta2, X)
12 |   h1 = sigmoid([ones(m, 1) X] * Theta1')
13 |   h2 = sigmoid([ones(m, 1) h1] * Theta2')
14 | 
15 |   for i in 1:m
16 |     p[i] = findmax(h2[i, :])[2]
17 |   end
18 |   return p
19 | end
20 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/randInitializeWeights.jl:
--------------------------------------------------------------------------------
1 | # Practical Machine learning
2 | # Artificial Neural Network
3 | # Chapter 11
4 | 
5 | function randInitializeWeights(L_in, L_out)
6 |    W = RANDINITIALIZEWEIGHTS(L_in, L_out)
7 |   return W
8 | end
9 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/sigmoid.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Artificial Neural Network
 3 | # Chapter 11
 4 | @doc """
 5 |   SIGMOID Compute sigmoid functoon
 6 |     J = SIGMOID(z) computes the sigmoid of z.
 7 | """ ->
 8 | function sigmoid(z)
 9 |   g = SIGMOID(z)
10 |   return g
11 | end
12 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/sigmoidGradient.jl:
--------------------------------------------------------------------------------
1 | # Practical Machine learning
2 | # Artificial Neural Network
3 | # Chapter 11
4 | function sigmoidGradient(z)
5 |   g = SIGMOIDGRADIENT(z)
6 |   return g
7 | end
8 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/submit.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Artificial Neural Network
 3 | # Chapter 11
 4 | 
 5 | export submit
 6 | 
 7 | include("../data.jl")
 8 | include("../submit.jl")
 9 | 
10 | include("nnCostFunction.jl")
11 | include("sigmoidGradient.jl")
12 | 
13 | function submit()
14 |   parts = [
15 |     Part(1, "Feedforward and Cost Function"),
16 |     Part(2, "Regularized Cost Function"),
17 |     Part(3, "Sigmoid Gradient"),
18 |     Part(4, "Neural Network Gradient (Backpropagation)"),
19 |     Part(5, "Regularized Gradient")
20 |   ]
21 |   conf = Conf("neural-network-learning",
22 |               "Neural Networks Learning", parts, solver)
23 | 
24 |   submitWithConf(conf)
25 | end
26 | 
27 | function solver(partId)
28 |   # Random Test Cases
29 |   X = reshape(3 * sin(1:1:30), (3, 10))
30 |   Xm = reshape(sin(1:32), (16, 2)) / 5
31 |   ym = (1 + mod(1:16, 4)')'
32 |   t1 = sin(reshape(1:2:24, (4, 3)))
33 |   t2 = cos(reshape(1:2:40, (4, 5)))
34 |   t  = [t1[:] ; t2[:]]
35 |   if partId == 1
36 |     J, _ = nnCostFunction(t, 2, 4, 4, Xm, ym, 0)
37 |     return @sprintf("%0.5f", J)
38 |   elseif partId == 2
39 |     J, _ = nnCostFunction(t, 2, 4, 4, Xm, ym, 1.5)
40 |     return @sprintf("%0.5f", J)
41 |   elseif partId == 3
42 |     return join(map(x -> @sprintf("%0.5f", x), sigmoidGradient(X)), " ")
43 |   elseif partId == 4
44 |     J, grad = nnCostFunction(t, 2, 4, 4, Xm, ym, 0)
45 |     return @sprintf("%0.5f ", J) * join(map(x -> @sprintf("%0.5f", x), grad), " ")
46 |   elseif partId == 5
47 |     J, grad = nnCostFunction(t, 2, 4, 4, Xm, ym, 1.5)
48 |     return @sprintf("%0.5f ", J) * join(map(x -> @sprintf("%0.5f", x), grad), " ")
49 |   end
50 | end
51 | 


--------------------------------------------------------------------------------
/julia/chapter11/annexample/weights.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/julia/chapter11/annexample/weights.mat


--------------------------------------------------------------------------------
/julia/chapter11/dlexample/autoencoder/README.md:
--------------------------------------------------------------------------------
1 | # Deep Learning with Julia
2 | 
3 | ## The tutorial
4 | The "Unsupervised Feature Learning and Deep Learning" tutorial can be found [here](http://deeplearning.stanford.edu/wiki/index.php/UFLD_TUTORIAL). The tutorial provides a fair amount of starter code in Matlab, which also contains the data we need to follow along with Julia. If you just want the data, you can use the shell scripts provided in this repository which have the form **getdata-*.sh**. Just fill in the name of the exercise you're working on.
5 | 
6 | ## Julia scripts
7 | The scripts included are complete solutions; they are NOT starter code. I'm still working through the tutorial. But once I'm done, and have learned quite a bit more about Julia, I will include a directory with just starter code.
8 | 
9 | 


--------------------------------------------------------------------------------
/julia/chapter11/dlexample/autoencoder/digits.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Deep learning - Sparse autoencoder example 
 3 | # Chapter 11
 4 | 
 5 | using SparseAutoencoder
 6 | using MAT
 7 | 
 8 | vars = matread("./data/mnist-images.mat")
 9 | data = vars["images"]
10 | 
11 | visiblesize = 28*28
12 | hiddensize = 196
13 | sparsityparameter = 0.1
14 | lambda = 3e-3
15 | beta = 3.0
16 | patches = data[:,1:10000]
17 | 
18 | minf,W1,W2,b1,b2 = autoencode(patches,hiddensize,visiblesize,lambda=lambda,beta=beta,rho=sparsityparameter)
19 | 
20 | using HDF5, JLD
21 | @save "./digits-results.jld"
22 | 


--------------------------------------------------------------------------------
/julia/chapter11/dlexample/autoencoder/getdata-sparseautoencoder.sh:
--------------------------------------------------------------------------------
1 | mkdir -p data || exit 1
2 | cd data
3 | wget http://ufldl.stanford.edu/wiki/resources/sparseae_exercise.zip
4 | unzip sparseae_exercise.zip
5 | cp starter/IMAGES.mat IMAGES.mat
6 | rm -rf starter
7 | rm sparseae_exercise.zip
8 | cd ..
9 | 


--------------------------------------------------------------------------------
/julia/chapter11/dlexample/autoencoder/getdata-vectorization.sh:
--------------------------------------------------------------------------------
1 | mkdir -p data || exit 1
2 | cd data
3 | wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
4 | gunzip train-images-idx3-ubyte.gz
5 | cd ..
6 | 


--------------------------------------------------------------------------------
/julia/chapter11/dlexample/autoencoder/matlab/loadMNISTImages.m:
--------------------------------------------------------------------------------
 1 | function images = loadMNISTImages(filename)
 2 | %loadMNISTImages returns a 28x28x[number of MNIST images] matrix containing
 3 | %the raw MNIST images
 4 | 
 5 | fp = fopen(filename, 'rb');
 6 | assert(fp ~= -1, ['Could not open ', filename, '']);
 7 | 
 8 | magic = fread(fp, 1, 'int32', 0, 'ieee-be');
 9 | display(magic)
10 | assert(magic == 2051, ['Bad magic number in ', filename, '']);
11 | 
12 | numImages = fread(fp, 1, 'int32', 0, 'ieee-be');
13 | numRows = fread(fp, 1, 'int32', 0, 'ieee-be');
14 | numCols = fread(fp, 1, 'int32', 0, 'ieee-be');
15 | 
16 | images = fread(fp, inf, 'unsigned char');
17 | images = reshape(images, numCols, numRows, numImages);
18 | images = permute(images,[2 1 3]);
19 | 
20 | fclose(fp);
21 | 
22 | % Reshape to #pixels x #examples
23 | images = reshape(images, size(images, 1) * size(images, 2), size(images, 3));
24 | % Convert to double and rescale to [0,1]
25 | images = double(images) / 255;
26 | 
27 | end
28 | 


--------------------------------------------------------------------------------
/julia/chapter11/dlexample/autoencoder/matlab/loadMNISTLabels.m:
--------------------------------------------------------------------------------
 1 | function labels = loadMNISTLabels(filename)
 2 | %loadMNISTLabels returns a [number of MNIST images]x1 matrix containing
 3 | %the labels for the MNIST images
 4 | 
 5 | fp = fopen(filename, 'rb');
 6 | assert(fp ~= -1, ['Could not open ', filename, '']);
 7 | 
 8 | magic = fread(fp, 1, 'int32', 0, 'ieee-be');
 9 | assert(magic == 2049, ['Bad magic number in ', filename, '']);
10 | 
11 | numLabels = fread(fp, 1, 'int32', 0, 'ieee-be');
12 | 
13 | labels = fread(fp, inf, 'unsigned char');
14 | 
15 | assert(size(labels,1) == numLabels, 'Mismatch in label count');
16 | 
17 | fclose(fp);
18 | 
19 | end
20 | 


--------------------------------------------------------------------------------
/julia/chapter11/dlexample/datautils.jl:
--------------------------------------------------------------------------------
 1 | module DataUtils
 2 | 
 3 | export sampleimages
 4 | function sampleimages(images::Array{Float64,3},patchwidth::Int,patchheight::Int,numsamples::Int; scalevariance=true)
 5 |     width, height = size(images[:,:,1])
 6 |     array::Array{Float64,2} = zeros(patchwidth*patchheight,numsamples)
 7 |     for index=1:numsamples
 8 |         image_index = rand(1:size(images,3))
 9 |         x = rand(1:width-patchwidth+1)
10 |         y = rand(1:height-patchheight+1)
11 |         sample = images[x:x+patchwidth-1,y:y+patchheight-1,image_index]
12 |         array[:,index] = reshape(sample,patchwidth*patchheight)
13 |         array[:,index] -= mean(array[:,index]) #subtract mean
14 |     end
15 | 
16 |     if scalevariance
17 |         # rescale images to fit in range 0.1 to 0.9
18 |         stddev = std(array)
19 |         array = max(min(array,3*stddev),-3*stddev) / (3*stddev)
20 |         array = (array + 1.0) * 0.4 + 0.1
21 |     end
22 |     return array
23 | end
24 | 
25 | import Plotly
26 | export displaynetwork_plotly
27 | function displaynetwork_plotly(A,filename,username,userkey)
28 |         m,n = size(A)
29 |         sz = int(sqrt(m))
30 |         A -= mean(A)
31 |         layout = [
32 |             "autosize" => false,
33 |             "width" => 500,
34 |             "height"=> 500
35 |         ]
36 | 
37 |         gridsize = int(ceil(sqrt(n)))
38 |         buffer = 1
39 |         griddata = ones(gridsize*(sz+1)+1,gridsize*(sz+1)+1)
40 |         index = 1
41 |         for i = 1:gridsize
42 |                 for j = 1:gridsize
43 |                         if index > n
44 |                                 continue
45 |                         end
46 |                         columnlimit = maximum(abs(A[:,index]))
47 |                         griddata[buffer+(i-1)*(sz+buffer)+(1:sz),buffer+(j-1)*(sz+buffer)+(1:sz)] = reshape(A[:,index],sz,sz)/columnlimit
48 |                         index += 1
49 |                 end
50 |         end
51 | 
52 |         Plotly.signin(username, userkey)
53 |         data = [
54 |           [
55 |             "z" => griddata,
56 |             "colorscale" => "Greys",
57 |             "type" => "heatmap"
58 |           ]
59 |         ]
60 |         response = Plotly.plot(data, ["layout" => layout, "filename" => filename, "fileopt" => "overwrite"])
61 |         plot_url = response["url"]
62 | end
63 | 
64 | end
65 | 


--------------------------------------------------------------------------------
/julia/chapter11/dlexample/plottingutils.jl:
--------------------------------------------------------------------------------
 1 | module PlottingUtils
 2 | 
 3 | function reshapedata(A)
 4 |         m,n = size(A)
 5 |         sz = int(sqrt(m))
 6 |         A -= mean(A)
 7 | 
 8 |         gridsize = int(ceil(sqrt(n)))
 9 |         buffer = 1
10 |         griddata = ones(gridsize*(sz+1)+1,gridsize*(sz+1)+1)
11 |         index = 1
12 |         for i = 1:gridsize
13 |                 for j = 1:gridsize
14 |                         if index > n
15 |                                 continue
16 |                         end
17 |                         columnlimit = maximum(abs(A[:,index]))
18 |                         griddata[buffer+(i-1)*(sz+buffer)+(1:sz),buffer+(j-1)*(sz+buffer)+(1:sz)] = reshape(A[:,index],sz,sz)/columnlimit
19 |                         index += 1
20 |                 end
21 |         end
22 | 
23 | 	return griddata
24 | end
25 | 
26 | import Plotly
27 | export displaynetwork_plotly
28 | function displaynetwork_plotly(A,filename,username,userkey)
29 | 	griddata = reshapedata(A)
30 | 	Plotly.signin(username, userkey)
31 |         data = [
32 |           [
33 |             "z" => griddata,
34 |             "colorscale" => "Greys",
35 |             "type" => "heatmap"
36 |           ]
37 |         ]
38 |         layout = [
39 |             "autosize" => false,
40 |             "width" => 500,
41 |             "height"=> 500
42 |         ]
43 |         response = Plotly.plot(data, ["layout" => layout, "filename" => filename, "fileopt" => "overwrite"])
44 |         plot_url = response["url"]
45 | end
46 | 
47 | import Gadfly
48 | export displaynetwork_gadfly
49 | function displaynetwork_gadfly(A)
50 | 	griddata = reshapedata(A)
51 | 	Gadfly.spy(A)
52 | end
53 | 
54 | import Winston
55 | export displaynetwork_winston
56 | function displaynetwork_winston(A)
57 | 	griddata = reshapedata(A)
58 | 	p = Winston.FramedPlot()
59 | 	Winston.colormap("grays")
60 | 	Winston.add(p,Winston.imagesc(griddata))
61 | 	display(p)
62 | end
63 | 
64 | end
65 | 


--------------------------------------------------------------------------------
/julia/chapter11/dlexample/readme.md:
--------------------------------------------------------------------------------
1 | Autoencoder example with a base from MATlab code
2 | 


--------------------------------------------------------------------------------
/julia/chapter11/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter12/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter12/rlexample/DeepQLearning.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Reinforcement learning - Q learning example 
 3 | # Chapter 12
 4 | 
 5 | module DeepQLearning
 6 | 
 7 | using NNGraph
 8 | 
 9 | export DQN, forward, act, learn
10 | include("dqn.jl")
11 | 
12 | end # module
13 | 


--------------------------------------------------------------------------------
/julia/chapter12/rlexample/dqn-example.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Reinforcement learning - Q learning example 
 3 | # Chapter 12
 4 | 
 5 | using NNGraph, DeepQLearning
 6 | reload("DeepQLearning")
 7 | 
 8 | dqn = DeepQLearning.DQN(10,100,5)
 9 | 
10 | s0 = randNNMat(10,1)
11 | a = DeepQLearning.forward(dqn, s0)
12 | DeepQLearning.act(dqn,s0)
13 | DeepQLearning.learn(dqn, 0.)
14 | 
15 | s1 = randNNMat(10,1)
16 | a = DeepQLearning.forward(dqn, s1)
17 | DeepQLearning.act(dqn,s1)
18 | DeepQLearning.learn(dqn, 0.)
19 | 


--------------------------------------------------------------------------------
/julia/chapter12/rlexample/dqnruntest.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Reinforcement learning - Q learning example 
 3 | # Chapter 12
 4 | 
 5 | using DeepQLearning
 6 | using Base.Test
 7 | 
 8 | # write your own tests here
 9 | @test 1 == 1
10 | 


--------------------------------------------------------------------------------
/julia/chapter12/rlexample/dqntest1.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Reinforcement learning - Q learning example 
 3 | # Chapter 12
 4 | 
 5 | using DeepQLearning, NNGraph
 6 | # reload("DeepQLearning")
 7 | xs = linspace(0,360,100)
 8 | ys = round(sin(deg2rad(xs)),3)
 9 | deg2rad(xs)
10 | # plot(x=xs,y=ys)
11 | 
12 | m = DQN(2,100,2)
13 | alpha=0.0001; t_alpha =0.15
14 | epsilon = 0.2; t_epsilon =0.45
15 | 
16 | init = [0. 0.]
17 | s0 = NNMatrix(init'); a0 = 1; r0 = 0.
18 | t = 0
19 | for epoch = 1:1000 #0000
20 |     t += 1
21 |     avgReward = 0
22 |     m.epsilon = epsilon * 1/t^t_epsilon
23 |     m.alpha = alpha * 1/t^t_alpha
24 |     for i = 2:length(xs)
25 |         x, x2, y = xs[i],xs[i-1], ys[i]
26 |         s = [x x2]
27 |         s1 = NNMatrix(s')
28 |         a1 = act(m,s1)
29 |         r1 = (a1==1?-1:1) * sign(y)
30 |         avgReward += r1
31 |         if i > 2 learn(m,s0,a0,r1,s1) end
32 |         s0 = s1; a0 = a1; r0 = r1
33 |     end
34 |     avgReward = avgReward / (length(xs)-2)
35 |     if epoch % 100 == 0 println("$t $epoch avgReward = $(round(avgReward,3))   m.alpha=$(round(m.alpha,6))  m.epsilon=$(round(m.epsilon,6))") end
36 | end
37 | 


--------------------------------------------------------------------------------
/julia/chapter12/rlexample/readme.md:
--------------------------------------------------------------------------------
1 | This folder has a deep Q learning example code
2 | 


--------------------------------------------------------------------------------
/julia/chapter13/ensembleexample/Ensemble.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Ensemble learning example 
 3 | # Chapter 13
 4 | 
 5 | # Ensemble module.
 6 | module Ensemble
 7 | 
 8 | # Load source files
 9 | include("types.jl")
10 | include("util.jl")
11 | include("transformers.jl")
12 | 
13 | end # module
14 | 


--------------------------------------------------------------------------------
/julia/chapter13/ensembleexample/decisiontree_test.jl:
--------------------------------------------------------------------------------
 1 | module TestDecisionTreeWrapper
 2 | 
 3 | include(joinpath("..", "fixture_learners.jl"))
 4 | using .FixtureLearners
 5 | nfcp = NumericFeatureClassification()
 6 | 
 7 | using FactCheck
 8 | 
 9 | 
10 | importall Orchestra.Transformers.DecisionTreeWrapper
11 | using DecisionTree
12 | 
13 | facts("DecisionTree learners") do
14 |   context("PrunedTree gives same results as its backend") do
15 |     # Predict with Orchestra learner
16 |     learner = PrunedTree()
17 |     orchestra_predictions = fit_and_transform!(learner, nfcp)
18 | 
19 |     # Predict with original backend learner
20 |     srand(1)
21 |     model = build_tree(nfcp.train_labels, nfcp.train_instances)
22 |     model = prune_tree(model, 1.0)
23 |     original_predictions = apply_tree(model, nfcp.test_instances)
24 | 
25 |     # Verify same predictions
26 |     @fact orchestra_predictions => original_predictions
27 |   end
28 | 
29 |   context("RandomForest gives same results as its backend") do
30 |     # Predict with Orchestra learner
31 |     learner = RandomForest()
32 |     orchestra_predictions = fit_and_transform!(learner, nfcp)
33 | 
34 |     # Predict with original backend learner
35 |     srand(1)
36 |     model = build_forest(
37 |       nfcp.train_labels,
38 |       nfcp.train_instances,
39 |       size(nfcp.train_instances, 2),
40 |       10,
41 |       0.7
42 |     )
43 |     original_predictions = apply_forest(model, nfcp.test_instances)
44 | 
45 |     # Verify same predictions
46 |     @fact orchestra_predictions => original_predictions
47 |   end
48 | 
49 |   context("DecisionStumpAdaboost gives same results as its backend") do
50 |     # Predict with Orchestra learner
51 |     learner = DecisionStumpAdaboost()
52 |     orchestra_predictions = fit_and_transform!(learner, nfcp)
53 | 
54 |     # Predict with original backend learner
55 |     srand(1)
56 |     model, coeffs = build_adaboost_stumps(
57 |       nfcp.train_labels,
58 |       nfcp.train_instances,
59 |       7
60 |     )
61 |     original_predictions = apply_adaboost_stumps(
62 |       model, coeffs, nfcp.test_instances
63 |     )
64 | 
65 |     # Verify same predictions
66 |     @fact orchestra_predictions => original_predictions
67 |   end
68 | 
69 |   context("RandomForest handles training-dependent options") do
70 |     # Predict with Orchestra learner
71 |     learner = RandomForest({:impl_options => {:num_subfeatures => 2}})
72 |     orchestra_predictions = fit_and_transform!(learner, nfcp)
73 | 
74 |     # Verify RandomForest didn't die
75 |     @fact 1 => 1
76 |   end
77 | end
78 | 
79 | end # module
80 | 


--------------------------------------------------------------------------------
/julia/chapter13/ensembleexample/dimensionalityreduction.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Ensemble learning example 
 3 | # Chapter 13
 4 | 
 5 | # Dimensionality Reduction transformers.
 6 | module DimensionalityReductionWrapper
 7 | 
 8 | importall Orchestra.Types
 9 | importall Orchestra.Util
10 | import DimensionalityReduction: pca
11 | 
12 | export PCA,
13 |        fit!,
14 |        transform!
15 | 
16 | # Principal Component Analysis rotation
17 | # on features.
18 | # Features ordered by maximal variance descending.
19 | #
20 | # Fails if zero-variance feature exists.
21 | type PCA <: Transformer
22 |   model
23 |   options
24 | 
25 |   function PCA(options=Dict())
26 |     default_options = {
27 |       :center => true,
28 |       :scale => true
29 |     }
30 |     new(nothing, nested_dict_merge(default_options, options))
31 |   end
32 | end
33 | 
34 | function fit!(p::PCA, instances::Matrix, labels::Vector)
35 |   pca_model = pca(instances; p.options...)
36 |   p.model = pca_model
37 | end
38 | 
39 | function transform!(p::PCA, instances::Matrix)
40 |   return instances * p.model.rotation
41 | end
42 | 
43 | end # module
44 | 


--------------------------------------------------------------------------------
/julia/chapter13/ensembleexample/dimensionalityreduction_test.jl:
--------------------------------------------------------------------------------
 1 | module TestDimensionalityReductionWrapper
 2 | 
 3 | include(joinpath("..", "fixture_learners.jl"))
 4 | using .FixtureLearners
 5 | fcp = FeatureClassification()
 6 | 
 7 | using FactCheck
 8 | 
 9 | 
10 | importall Orchestra.Transformers.DimensionalityReductionWrapper
11 | 
12 | facts("DimensionalityReduction transformers") do
13 |   context("PCA transforms features") do
14 |     instances = [
15 |       5 10;
16 |       -5 0;
17 |       0 5;
18 |     ]
19 |     labels = ["x"; "y"; "z"]
20 |     options = {:center => false, :scale => false}
21 |     pca = PCA(options)
22 |     fit!(pca, instances, labels)
23 |     transformed = transform!(pca, instances)
24 | 
25 |     @fact true => maximum(instances - transformed * pca.model.rotation') < 10e-4
26 |   end
27 | end
28 | 
29 | end # module
30 | 


--------------------------------------------------------------------------------
/julia/chapter13/ensembleexample/mlbase.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Ensemble learning example 
 3 | # Chapter 13
 4 | 
 5 | # MLBase transformers.
 6 | module MLBaseWrapper
 7 | 
 8 | importall Orchestra.Types
 9 | importall Orchestra.Util
10 | 
11 | import MLBase: Standardize, estimate, transform
12 | 
13 | export StandardScaler,
14 |        fit!,
15 |        transform!
16 | 
17 | # Standardizes each feature using (X - mean) / stddev.
18 | # Will produce NaN if standard deviation is zero.
19 | type StandardScaler <: Transformer
20 |   model
21 |   options
22 | 
23 |   function StandardScaler(options=Dict())
24 |     default_options = {
25 |       :center => true,
26 |       :scale => true
27 |     }
28 |     new(nothing, nested_dict_merge(default_options, options))
29 |   end
30 | end
31 | 
32 | function fit!(st::StandardScaler, instances::Matrix, labels::Vector)
33 |   st_transform = estimate(Standardize, instances'; st.options...)
34 |   st.model = {
35 |     :standardize_transform => st_transform
36 |   }
37 | end
38 | 
39 | function transform!(st::StandardScaler, instances::Matrix)
40 |   st_transform = st.model[:standardize_transform]
41 |   transposed_instances = instances'
42 |   return transform(st_transform, transposed_instances)'
43 | end
44 | 
45 | end # module
46 | 


--------------------------------------------------------------------------------
/julia/chapter13/ensembleexample/mlbase_test.jl:
--------------------------------------------------------------------------------
 1 | module TestMLBaseWrapper
 2 | 
 3 | include(joinpath("..", "fixture_learners.jl"))
 4 | using .FixtureLearners
 5 | fcp = FeatureClassification()
 6 | 
 7 | using FactCheck
 8 | 
 9 | 
10 | importall Orchestra.Transformers.MLBaseWrapper
11 | 
12 | facts("MLBase transformers") do
13 |   context("StandardScaler transforms features") do
14 |     instances = [
15 |       5 10;
16 |       -5 0;
17 |       0 5;
18 |     ]
19 |     labels = [
20 |       "x";
21 |       "y";
22 |       "z";
23 |     ]
24 |     expected_transformed = [
25 |       1.0 1.0;
26 |       -1.0 -1.0;
27 |       0.0 0.0;
28 |     ]
29 |     standard_scaler = StandardScaler()
30 |     fit!(standard_scaler, instances, labels)
31 |     transformed = transform!(standard_scaler, instances)
32 | 
33 |     @fact transformed => expected_transformed
34 |   end
35 | end
36 | 
37 | end # module
38 | 


--------------------------------------------------------------------------------
/julia/chapter13/ensembleexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter13/ensembleexample/transformers.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Ensemble learning example 
 3 | # Chapter 13
 4 | 
 5 | # Transformer definitions and implementations.
 6 | module Transformers
 7 | 
 8 | export Transformer,
 9 |        Learner,
10 |        OneHotEncoder,
11 |        Imputer,
12 |        Pipeline,
13 |        Wrapper,
14 |        Identity,
15 |        Baseline,
16 |        PrunedTree, 
17 |        RandomForest,
18 |        DecisionStumpAdaboost,
19 |        StandardScaler,
20 |        PCA,
21 |        VoteEnsemble, 
22 |        StackEnsemble,
23 |        BestLearner,
24 |        SKLLearner,
25 |        CRTLearner,
26 |        fit!,
27 |        transform!
28 | 
29 | # Obtain system details
30 | import Orchestra.System: LIB_SKL_AVAILABLE, LIB_CRT_AVAILABLE
31 | 
32 | # Include abstract types as convenience
33 | importall Orchestra.Types
34 | 
35 | # Include atomic Orchestra transformers
36 | include(joinpath("orchestra", "baseline.jl"))
37 | importall .BaselineMethods
38 | include(joinpath("orchestra", "transformers.jl"))
39 | importall .OrchestraTransformers
40 | 
41 | # Include Julia transformers
42 | include(joinpath("julia", "decisiontree.jl"))
43 | importall .DecisionTreeWrapper
44 | include(joinpath("julia", "mlbase.jl"))
45 | importall .MLBaseWrapper
46 | include(joinpath("julia", "dimensionalityreduction.jl"))
47 | importall .DimensionalityReductionWrapper
48 | 
49 | # Include Python transformers
50 | if LIB_SKL_AVAILABLE
51 |   include(joinpath("python", "scikit_learn.jl"))
52 |   importall .ScikitLearnWrapper
53 | end
54 | 
55 | # Include R transformers
56 | if LIB_CRT_AVAILABLE
57 |   include(joinpath("r", "caret.jl"))
58 |   importall .CaretWrapper
59 | end
60 | 
61 | # Include aggregate transformers last, dependent on atomic transformers
62 | include(joinpath("orchestra", "ensemble.jl"))
63 | importall .EnsembleMethods
64 | 
65 | end # module
66 | 


--------------------------------------------------------------------------------
/julia/chapter13/ensembleexample/types.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Ensemble learning example 
 3 | # Chapter 13
 4 | 
 5 | # types.
 6 | module Types
 7 | 
 8 | export Transformer,
 9 |        Learner,
10 |        TestLearner,
11 |        fit!,
12 |        transform!
13 | 
14 | # All transformer types must have implementations 
15 | # of function `fit!` and `transform!`.
16 | abstract Transformer
17 | 
18 | # Learner abstract type which all machine learners implement.
19 | abstract Learner <: Transformer
20 | 
21 | # Test learner. 
22 | # Used to separate production learners from test.
23 | abstract TestLearner <: Learner
24 | 
25 | # Trains transformer on provided instances and labels.
26 | #
27 | # @param transformer Target transformer.
28 | # @param instances Training instances.
29 | # @param labels Training labels.
30 | function fit!(transformer::Transformer, instances::Matrix, labels::Vector)
31 |   error(typeof(transformer), " does not implement fit!")
32 | end
33 | 
34 | # Trains transformer on provided instances and labels.
35 | #
36 | # @param transformer Target transformer.
37 | # @param instances Original instances.
38 | # @return Transformed instances.
39 | function transform!(transformer::Transformer, instances::Matrix)
40 |   error(typeof(transformer), " does not implement transform!")
41 | end
42 | 
43 | end # module
44 | 


--------------------------------------------------------------------------------
/julia/chapter13/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter5/decisiontreeexample/decision_tree_test1.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Decision Trees example 
 3 | # Chapter 5
 4 | 
 5 | using Base.Test
 6 | using DecisionTree
 7 | 
 8 | n,m = 10^3, 5 ;
 9 | features = rand(n,m);
10 | weights = rand(-1:1,m);
11 | labels = _int(features * weights);
12 | 
13 | println("\n##### nfoldCV Classification Tree #####")
14 | accuracy = nfoldCV_tree(labels, features, 0.9, 3)
15 | @test mean(accuracy) > 0.7
16 | 
17 | println("\n##### nfoldCV Classification Forest #####")
18 | accuracy = nfoldCV_forest(labels, features, 2, 10, 3)
19 | @test mean(accuracy) > 0.7
20 | 
21 | println("\n##### nfoldCV Adaboosted Stumps #####")
22 | nfoldCV_stumps(labels, features, 7, 3)
23 | 


--------------------------------------------------------------------------------
/julia/chapter5/decisiontreeexample/decision_tree_test2.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Decision Trees example 
 3 | # Chapter 5
 4 | 
 5 | 
 6 | using Base.Test
 7 | using RDatasets
 8 | using DecisionTree
 9 | 
10 | iris = dataset("datasets", "iris")
11 | features = array(iris[:, 1:4]);
12 | labels = array(iris[:, 5]);
13 | 
14 | # train full-tree classifier
15 | model = build_tree(labels, features)
16 | # prune tree: merge leaves having >= 90% combined purity (default: 100%)
17 | model = prune_tree(model, 0.9)
18 | # pretty print of the tree, to a depth of 5 nodes (optional)
19 | print_tree(model, 5)
20 | # apply learned model
21 | apply_tree(model, [5.9,3.0,5.1,1.9])
22 | # run n-fold cross validation for pruned tree, using 90% purity threshold purning, and 3 CV folds
23 | println("\n##### nfoldCV Classification Tree #####")
24 | accuracy = nfoldCV_tree(labels, features, 0.9, 3)
25 | @test mean(accuracy) > 0.8
26 | 
27 | # train random forest classifier, using 2 random features, 10 trees and 0.5 of samples per tree (optional, defaults to 0.7)
28 | model = build_forest(labels, features, 2, 10, 0.5)
29 | # apply learned model
30 | apply_forest(model, [5.9,3.0,5.1,1.9])
31 | # run n-fold cross validation for forests, using 2 random features, 10 trees, 3 folds, 0.5 of samples per tree (optional, defaults to 0.7)
32 | println("\n##### nfoldCV Classification Forest #####")
33 | accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5)
34 | @test mean(accuracy) > 0.8
35 | 
36 | # train adaptive-boosted decision stumps, using 7 iterations
37 | model, coeffs = build_adaboost_stumps(labels, features, 7);
38 | # apply learned model
39 | apply_adaboost_stumps(model, coeffs, [5.9,3.0,5.1,1.9])
40 | # run n-fold cross validation for boosted stumps, using 7 iterations and 3 folds
41 | println("\n##### nfoldCV Classification Adaboosted Stumps #####")
42 | nfoldCV_stumps(labels, features, 7, 3)
43 | 
44 | 


--------------------------------------------------------------------------------
/julia/chapter5/decisiontreeexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter5/randomforstexample/RandomForests.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Decision Trees based learning - Random Forests example 
 3 | # Chapter 5
 4 | 
 5 | module RandomForests
 6 | 
 7 | export
 8 |     RandomForestClassifier,
 9 |     RandomForestRegressor,
10 |     fit,
11 |     predict,
12 |     feature_importances,
13 |     oob_error
14 | 
15 | using DataFrames
16 | 
17 | include("util.jl")
18 | include("tree.jl")
19 | include("randomforest.jl")
20 | include("classifier.jl")
21 | include("regressor.jl")
22 | 
23 | end # RandomForests module
24 | 


--------------------------------------------------------------------------------
/julia/chapter5/randomforstexample/example.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Decision Trees based learning - Random Forests example 
 3 | # Chapter 5
 4 | 
 5 | typealias TabularData Union{AbstractMatrix,DataFrame}
 6 | 
 7 | type Example{T<:TabularData}
 8 |     x::T  # tabular data
 9 |     y::AbstractVector
10 |     n_labels::Int
11 |     n_features::Int
12 |     sample_weight::Vector{Float64}
13 | 
14 |     function Example(x::T, y::AbstractVector, sample_weight::Vector{Float64})
15 |         n_labels = length(unique(y))
16 |         n_features = size(x, 2)
17 |         new(x, y, n_labels, n_features, sample_weight)
18 |     end
19 | 
20 |     Example(x::T, y::AbstractVector) = Example{T}(x, y, ones(Float64, size(x, 1)))
21 | end
22 | 


--------------------------------------------------------------------------------
/julia/chapter5/randomforstexample/readme.md:
--------------------------------------------------------------------------------
 1 | # RandomForests.jl
 2 | 
 3 | CART-based random forest implementation in Julia.
 4 | This package supports:
 5 | * Classification model
 6 | * Regression model
 7 | * Out-of-bag (OOB) error
 8 | * Feature importances
 9 | * Various configurable parameters
10 | **Please be aware that this package is not yet fully examined implementation. You can use it at your own risk.**
11 | And your bug report or suggestion is welcome!
12 | ```julia
13 | RandomForestClassifier(;n_estimators::Int=10,
14 |                         max_features::Union(Integer, FloatingPoint, Symbol)=:sqrt,
15 |                         max_depth=nothing,
16 |                         min_samples_split::Int=2,
17 |                         criterion::Symbol=:gini)
18 | ```
19 | 
20 | ```julia
21 | RandomForestRegressor(;n_estimators::Int=10,
22 |                        max_features::Union(Integer, FloatingPoint, Symbol)=:third,
23 |                        max_depth=nothing,
24 |                        min_samples_split::Int=2)
25 | ```
26 | 
27 | * `n_estimators`: the number of weak estimators
28 | * `max_features`: the number of candidate features at each split
29 |     * if `Integer` is given, the fixed number of features are used
30 |     * if `FloatingPoint` is given, the proportion of given value (0.0, 1.0] are used
31 |     * if `Symbol` is given, the number of candidate features is decided by a strategy
32 |         * `:sqrt`: `ifloor(sqrt(n_features))`
33 |         * `:third`: `div(n_features, 3)`
34 | * `max_depth`: the maximum depth of each tree
35 |     * the default argument `nothing` means there is no limitation of the maximum depth
36 | * `min_samples_split`: the minimum number of sub-samples to try to split a node
37 | * `criterion`: the criterion of impurity measure (classification only)
38 |     * `:gini`: Gini index
39 |     * `:entropy`: Cross entropy
40 | 
41 | `RandomForestRegressor` always uses the mean squared error for its impurity measure.
42 | At the current moment, there is no configurable criteria for regression model.
43 | 
44 | 
45 | ## Related package
46 | * [DecisionTree.jl]
47 |     * DecisionTree.jl is based on the ID3 (Iterative Dichotomiser 3) algorithm while RandomForests.jl uses CART (Classification And Regression Tree).
48 | 
49 | ## Acknowledgement
50 | The algorithm and interface are highly inspired by those of [scikit-learn](http://scikit-learn.org).
51 | 


--------------------------------------------------------------------------------
/julia/chapter5/randomforstexample/sort.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Decision Trees based learning - Random Forests example 
 3 | # Chapter 5
 4 | 
 5 | import Base: sort!
 6 | 
 7 | const SMALL_THRESHOLD = 40
 8 | 
 9 | function sort!(samples::AbstractVector, feature::AbstractVector, range::UnitRange{Int})
10 |     # inplace sort `samples` and `feature` vector in one shot, along with `feature`, between `range`
11 |     len = length(range)
12 |     @assert len == length(feature) <= length(samples)
13 |     sort!(sub(samples, range), feature, 1, endof(feature))
14 | end
15 | 
16 | function sort!(x::AbstractVector, y::AbstractVector, lo::Int, hi::Int)
17 |     # if the range subject to sorting is small, the insertion sort would be faster
18 |     if hi - lo <= SMALL_THRESHOLD
19 |         isort!(x, y, lo, hi)
20 |     else
21 |         # quick sort
22 |         p = partition(x, y, lo, hi)
23 |         sort!(x, y, lo, p - 1)
24 |         sort!(x, y, p + 1, hi)
25 |     end
26 |     return
27 | end
28 | 
29 | # insertion sort
30 | function isort!(x::AbstractVector, y::AbstractVector, lo::Int, hi::Int)
31 |     @inbounds for i in lo+1:hi
32 |         elm = y[i]
33 |         tmp = x[i]
34 |         j = i
35 |         while j > lo && y[j-1] > elm
36 |             # shift elements
37 |             y[j] = y[j-1]
38 |             x[j] = x[j-1]
39 |             j -= 1
40 |         end
41 |         y[j] = elm
42 |         x[j] = tmp
43 |     end
44 | end
45 | 
46 | function median(x::AbstractVector, i::Int, j::Int, k::Int)
47 |     if x[i] < x[j]
48 |         if x[j] < x[k]
49 |             return j
50 |         elseif x[k] < x[i]
51 |             return i
52 |         else
53 |             return k
54 |         end
55 |     else
56 |         # implies x[j] <= x[i]
57 |         if x[k] <= x[j]
58 |             return j
59 |         elseif x[i] <= x[k]
60 |             return i
61 |         else
62 |             return k
63 |         end
64 |     end
65 | end
66 | 
67 | function partition(x::AbstractVector, y::AbstractVector, lo::Int, hi::Int)
68 |     # choose pivot
69 |     pivot_index = median(y, lo, hi, div(lo + hi, 2))
70 |     pivot_value = y[pivot_index]
71 | 
72 |     # swap elements at pivot_index and hi
73 |     y[pivot_index], y[hi] = y[hi], y[pivot_index]
74 |     x[pivot_index], x[hi] = x[hi], x[pivot_index]
75 | 
76 |     p = lo
77 |     @inbounds for i in lo:hi-1
78 |         if y[i] <= pivot_value
79 |             # swap elements at i and p
80 |             y[i], y[p] = y[p], y[i]
81 |             x[i], x[p] = x[p], x[i]
82 |             p += 1
83 |         end
84 |     end
85 | 
86 |     # swap elements at p and hi
87 |     y[p], y[hi] = y[hi], y[p]
88 |     x[p], x[hi] = x[hi], x[p]
89 | 
90 |     p
91 | end
92 | 


--------------------------------------------------------------------------------
/julia/chapter5/randomforstexample/util.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Decision Trees based learning - Random Forests example 
 3 | # Chapter 5
 4 | 
 5 | function Base.vec(x::DataFrame)
 6 |     n, m = size(x)
 7 |     n == 1 || error("x must be a single record")
 8 |     [x[n, j] for j in 1:m]
 9 | end
10 | 


--------------------------------------------------------------------------------
/julia/chapter5/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter6/knnexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter6/knnexample/test.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/julia/chapter6/knnexample/test.zip


--------------------------------------------------------------------------------
/julia/chapter6/knnexample/train.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/julia/chapter6/knnexample/train.zip


--------------------------------------------------------------------------------
/julia/chapter6/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter6/svmexample/examplesvm1.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Support Vector Machine example 
 3 | # Chapter 6
 4 | 
 5 | reposDir = EnvHash()["JuliaRepos"]
 6 | 
 7 | ## Load julia-svm
 8 | load("svm.jl")
 9 | 
10 | n = int(1e3)
11 | p = 20
12 | X = rand(n, p)
13 | y = float(randi((0, 1), n))
14 | 
15 | svp = svmproblem(y, X)
16 | svparam = svmparameter("epsilon_svr", "rbf", int32(3),
17 |                        1., 0., 40., 0.001,
18 |                        1., 0.5, 
19 |                        1., int32(1), int32(0))
20 | model = svmtrain(svp, svparam)
21 | 
22 | X2 = rand(10, p)
23 | 
24 | pred = svmpredict(model, X2)
25 | 


--------------------------------------------------------------------------------
/julia/chapter6/svmexample/readme.md:
--------------------------------------------------------------------------------
1 | Julia bindings to libsvm
2 | 


--------------------------------------------------------------------------------
/julia/chapter7/aprioriexample/apriori.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Association rule based learning - Apriori example 
 3 | # Chapter 7
 4 | 
 5 | # Find k-freq-itemset in given transactions of items queried together
 6 | using StatsBase
 7 | 
 8 | # Find frequent itemsets from transactions
 9 | # @T: transaction list
10 | # @minsupp: minimum support
11 | function find_freq_itemset(T, minsupp)
12 |     N = length(T)
13 |     # Find itemset I from transaction list T
14 |     I = Array(Int64,0)
15 |     for t in T
16 |         for i in t
17 |             push!(I,i)
18 |         end
19 |     end
20 |     I = Set(I)
21 | 
22 |     # Find freq-itemset when k = 1: Fₖ = {i | i ∈ I^σ({i}) ≥ N × minsupp}
23 |     k = 1
24 |     F = []
25 |     push!(F,map(x->[x],filter(i->σ(i,T) >= N * minsupp, I))) # F₁
26 |     while true
27 |         Cₖ = gen_candidate(F[end]) # Generate candidate set Cₖ from Fₖ₋₁
28 |         Fₖ = filter(c->σ(c,T) >= Nbumanzu * minsupp, Cₖ)
29 |         if !isempty(Fₖ)
30 |             push!(F,Fₖ) # Eliminate infrequent candidates, then set to Fₖ
31 |         else break
32 |         end
33 |     end
34 |     F
35 | end
36 | 
37 | # Generate freq-itemset from a list of itemsets
38 | # @x: list of itemsets
39 | function gen_candidate(x)
40 |     n = length(x)
41 |     Cₖ = Array(Array{Int64,1},0)
42 |     for a = 1:n, b = 1:n
43 |         if a >= b;continue
44 |         end
45 |         is_candidate = true
46 |         sort!(x[a]); sort!(x[b])
47 |         for i in 1:length(x[1])-1
48 |             if x[a][i] == x[b][i]; continue
49 |             else is_candidate = false; break
50 |             end
51 |         end
52 |         if is_candidate
53 |             push!(Cₖ, sort!([ x[a][1:end-1], x[a][end], x[b][end] ]))
54 |         end
55 |     end
56 |     Cₖ
57 | end
58 | 
59 | # Generate rules from frequent itemsets
60 | # @x: list of frequent itemsets
61 | # @T: Transaction list
62 | function gen_rules(x, T)
63 |     if length(x) <= 1; return [] # F contains 1-itemsets only, hence no rules generated.
64 |     end
65 |     x = reduce(append!,x[2:end])
66 |     R = Array(Rule,0)
67 |     for f in x # f as each freq-f-itemset fₖ
68 |         ap_genrules!(R,f,map(i->Array([i]),f),T) # H₁ itemset is same as f
69 |     end
70 |     R
71 | end
72 | 
73 | function ap_genrules!(R, f, H, T)
74 |     k, m = length(f), length(H[1])
75 |     if k > m + 1
76 |         H = gen_candidate(H)
77 |         H_plus_1 = []
78 |         for h in H
79 |             p = setdiff(f,h)
80 |             if conf(p, h, T) >= minconf
81 |                 push!(R, Rule(p,h))
82 |                 push!(H_plus_1, h)
83 |             end
84 |         end
85 |         ap_genrules(R, f, H_plus_1, T)
86 |     end
87 | end
88 | 


--------------------------------------------------------------------------------
/julia/chapter7/aprioriexample/aprioritest.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Association rule based learning - Apriori TEST  
 3 | # Chapter 7
 4 | 
 5 | using Base.Test
 6 | include("apriori.jl")
 7 | 
 8 | function _gen_dummy_data!(transactions)
 9 |     range = [1:10]
10 |     for i in 1:length(transactions)
11 |         transactions[i] = sample(range, sample(range, 1)[1], replace = false)
12 |     end
13 |     transactions
14 | end
15 | 
16 | minsupp = 0.5
17 | minconf = 0.2
18 | T = Array(Array{Int64,1},10)
19 | T[1] = [1,2]
20 | T[2] = [1,3,4,5]
21 | T[3] = [2,3,4,6]
22 | T[4] = [1,2,3,4]
23 | T[5] = [1,2,3,6]
24 | T[6] = [1,2,3,5]
25 | T[7] = [1,2,3,5,6]
26 | T[8] = [1,3,4,5,6]
27 | T[9] = [1,2,3,4,5,6]
28 | T[10] = [1,2,3,4,5]
29 | 
30 | @test_approx_eq σ([2,3,4],T[1:5]) 2
31 | @test_approx_eq supp([2,3],4,T[1:5]) 0.4
32 | @test_approx_eq_eps conf([2,3],4,T[1:5]) 0.67 1e-2
33 | 
34 | F₂ = Array(Array{Int64,1},4)
35 | F₂[1] = [3,4]
36 | F₂[2] = [1,3]
37 | F₂[3] = [1,2]
38 | F₂[4] = [2,3]
39 | @test length(gen_candidate(F₂)) == 1
40 | @test gen_candidate(F₂)[1] == [1,2,3]
41 | 
42 | F = find_freq_itemset(T[1:5], minsupp)
43 | @test length(F) == 2
44 | @test Set(F[1]) == Set([1],[2],[3],[4])
45 | @test Set(F[2]) == Set([3,4],[1,3],[1,2],[2,3])
46 | 


--------------------------------------------------------------------------------
/julia/chapter7/aprioriexample/common.jl:
--------------------------------------------------------------------------------
 1 | # Common types and functions
 2 | 
 3 | type Rule
 4 |     P::Array{Int64} # Antecedent
 5 |     Q::Array{Int64} # Consequent
 6 | end
 7 | 
 8 | # Support Count: σ(x) = | {tᵢ|x ⊆ tᵢ,tᵢ∈ T}|
 9 | function σ(x, T)
10 |     ret = 0
11 |     for t in T
12 |         ⊆(x,t) && (ret += 1)
13 |     end
14 |     ret
15 | end
16 | 
17 | # Support of itemset x -> y, which x does not intersect y.
18 | supp(x,y,T) = σ(∪(x,y),T)/length(T)
19 | 
20 | # Confidence of itemset x-> y, which x does not intersect y.
21 | conf(x,y,T) = σ(∪(x,y),T)/σ(x,T)
22 | 


--------------------------------------------------------------------------------
/julia/chapter7/aprioriexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter7/fpgrowthexample/common.jl:
--------------------------------------------------------------------------------
 1 | # Common types and functions
 2 | 
 3 | type Rule
 4 |     P::Array{Int64} # Antecedent
 5 |     Q::Array{Int64} # Consequent
 6 | end
 7 | 
 8 | # Support Count: σ(x) = | {tᵢ|x ⊆ tᵢ,tᵢ∈ T}|
 9 | function σ(x, T)
10 |     ret = 0
11 |     for t in T
12 |         ⊆(x,t) && (ret += 1)
13 |     end
14 |     ret
15 | end
16 | 
17 | # Support of itemset x -> y, which x does not intersect y.
18 | supp(x,y,T) = σ(∪(x,y),T)/length(T)
19 | 
20 | # Confidence of itemset x-> y, which x does not intersect y.
21 | conf(x,y,T) = σ(∪(x,y),T)/σ(x,T)
22 | 


--------------------------------------------------------------------------------
/julia/chapter7/fpgrowthexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter7/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/julia/chapter8/k-meansexample/k-means.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Clustering based learning - L-mena clustering 
 3 | # Chapter 8
 4 | 
 5 | using Images
 6 | using ImageView
 7 | using Color
 8 | 
 9 | # Run non-interactively, accept either an image or a directory
10 | # If directory, iterate through image filetypes
11 | 
12 | # Enter filepath and number of dominant colors wanted, k
13 | function dominant_colors (filename, k) 
14 | 	if isfile(filename)
15 | 		img = imread(filename,RGB)
16 | 		init(img, filename, k)
17 | 	elseif isdir(filename)
18 | 		files = filter!(abspath(r"\.(?:jpe?g|gif|png|tiff)$")), readdir()) #creates an array of filenames by filtering out only files that in file extensions.
19 | 		for i in files
20 | 			dominant_colors(i)
21 | 		end
22 | 	else 
23 | 		error("No image found.")
24 | 	end
25 | end
26 | 
27 | type Point
28 | 	coords::Array # Color associated with pt, a 3D array.
29 | 	ct:Int # Count
30 | end
31 | 
32 | type Cluster
33 | 	points::Array # Points associated with cluster, C_k
34 | 	centroid::Point # Center of cluster, assumed mean of pt values
35 | 	k::Int # Cluster count
36 | end
37 | 
38 | function init(img, filename, k)
39 | # Convert color space from sRGB (linear) to CIEXYZ to CIELAB
40 | 	run(`convert $filename -thumbnail 200x200 $filename`) #convert to thumbnail via ImageMagick CLI
41 | 	img = convert(Image{LAB}, img) #use Color to convert to LAB automagically
42 | 	points = getpoints(img)
43 | 	randclusters(points, k)
44 | 	kmeans(points, k) 
45 | 	
46 | function getpoints(img)
47 | 	points = []
48 | 	count = 0
49 | 	for count, color in img[1:width(img)]
50 | 		for count, color in img[2:height(img)]
51 | 			count += 1
52 | 			points.append(Point(color, count))
53 | 		end
54 | 	return points
55 | end	
56 | 
57 | # Sq. euclidean distance
58 | function distance (pt1, pt2)
59 | 	return mapreduce((pt1.coords[i]-pt2.coords[i])**2, +, 1:length(pt1.coords))
60 | end
61 | 
62 | # Randomly assign pixels to represent intial centroid/clusters
63 | function randclusters(pts::Array, k)
64 | 	kclusters = []
65 | 	for n = 1:k 
66 | 		kclusters.append(Cluster(pts, pts[rand(1:end)], n)
67 | 	return kclusters
68 | end
69 | 
70 | # Recalculate Centroid
71 | function recenter(points, )
72 | 
73 | # K-Means Algorithm (alternate between resigning points to a cluster based on similarity and cluster centroid based on the points assigned)
74 | function kmeans()
75 | # Repeat n number of times 
76 | 
77 | # Optionally convert returned clusters back to sRGB


--------------------------------------------------------------------------------
/julia/chapter8/k-meansexample/readme.md:
--------------------------------------------------------------------------------
1 | Finding dominant colors in an image ala Google's Palette using K-Means clustering in Julia.
2 | Reference from a MIT license code
3 | 


--------------------------------------------------------------------------------
/julia/chapter8/readme.md:
--------------------------------------------------------------------------------
1 | Finding dominant colors in an image ala Google's Palette using K-Means clustering in Julia.
2 | Reference from a MIT license code
3 | 


--------------------------------------------------------------------------------
/julia/chapter9/naivebayesexample/NaiveBayes.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Bayesian learning - Naive Bayes example 
 3 | # Chapter 9
 4 | module NaiveBayes
 5 | 
 6 | export NBModel,
 7 |        MultinomialNB,
 8 |        GaussianNB,
 9 |        fit,
10 |        predict,
11 |        predict_proba
12 | 
13 | include("nbtypes.jl")
14 | include("core.jl")
15 | 
16 | end
17 | 


--------------------------------------------------------------------------------
/julia/chapter9/naivebayesexample/datastats.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Bayesian learning - Naive Bayes example 
 3 | # Chapter 9
 4 | 
 5 | using Base.BLAS
 6 | 
 7 | # type for collecting data statistics incrementally
 8 | type DataStats    
 9 |     x_sums::Vector{Float64}      # sum(x_i)
10 |     cross_sums::Matrix{Float64}  # sum(x_i'*x_i) (lower-triangular matrix)
11 |     n_obs::Uint64                # number of observations
12 |     obs_axis::Int64              # observation axis, e.g. size(X, obs_axis)
13 |                                  # should return number of observations
14 |     function DataStats(n_vars, obs_axis=1)
15 |         @assert obs_axis == 1 || obs_axis == 2
16 |         new(zeros(Float64, n_vars), zeros(Float64, n_vars, n_vars), 0, obs_axis)
17 |     end
18 | end
19 | 
20 | 
21 | function Base.show(io::IO, dstats::DataStats)
22 |     print(io, "DataStats(n_vars=$(length(dstats.x_sums))," *
23 |           "n_obs=$(dstats.n_obs),obs_axis=$(dstats.obs_axis))")
24 | end
25 | 
26 | 
27 | # Collect data statistics.
28 | # This method may be called multiple times on different
29 | # data samples to collect aggregative statistics.
30 | function updatestats(dstats::DataStats, X::Matrix{Float64})
31 |     trans = dstats.obs_axis == 1 ? 'T' : 'N'
32 |     axpy!(1.0, sum(X, dstats.obs_axis), dstats.x_sums)
33 |     syrk!('L', trans, 1.0, X, 1.0, dstats.cross_sums)
34 |     dstats.n_obs += size(X, dstats.obs_axis)
35 |     return dstats
36 | end
37 | 
38 | function Base.mean(dstats::DataStats)
39 |     @assert (dstats.n_obs >= 1) "At least 1 observations is requied"
40 |     return dstats.x_sums ./ dstats.n_obs
41 | end
42 | 
43 | function Base.cov(dstats::DataStats)
44 |     @assert (dstats.n_obs >= 2) "At least 2 observations are requied"
45 |     mu = mean(dstats)
46 |     C = (dstats.cross_sums - dstats.n_obs * (mu*mu')) / (dstats.n_obs - 1)
47 |     Base.LinAlg.copytri!(C, 'L')
48 |     return C
49 | end
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/julia/chapter9/naivebayesexample/nbexampledata-iris.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Bayesian learning - Naive Bayes example 
 3 | # Chapter 9
 4 | 
 5 | using NaiveBayes
 6 | using RDatasets
 7 | 
 8 | iris = dataset("datasets", "iris")
 9 | 
10 | # observations in columns and variables in rows
11 | X = array(iris[:, 1:4])'
12 | p, n = size(X)
13 | # by default species is a PooledDataArray,
14 | y = [species for species in iris[:, 5]]
15 | 
16 | # how much data use for training
17 | train_frac = 0.9
18 | k = int(floor(train_frac * n))
19 | idxs = randperm(n)
20 | train = idxs[1:k]
21 | test = idxs[k+1:end]
22 | 
23 | model = GaussianNB(unique(y), p)
24 | fit(model, X[:, train], y[train])
25 | 
26 | accuracy = countnz(predict(model, X[:,test]) .== y[test]) / countnz(test)
27 | 
28 | println("Accuracy: $accuracy")
29 | 


--------------------------------------------------------------------------------
/julia/chapter9/naivebayesexample/nbtest1.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Bayesian learning - Naive Bayes example 
 3 | # Chapter 9
 4 | 
 5 | using StatsBase
 6 | using NaiveBayes
 7 | 
 8 | function test_multinomial()
 9 |     print("testing MultinomialNB... ")
10 |     m = MultinomialNB([:a, :b, :c], 5)
11 |     X = [1 2 5 2;
12 |          5 3 -2 1;
13 |          0 2 1 11;
14 |          6 -1 3 3;
15 |          5 7 7 1]
16 |     y = [:a, :b, :a, :c]
17 |     
18 |     fit(m, X, y)
19 |     @assert predict(m, X) == y
20 |     println("OK")
21 | end
22 | 
23 | function test_gaussian()
24 |     print("testing GaussianNB... ")
25 |     n_obs = 100
26 |     m = GaussianNB([:a, :b, :c], 5)
27 |     X = randn(5, n_obs)
28 |     y = sample([:a, :b, :c], n_obs)
29 |     
30 |     fit(m, X, y)
31 |     accuracy = sum(predict(m, X) .== y) / n_obs
32 |     println(accuracy)
33 |     println("OK")
34 | end
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/julia/chapter9/naivebayesexample/nbtest2.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Bayesian learning - Naive Bayes example 
 3 | # Chapter 9
 4 | 
 5 | include("ntests1.jl")
 6 | 
 7 | # normal (variables on columns)
 8 | X = rand(40, 10)
 9 | ds = DataStats(10)
10 | updatestats(ds, X[1:20, :])
11 | updatestats(ds, X[21:end, :])
12 | 
13 | @assert all((cov(X) - cov(ds)) .< 0.0001)
14 | 
15 | # transposed (variables on rows)
16 | X = rand(40, 10)
17 | ds = DataStats(10, 2)
18 | updatestats(ds, X')
19 | 
20 | @assert all((cov(X) - cov(ds)) .< 0.0001)
21 | 
22 | println("All OK")
23 | 


--------------------------------------------------------------------------------
/julia/chapter9/naivebayesexample/nbtypes.jl:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Bayesian learning - Naive Bayes example 
 3 | # Chapter 9
 4 | 
 5 | using Distributions
 6 | 
 7 | include("datastats.jl")
 8 | 
 9 | # Base type for Naive Bayes models.
10 | # Inherited classes should have at least following fields:
11 | #  c_counts::Dict{C, Int64} - count of ocurrences of each class
12 | #  n_obs::Int64             - total number of observations
13 | abstract NBModel{C}
14 | 
15 | #####################################
16 | #####  Multinomial Naive Bayes  #####
17 | #####################################
18 | 
19 | type MultinomialNB{C} <: NBModel
20 |     c_counts::Dict{C, Int64}           # count of ocurrences of each class
21 |     x_counts::Dict{C, Vector{Number}}  # count/sum of occurrences of each var
22 |     x_totals::Vector{Number}           # total occurrences of each var
23 |     n_obs::Int64                       # total number of seen observations
24 | end
25 | 
26 | 
27 | # Multinomial Naive Bayes classifier
28 | #
29 | # classes : array of objects
30 | #     Class names
31 | # n_vars : Int64
32 | #     Number of variables in observations
33 | # alpha : Number (optional, default 1)
34 | #     Smoothing parameter. E.g. if alpha equals 1, each variable in each class
35 | #     is believed to have 1 observation by default
36 | function MultinomialNB{C}(classes::Vector{C}, n_vars::Int64; alpha=1)
37 |     c_counts = Dict(classes, ones(Int64, length(classes)) * alpha)
38 |     x_counts = Dict{C, Vector{Int64}}()
39 |     for c in classes
40 |         x_counts[c] = ones(Int64, n_vars) * alpha
41 |     end
42 |     x_totals = ones(Float64, n_vars) * alpha * length(c_counts)
43 |     MultinomialNB{C}(c_counts, x_counts, x_totals, sum(x_totals))
44 | end
45 | 
46 | 
47 | function Base.show(io::IO, m::MultinomialNB)
48 |     print(io, "MultinomialNB($(m.c_counts))")
49 | end
50 | 
51 | 
52 | #####################################
53 | ######  Gaussian Naive Bayes  #######
54 | #####################################
55 | 
56 | type GaussianNB{C} <: NBModel
57 |     c_counts::Dict{C, Int64}           # count of ocurrences of each class
58 |     c_stats::Dict{C, DataStats}        # aggregative data statistics
59 |     gaussians::Dict{C, MvNormal}        # precomputed distribution
60 |     # x_counts::Dict{C, Vector{Number}}  # ?? count/sum of occurrences of each var
61 |     # x_totals::Vector{Number}           # ?? total occurrences of each var
62 |     n_obs::Int64                       # total number of seen observations
63 | end
64 | 
65 | 
66 | function GaussianNB{C}(classes::Vector{C}, n_vars::Int64)
67 |     c_counts = Dict(classes, zeros(Int64, length(classes)))
68 |     c_stats = Dict(classes, [DataStats(n_vars, 2) for i=1:length(classes)])
69 |     gaussians = Dict{C, MvNormal}()
70 |     GaussianNB{C}(c_counts, c_stats, gaussians, 0)
71 | end
72 | 
73 | 
74 | function Base.show(io::IO, m::GaussianNB)
75 |     print(io, "GaussianNB($(m.c_counts))")
76 | end
77 | 


--------------------------------------------------------------------------------
/julia/chapter9/naivebayesexample/readme.md:
--------------------------------------------------------------------------------
1 | Naive Bayes classifier. Currently 2 types of NB are supported: 
2 | 
3 |  * **MultinomialNB** - assumes variables have a multinomial distribution. Good e.g. for text classification. See `examples/nums.jl` for usage.
4 |  * **GaussianNB** - assumes variables have a multivariate normal distribution. Good for real-valued data. See `examples/iris.jl` for usage.
5 | 
6 | Since `GaussianNB` models multivariate distribution, it's not really a "naive" classifier (i.e. no independence assumption is made), so the name may change in the future. 
7 | 


--------------------------------------------------------------------------------
/julia/chapter9/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter10/linearregressionexample/readme.md:
--------------------------------------------------------------------------------
1 | There are no direct APIs for implementing Linear Regression Algorithm. There are some alternative ways of solving this problem using Hadoop MapReduce and Python or R packages.
2 | 
3 | This is a placeholder folder and implementation details will be added as and when supporting APIs are made available with the maout distributions
4 | 
5 | Reference to list of algorithms supported by mahout can be found here:
6 | https://mahout.apache.org/users/basics/algorithms.html
7 | 
8 | 


--------------------------------------------------------------------------------
/mahout/chapter10/logisticregressionexample/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>com.packt.pml.mahout.logreg</groupId>
 6 |   <artifactId>logistic-regression</artifactId>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>logistic-regression</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>junit</groupId>
20 |       <artifactId>junit</artifactId>
21 |       <version>3.8.1</version>
22 |       <scope>test</scope>
23 |     </dependency>
24 |     
25 | <dependency>
26 | 	<groupId>net.sf.opencsv</groupId>
27 | 	<artifactId>opencsv</artifactId>
28 | 	<version>2.3</version>
29 | </dependency>
30 |     <dependency>
31 |       <groupId>org.apache.mahout</groupId>
32 |       <artifactId>mahout-examples</artifactId>
33 |       <version>0.9</version>
34 |       <type>jar</type>
35 |     </dependency>
36 |   </dependencies>
37 |   
38 |   
39 | </project>
40 | 


--------------------------------------------------------------------------------
/mahout/chapter10/logisticregressionexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter10/logisticregressionexample/src/main/java/com/packt/pml/mahout/logreg/LogisticRegressionApp.java:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Practical Machine learning
 3 | * Logistic Regression Example
 4 | * Chapter 10
 5 | */
 6 | package com.packt.pml.mahout.logreg;
 7 | 
 8 | import au.com.bytecode.opencsv.CSVReader;
 9 | import au.com.bytecode.opencsv.CSVWriter;
10 | import java.io.FileNotFoundException;
11 | import java.io.FileReader;
12 | import java.io.FileWriter;
13 | import java.io.IOException;
14 | 
15 | 
16 | /**
17 |  * Hello world!
18 |  *
19 |  */
20 | public class LogisticRegressionApp 
21 | {
22 |     public static void main( String[] args ) throws FileNotFoundException, IOException
23 |     {
24 |             
25 |         CSVReader reader = new CSVReader(new FileReader("$WORK_DIR/train/train.csv"));
26 | 
27 |         String [] nextLine;
28 |         String [] previousLine;
29 |         String [] headernew = new String [reader.readNext().length + 1];  
30 |         
31 |         CSVWriter writer = new CSVWriter(new FileWriter("$WORK_DIR/train/final.csv"), ',');  
32 |         
33 |         nextLine = reader.readNext();
34 |         
35 |         for (int i = 0; i < nextLine.length;i++)
36 |         {
37 |             headernew[i] = nextLine[i];
38 |         }
39 |         
40 |         headernew[headernew.length-1] = "action";
41 |         writer.writeNext(headernew); 
42 |         
43 |         previousLine = reader.readNext();
44 |            
45 |         
46 |         while ((nextLine = reader.readNext()) != null) {
47 |             // nextLine[] is an array of values from the line
48 |             System.out.println(nextLine[0] + nextLine[1] + "etc...");
49 |             headernew = new String [nextLine.length + 1];
50 |             
51 |             for (int i = 0; i < headernew.length-1;i++)
52 |             {
53 |                 headernew[i] = nextLine[i];
54 |             }            
55 |             
56 |             if (
57 |                     Double.parseDouble(previousLine[4]) < Double.parseDouble(nextLine[4])
58 |                 )
59 |             {
60 |                     headernew[headernew.length] = "SELL";
61 |             } else {
62 |                 headernew[headernew.length] = "BUY";
63 |             }
64 |             
65 |             writer.writeNext(headernew);
66 |             
67 |             previousLine = nextLine;
68 |             
69 |             
70 |         }
71 |         
72 |         reader.close();
73 |         writer.close();
74 | 
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/mahout/chapter10/logisticregressionexample/src/test/java/com/packt/pml/mahout/logreg/LogisticRegressionTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Practical Machine learning
 3 | * Logistic Regression Example
 4 | * Chapter 10
 5 | */
 6 | package com.packt.pml.mahout.logreg;
 7 | 
 8 | import junit.framework.Test;
 9 | import junit.framework.TestCase;
10 | import junit.framework.TestSuite;
11 | 
12 | /**
13 |  * Unit test for simple App.
14 |  */
15 | public class LogisticRegressionTest 
16 |     extends TestCase
17 | {
18 |     /**
19 |      * Create the test case
20 |      *
21 |      * @param testName name of the test case
22 |      */
23 |     public LogisticRegressionTest( String testName )
24 |     {
25 |         super( testName );
26 |     }
27 | 
28 |     /**
29 |      * @return the suite of tests being tested
30 |      */
31 |     public static Test suite()
32 |     {
33 |         return new TestSuite( LogisticRegressionTest.class );
34 |     }
35 | 
36 |     /**
37 |      * Rigourous Test :-)
38 |      */
39 |     public void testLogisticRegression()
40 |     {
41 |         assertTrue( true );
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/mahout/chapter10/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter11/annexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter11/dlexample/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>EPFL</groupId>
 4 |   <artifactId>DeepManuscriptLearning</artifactId>
 5 |   <version>0.0.1</version>
 6 |   <build>
 7 |     <sourceDirectory>src</sourceDirectory>
 8 |     <plugins>
 9 |       <plugin>
10 |         <artifactId>maven-compiler-plugin</artifactId>
11 |         <version>3.1</version>
12 |         <configuration>
13 |           <source>1.7</source>
14 |           <target>1.7</target>
15 |         </configuration>
16 |       </plugin>
17 |     </plugins>
18 |   </build>
19 |   <dependencies>
20 |   	<dependency>
21 |   		<groupId>org.apache.spark</groupId>
22 |   		<artifactId>spark-core_2.10</artifactId>
23 |   		<version>1.3.0</version>
24 |   	</dependency>
25 |   	<dependency>
26 |   		<groupId>org.apache.spark</groupId>
27 |   		<artifactId>spark-mllib_2.10</artifactId>
28 |   		<version>1.3.0</version>
29 |   	</dependency>
30 |        <dependency>
31 |                <groupId>com.google.protobuf</groupId>
32 |                <artifactId>protobuf-java</artifactId>
33 |                <version>2.6.1</version>
34 |         </dependency>
35 |         <dependency>
36 | 	      <groupId>junit</groupId>
37 | 	       <artifactId>junit-dep</artifactId>
38 | 	       <version>4.8.2</version>
39 |        </dependency> 
40 |  </dependencies>
41 | </project>
42 | 


--------------------------------------------------------------------------------
/mahout/chapter11/dlexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter11/dlexample/src/main/java/AutoencoderComputedParams.java:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Deep learning - Autoencoder example 
 3 | // Chapter 11
 4 | 
 5 | package main.java;
 6 | 
 7 | import java.io.Serializable;
 8 | 
 9 | public class AutoencoderComputedParams implements Serializable {
10 | 	
11 | 	private long numSamples;
12 | 	private double[] sparsityArray;
13 | 	
14 | 	public AutoencoderComputedParams(long numSamples, double[] sparsityArray) {
15 | 		super();
16 | 		this.numSamples = numSamples;
17 | 		this.sparsityArray = sparsityArray;
18 | 	}
19 | 
20 | 	public long getNumSamples() {
21 | 		return numSamples;
22 | 	}
23 | 
24 | 	public void setNumSamples(long numSamples) {
25 | 		this.numSamples = numSamples;
26 | 	}
27 | 
28 | 	public double[] getSparsityArray() {
29 | 		return sparsityArray;
30 | 	}
31 | 
32 | 	public void setSparsityArray(double[] sparsityArray) {
33 | 		this.sparsityArray = sparsityArray;
34 | 	}
35 | 
36 | 
37 | 	
38 | 	
39 | }
40 | 


--------------------------------------------------------------------------------
/mahout/chapter11/dlexample/src/main/java/AutoencoderFctGrd.java:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Deep learning - Autoencoder example 
 3 | // Chapter 11
 4 | package main.java;
 5 | 
 6 | import java.io.Serializable;
 7 | 
 8 | import org.apache.spark.mllib.linalg.DenseMatrix;
 9 | import org.apache.spark.mllib.linalg.DenseVector;
10 | 
11 | public class AutoencoderFctGrd implements Serializable{
12 | 	
13 | 	private DenseMatrix w1;
14 | 	private DenseMatrix w2;
15 | 	private DenseVector b1;
16 | 	private DenseVector b2;
17 | 	private double value;
18 | 	
19 | 	public AutoencoderFctGrd(DenseMatrix w1, DenseMatrix w2, DenseVector b1, DenseVector b2,double value) {
20 | 		this.w1 = w1;
21 | 		this.w2 = w2;
22 | 		this.b1 = b1;
23 | 		this.b2 = b2;
24 | 		this.value = value;
25 | 	}
26 | 
27 | 	public DenseMatrix getW1() {
28 | 		return w1;
29 | 	}
30 | 
31 | 	public void setW1(DenseMatrix w1) {
32 | 		this.w1 = w1;
33 | 	}
34 | 
35 | 	public DenseMatrix getW2() {
36 | 		return w2;
37 | 	}
38 | 
39 | 	public void setW2(DenseMatrix w2) {
40 | 		this.w2 = w2;
41 | 	}
42 | 
43 | 	public DenseVector getB1() {
44 | 		return b1;
45 | 	}
46 | 
47 | 	public void setB1(DenseVector b1) {
48 | 		this.b1 = b1;
49 | 	}
50 | 
51 | 	public DenseVector getB2() {
52 | 		return b2;
53 | 	}
54 | 
55 | 	public void setB2(DenseVector b2) {
56 | 		this.b2 = b2;
57 | 	}
58 | 
59 | 	public double getValue() {
60 | 		return value;
61 | 	}
62 | 
63 | 	public void setValue(double value) {
64 | 		this.value = value;
65 | 	}
66 | 
67 | 	@Override
68 | 	public String toString() {
69 | 		String r = new String();
70 | 		//r = "Contains: "+w1.toString()+" "+w2.toString()+" "+b1.toString()+" "+b2.toString()+" "+value+"\n";
71 | 		r = "Contains: "+value+"\n";
72 | 		return r;
73 | 	}
74 | }
75 | 


--------------------------------------------------------------------------------
/mahout/chapter11/dlexample/src/main/java/AutoencoderLearner.java:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Deep learning - Autoencoder example 
 3 | // Chapter 11
 4 | 
 5 | package main.java;
 6 | 
 7 | import main.java.DeepModelSettings.ConfigBaseLayer;
 8 | import main.java.DeepModelSettings.ConfigKMeans;
 9 | 
10 | import org.apache.spark.api.java.JavaRDD;
11 | import org.apache.spark.mllib.linalg.Vector;
12 | 
13 | public class AutoencoderLearner implements Learner{
14 | 	
15 | 	private AutoencoderConfig conf;
16 | 	
17 | 	public AutoencoderLearner(ConfigBaseLayer configLayer) {
18 | 		this.conf = new AutoencoderConfig(configLayer);
19 | 	}
20 | 
21 | 	@Override
22 | 	public Vector[] call(JavaRDD<Vector> data) throws Exception {
23 | 		return new Autoencoder(conf).train(data);
24 | 	}
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/mahout/chapter11/dlexample/src/main/java/AutoencoderParams.java:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Deep learning - Autoencoder example 
 3 | // Chapter 11
 4 | 
 5 | package main.java;
 6 | 
 7 | import java.io.Serializable;
 8 | 
 9 | import org.apache.spark.mllib.linalg.DenseMatrix;
10 | import org.apache.spark.mllib.linalg.DenseVector;
11 | 
12 | public class AutoencoderParams implements Serializable {
13 | 	
14 | 	private DenseMatrix w1;
15 | 	private DenseMatrix w2;
16 | 	private DenseVector b1;
17 | 	private DenseVector b2;
18 | 	
19 | 	public AutoencoderParams(DenseMatrix w1, DenseMatrix w2, DenseVector b1, DenseVector b2) {
20 | 		this.w1 = w1;
21 | 		this.w2 = w2;
22 | 		this.b1 = b1;
23 | 		this.b2 = b2;
24 | 	}
25 | 
26 | 	public DenseMatrix getW1() {
27 | 		return w1;
28 | 	}
29 | 
30 | 	public void setW1(DenseMatrix w1) {
31 | 		this.w1 = w1;
32 | 	}
33 | 
34 | 	public DenseMatrix getW2() {
35 | 		return w2;
36 | 	}
37 | 
38 | 	public void setW2(DenseMatrix w2) {
39 | 		this.w2 = w2;
40 | 	}
41 | 
42 | 	public DenseVector getB1() {
43 | 		return b1;
44 | 	}
45 | 
46 | 	public void setB1(DenseVector b1) {
47 | 		this.b1 = b1;
48 | 	}
49 | 
50 | 	public DenseVector getB2() {
51 | 		return b2;
52 | 	}
53 | 
54 | 	public void setB2(DenseVector b2) {
55 | 		this.b2 = b2;
56 | 	}
57 | 	
58 | 	
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/mahout/chapter11/dlexample/src/main/java/AutoencoderSigmoid.java:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Deep learning - Autoencoder example 
 3 | // Chapter 11
 4 | 
 5 | package main.java;
 6 | 
 7 | public class AutoencoderSigmoid {
 8 | 	
 9 | 	
10 | 	//Thread safe singleton class since Spark is run one thread per Partition
11 | 	
12 | 	private static volatile AutoencoderSigmoid instance = null;
13 | 	
14 | 	
15 | 	
16 | 	private AutoencoderSigmoid(){
17 | 			
18 | 	}
19 | 	
20 | 	public static AutoencoderSigmoid getInstance(){
21 | 		
22 | 		if (instance == null){
23 | 			synchronized (AutoencoderSigmoid.class) {
24 | 				if (instance == null){
25 | 					instance = new AutoencoderSigmoid();
26 | 				}
27 | 			}
28 | 		}
29 | 		return instance;
30 | 	}
31 | 
32 | 	
33 | 	public static double getValue(double x){
34 | 		int i = (int) Math.round(x*100);
35 | 		return values[i];
36 | 		//alternatively make a 3 point average
37 | 	}
38 | 	//to complete with generated values
39 | 	private static double[] values = new double[]{-1.0,0.0,1.0};
40 | }
41 | 


--------------------------------------------------------------------------------
/mahout/chapter11/dlexample/src/main/java/two_layers_autoencoders_model.prototxt:
--------------------------------------------------------------------------------
 1 | config_layer {
 2 |   config_preprocess {
 3 |   	eps_1 : 0.1
 4 |   	eps_2 : 10
 5 |   }
 6 |   
 7 |   config_autoencoders {
 8 |  	number_of_units : 700    
 9 |  	rho : 0.5
10 | 	lambda : 0.0001
11 | 	beta : 1
12 | 	numEpochs  : 1
13 | 	numBatches : 2
14 | 	alpha_init : 0.05
15 | 	alpha_step : 2.0
16 | 	alpha_max_steps : 10	
17 |   }
18 | 
19 |   config_feature_extractor {
20 |     input_dim1: 128
21 |     input_dim2: 32
22 |     feature_dim1: 32
23 |     feature_dim2: 32
24 |   }
25 |   config_pooler {
26 |     pool_size: 2
27 |   }  
28 | }
29 | 
30 | config_layer {
31 |   config_autoencoders {
32 |  	number_of_units : 700    
33 |  	rho : 0.5
34 | 	lambda : 0.0001
35 | 	beta : 1
36 | 	numEpochs  : 1
37 | 	numBatches : 2
38 | 	alpha_init : 0.05
39 | 	alpha_step : 2.0
40 | 	alpha_max_steps : 10	
41 |   }
42 |   config_pooler {
43 |     pool_size: 2
44 |   }  
45 | }


--------------------------------------------------------------------------------
/mahout/chapter11/dlexample/src/test/java/RankTest.java:
--------------------------------------------------------------------------------
 1 | package test.java;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.ArrayList;
 5 | import java.util.Arrays;
 6 | import java.util.Comparator;
 7 | import java.util.List;
 8 | 
 9 | import main.java.ComputeSimilarity;
10 | import main.java.MatrixOps;
11 | 
12 | import org.apache.commons.lang.ArrayUtils;
13 | import org.apache.spark.api.java.JavaRDD;
14 | import org.apache.spark.api.java.JavaSparkContext;
15 | import org.apache.spark.mllib.linalg.DenseMatrix;
16 | import org.apache.spark.mllib.linalg.Vector;
17 | import org.apache.spark.mllib.linalg.Vectors;
18 | import org.junit.After;
19 | import org.junit.Assert;
20 | import org.junit.Before;
21 | import org.junit.Ignore;
22 | import org.junit.Test;
23 | 
24 | public class RankTest implements Serializable {
25 | 
26 | 	/**
27 | 	 * 
28 | 	 */
29 | 	private static final long serialVersionUID = 8707243339400493968L;
30 | 	private transient JavaSparkContext sc;
31 | 	
32 | 	
33 | 	/**
34 | 	 * @throws java.lang.Exception
35 | 	 */
36 | 	@Before
37 | 	public void setUp() throws Exception {
38 | 		sc = new JavaSparkContext("local", "FeatureExtractionTest");
39 | 	}
40 | 	
41 | 	
42 | 	/**
43 | 	 * @throws java.lang.Exception
44 | 	 */
45 | 	@After
46 | 	public void tearDown() throws Exception {
47 | 		sc.stop();
48 | 		sc = null;
49 | 	}
50 | 	
51 | 	
52 | 	@Test @Ignore
53 | 	public void rankTest() {
54 | 		
55 | 		// simple example
56 | 		double[] x1 = {0.35, 0.65, 0.28, 0.12}; 
57 | 		double[] x2 = {0.86, 0.96, 0.34, 0.57};
58 | 		double[] query = {0.46, 0.92, 0.78, 0.34};
59 | 		
60 | 		double[] expected_output = {0.955073918586867, 0.897967422096528};
61 | 		
62 | 		Vector queryV = Vectors.dense(query);
63 | 		
64 | 		// create a parallel dataset from the local matrix
65 | 		List<Vector> matX = new ArrayList<Vector>(2);
66 | 		matX.add(Vectors.dense(x1));
67 | 		matX.add(Vectors.dense(x2));
68 | 		JavaRDD<Vector> matRDD = sc.parallelize(matX);
69 | 		
70 | 		// compute cosine similarities
71 | 		JavaRDD<Double> sims = matRDD.map(new ComputeSimilarity(queryV));
72 | 		
73 | 		final Double[] output = sims.collect().toArray(new Double[2]);
74 | 		final double[] outputD = ArrayUtils.toPrimitive(output);
75 | 		
76 | 		// sort the similarities and the indices
77 | 		final Integer[] idx = new Integer[2];
78 | 		for (int i = 0; i < 2; i++) {
79 | 			idx[i] = i;
80 | 		}
81 | 		Arrays.sort(idx, new Comparator<Integer>() {
82 | 		    @Override 
83 | 		    public int compare(final Integer o1, final Integer o2) {
84 | 		        return Double.compare(outputD[o1], outputD[o2]);
85 | 		    }
86 | 		});
87 | 		System.out.println("Sorted indices");
88 | 		for (int i = 0; i < 2; i++) {
89 | 			System.out.println(idx[i]);
90 | 		}
91 | 		
92 | 		Assert.assertArrayEquals(expected_output, outputD, 1e-6);
93 | 	}
94 | }
95 | 


--------------------------------------------------------------------------------
/mahout/chapter11/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter12/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter12/rlexample/readme.md:
--------------------------------------------------------------------------------
1 | There are no direct APIs for implementing reinforcement learning. This is a placeholder folder and implementation details will be added as and when supporting APIs are made available with the maout distributions
2 | 
3 | Reference to list of algorithms supported by mahout can be found here:
4 | https://mahout.apache.org/users/basics/algorithms.html
5 | 


--------------------------------------------------------------------------------
/mahout/chapter13/ensembleexample/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>com.packt.pml.mahout.ensemble</groupId>
 6 |   <artifactId>ensemble</artifactId>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>ensemble</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>junit</groupId>
20 |       <artifactId>junit</artifactId>
21 |       <version>3.8.1</version>
22 |       <scope>test</scope>
23 |     </dependency>
24 |     <dependency>
25 |       <groupId>org.apache.hadoop</groupId>
26 |       <artifactId>hadoop-core</artifactId>
27 |       <version>0.20.2</version>
28 |       <type>jar</type>
29 |     </dependency>
30 |     <dependency>
31 |       <groupId>org.apache.mahout</groupId>
32 |       <artifactId>mahout-core</artifactId>
33 |       <version>0.9</version>
34 |       <type>jar</type>
35 |     </dependency>
36 |   </dependencies>
37 | </project>
38 | 


--------------------------------------------------------------------------------
/mahout/chapter13/ensembleexample/readme.md:
--------------------------------------------------------------------------------
1 | A ensemble of different distributed recommendation systems using Apache Mahout.
2 | 


--------------------------------------------------------------------------------
/mahout/chapter13/ensembleexample/src/main/java/com/packt/pml/mahout/ensemble/ItemRecommender.java:
--------------------------------------------------------------------------------
 1 | /** Practical Machine learning
 2 | * Ensemble learning
 3 | * Chapter 13
 4 | **/
 5 | 
 6 | package com.packt.pml.mahout.ensemble;
 7 | 
 8 | /*
 9 |  * A item based recommender model.
10 |  */
11 | 
12 | import java.io.File;
13 | import java.io.IOException;
14 | 
15 | import org.apache.mahout.cf.taste.common.TasteException;
16 | import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
17 | import org.apache.mahout.cf.taste.impl.eval.RMSRecommenderEvaluator;
18 | import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
19 | import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
20 | import org.apache.mahout.cf.taste.model.DataModel;
21 | import org.apache.mahout.cf.taste.recommender.Recommender;
22 | import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
23 | import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity;
24 | import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity;
25 | 
26 | class ItembasedBuilder implements RecommenderBuilder{
27 | 	int k;
28 | 	ItemSimilarity similarity;
29 | 	
30 | 	ItembasedBuilder(int similarityMeasure, DataModel dataModel) throws TasteException{
31 | 		
32 | 		if(similarityMeasure==0)
33 | 			similarity = new EuclideanDistanceSimilarity(dataModel);
34 | 		else
35 | 			similarity = new PearsonCorrelationSimilarity(dataModel);
36 | 	}
37 | 	public Recommender buildRecommender(DataModel dataModel) throws TasteException {
38 | 		return new GenericItemBasedRecommender(dataModel, similarity);
39 | 	}
40 | 	
41 | }
42 | 
43 | public class ItemRecommender {
44 | 	
45 | 	public static void main(String args[]) throws IOException, TasteException{
46 | 		
47 | 		DataModel model = new FileDataModel(new File("data/input/u1.base"));
48 | 		RMSRecommenderEvaluator evaluator = new RMSRecommenderEvaluator();
49 | 		ItembasedBuilder builder; 
50 | 		double score;
51 | 		builder = new ItembasedBuilder(1,model);
52 | 		score = evaluator.evaluate(builder, null, model, 0.8, 0.7);
53 | 		System.out.println(score);
54 |    }
55 | }


--------------------------------------------------------------------------------
/mahout/chapter13/ensembleexample/src/main/java/com/packt/pml/mahout/ensemble/SlopeOneBasedRecommender.java:
--------------------------------------------------------------------------------
 1 | /** Practical Machine learning
 2 | * Ensemble learning
 3 | * Chapter 13
 4 | **/
 5 | 
 6 | package com.packt.pml.mahout.ensemble;
 7 | 
 8 | /*
 9 |  * A slope one based recommender model.
10 |  */
11 | 
12 | import java.io.File;
13 | import java.io.IOException;
14 | 
15 | import org.apache.mahout.cf.taste.common.TasteException;
16 | import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
17 | import org.apache.mahout.cf.taste.impl.eval.RMSRecommenderEvaluator;
18 | import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
19 | import org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender;
20 | import org.apache.mahout.cf.taste.model.DataModel;
21 | import org.apache.mahout.cf.taste.recommender.Recommender;
22 | 
23 | 
24 | public class SlopeOneBasedRecommender {
25 | 	
26 | 	public static void main(String args[]) throws IOException, TasteException{
27 | 		
28 | 		DataModel model = new FileDataModel(new File("data/input/u.data"));
29 | 		RMSRecommenderEvaluator evaluator = new RMSRecommenderEvaluator();
30 | 		
31 | 		RecommenderBuilder builder = new RecommenderBuilder() {
32 | 		 public Recommender buildRecommender(DataModel model)throws TasteException { 
33 | 			 SlopeOneRecommender slope = new SlopeOneRecommender(model);
34 | 			 System.out.println(slope.recommend(199, 3).toString());
35 | 		 
36 | 			 return slope;
37 | 			}
38 | 		};
39 | 		
40 | 		double score = evaluator.evaluate(builder, null, model, 0.8, 0.7);
41 | 		System.out.println(score);
42 |    }
43 | }
44 | 
45 | /*OutPut
46 |  * Score = 0.9507197266125407
47 |  * [RecommendedItem[item:1175, value:7.0], RecommendedItem[item:1158, value:6.0], RecommendedItem[item:1026, value:5.7245636]]
48 |  */


--------------------------------------------------------------------------------
/mahout/chapter13/ensembleexample/src/main/java/com/packt/pml/mahout/ensemble/Utilities.java:
--------------------------------------------------------------------------------
 1 | /** Practical Machine learning
 2 | * Ensemble learning
 3 | * Chapter 13
 4 | **/
 5 | 
 6 | package com.packt.pml.mahout.ensemble;
 7 | 
 8 | /*
 9 |  * A collection of utility functions for working with the ensemble
10 |  * 
11 |  */
12 | import java.util.ArrayList;
13 | import java.util.HashMap;
14 | import java.util.Iterator;
15 | import java.util.List;
16 | import java.util.Set;
17 | 
18 | class Utilities{
19 | 	HashMap<Long, ArrayList<Float>> hm;
20 | 	public Utilities() {
21 | 		// TODO Auto-generated constructor stub
22 | 		hm = new  HashMap<Long, ArrayList<Float>>();
23 | 	}
24 | 	
25 | 	public void insert(Long item, Float value){
26 | 		
27 | 		if(!hm.containsKey(item))
28 | 			hm.put(item, new ArrayList<Float>());
29 | 		hm.get(item).add(value);
30 | 	}
31 | 	
32 | 	public void show(){
33 | 		System.out.println(hm);
34 | 	}
35 | 	
36 | 	public HashMap<Long,Float> getAverage(){
37 | 		HashMap<Long,Float> result = new HashMap<Long,Float>();
38 | 		Set<Long> items = hm.keySet();
39 | 		Iterator<Long> it = items.iterator();
40 | 		float sum,avg;
41 | 		while(it.hasNext()){
42 | 			Long key = it.next();
43 | 			List<Float> values = hm.get(key);
44 | 			Iterator<Float> itv = values.iterator();
45 | 			sum=0;
46 | 			while(itv.hasNext())
47 | 				sum = sum + itv.next();
48 | 			avg = sum/values.size();
49 | 			result.put(key, new Float(avg));
50 | 		}
51 | 		
52 | 		return result;
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/mahout/chapter13/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter5/decisiontreeexample/readme.md:
--------------------------------------------------------------------------------
 1 | Please refer to the RandomForest implementation as a reference. The DecisionFOrest API is used for this purpose, but with the following specific implementation lines..
 2 | 
 3 | int numberOfTrees = 1;
 4 | Data data = loadData(...);
 5 | DecisionForest forest = buildForest(numberOfTrees, data);
 6 |  
 7 | String path = "saved-trees/" + numberOfTrees + "-trees.txt";
 8 | DataOutputStream dos = new DataOutputStream(new FileOutputStream(path));
 9 |  
10 | forest.write(dos);
11 | 


--------------------------------------------------------------------------------
/mahout/chapter5/randomforestexample/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>com.packt.pml.mahout.randomforest</groupId>
 6 |   <artifactId>random-forest</artifactId>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>random-forest</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>junit</groupId>
20 |       <artifactId>junit</artifactId>
21 |       <version>3.8.1</version>
22 |       <scope>test</scope>
23 |     </dependency>
24 |     <dependency>
25 |       <groupId>org.apache.mahout</groupId>
26 |       <artifactId>mahout-core</artifactId>
27 |       <version>0.8-SNAPSHOT</version>
28 |       <type>jar</type>
29 |     </dependency>
30 |     <dependency>
31 |       <groupId>org.uncommons.maths</groupId>
32 |       <artifactId>uncommons-maths</artifactId>
33 |       <version>1.2.2</version>
34 |       <type>jar</type>
35 |     </dependency>
36 |   </dependencies>
37 | </project>
38 | 


--------------------------------------------------------------------------------
/mahout/chapter5/randomforestexample/readme.md:
--------------------------------------------------------------------------------
1 | This folder has an example implementation for random forest overed as a part of chapter 5 using Apache Mahout 0.9 distribution.
2 | 


--------------------------------------------------------------------------------
/mahout/chapter5/randomforestexample/src/test/java/com/packt/pml/mahout/randomforest/RandomForestTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Practical Machine learning
 3 | * Random Forest Example
 4 | * Chapter 05
 5 | * @author sunilag
 6 | */
 7 | package com.packt.pml.mahout.randomforest;
 8 | 
 9 | import junit.framework.Test;
10 | import junit.framework.TestCase;
11 | import junit.framework.TestSuite;
12 | 
13 | /**
14 |  * Unit test for simple App.
15 |  */
16 | public class RandomForestTest 
17 |     extends TestCase
18 | {
19 |     /**
20 |      * Create the test case
21 |      *
22 |      * @param testName name of the test case
23 |      */
24 |     public RandomForestTest( String testName )
25 |     {
26 |         super( testName );
27 |     }
28 | 
29 |     /**
30 |      * @return the suite of tests being tested
31 |      */
32 |     public static Test suite()
33 |     {
34 |         return new TestSuite( RandomForestTest.class );
35 |     }
36 | 
37 |     /**
38 |      * Rigourous Test :-)
39 |      */
40 |     public void testRandomForest()
41 |     {
42 |         assertTrue( true );
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/mahout/chapter6/knnexample/readme.md:
--------------------------------------------------------------------------------
1 | There are no direct APIs for implementing KNN algorithm. A derived implementation is presented in this folder and  implementation details will be added as and when supporting APIs are made available with the maout distributions
2 | 
3 | Reference to list of algorithms supported by mahout can be found here:
4 | https://mahout.apache.org/users/basics/algorithms.html
5 | 


--------------------------------------------------------------------------------
/mahout/chapter6/knnexample/test/java/WeightedMatrixTest.java:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // K Nearest Neighbor example 
 3 | // Chapter 6
 4 | 
 5 | package test.java;
 6 | 
 7 | import main.java.KNearestNeighbor;
 8 | 
 9 | import org.apache.spark.mllib.linalg.Matrix;
10 | import org.apache.spark.mllib.linalg.Vector;
11 | import org.apache.spark.mllib.linalg.Vectors;
12 | 
13 | public class WeightedMatrixTest {
14 | 
15 | 	public static void main(String[] args) {
16 | 		double testArray[][] = {{1,2,3,4,5,6},{1,2,3,4,5,7},{2,4,6,8,10,12},{2,3,4,5,6,7},{5,5,5,5,5,5}};
17 | 		System.out.println(testArray.length);
18 | 		Vector[] vectors = new Vector[5];
19 | 
20 | 		for (int i = 0; i < 5; i++) {
21 | 			vectors[i] = Vectors.dense(testArray[i]);
22 | 			System.out.println("Vector " + i +": " + vectors[i].toArray()[0] + " " + vectors[i].toArray()[1] + " " + vectors[i].toArray()[2] + " " + vectors[i].toArray()[3] + " " + vectors[i].toArray()[4] + " " + vectors[i].toArray()[5] + " ");
23 | 		}
24 | 		KNearestNeighbor w = new KNearestNeighbor(vectors, 3,0,1,1);
25 | 		Matrix m = w.getWeightedMatrix();
26 | 		System.out.println("Matrix:");
27 | 		int size = m.numCols();
28 | 		double[] mA = m.toArray();
29 | 		for(int i=0; i<size; i++){
30 | 			for(int j=0; j<size; j++){
31 | 				System.out.print(mA[i + j*size] + " ");
32 | 			}
33 | 			System.out.print("\n");
34 | 		}
35 | 
36 |     }
37 | }
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/mahout/chapter6/svmexample/readme.md:
--------------------------------------------------------------------------------
1 | There are no direct APIs for implementing Support Vector Machines algorithm. There are some alternative ways of solving this problem using Hadoop MapReduce.
2 | 
3 | This is a placeholder folder and implementation details will be added as and when supporting APIs are made available with the maout distributions
4 | 
5 | Reference to list of algorithms supported by mahout can be found here:
6 | https://mahout.apache.org/users/basics/algorithms.html
7 | 


--------------------------------------------------------------------------------
/mahout/chapter7/aprioriexample/readme.md:
--------------------------------------------------------------------------------
1 | There are no direct APIs for implementing apriori based association rule based learning. FP Growth has a specific implementation for frequent pattern based association rules implementation.
2 | 
3 | This is a placeholder folder and implementation details will be added as and when supporting APIs are made available with the maout distributions
4 | 
5 | Reference to list of algorithms supported by mahout can be found here:
6 | https://mahout.apache.org/users/basics/algorithms.html
7 | 


--------------------------------------------------------------------------------
/mahout/chapter7/fpgrowthexample/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>com.packt.pml.mahout.fpgrowth</groupId>
 6 |   <artifactId>fp-growth</artifactId>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>fp-growth</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>junit</groupId>
20 |       <artifactId>junit</artifactId>
21 |       <version>3.8.1</version>
22 |       <scope>test</scope>
23 |     </dependency>
24 |     <dependency>
25 |       <groupId>org.apache.hadoop</groupId>
26 |       <artifactId>hadoop-core</artifactId>
27 |       <version>0.20.2</version>
28 |       <type>jar</type>
29 |     </dependency>
30 |     <dependency>
31 |       <groupId>org.apache.mahout</groupId>
32 |       <artifactId>mahout-core</artifactId>
33 |       <version>0.9</version>
34 |       <type>jar</type>
35 |     </dependency>
36 |   </dependencies>
37 | </project>
38 | 


--------------------------------------------------------------------------------
/mahout/chapter7/fpgrowthexample/readme.md:
--------------------------------------------------------------------------------
1 | This folder has an example implementation for fp-growth algorithm using Apache Mahout 0.9 distribution covered as a part of chapter 7 association rule based learning methods
2 | 


--------------------------------------------------------------------------------
/mahout/chapter7/fpgrowthexample/src/test/java/com/packt/pml/mahout/fpgrowth/FPgrowthTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Practical Machine Learning
 3 | * Aoocition rule based learning - FP growth example
 4 | * Chapter 07
 5 | **/
 6 | package com.packt.pml.mahout.fpgrowth;
 7 | 
 8 | import junit.framework.Test;
 9 | import junit.framework.TestCase;
10 | import junit.framework.TestSuite;
11 | 
12 | /**
13 |  * Unit test for simple App.
14 |  */
15 | public class FPgrowthTest 
16 |     extends TestCase
17 | {
18 |     /**
19 |      * Create the test case
20 |      *
21 |      * @param testName name of the test case
22 |      */
23 |     public FPgrowthTest( String testName )
24 |     {
25 |         super( testName );
26 |     }
27 | 
28 |     /**
29 |      * @return the suite of tests being tested
30 |      */
31 |     public static Test suite()
32 |     {
33 |         return new TestSuite( FPgrowthTest.class );
34 |     }
35 | 
36 |     /**
37 |      * Rigourous Test :-)
38 |      */
39 |     public void testApp()
40 |     {
41 |         assertTrue( true );
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/mahout/chapter8/k-meansexample/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>com.packt.pml.mahout.clustering</groupId>
 6 |   <artifactId>k-means</artifactId>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>k-means</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>junit</groupId>
20 |       <artifactId>junit</artifactId>
21 |       <version>3.8.1</version>
22 |       <scope>test</scope>
23 |     </dependency>
24 |     <dependency>
25 |       <groupId>org.apache.hadoop</groupId>
26 |       <artifactId>hadoop-core</artifactId>
27 |       <version>0.20.2</version>
28 |       <type>jar</type>
29 |     </dependency>
30 |     <dependency>
31 |       <groupId>org.apache.mahout</groupId>
32 |       <artifactId>mahout-core</artifactId>
33 |       <version>0.9</version>
34 |       <type>jar</type>
35 |     </dependency>
36 |   </dependencies>
37 | </project>
38 | 


--------------------------------------------------------------------------------
/mahout/chapter8/k-meansexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter8/k-meansexample/src/main/java/com/packt/pml/mahout/kmeans/DataPreprocessing.java:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Practical Machine learning
 3 | * Clustering based learning - K-means clustering Example
 4 | * Chapter 8
 5 | */
 6 | package com.packt.pml.mahout.kmeans;
 7 | 
 8 | import java.io.IOException;
 9 | 
10 | import org.apache.hadoop.conf.Configuration;
11 | import org.apache.hadoop.fs.Path;
12 | import org.apache.hadoop.fs.FileSystem;
13 | 
14 | import chapter7.src.InputDriver;
15 | 
16 | public class DataPreprocessing {
17 | 	
18 | 	public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException
19 | 	{
20 | 	
21 | 	Configuration conf = new Configuration();
22 | 	conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
23 | 	conf.addResource(new Path("/usr/local/hadoop/conf/hdfs-site.xml"));
24 | 
25 | 	//create the file system object and pass the configuration object		
26 | 	FileSystem fileSystem = FileSystem.get(conf);
27 | 	//We then create the input and output Path Objects.
28 | 
29 | 			
30 | 	//define the input and sequence file directory
31 | 	String inputPath="chapter7/clustering_input";
32 | 	String inputSeq="clustering_seq";
33 | 			
34 | 	Path inputDir = new Path(inputPath);
35 | 	Path inputSeqDir = new Path(inputSeq);
36 | 	
37 |     if (fileSystem.exists(inputSeqDir)) {
38 | 		System.out.println("Output already exists");
39 | 		fileSystem.delete(inputSeqDir, true);
40 | 		System.out.println("deleted output directory");
41 | 	}
42 | 
43 | 	//The last step is to encode the vectors using the //RandomAccessSparseVector
44 | 	InputDriver.runJob(inputDir, inputSeqDir, 
45 | 			"org.apache.mahout.math.RandomAccessSparseVector",conf);
46 | 
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/mahout/chapter8/k-meansexample/src/test/java/com/packt/pml/mahout/kmeans/KMeansTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Practical Machine learning
 3 | * Clustering based learning - K-means clustering Example
 4 | * Chapter 8
 5 | */
 6 | package com.packt.pml.mahout.kmeans;
 7 | 
 8 | import junit.framework.Test;
 9 | import junit.framework.TestCase;
10 | import junit.framework.TestSuite;
11 | 
12 | /**
13 |  * Unit test for simple App.
14 |  */
15 | public class KMeansTest 
16 |     extends TestCase
17 | {
18 |     /**
19 |      * Create the test case
20 |      *
21 |      * @param testName name of the test case
22 |      */
23 |     public KMeansTest( String testName )
24 |     {
25 |         super( testName );
26 |     }
27 | 
28 |     /**
29 |      * @return the suite of tests being tested
30 |      */
31 |     public static Test suite()
32 |     {
33 |         return new TestSuite( KMeansTest.class );
34 |     }
35 | 
36 |     /**
37 |      * Rigourous Test :-)
38 |      */
39 |     public void testApp()
40 |     {
41 |         assertTrue( true );
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/mahout/chapter8/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter9/naivebayesexample/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>com.packt.pml.mahout.naivebayes</groupId>
 6 |   <artifactId>naive-bayes</artifactId>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>naive-bayes</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>junit</groupId>
20 |       <artifactId>junit</artifactId>
21 |       <version>3.8.1</version>
22 |       <scope>test</scope>
23 |     </dependency>
24 |     <dependency>
25 |       <groupId>org.apache.mahout</groupId>
26 |       <artifactId>mahout-core</artifactId>
27 |       <version>0.9</version>
28 |       <type>jar</type>
29 |     </dependency>
30 |   </dependencies>
31 | </project>
32 | 


--------------------------------------------------------------------------------
/mahout/chapter9/naivebayesexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mahout/chapter9/naivebayesexample/src/main/java/start.sh:
--------------------------------------------------------------------------------
 1 | export WORK_DIR=/chapter9/mahout/naive-bayes
 2 | mkdir $WORK_DIR
 3 | cd $WORK_DIR
 4 | wget http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz
 5 | tar –xvzf 20news-bydate.tar.gz
 6 | mkdir ${WORK_DIR}/20news-all
 7 | mkdir ${WORK_DIR}/20news-seq
 8 | cp -R ${WORK_DIR}/20news-bydate*/*/* ${WORK_DIR}/20news-all
 9 | 
10 | 
11 | mahout seqdirectory   -i ${WORK_DIR}/20news-all  -o ${WORK_DIR}/20news-seq -ow
12 | 
13 | mahout seq2sparse  -i ${WORK_DIR}/20news-seq   -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf -ow
14 | 
15 | mahout split  -i ${WORK_DIR}/20news-vectors/tfidf-vectors     --trainingOutput ${WORK_DIR}/20news-train-vectors     --testOutput ${WORK_DIR}/20news-test-vectors      --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential 


--------------------------------------------------------------------------------
/mahout/chapter9/naivebayesexample/src/test/java/com/packt/pml/mahout/naivebayes/NaiveBayesTest.java:
--------------------------------------------------------------------------------
 1 | package com.packt.pml.mahout.naivebayes;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class NaiveBayesTest 
11 |     extends TestCase
12 | {
13 |     /**
14 |      * Create the test case
15 |      *
16 |      * @param testName name of the test case
17 |      */
18 |     public NaiveBayesTest( String testName )
19 |     {
20 |         super( testName );
21 |     }
22 | 
23 |     /**
24 |      * @return the suite of tests being tested
25 |      */
26 |     public static Test suite()
27 |     {
28 |         return new TestSuite( NaiveBayesTest.class );
29 |     }
30 | 
31 |     /**
32 |      * Rigourous Test :-)
33 |      */
34 |     public void testNaiveBayes()
35 |     {
36 |         assertTrue( true );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/mahout/chapter9/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter10/linearregressionexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter10/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter11/annexample/ann.py:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Deep learning - Artificial Neural Networks example
 3 | # Chapter 11
 4 | 
 5 | from pybrain.supervised.trainers import BackpropTrainer
 6 | from pybrain.tools.shortcuts import buildNetwork
 7 | from pybrain.datasets import SupervisedDataSet
 8 | from pybrain.structure import SigmoidLayer
 9 | 
10 | network = buildNetwork(2, 5, 1, hiddenclass=SigmoidLayer)
11 | 
12 | data_set = SupervisedDataSet(2, 1)
13 | data_set.addSample((0, 0), [0])
14 | data_set.addSample((0, 1), [1])
15 | data_set.addSample((1, 0), [1])
16 | data_set.addSample((1, 1), [0])
17 | 
18 | trainer = BackpropTrainer(module=network, dataset=data_set, momentum=0.00, learningrate=0.10, weightdecay=0.0,
19 |                           lrdecay=1.0)
20 | 
21 | error = 1
22 | epochsToTrain = 0
23 | while error > 0.0001:
24 |     epochsToTrain += 1
25 |     error = trainer.train()
26 | 
27 | results = network.activateOnDataset(data_set)
28 | for i in range(len(results)):
29 |     print data_set['input'][i][0], 'xor', data_set['input'][i][1], '=', int(results[i] > 0.5)
30 | 	
31 | """
32 | 0.0 xor 0.0 = 0
33 | 0.0 xor 1.0 = 1
34 | 1.0 xor 0.0 = 1
35 | 1.0 xor 1.0 = 0
36 | """
37 | 
38 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter11/annexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter11/dlexample/example-1-data.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter11/dlexample/example-1-data.ods


--------------------------------------------------------------------------------
/python-sckit-learn/chapter11/dlexample/perceptron-data.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter11/dlexample/perceptron-data.ods


--------------------------------------------------------------------------------
/python-sckit-learn/chapter11/dlexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter11/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter12/readme.md:
--------------------------------------------------------------------------------
1 | Reference from Andrew Ng example for reinforcement learning:
2 | http://cs229.stanford.edu/notes/cs229-notes12.pdf
3 | 
4 | scikit-learn provides excellent tools for supervised and unsupervised learning but explicitly does not deal with reinforcement learning.
5 | This example implementation is intended to compliment the functionality of scikit-learn.
6 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter12/rlexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter13/ensembleexample/ensemble_predict.py:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Ensemble learning 
 3 | # Chapter 13
 4 | 
 5 | """
 6 | ==========================================================
 7 | Prediction utility for trained EnsembleSelectionClassifier
 8 | ==========================================================
 9 | 
10 | Get predictions from trained EnsembleSelectionClassifier given
11 | svm format data file.
12 | 
13 | Can output predicted classes or probabilities from the full
14 | ensemble or just the best model.
15 | 
16 | Expects to find a trained ensemble in the sqlite db specified.
17 | 
18 | usage: ensemble_predict.py [-h] [-s {best,ens}] [-p] db_file data_file
19 | 
20 | Get EnsembleSelectionClassifier predictions
21 | 
22 | positional arguments:
23 |   db_file        sqlite db file containing model
24 |   data_file      testing data in svm format
25 | 
26 | optional arguments:
27 |   -h, --help     show this help message and exit
28 |   -s {best,ens}  choose source of prediction ["best", "ens"]
29 |   -p             predict probabilities
30 | """
31 | from __future__ import print_function
32 | 
33 | import numpy as np
34 | 
35 | from argparse import ArgumentParser
36 | 
37 | from sklearn.datasets import load_svmlight_file
38 | 
39 | from ensemble import EnsembleSelectionClassifier
40 | 
41 | 
42 | def parse_args():
43 |     desc = 'Get EnsembleSelectionClassifier predictions'
44 |     parser = ArgumentParser(description=desc)
45 | 
46 |     parser.add_argument('db_file', help='sqlite db file containing model')
47 |     parser.add_argument('data_file', help='testing data in svm format')
48 | 
49 |     help_fmt = 'choose source of prediction ["best", "ens"] (default "ens")'
50 |     parser.add_argument('-s', dest='pred_src',
51 |                         choices=['best', 'ens'],
52 |                         help=help_fmt, default='ens')
53 | 
54 |     parser.add_argument('-p', dest='return_probs',
55 |                         action='store_true', default=False,
56 |                         help='predict probabilities')
57 | 
58 |     return parser.parse_args()
59 | 
60 | 
61 | if (__name__ == '__main__'):
62 |     res = parse_args()
63 | 
64 |     X, _ = load_svmlight_file(res.data_file)
65 |     X = X.toarray()
66 | 
67 |     ens = EnsembleSelectionClassifier(db_file=res.db_file, models=None)
68 | 
69 |     if (res.pred_src == 'best'):
70 |         preds = ens.best_model_predict_proba(X)
71 |     else:
72 |         preds = ens.predict_proba(X)
73 | 
74 |     if (not res.return_probs):
75 |         preds = np.argmax(preds, axis=1)
76 | 
77 |     for p in preds:
78 |         if (res.return_probs):
79 |             mesg = " ".join(["%.5f" % v for v in p])
80 |         else:
81 |             mesg = p
82 | 
83 |         print(mesg)
84 | 
85 | # Original Author: David C. Lambert [dcl -at- panix -dot- com]
86 | # Copyright(c) 2013
87 | # License: Simple BSD


--------------------------------------------------------------------------------
/python-sckit-learn/chapter13/ensembleexample/readme.md:
--------------------------------------------------------------------------------
1 | ######An implementation of [Caruana et al's Ensemble Selection algorithm] (http://www.cs.cornell.edu/~caruana/ctp/ct.papers/caruana.icml04.icdm06long.pdf) [1][2] in Python, based on [scikit-learn](http://scikit-learn.org).
2 | 
3 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter13/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter5/decisiontreeexample/data/ad.names:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter5/decisiontreeexample/data/ad.names


--------------------------------------------------------------------------------
/python-sckit-learn/chapter5/decisiontreeexample/information-gain.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter5/decisiontreeexample/information-gain.ods


--------------------------------------------------------------------------------
/python-sckit-learn/chapter5/randomforstexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter5/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter6/knnexample/iris_data/README.md:
--------------------------------------------------------------------------------
1 | # Iris Dataset
2 | 
3 | This Dataset has been obtained from [UCI ML Repository](http://archive.ics.uci.edu/ml/datasets/Iris).
4 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter6/knnexample/knn_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter6/knnexample/knn_example.png


--------------------------------------------------------------------------------
/python-sckit-learn/chapter6/knnexample/readme.md:
--------------------------------------------------------------------------------
 1 | # kNN using the scikit-learn package
 2 | A basic k-Nearest Neighbour implementation in python using the scikitlearn package.
 3 | 
 4 | # Usage
 5 | Look for code in `demo.py`.
 6 | 
 7 | # Dependencies
 8 |  - Numpy
 9 |  - Matplotlib
10 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter6/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter6/svmexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter6/svmexample/svm.py:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Support Vector Machines example
 3 | # Chapter 6
 4 | 
 5 | # Example: ExImage Recognition with Support Vector Machines
 6 | 
 7 | import sklearn as sk
 8 | import numpy as np
 9 | import matplotlib
10 | import matplotlib.pyplot as plt
11 | 
12 | # print 'IPython version:', IPython.__version__
13 | # print 'numpy version:', np.__version__
14 | # print 'scikit-learn version:', sk.__version__
15 | # print 'matplotlib version:', matplotlib.__version__
16 | from sklearn.datasets import fetch_olivetti_faces
17 | 
18 | # fetch the faces data
19 | faces = fetch_olivetti_faces()
20 | 
21 | # print faces.DESCR
22 | 
23 | def print_faces(images, target, top_n):
24 |     # set up the figure size in inches
25 |     fig = plt.figure(figsize=(12, 12))
26 |     fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
27 |     for i in range(top_n):
28 |         # plot the images in a matrix of 20x20
29 |         p = fig.add_subplot(20, 20, i + 1, xticks=[], yticks=[])
30 |         p.imshow(images[i], cmap=plt.cm.bone)
31 |         
32 |         # label the image with the target value
33 |         p.text(0, 14, str(target[i]))
34 |         p.text(0, 60, str(i))
35 | 		
36 | 		print_faces(faces.images, faces.target, 20)
37 | 		print_faces(faces.images, faces.target, 400)
38 | 
39 | # Build training and testing sets
40 | from sklearn.svm import SVC
41 | svc_1 = SVC(kernel='linear')
42 | from sklearn.cross_validation import train_test_split
43 | 
44 | X_train, X_test, y_train, y_test = train_test_split(
45 |         faces.data, faces.target, test_size=0.25, random_state=0)
46 | 		
47 | # Perform 5-fold cross-validation
48 | from sklearn.cross_validation import cross_val_score, KFold
49 | from scipy.stats import sem
50 | 
51 | def evaluate_cross_validation(clf, X, y, K):
52 |     # create a k-fold croos validation iterator
53 |     cv = KFold(len(y), K, shuffle=True, random_state=0)
54 |     # by default the score used is the one returned by score method of the estimator (accuracy)
55 |     scores = cross_val_score(clf, X, y, cv=cv)
56 |     print scores
57 |     print ("Mean score: {0:.3f} (+/-{1:.3f})").format(
58 |         np.mean(scores), sem(scores))
59 | 		
60 | 	evaluate_cross_validation(svc_1, X_train, y_train, 5)
61 | 	
62 | # measure precision and recall on the evaluation set, for each class.
63 | train_and_evaluate(svc_1, X_train, X_test, y_train, y_test)


--------------------------------------------------------------------------------
/python-sckit-learn/chapter6/svmexample/svm_test.py:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Support Vector Machines example
 3 | # Chapter 6
 4 | 
 5 | #SVM test code
 6 | import sklearn as sk
 7 | import numpy as np
 8 | import matplotlib
 9 | import matplotlib.pyplot as plt
10 | 
11 | # the index ranges of images of people with glasses
12 | glasses = [
13 | 	(10, 19), (30, 32), (37, 38), (50, 59), (63, 64),
14 | 	(69, 69), (120, 121), (124, 129), (130, 139), (160, 161),
15 | 	(164, 169), (180, 182), (185, 185), (189, 189), (190, 192),
16 | 	(194, 194), (196, 199), (260, 269), (270, 279), (300, 309),
17 | 	(330, 339), (358, 359), (360, 369)
18 | ]
19 | 
20 | def create_target(segments):
21 |     # create a new y array of target size initialized with zeros
22 |     y = np.zeros(faces.target.shape[0])
23 |     # put 1 in the specified segments
24 |     for (start, end) in segments:
25 |         y[start:end + 1] = 1
26 |     return y
27 | 	
28 | 	
29 | target_glasses = create_target(glasses)
30 | 
31 | X_train, X_test, y_train, y_test = train_test_split(
32 |         faces.data, target_glasses, test_size=0.25, random_state=0)
33 | 		
34 | svc_2 = SVC(kernel='linear')
35 | evaluate_cross_validation(svc_2, X_train, y_train, 5)
36 | train_and_evaluate(svc_2, X_train, X_test, y_train, y_test)
37 | 
38 | X_test = faces.data[30:40]
39 | y_test = target_glasses[30:40]
40 | 
41 | print y_test.shape[0]
42 | 
43 | select = np.ones(target_glasses.shape[0])
44 | select[30:40] = 0
45 | X_train = faces.data[select == 1]
46 | y_train = target_glasses[select == 1]
47 | 
48 | print y_train.shape[0]
49 | 
50 | svc_3 = SVC(kernel='linear')
51 | train_and_evaluate(svc_3, X_train, X_test, y_train, y_test)
52 | y_pred = svc_3.predict(X_test)
53 | 
54 | eval_faces = [np.reshape(a, (64, 64)) for a in X_test]
55 | print_faces(eval_faces, y_pred, 10)


--------------------------------------------------------------------------------
/python-sckit-learn/chapter7/aprioriexample/readme.md:
--------------------------------------------------------------------------------
1 | This folder conatins python Implementation of Apriori Algorithm 
2 | 
3 | The dataset is a copy of the “Online directory of certified businesses with a detailed profile” file from the Small Business Services (SBS) 
4 | dataset in the `NYC Open Data Sets <http://nycopendata.socrata.com/>`_


--------------------------------------------------------------------------------
/python-sckit-learn/chapter7/fpgrowthexample/data/numeric.csv:
--------------------------------------------------------------------------------
1 | 1,2,3,4,5
2 | 1,2,4
3 | 2,3,5
4 | 3,4,5
5 | 1,2,3,4
6 | 1,2,3	


--------------------------------------------------------------------------------
/python-sckit-learn/chapter7/fpgrowthexample/data/tsk.csv:
--------------------------------------------------------------------------------
 1 | a,b
 2 | b,c,d
 3 | a,c,d,e
 4 | a,d,e
 5 | a,b,c
 6 | a,b,c,d
 7 | a
 8 | a,b,c
 9 | a,b,d
10 | b,c,e
11 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter7/fpgrowthexample/readme.md:
--------------------------------------------------------------------------------
1 | This folder provides an implementation for fpgrowth using vanilla python libraries and not scikit-learn packages
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter8/k-meansexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter8/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter9/naivebayesexample/data-types.py:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Bayesian learning - Naive Bayes example 
 3 | # Chapter 9
 4 | 
 5 | from collections import namedtuple
 6 | 
 7 | """
 8 |     This module defines the datatypes used by the other modules.
 9 | """
10 | 
11 | Dataset = namedtuple("Dataset", ["data", "target"] )
12 | 
13 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter9/naivebayesexample/feature-selection.py:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Bayesian learning - Naive Bayes example 
 3 | # Chapter 9
 4 | 
 5 | from datatypes import Dataset
 6 | 
 7 | from sklearn.feature_selection import SelectKBest, f_classif
 8 | from sklearn.lda import LDA
 9 | from sklearn.qda import QDA
10 | from sklearn.decomposition import PCA
11 | 
12 | def univariate_feature_selection(ds, n):
13 |     """
14 |     Selects 'n' features in the dataset. Returns the Reduced Dataset
15 |     n (int), ds (Dataset) -> Dataset
16 |     """
17 | 
18 |     selector = SelectKBest(f_classif, n)
19 |     selector.fit(ds.data, ds.target)
20 |     features = selector.get_support(indices=True)
21 |     return Dataset(selector.transform(ds.data), ds.target)
22 | 
23 | def lda(ds, n):
24 |     '''
25 |         Outputs the projection of the data in the best
26 |         discriminant dimension.
27 |         Maximum of 2 dimensions for our binary case (values of n greater than this will be ignored by sklearn)
28 |     '''
29 |     selector = LDA(n_components=n)
30 |     selector.fit(ds.data, ds.target)
31 |     new_data = selector.transform(ds.data)
32 |     return Dataset(new_data, ds.target)
33 |     
34 | def pca(ds,n):
35 |     '''
36 |         Uses the PCA classifier to reduces the dimensionality by choosing the n lastest elements
37 |         of the transform.
38 |     '''
39 |     selector = PCA()
40 |     selector.fit(ds.data, ds.target)    
41 |     new_data = selector.transform(ds.data)[:, :-n]
42 |     return Dataset(new_data, ds.target)
43 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter9/naivebayesexample/readme.md:
--------------------------------------------------------------------------------
1 | Naive Bayes Classifier implementation with scikit-learn with the spambase dataset.
2 | 
3 | 


--------------------------------------------------------------------------------
/python-sckit-learn/chapter9/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/python-sckit-learn/data/titanic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/data/titanic.png


--------------------------------------------------------------------------------
/python-sckit-learn/readme.md:
--------------------------------------------------------------------------------
 1 | # Introduction to Machine Learning with Scikit-Learn
 2 | 
 3 | **Code &amp; Data for Introduction to Machine Learning with Scikit-Learn**
 4 | 
 5 | [![Scikit-Learn Cheat Sheet](docs/img/cheat_sheet.png)](http://scikit-learn.org/stable/tutorial/machine_learning_map/)
 6 | 
 7 | ## Installing Scikit-Learn with pip
 8 | 
 9 | See the full [installation instructions](http://scikit-learn.org/stable/install.html) for more details; these are provided for convenience only.
10 | 
11 | Scikit-Learn requires:
12 | 
13 | - Python >= 2.6 or >= 3.3
14 | - Numpy >= 1.6.1
15 | - SciPy >= 0.9
16 | 
17 | Once you have installed `pip` (the python package manager):
18 | 
19 | ### Mac OS X
20 | 
21 | This should be super easy:
22 | 
23 |     pip install -U numpy scipy scikit-learn
24 | 
25 | Now just wait! Also, you have no excuse not to do this in a virtualenv.
26 | 
27 | ### Windows
28 | 
29 | Install [numpy](http://numpy.scipy.org/) and [scipy](http://www.scipy.org/) with their official installers. You can then use PyPi to install scikit-learn:
30 | 
31 |     pip install -U scikit-learn
32 | 
33 | If you're having trouble, consider one of the unofficial windows installers or anacondas (see the Scikit-Learn page for more).
34 | 
35 | ### Ubuntu Linux
36 | 
37 | Unfortunately there are no official binary packages for Linux. First install the build dependencies:
38 | 
39 |     sudo apt-get install build-essential python-dev python-setuptools \
40 |         python-numpy python-scipy \
41 |         libatlas-dev libatlas3gf-base
42 | 
43 | Then you can build (hopefully) Scikit-learn with pip:
44 | 
45 |     pip install --user --install-option="--prefix=" -U scikit-learn
46 | 
47 | Keep in mind however, that there are other dependencies and might be issues with ATLAS and BLAS - see the official installation for more.
48 | 


--------------------------------------------------------------------------------
/r/chapter10/linearregressionexample/Rplots.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter10/linearregressionexample/Rplots.pdf


--------------------------------------------------------------------------------
/r/chapter10/linearregressionexample/linearregression.R:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Regression Analysis - Linear Regression
 3 | # Chapter 10
 4 | 
 5 | 
 6 | ins <- read.csv("insurance.csv", stringsAsFactors = TRUE)
 7 | 
 8 | str(ins)
 9 | summary(ins$charges)
10 | hist(ins$charges)
11 | 
12 | table(ins$region)
13 | 
14 | cor(ins[c("age", "bmi", "children", "charges")])
15 | 
16 | pairs(ins[c("age", "bmi", "children", "charges")])
17 | 
18 | library(psych)
19 | 
20 | pairs.panels(ins[c("age", "bmi", "children", "charges")])
21 | 
22 | 
23 | ins_model <- lm(charges ~ age + children + bmi + sex + smoker + region, data = ins)
24 | 
25 | ins_model <- lm(charges ~ ., data = ins)
26 | 
27 | summary(ins_model)
28 | 
29 | 
30 | ins$age2 <- ins$age^2
31 | 
32 | ins$bmi30 <- ifelse(ins$bmi >= 30, 1, 0)
33 | 
34 | ins_model2 <- lm(charges ~ age + age2 + children + bmi + sex + bmi30*smoker + region, data = ins)
35 | 
36 | summary(ins_model2)
37 | 
38 | 
39 | 
40 | 
41 |  
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/r/chapter10/linearregressionexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter10/logisticregressionexample/dataset1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter10/logisticregressionexample/dataset1.txt


--------------------------------------------------------------------------------
/r/chapter10/logisticregressionexample/dataset2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter10/logisticregressionexample/dataset2.txt


--------------------------------------------------------------------------------
/r/chapter10/logisticregressionexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter10/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter11/annexample/Rplots.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter11/annexample/Rplots.pdf


--------------------------------------------------------------------------------
/r/chapter11/annexample/Rplots1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter11/annexample/Rplots1.pdf


--------------------------------------------------------------------------------
/r/chapter11/annexample/ann.R:
--------------------------------------------------------------------------------
 1 | # Practical Machine Learning
 2 | # Neural Networks (to predict the strength of concrete)
 3 | # Chapter 11
 4 | 
 5 | conc <- read.csv("concrete.csv")
 6 | str(conc)
 7 | 
 8 | 
 9 | normalize <- function(x) {
10 |   return((x - min(x))/(max(x) - min(x)))
11 | }
12 | 
13 | 
14 | conc_norm <- as.data.frame(lapply(conc, normalize))
15 | 
16 | summary(conc_norm$strength)
17 | summary(conc$strength)
18 | 
19 | 
20 | conc_train <- conc_norm[1:773, ]
21 | conc_test <- conc_norm[774:1030, ]
22 | 
23 | library(neuralnet)
24 | 
25 | # deault: 1 hidden nodes
26 | conc_model <- neuralnet(strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age, data = conc_train)
27 | plot(conc_model)
28 | 
29 | model_results <- compute(conc_model, conc_test[1:8])
30 | 
31 | pred_strength <- model_results$net.result
32 | 
33 | cor(pred_strength, conc_test$strength)
34 | 
35 | 
36 | # 5 hidden layers
37 | conc_model2 <- neuralnet(strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age, data = conc_train, hidden = 5)
38 | plot(conc_model2)
39 | 
40 | model_results2 <- compute(conc_model2, conc_test[1:8])
41 | 
42 | pred_strength2 <- model_results2$net.result
43 | 
44 | cor(pred_strength2, conc_test$strength)
45 | 
46 | 


--------------------------------------------------------------------------------
/r/chapter11/annexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter11/dlexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter11/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter12/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter12/rlexample/Results.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/Results.pdf


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlaci.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlaci.zip


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: qlearning
 2 | Type: Package
 3 | Title: Q-Learning: A Data Analysis Method for Constructing Adaptive
 4 |         Interventions
 5 | Version: 2.0
 6 | Date: 2012-01-16
 7 | Author: Min Qian, Inbal Nahum-Shani, Ashkan Ertefaie, Amarpreet Kaur and Susan A. Murphy
 8 | Maintainer: Min Qian <mq2158@columbia.edu>
 9 | Description: The <a1><b0>qlearning package<a1><b1> uses q-learning method to analyze data from a SMART trial, to inform the construction of high-quality adaptive interventions. See Nahum-Shani et al. (2010)  <a1><b0>Q-Learning: A Data   Analysis Method for Constructing Adaptive Interventions<a1><b1> for more details.
10 | Depends: R(>= 2.12.0)
11 | License: GPL (version 2 or later)
12 | Packaged: 2012-01-16 16:35:40 UTC; min
13 | Built: R 2.14.1; ; 2012-01-16 16:35:51 UTC; windows
14 | 


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/INDEX:
--------------------------------------------------------------------------------
1 | DataEx                  An example data set collected from a SMART
2 |                         trial
3 | qlearning               Q-Learning: A Data Analysis Method for
4 |                         Constructing Adaptive Interventions
5 | 


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/MD5:
--------------------------------------------------------------------------------
 1 | be2a63cdb1381190e83b83026e2a3e56 *DESCRIPTION
 2 | 1db07346d2729caed9473132bd1ef715 *INDEX
 3 | fdfc9b6890d9cefb85bd7af2188d7ab4 *Meta/Rd.rds
 4 | f2f59fc5c8c7f3884203e050db360fb5 *Meta/data.rds
 5 | 09602eeaeca4b9942ea7487e10034e9b *Meta/hsearch.rds
 6 | 311171ce88715c62d1ba8ce79ec59b7d *Meta/links.rds
 7 | c43efce1f9e580a609ff991e5bf452d6 *Meta/nsInfo.rds
 8 | 44bee8c0ebb6a5b7d963b96dc99190bf *Meta/package.rds
 9 | df390c53434517b304ac5db487184641 *NAMESPACE
10 | 240d28d145138a75831809e31a480bad *R/qlearning
11 | b877002c67ec87e7354dff3fb8a40c61 *R/qlearning.rdb
12 | 7efa81702a1b5f6229018ce17df72e2b *R/qlearning.rdx
13 | dc2d1ebe7a40fbd6d8359c3fd99686ae *data/DataEx.RData
14 | 13a2ace3051dd4135315513fedac1985 *help/AnIndex
15 | ddcd8f1047bf7890c0bfdf0d141837b7 *help/aliases.rds
16 | 4d3736ed70bd147d49eeb12c91202084 *help/paths.rds
17 | df801dd385d4c58da7fa648c46f222ca *help/qlearning.rdb
18 | b8290f01c2be1ce797d6d647c3e09a43 *help/qlearning.rdx
19 | 8b319481a8c2dc47aad8462c0e98f856 *html/00Index.html
20 | 444535b9cb76ddff1bab1e1865a3fb14 *html/R.css
21 | 


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/Meta/Rd.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/Rd.rds


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/Meta/data.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/data.rds


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/Meta/hsearch.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/hsearch.rds


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/Meta/links.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/links.rds


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/Meta/nsInfo.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/nsInfo.rds


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/Meta/package.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/package.rds


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/NAMESPACE:
--------------------------------------------------------------------------------
1 | exportPattern("^[[:alpha:]]+")
2 | 


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/R/qlearning:
--------------------------------------------------------------------------------
 1 | local({
 2 |     info <- loadingNamespaceInfo()
 3 |     ns <- .Internal(getRegisteredNamespace(as.name(info$pkgname)))
 4 |     if (is.null(ns))
 5 |         stop("cannot find namespace environment");
 6 |     barepackage <- sub("([^-]+)_.*", "\\1", info$pkgname)
 7 |     dbbase <- file.path(info$libname, info$pkgname, "R", barepackage)
 8 |     lazyLoad(dbbase, ns, filter = function(n) n != ".__NAMESPACE__.")
 9 | })
10 | 


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/R/qlearning.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/R/qlearning.rdb


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/R/qlearning.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/R/qlearning.rdx


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/data/DataEx.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/data/DataEx.RData


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/help/AnIndex:
--------------------------------------------------------------------------------
1 | DataEx	DataEx
2 | qlearning	qlearning
3 | 


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/help/aliases.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/help/aliases.rds


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/help/paths.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/help/paths.rds


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/help/qlearning.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/help/qlearning.rdb


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/help/qlearning.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/help/qlearning.rdx


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/html/00Index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
 2 | <html><head><title>R: Q-Learning: A Data Analysis Method for Constructing Adaptive
 3 | Interventions</title>
 4 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 5 | <link rel="stylesheet" type="text/css" href="R.css">
 6 | </head><body>
 7 | <h1> Q-Learning: A Data Analysis Method for Constructing Adaptive
 8 | Interventions
 9 | <img class="toplogo" src="../../../doc/html/logo.jpg" alt="[R logo]">
10 | </h1>
11 | <hr>
12 | <div align="center">
13 | <a href="../../../doc/html/packages.html"><img src="../../../doc/html/left.jpg" alt="[Up]" width="30" height="30" border="0"></a>
14 | <a href="../../../doc/html/index.html"><img src="../../../doc/html/up.jpg" alt="[Top]" width="30" height="30" border="0"></a>
15 | </div><h2>Documentation for package &lsquo;qlearning&rsquo; version 2.0</h2>
16 | 
17 | <ul><li><a href="../DESCRIPTION">DESCRIPTION file</a>.</li>
18 | </ul>
19 | 
20 | <h2>Help Pages</h2>
21 | 
22 | 
23 | <table width="100%">
24 | <tr><td width="25%"><a href="DataEx.html">DataEx</a></td>
25 | <td>An example data set collected from a SMART trial</td></tr>
26 | <tr><td width="25%"><a href="qlearning.html">qlearning</a></td>
27 | <td>Q-Learning: A Data Analysis Method for Constructing Adaptive Interventions</td></tr>
28 | </table>
29 | </body></html>
30 | 


--------------------------------------------------------------------------------
/r/chapter12/rlexample/qlearning/html/R.css:
--------------------------------------------------------------------------------
 1 | BODY{		background: white;
 2 | 		color: black }
 3 | 
 4 | A:link{         background: white;
 5 |                 color: blue }
 6 | A:visited{	background: white;
 7 | 		color: rgb(50%, 0%, 50%) }
 8 | 
 9 | H1{		background: white;
10 | 		color: rgb(55%, 55%, 55%);
11 | 		font-family: monospace;
12 | 		font-size: x-large;
13 | 		text-align: center }
14 | 
15 | H2{		background: white;
16 | 		color: rgb(40%, 40%, 40%);
17 | 		font-family: monospace;
18 | 		font-size: large;
19 | 		text-align: center }
20 | 
21 | H3{		background: white;
22 | 		color: rgb(40%, 40%, 40%);
23 | 		font-family: monospace;
24 | 		font-size: large }
25 | 
26 | H4{		background: white;
27 | 		color: rgb(40%, 40%, 40%);
28 | 		font-family: monospace;
29 | 		font-style: italic;
30 | 		font-size: large }
31 | 
32 | H5{		background: white;
33 | 		color: rgb(40%, 40%, 40%);
34 | 		font-family: monospace }
35 | 
36 | H6{		background: white;
37 | 		color: rgb(40%, 40%, 40%);
38 | 		font-family: monospace;
39 | 		font-style: italic }
40 | 		
41 | IMG.toplogo{	vertical-align: middle }
42 | 
43 | IMG.arrow{	width: 30px;
44 | 		height: 30px;
45 | 		border: 0 }
46 | 
47 | span.acronym{font-size: small}
48 | span.env{font-family: monospace}
49 | span.file{font-family: monospace}
50 | span.option{font-family: monospace}
51 | span.pkg{font-weight: bold}
52 | span.samp{font-family: monospace}
53 | 
54 | div.vignettes a:hover {
55 |   background: rgb(85%, 85%, 85%);
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/r/chapter12/rlexample/readme.md:
--------------------------------------------------------------------------------
 1 | This example implementation for Q learning uses an R library built by PennState University.
 2 | The details of this library can be found at link below:
 3 | https://methodology.psu.edu/downloads/qlearning
 4 | 
 5 | qlaci library zip can be downloaded at the link below:
 6 | https://methodology.psu.edu/downloads/qlaci
 7 | 
 8 | This library for Reinforcement learning techniques is evolving. This code will be updated as and when any new implementations are published in the CRAN repository.
 9 | 
10 | Reference from the above link:
11 | The qlaci (Q-learning with adaptive confidence intervals) R package can be used with data from a sequential, multiple assignment, randomized trial (SMART) to design an adaptive intervention. The qlaci R package requires R 2.15, available for free download. This is the recommended platform for running this package. At the time of release, R version 3.0.x was released recently. We will only support installation of qlaci on R 2.15.x. 
12 | 


--------------------------------------------------------------------------------
/r/chapter13/ensembleexample/bagging-random-forest.R:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Bagging & Random Forest example
 3 | # Chapter 13
 4 | 
 5 | credit <- read.csv("credit.csv")
 6 | 
 7 | library(caret)
 8 | 
 9 | m <- train(default ~ ., data = credit, method = "C5.0")
10 | p <- predict(m, credit)
11 | 
12 | table(p, credit$default)
13 | 
14 | head(predict(m, credit))
15 | head(predict(m, credit, type = "prob"))
16 | 
17 | ctrl <- trainControl(method = "cv", number = 10, selectionFunction = "oneSE")
18 | 
19 | grid <- expand.grid(.model = "tree", .trials = c(1, 5, 10, 15, 20, 25, 30, 35), .winnow = "FALSE")
20 | 
21 | grid
22 | 
23 | m <- train(default ~ ., data = credit, method = "C5.0", metric = "Kappa", trControl = ctrl, tuneGrid = grid)
24 | m
25 | 
26 | 
27 | library(ipred)
28 | 
29 | mybag <- bagging(default ~ ., data = credit, nbagg = 25)
30 | 
31 | credit_pred <- predict(mybag, credit)
32 | table(credit_pred, credit$default)
33 | 
34 | library(caret)
35 | ctrl <- trainControl(method = "cv", number = 10)
36 | train(default ~ ., data = credit, method = "treebag", trControl = ctrl)
37 | 
38 | 
39 | 
40 | # Bagging
41 | 
42 | str(svmBag)
43 | svmBag$fit
44 | 
45 | bagctrl <- bagControl(fit = svmBag$fit, predict = svmBag$pred, aggregate = svmBag$aggregate)
46 | 
47 | svmBag <- train(default ~ ., data = credit, "bag", trControl = ctrl, bagControl = bagctrl)
48 | svmBag
49 | 
50 | 
51 | # Random Forest
52 | 
53 | library(randomForest)
54 | rf <- randomForest(default ~ ., data = credit)
55 | rf
56 | 
57 | library(caret)
58 | ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
59 | 
60 | grid_rf <- expand.grid(.mtry = c(2, 4, 8, 16))
61 | m_rf <- train(default ~ ., data = credit, method = "rf", metric = "Kappa", trControl = ctrl, tuneGrid = grid_rf)
62 | 
63 | 
64 | grid_c50 <- expand.grid(.model = "tree", .trials = c(10, 20, 30, 40), .winnow = "FALSE")
65 | m_c50 <- train(default ~ ., data = credit, method = "C5.0", metric = "Kappa", trControl = ctrl, tuneGrid = grid_c50)
66 | 
67 | m_rf
68 | m-c50
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/r/chapter13/ensembleexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter13/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter5/decisiontreeexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter5/randomforstexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter5/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter6/knnexample/knn.R:
--------------------------------------------------------------------------------
 1 | # Practical Machine Learning
 2 | # K- Nearest Neighbor algorithm
 3 | # Chapter 6 
 4 | 
 5 | wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE)
 6 | wbcd <- wbcd[-1]
 7 | table(wbcd$diagnosis)
 8 | wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
 9 | round(prop.table(table(wbcd$diagnosis))*100, digits = 1)
10 | summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])
11 | 
12 | normalize <- function(x) {
13 |  return((x - min(x))/(max(x) - min(x)))
14 | }
15 | 
16 | wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
17 | 
18 | summary(wbcd_n$area_mean)
19 | 
20 | wbcd_train <- wbcd_n[1:469, ]
21 | wbcd_test <- wbcd_n[470:569, ]
22 | 
23 | wbcd_train_labels <- wbcd[1:469, 1]
24 | wbcd_test_labels <- wbcd[470:569, 1]
25 | 
26 | library("class")
27 | 
28 | # call the knn function of the class package
29 | wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21)
30 | 
31 | # need gmodels for CrossTable
32 | library(gmodels)
33 | 
34 | CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq = FALSE)
35 | 
36 | # z score
37 | wbcd_z <- as.data.frame(scale(wbcd[-1]))
38 |  
39 | 
40 | summary(wbcd_z$area_mean)
41 | 
42 | wbcd_train <- wbcd_z[1:469, ]
43 | wbcd_test <- wbcd_z[470:569, ]
44 | wbcd_train_labels <- wbcd[1:469, 1]
45 | wbcd_test_labels <- wbcd[470:569, 1]
46 | 
47 | wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21)
48 | 
49 | CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq = FALSE)
50 | 


--------------------------------------------------------------------------------
/r/chapter6/knnexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter6/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter6/svmexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter6/svmexample/svm.R:
--------------------------------------------------------------------------------
 1 | # Practical Machine Learning
 2 | # Support Vector Machines (SVM)
 3 | # Chapter 6
 4 | 
 5 | 
 6 | let <- read.csv("letterdata.csv")
 7 | str(let)
 8 | 
 9 | let_train <- let[1:16000, ]
10 | let_test <- let[16001:20000, ]
11 | 
12 | 
13 | 
14 | # linear kernel (vanilla)
15 | 
16 | library(kernlab)
17 | let_classifier <- ksvm(letter ~ ., data = let_train, kernel = "vanilladot")
18 | let_classifier
19 | 
20 | let_pred <- predict(let_classifier, let_test)
21 | 
22 | head(let_pred)
23 | table(let_pred, let_test$letter)
24 | 
25 | agreement <- let_pred == let_test$letter
26 | table(agreement)
27 | prop.table(table(agreement))
28 | 
29 | 
30 | 
31 | 
32 | # RBF kernel
33 | 
34 | let_classifier2 <- ksvm(letter ~ ., data = let_train, kernel = "rbfdot")
35 | let_pred2 <- predict(let_classifier2, let_test)
36 | 
37 | agreement2 <- let_pred == let_test$letter
38 | table(agreement2)
39 | prop.table(table(agreement2))
40 | 
41 | 


--------------------------------------------------------------------------------
/r/chapter7/aprioriexample/Rplots.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter7/aprioriexample/Rplots.pdf


--------------------------------------------------------------------------------
/r/chapter7/aprioriexample/association-rules.R:
--------------------------------------------------------------------------------
 1 | # Practical Machine Learning 
 2 | # Association Rules - Apriori
 3 | # Chapter 7
 4 | 
 5 | library(arules)
 6 | 
 7 | g <- read.transactions("groceries.csv", sep = ",")
 8 | 
 9 | summary(g)
10 | 
11 | inspect(g[1:5])
12 | 
13 | itemFrequency(g[, 1:3])
14 | itemFrequencyPlot(g, support = 0.1)
15 | itemFrequencyPlot(g, topN = 20)
16 | 
17 | image(g[1:5])
18 | image(sample(g, 100))
19 | 
20 | apriori(g)
21 | 
22 | grules <- apriori(g, parameter = list(support = 0.006, confidence = 0.25, minlen = 2))
23 | 
24 | grules
25 | summary(grules)
26 | 
27 | inspect(grules[1:3])
28 | inspect(sort(grules, by = "lift")[1:5])
29 | 
30 | berryrules <- subset(grules, items %in% "berries")
31 | inspect(berryrules)
32 | 
33 | write(grules, file = "groceryrules.csv", sep = ",", quote = TRUE, row.names = FALSE)
34 | 
35 | grules_df <- as(grules, "data.frame")
36 | 
37 | str(grules_df)
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/r/chapter7/aprioriexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter7/fpgrowthexample/readme.md:
--------------------------------------------------------------------------------
1 | There are no explicit libraries avaialble in R for FP growth implementation. The folder is left as a place holder for implementing the same in future.
2 | 


--------------------------------------------------------------------------------
/r/chapter7/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter8/k-meansexample/kmeans-clustering.R:
--------------------------------------------------------------------------------
 1 | # Practical machine learning
 2 | # K Means Clustering 
 3 | # Chapter 8
 4 | 
 5 | 
 6 | teens <- read.csv("snsdata.csv")
 7 | 
 8 | str(teens)
 9 | 
10 | table(teens$gender)
11 | table(teens$gender, useNA = "ifany")
12 | 
13 | summary(teens$age)
14 | 
15 | teens$age <- ifelse(teens$age >= 13 & teens$age < 20, teens$age, NA)
16 | 
17 | summary(teens$age)
18 | 
19 | teens$female <- ifelse(teens$gender == "F" & !is.na(teens$gender), 1, 0)
20 | teens$no_gender <- ifelse(is.na(teens$gender), 1, 0)
21 | 
22 | table(teens$gender, useNA = "ifany")
23 | table(teens$female, useNA = "ifany")
24 | table(teens$no_gender, useNA = "ifany")
25 | 
26 | mean(teens$age)
27 | mean(teens$age, na.rm = TRUE)
28 | 
29 | aggregate(data = teens, age ~ gradyear, mean, na.rm = TRUE)
30 | 
31 | 
32 | ave_age <- mean(teens$age, na.rm = TRUE)
33 | ave_age
34 | 
35 | teens$age <- ifelse(is.na(teens$age), ave_age, teens$age)
36 | 
37 | summary(teens$age)
38 | 
39 | interests <- teens[5:40]
40 | 
41 | interests_z <- as.data.frame(lapply(interests, scale))
42 | 
43 | teen_clusters <- kmeans(interests_z, 5)
44 | teen_clusters$size
45 | 
46 | teen_clusters$centers
47 | 
48 | teens$cluster <- teen_clusters$cluster
49 | 
50 | teens[1:5, c("cluster", "gender", "age", "friends")]
51 | 
52 | aggregate(data = teens, age ~ cluster, mean)
53 | aggregate(data = teens, female ~ cluster, mean)
54 | aggregate(data = teens, friends ~ cluster, mean)
55 | 


--------------------------------------------------------------------------------
/r/chapter8/k-meansexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter8/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter9/naivebayesexample/Rplots.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter9/naivebayesexample/Rplots.pdf


--------------------------------------------------------------------------------
/r/chapter9/naivebayesexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/r/chapter9/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter10/linearregressionexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter10/linearregressionexample/startClickRate.sh:
--------------------------------------------------------------------------------
1 | cd /home/sunilag/spark-1.4.1-bin-hadoop2.6/bin
2 | ./spark-submit  /home/sunilag/Spark_Linear_Regression/ClickRate.py
3 | 


--------------------------------------------------------------------------------
/spark/chapter10/logisticregressionexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter10/logisticregressionexample/src/main/scala/default/SpamClassification-Logreg.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Logistic Regression example
 3 | // Chapter 10
 4 | 
 5 | package default
 6 | 
 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 8 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 9 | import org.apache.spark.mllib.linalg.Vectors
10 | import org.apache.spark.mllib.optimization.L1Updater
11 | import org.apache.spark.mllib.regression.LabeledPoint
12 | import org.apache.spark.{SparkConf, SparkContext}
13 | 
14 | /**
15 |  * @author Oleksiy Dyagilev
16 |  */
17 | object SpamClassification extends App {
18 | 
19 |   runSpark()
20 | 
21 |   def runSpark() {
22 |     val conf = new SparkConf().setAppName("Spam classification").setMaster("local[*]")
23 |     val sc = new SparkContext(conf)
24 |     val file = sc.textFile("./dataset/spambase.data")
25 |     val examples = file.map { line =>
26 |       val parts = line.split(",").map(_.toDouble)
27 |       LabeledPoint(parts.last, Vectors.dense(parts.init))
28 |     }
29 | 
30 |     val splits = examples.randomSplit(Array(0.8, 0.2))
31 |     val training = splits(0).cache()
32 |     val test = splits(1).cache()
33 | 
34 |     val numTraining = training.count()
35 |     val numTest = test.count()
36 |     println(s"Training: $numTraining, test: $numTest.")
37 | 
38 |     examples.unpersist(blocking = false)
39 | 
40 |     val algorithm = new LogisticRegressionWithLBFGS()
41 | 
42 |     //      new SquaredL2Updater()
43 |     val updater = new L1Updater()
44 | 
45 |     algorithm.optimizer
46 |       .setNumIterations(1000)
47 |       .setUpdater(updater)
48 | //      .setRegParam(0.0)
49 | 
50 |     val model = algorithm.run(training).clearThreshold()
51 | 
52 |     val prediction = model.predict(test.map(_.features))
53 |     val predictionAndLabel = prediction.zip(test.map(_.label))
54 | 
55 |     val metrics = new BinaryClassificationMetrics(predictionAndLabel)
56 | 
57 |     println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.")
58 |     println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.")
59 | 
60 |     sc.stop()
61 | 
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/spark/chapter10/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "NeuralNetwork"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |   "org.apache.spark" % "spark-core_2.10" % "1.2.1",
 9 |   "org.apache.spark" % "spark-mllib_2.10" % "1.2.1",
10 |   "org.scalanlp" % "breeze_2.10" % "0.10",
11 |   "org.scalanlp" % "breeze-natives_2.10" % "0.10",
12 |   "org.scalatest" % "scalatest_2.10" % "2.2.4" % "test"
13 | )


--------------------------------------------------------------------------------
/spark/chapter11/annexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015 Meihua Wu
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # Set everything to be logged to the console
18 | log4j.rootCategory=WARN, console
19 | log4j.appender.console=org.apache.log4j.ConsoleAppender
20 | log4j.appender.console.target=System.err
21 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
22 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
23 | 
24 | # Settings to quiet third party logs that are too verbose
25 | log4j.logger.org.eclipse.jetty=WARN
26 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
27 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
28 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
29 | 
30 | 
31 | #
32 | log4j.logger.rotationsymmetry.neuralnetwork.algo.GradientDescendOptimizer=TRACE


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/Util.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default
 6 | 
 7 | import breeze.linalg.{DenseVector=>BDV, DenseMatrix=>BDM}
 8 | import breeze.numerics.{sigmoid}
 9 | import org.apache.spark.mllib.linalg.{Vectors, Vector}
10 | import scala.math.abs
11 | 
12 | object Util {
13 | 
14 |   def toBreeze(v: Vector): BDV[Double] = {
15 |     new BDV(v.toArray)
16 |   }
17 | 
18 |   def addBias(v: BDV[Double]) : BDV[Double] ={
19 |     BDV.vertcat(BDV(1d), v)
20 |   }
21 | 
22 |   def removeBias(v: BDV[Double]): BDV[Double] ={
23 |     v(1 to -1)
24 |   }
25 | 
26 |   def sigmoidGradient(v: BDV[Double]) : BDV[Double] = {
27 |     val s = sigmoid(v)
28 |     s :* ((-s) + 1d)
29 |   }
30 | 
31 |   def doubleEqual(v1: Double, v2: Double, p: Double = 1e-4): Boolean = {
32 |     abs(v1-v2) <= p
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/algo/CostGradient.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default.algo
 6 | 
 7 | import breeze.linalg.DenseMatrix
 8 | 
 9 | case class CostGradient(val cost: Double, val thetaGradient: List[DenseMatrix[Double]], val n: Int)
10 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/algo/GradientDescendOptimizer.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default.algo
 6 | 
 7 | import rotationsymmetry.neuralnetwork.model.{NeuralNetworkModel, Topology}
 8 | import org.apache.spark.mllib.regression.LabeledPoint
 9 | import org.apache.spark.rdd.RDD
10 | import breeze.linalg.{DenseMatrix=>BDM, DenseVector=>BDV}
11 | 
12 | import scala.util.Random
13 | 
14 | 
15 | object GradientDescendOptimizer extends LoggingAbility{
16 | 
17 |   def optimize(data: RDD[LabeledPoint],
18 |                neuralNetworkModel: NeuralNetworkModel,
19 |                topology: Topology,
20 |                initTheta: List[BDM[Double]],
21 |                rate: Double,
22 |                lambda: Double,
23 |                normalFactor: Double,
24 |                maxIter: Int,
25 |                batchProp: Double = 1,
26 |                batchSeed: Integer): GradientDescendSolution ={
27 | 
28 | 
29 |     var theta = initTheta
30 | 
31 |     val costHistory: Array[Double] = new Array[Double](maxIter)
32 | 
33 |     var i: Integer = 0
34 |     while (i < maxIter){
35 |       val costGradient = DistCostGradientComputer.compute(data,
36 |         theta,
37 |         neuralNetworkModel,
38 |         lambda,
39 |         normalFactor,
40 |         batchProp,
41 |         batchSeed + i)
42 | 
43 |       costHistory(i) = costGradient.cost
44 | 
45 |       val unrolledThetaVector: BDV[Double] = new BDV(Topology.unrollTheta(theta))
46 |       val unrolledGradientVector: BDV[Double] = new BDV(Topology.unrollTheta(costGradient.thetaGradient))
47 | 
48 |       val updatedUnrolledThetaVector: BDV[Double] = unrolledThetaVector - (unrolledGradientVector * rate)
49 | 
50 |       theta = topology.generateThetaFrom(updatedUnrolledThetaVector.toArray)
51 |       i = i + 1
52 |       logger.trace("Iteration: " + i + "/" + maxIter + "        Cost: " + costGradient.cost)
53 |     }
54 | 
55 |     GradientDescendSolution(costHistory.toList, theta)
56 | 
57 |   }
58 | }
59 | 
60 | case class GradientDescendSolution(val costHistory: List[Double], val theta: List[BDM[Double]] )
61 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/algo/LoggingAbility.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default.algo
 6 | 
 7 | import org.apache.log4j.Logger
 8 | 
 9 | 
10 | trait LoggingAbility {
11 |   val loggerName = this.getClass.getName
12 |   lazy val logger = Logger.getLogger(loggerName.split("\\$").head)
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/algo/NaiveCostGradientComputer.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default.algo
 6 | 
 7 | import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
 8 | import breeze.numerics.sigmoid
 9 | import rotationsymmetry.neuralnetwork.model.{NeuralNetworkModel, Topology}
10 | import org.apache.spark.mllib.regression.LabeledPoint
11 | import rotationsymmetry.neuralnetwork.Util
12 | 
13 | 
14 | object NaiveCostGradientComputer {
15 | 
16 |   def compute_cost(data: List[LabeledPoint], theta: List[BDM[Double]], neuralNetworkModel: NeuralNetworkModel): Double = {
17 |     val costList = data map {d =>
18 |       val acc = theta.foldLeft(Util.toBreeze(d.features))(
19 |         (a, th)=>  sigmoid (th * Util.addBias(a))
20 |       )
21 |       neuralNetworkModel.cost(acc, d.label)
22 |     }
23 |     costList.sum / data.size.toDouble
24 |   }
25 | 
26 |   def compute_gradient(data: List[LabeledPoint],
27 |                    theta: List[BDM[Double]],
28 |                    neuralNetworkModel: NeuralNetworkModel,
29 |                    eps: Double): BDV[Double] ={
30 | 
31 |     val topology = Topology(theta)
32 | 
33 |     val thetaUnrolled: Array[Double] = Topology.unrollTheta(theta)
34 | 
35 |     val thetaUnrolledWithEps: List[Array[Double]] = addEps(thetaUnrolled, eps)
36 | 
37 |     val thetaWithEps = thetaUnrolledWithEps map (topology.generateThetaFrom(_))
38 | 
39 |     val costWithEps: List[Double] = thetaWithEps map (th =>
40 |       compute_cost(data, th, neuralNetworkModel)
41 |       )
42 | 
43 |     val costAtOrigin = compute_cost(data, theta, neuralNetworkModel)
44 | 
45 |     val diff = (BDV(costWithEps.toArray) - costAtOrigin)
46 | 
47 |     diff / eps
48 | 
49 |   }
50 | 
51 |   def addEps(thetaUnrolled: Array[Double], eps: Double): List[Array[Double]] ={
52 |     val out = for (i <- 0 until thetaUnrolled.length) yield {
53 |       val tmp = thetaUnrolled.clone()
54 |       tmp(i) = tmp(i) + eps
55 |       tmp
56 |     }
57 |     out.toList
58 |   }
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/algo/Predictor.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default.algo
 6 | 
 7 | import breeze.numerics.sigmoid
 8 | import rotationsymmetry.neuralnetwork.model.NeuralNetworkModel
 9 | import org.apache.spark.mllib.linalg.Vector
10 | import org.apache.spark.rdd.RDD
11 | import breeze.linalg.{DenseMatrix=>BDM}
12 | import rotationsymmetry.neuralnetwork.Util
13 | 
14 | 
15 | object Predictor {
16 |   def predict(features: RDD[Vector], neuralNetworkModel: NeuralNetworkModel, theta: List[BDM[Double]]): RDD[Double] ={
17 |     features.map(x=>{
18 |       val xVec = Util.toBreeze(x)
19 |       val outputActivation = theta.foldLeft(xVec)((a, th)=> sigmoid( th * Util.addBias(a)))
20 |       neuralNetworkModel.predict(outputActivation)
21 |     })
22 |   }
23 | 
24 |   def predict(features: Array[Vector], neuralNetworkModel: NeuralNetworkModel, theta: List[BDM[Double]]): Array[Double] ={
25 |     features.map(x=>{
26 |       val xVec = Util.toBreeze(x)
27 |       val outputActivation = theta.foldLeft(xVec)((a, th)=> sigmoid( th * Util.addBias(a)))
28 |       neuralNetworkModel.predict(outputActivation)
29 |     })
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/example/MNIST.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default.example
 6 | 
 7 | import org.apache.spark.mllib.linalg.Vectors
 8 | import org.apache.spark.mllib.regression.LabeledPoint
 9 | import org.apache.spark.mllib.util.MLUtils._
10 | import org.apache.spark.{SparkContext, SparkConf}
11 | 
12 | object MNIST {
13 | 
14 | 
15 |   def processData(): Unit = {
16 |     val conf = new SparkConf().setAppName("Simple Application").setMaster("local")
17 |     val sc = new SparkContext(conf)
18 |     val xData=sc.textFile("x.txt")
19 |     val xValue = xData.map(line => line.trim().split(" ").map(_.toDouble))
20 | 
21 |     val yData=sc.textFile("y.txt")
22 |     val yValue = yData.map(line => {
23 |       val yInt = line.trim().toInt
24 |       yInt match {
25 |         case 10 => 0
26 |         case _ => yInt
27 |       }
28 |     })
29 | 
30 |     val data = yValue.zip(xValue).map(
31 |       line => LabeledPoint(line._1, Vectors.dense(line._2))
32 |     )
33 | 
34 |     saveAsLibSVMFile(data, "data.libsvm")
35 |     sc.stop()
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/model/NeuralNetworkClassifier.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default.model
 6 | 
 7 | import breeze.linalg.{argmax, DenseVector, sum}
 8 | import breeze.numerics.log
 9 | 
10 | 
11 | 
12 | class NeuralNetworkClassifier(nGroup: Int) extends NeuralNetworkModel(){
13 | 
14 |   override def cost(activations: DenseVector[Double], y: Double): Double ={
15 | 
16 |     handelException(activations, y)
17 | 
18 |     val tmp_act: DenseVector[Double] =  ((- activations) + 1d)
19 | 
20 |     val yInt: Int = y.floor.toInt
21 | 
22 |     tmp_act(yInt) = activations(yInt)
23 | 
24 |     sum(-log(tmp_act))
25 | 
26 |   }
27 | 
28 |   override def delta(activations: DenseVector[Double], y: Double): DenseVector[Double] = {
29 | 
30 |     handelException(activations, y)
31 | 
32 |     val tmp_act: DenseVector[Double] = activations.copy
33 | 
34 |     val yInt: Int = y.floor.toInt
35 | 
36 |     tmp_act(yInt) = tmp_act(yInt) - 1d
37 | 
38 |     tmp_act
39 |   }
40 | 
41 |   override def predict(activations: DenseVector[Double]): Double ={
42 |     require(activations.length == nGroup, "Number of output activations is not equal to number of group.")
43 | 
44 |     val groupWithMaxActivation = argmax(activations)
45 |     groupWithMaxActivation.toDouble
46 |   }
47 | 
48 | 
49 | 
50 |   private def handelException(activations: DenseVector[Double], y: Double): Unit ={
51 | 
52 |     require(activations.length == nGroup, "Number of output activations is not equal to number of group.")
53 | 
54 |     require(0 <= y && y < nGroup, "y is out of range: " + "y=" + y + "; nGroup=" + nGroup)
55 | 
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/model/NeuralNetworkModel.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default.model
 6 | 
 7 | import breeze.linalg.{DenseVector => BDV}
 8 | 
 9 | 
10 | abstract class NeuralNetworkModel extends  Serializable {
11 |   def cost(activations: BDV[Double], y: Double): Double
12 |   def delta(activations: BDV[Double], y: Double): BDV[Double]
13 |   def predict(activations: BDV[Double]): Double
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/spark/chapter11/annexample/src/main/scala/default/model/Topology.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Neural Network example
 3 | // Chapter 11
 4 | 
 5 | package default.model
 6 | 
 7 | import breeze.linalg.DenseMatrix
 8 | import breeze.numerics._
 9 | 
10 | import scala.util.Random
11 | 
12 | 
13 | class Topology(val self: List[Int]) {
14 |   require(self.size >= 2, "Neural Network is less than 2 layers.")
15 | 
16 |   require(self.forall(_ >0), "The number of activations in a layout should be positive.")
17 | 
18 | 
19 |   private[this] val rowDim = self.tail
20 | 
21 |   private[this] val colDim = self.dropRight(1) map (_ + 1)
22 | 
23 |   private[this]val dimPair =  (rowDim zip colDim) map (rc => RowColPair(rc._1, rc._2))
24 | 
25 |   private[this] val start = dimPair.scanLeft(0) (
26 |     (s: Int, pair: RowColPair) => s + pair.row * pair.col
27 |   )
28 | 
29 |   def generateThetaFrom(values: Array[Double]): List[DenseMatrix[Double]] ={
30 |     require(values.length == start.last, "input is of incorrect length.")
31 | 
32 |     val dimPair_start = dimPair zip start.dropRight(1)
33 | 
34 |     dimPair_start.map(
35 |       _ match {
36 |         case (pair: RowColPair, s: Int) =>{
37 |           new DenseMatrix(pair.row, pair.col, values.slice(s, s + pair.row * pair.col))
38 |         }
39 |       }
40 |     )
41 |   }
42 | 
43 |   def generateThetaFrom(rand: Random): List[DenseMatrix[Double]] ={
44 | 
45 |     val dimPair_start = dimPair zip start.dropRight(1)
46 | 
47 |     dimPair_start.map(
48 |       _ match {
49 |         case (pair: RowColPair, s: Int) =>{
50 |           val eps = sqrt(6d / (pair.row + pair.col - 1))
51 | 
52 |           val value: Array[Double] = (for (i <- 0 until pair.row * pair.col) yield (rand.nextDouble() - 0.5) * 2 * eps).toArray
53 |           new DenseMatrix(pair.row, pair.col, value)
54 |         }
55 |       }
56 |     )
57 |   }
58 | 
59 |   private[this] case class RowColPair(val row: Int, val col: Int)
60 | 
61 | }
62 | 
63 | object Topology {
64 |   def apply(theta: List[DenseMatrix[Double]]): Topology ={
65 |     new Topology((theta.head.cols - 1) +: theta.map(_.rows))
66 |   }
67 | 
68 |   def unrollTheta(theta: List[DenseMatrix[Double]]): Array[Double] = {
69 |     val unrolledMatrixList: List[List[Double]] = theta map (_.toArray.toList)
70 |     unrolledMatrixList reduce(_ ::: _) toArray
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/spark/chapter11/dlexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter11/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter12/readme.md:
--------------------------------------------------------------------------------
1 | There are no sepcific libraries in MLIB available for Reinforcement learning technique slike Q learning / TD learning
2 | 


--------------------------------------------------------------------------------
/spark/chapter12/rlexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter13/ensembleexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter13/ensembleexample/result/GBT_clas.txt:
--------------------------------------------------------------------------------
 1 | Test Error = 0.0
 2 | Learned classification GBT model:
 3 | TreeEnsembleModel classifier with 3 trees
 4 | 
 5 |   Tree 0:
 6 |     If (feature 434 <= 0.0)
 7 |      If (feature 100 <= 165.0)
 8 |       Predict: -1.0
 9 |      Else (feature 100 > 165.0)
10 |       Predict: 1.0
11 |     Else (feature 434 > 0.0)
12 |      Predict: 1.0
13 |   Tree 1:
14 |     If (feature 434 <= 0.0)
15 |      If (feature 568 <= 253.0)
16 |       If (feature 211 <= 250.0)
17 |        Predict: 0.47681168808847024
18 |       Else (feature 211 > 250.0)
19 |        Predict: 0.4768116880884703
20 |      Else (feature 568 > 253.0)
21 |       Predict: -0.4768116880884694
22 |     Else (feature 434 > 0.0)
23 |      If (feature 351 <= 58.0)
24 |       Predict: -0.4768116880884702
25 |      Else (feature 351 > 58.0)
26 |       Predict: -0.47681168808847035
27 |   Tree 2:
28 |     If (feature 434 <= 0.0)
29 |      If (feature 295 <= 253.0)
30 |       If (feature 152 <= 253.0)
31 |        Predict: -0.5183379293761909
32 |       Else (feature 152 > 253.0)
33 |        Predict: -0.5183379293761909
34 |      Else (feature 295 > 253.0)
35 |       Predict: 0.5183379293761909
36 |     Else (feature 434 > 0.0)
37 |      If (feature 157 <= 252.0)
38 |       Predict: 0.5183379293761909
39 |      Else (feature 157 > 252.0)
40 |       If (feature 156 <= 85.0)
41 |        Predict: 0.5183379293761909
42 |       Else (feature 156 > 85.0)
43 |        Predict: 0.5183379293761909
44 | 


--------------------------------------------------------------------------------
/spark/chapter13/ensembleexample/src/GradientBoostTree_classification.scala:
--------------------------------------------------------------------------------
 1 | // https://spark.apache.org/docs/1.2.0/mllib-ensembles.html
 2 | //  perform classification using Gradient-Boosted Trees with log loss. 
 3 | // The test error is calculated to measure the algorithm accuracy.
 4 | 
 5 | 
 6 | 
 7 | import org.apache.spark.mllib.tree.GradientBoostedTrees
 8 | import org.apache.spark.mllib.tree.configuration.BoostingStrategy
 9 | import org.apache.spark.mllib.util.MLUtils
10 | 
11 | // Load and parse the data file.
12 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/sample_libsvm_data.txt")
13 | // Split the data into training and test sets (30% held out for testing)
14 | val splits = data.randomSplit(Array(0.7, 0.3))
15 | val (trainingData, testData) = (splits(0), splits(1))
16 | 
17 | // Train a GradientBoostedTrees model.
18 | //  The defaultParams for Classification use LogLoss by default.
19 | val boostingStrategy = BoostingStrategy.defaultParams("Classification")
20 | boostingStrategy.numIterations = 3 // Note: Use more iterations in practice.
21 | 
22 | // It is numClasses rather than numClassesForclassification 
23 | // chech in API // https://spark.apache.org/docs/1.2.0/api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy
24 | boostingStrategy.treeStrategy.numClasses= 2
25 | 
26 | boostingStrategy.treeStrategy.maxDepth = 5
27 | //  Empty categoricalFeaturesInfo indicates all features are continuous.
28 | boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()
29 | 
30 | val model = GradientBoostedTrees.train(trainingData, boostingStrategy)
31 | 
32 | // Evaluate model on test instances and compute test error
33 | val labelAndPreds = testData.map { point =>
34 |   val prediction = model.predict(point.features)
35 |   (point.label, prediction)
36 | }
37 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
38 | println("Test Error = " + testErr)
39 | println("Learned classification GBT model:\n" + model.toDebugString)


--------------------------------------------------------------------------------
/spark/chapter13/ensembleexample/src/GradientBoostTree_regression.scala:
--------------------------------------------------------------------------------
 1 | // https://spark.apache.org/docs/1.2.0/mllib-ensembles.html
 2 | // Gradient-Boosted Trees with Squared Error as the loss. 
 3 | // The Mean Squared Error (MSE) is computed at the end to evaluate goodness of fit.
 4 | 
 5 | 
 6 | import org.apache.spark.mllib.tree.GradientBoostedTrees
 7 | import org.apache.spark.mllib.tree.configuration.BoostingStrategy
 8 | import org.apache.spark.mllib.util.MLUtils
 9 | 
10 | // Load and parse the data file.
11 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/housing.txt")
12 | // Split the data into training and test sets (30% held out for testing)
13 | val splits = data.randomSplit(Array(0.7, 0.3))
14 | val (trainingData, testData) = (splits(0), splits(1))
15 | 
16 | // Train a GradientBoostedTrees model.
17 | //  The defaultParams for Regression use SquaredError by default.
18 | val boostingStrategy = BoostingStrategy.defaultParams("Regression")
19 | boostingStrategy.numIterations = 3 // Note: Use more iterations in practice.
20 | boostingStrategy.treeStrategy.maxDepth = 5
21 | //  Empty categoricalFeaturesInfo indicates all features are continuous.
22 | boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()
23 | 
24 | val model = GradientBoostedTrees.train(trainingData, boostingStrategy)
25 | 
26 | // Evaluate model on test instances and compute test error
27 | val labelsAndPredictions = testData.map { point =>
28 |   val prediction = model.predict(point.features)
29 |   (point.label, prediction)
30 | }
31 | val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
32 | println("Test Mean Squared Error = " + testMSE)
33 | println("Learned regression GBT model:\n" + model.toDebugString)


--------------------------------------------------------------------------------
/spark/chapter13/ensembleexample/src/test.txt:
--------------------------------------------------------------------------------
 1 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html 
 2 | // Decision Tree with Gini impurity as an impurity 
 3 | // The test error is calculated to measure the algorithm accuracy.
 4 | 
 5 | import org.apache.spark.mllib.tree.DecisionTree
 6 | import org.apache.spark.mllib.util.MLUtils
 7 | 
 8 | // Load and parse the data file.
 9 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/svm.txt")
10 | // Split the data into training and test sets (30% held out for testing)
11 | val splits = data.randomSplit(Array(0.5, 0.5))
12 | val (trainingData, testData) = (splits(0), splits(1))
13 | 
14 | // Train a DecisionTree model.
15 | //  Empty categoricalFeaturesInfo indicates all features are continuous.
16 | val numClasses = 2
17 | val categoricalFeaturesInfo = Map[Int, Int]()
18 | val impurity = "gini"
19 | val maxDepth = 10   
20 | val maxBins = 32
21 | val minInstancesPerNode =5
22 | 
23 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
24 |   impurity, maxDepth, maxBins, minInstancesPerNode)
25 | 
26 | // Evaluate model on test instances and compute test error
27 | val labelAndPreds = testData.map { point =>
28 |   val prediction = model.predict(point.features)
29 |   (point.label, prediction)
30 | }
31 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
32 | println("Test Error = " + testErr)
33 | println("Learned classification tree model:\n" + model.toDebugString)


--------------------------------------------------------------------------------
/spark/chapter13/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter5/decisiontreeexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter5/decisiontreeexample/result/classification.txt:
--------------------------------------------------------------------------------
 1 | Test Error = 0.03125
 2 | 
 3 | Learned classification tree model:
 4 | DecisionTreeModel classifier of depth 2 with 5 nodes
 5 |   If (feature 434 <= 0.0)
 6 |    If (feature 99 <= 0.0)
 7 |     Predict: 0.0
 8 |    Else (feature 99 > 0.0)
 9 |     Predict: 1.0
10 |   Else (feature 434 > 0.0)
11 |    Predict: 1.0
12 | 


--------------------------------------------------------------------------------
/spark/chapter5/decisiontreeexample/result/regression.txt:
--------------------------------------------------------------------------------
 1 | Test Mean Squared Error = 22.935061460324544
 2 | 
 3 | Learned regression tree model:
 4 | DecisionTreeModel regressor of depth 4 with 29 nodes
 5 |   If (feature 5 <= 0.269592)
 6 |    If (feature 12 <= -0.279249)
 7 |     If (feature 5 <= 0.140832)
 8 |      If (feature 7 <= -0.95335)
 9 |       Predict: 38.95
10 |      Else (feature 7 > -0.95335)
11 |       Predict: 21.594078947368423
12 |     Else (feature 5 > 0.140832)
13 |      If (feature 4 <= -0.427984)
14 |       Predict: 28.96785714285715
15 |      Else (feature 4 > -0.427984)
16 |       Predict: 23.650000000000006
17 |    Else (feature 12 > -0.279249)
18 |     If (feature 0 <= -0.987141)
19 |      If (feature 7 <= -0.857305)
20 |       Predict: 15.514285714285714
21 |      Else (feature 7 > -0.857305)
22 |       Predict: 19.866666666666667
23 |     Else (feature 0 > -0.987141)
24 |      If (feature 0 <= -0.78628)
25 |       Predict: 14.6
26 |      Else (feature 0 > -0.78628)
27 |       Predict: 11.236842105263158
28 |   Else (feature 5 > 0.269592)
29 |    If (feature 5 <= 0.491857)
30 |     If (feature 12 <= -0.566225)
31 |      If (feature 7 <= -0.857305)
32 |       Predict: 45.65
33 |      Else (feature 7 > -0.857305)
34 |       Predict: 33.66923076923077
35 |     Else (feature 12 > -0.566225)
36 |      If (feature 2 <= -0.579179)
37 |       Predict: 27.580000000000002
38 |      Else (feature 2 > -0.579179)
39 |       Predict: 15.0
40 |    Else (feature 5 > 0.491857)
41 |     If (feature 11 <= 0.798729)
42 |      Predict: 21.9
43 |     Else (feature 11 > 0.798729)
44 |      If (feature 10 <= 0.106383)
45 |       Predict: 46.4470588235294
46 |      Else (feature 10 > 0.106383)
47 |       Predict: 40.125
48 | 


--------------------------------------------------------------------------------
/spark/chapter5/decisiontreeexample/src/decisiontree-classification.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Decision Tree example
 3 | // Chapter 5
 4 | 
 5 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html 
 6 | // Decision Tree with Gini impurity as an impurity 
 7 | // The test error is calculated to measure the algorithm accuracy.
 8 | 
 9 | import org.apache.spark.mllib.tree.DecisionTree
10 | import org.apache.spark.mllib.util.MLUtils
11 | 
12 | // Load and parse the data file.
13 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/sample_libsvm_data.txt")
14 | // Split the data into training and test sets (30% held out for testing)
15 | val splits = data.randomSplit(Array(0.7, 0.3))
16 | val (trainingData, testData) = (splits(0), splits(1))
17 | 
18 | // Train a DecisionTree model.
19 | //  Empty categoricalFeaturesInfo indicates all features are continuous.
20 | val numClasses = 2
21 | val categoricalFeaturesInfo = Map[Int, Int]()
22 | val impurity = "gini"
23 | val maxDepth = 4    // rather than 5, easy to test
24 | val maxBins = 32
25 | 
26 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
27 |   impurity, maxDepth, maxBins)
28 | 
29 | // Evaluate model on test instances and compute test error
30 | val labelAndPreds = testData.map { point =>
31 |   val prediction = model.predict(point.features)
32 |   (point.label, prediction)
33 | }
34 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
35 | println("Test Error = " + testErr)
36 | println("Learned classification tree model:\n" + model.toDebugString)


--------------------------------------------------------------------------------
/spark/chapter5/decisiontreeexample/src/decisiontree-regression.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Decision Tree example
 3 | // Chapter 5
 4 | 
 5 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html
 6 | // decision tree with variance as an impurity measure and a maximum tree depth of 4
 7 | // The Mean Squared Error (MSE) is computed at the end to evaluate goodness of fit.
 8 | 
 9 | import org.apache.spark.mllib.tree.DecisionTree
10 | import org.apache.spark.mllib.util.MLUtils
11 | 
12 | // Load and parse the data file.
13 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/housing.txt")
14 | // Split the data into training and test sets (30% held out for testing)
15 | val splits = data.randomSplit(Array(0.7, 0.3))
16 | val (trainingData, testData) = (splits(0), splits(1))
17 | 
18 | // Train a DecisionTree model.
19 | //  Empty categoricalFeaturesInfo indicates all features are continuous.
20 | val categoricalFeaturesInfo = Map[Int, Int]()
21 | val impurity = "variance"
22 | val maxDepth = 4
23 | val maxBins = 32
24 | 
25 | val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity,
26 |   maxDepth, maxBins)
27 | 
28 | // Evaluate model on test instances and compute test error
29 | val labelsAndPredictions = testData.map { point =>
30 |   val prediction = model.predict(point.features)
31 |   (point.label, prediction)
32 | }
33 | val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
34 | println("Test Mean Squared Error = " + testMSE)
35 | println("Learned regression tree model:\n" + model.toDebugString)


--------------------------------------------------------------------------------
/spark/chapter5/decisiontreeexample/src/decsiontree-test.txt:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Decision Tree example
 3 | // Chapter 5
 4 | 
 5 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html 
 6 | // Decision Tree with Gini impurity as an impurity 
 7 | // The test error is calculated to measure the algorithm accuracy.
 8 | 
 9 | import org.apache.spark.mllib.tree.DecisionTree
10 | import org.apache.spark.mllib.util.MLUtils
11 | 
12 | // Load and parse the data file.
13 | val data = MLUtils.loadLibSVMFile(sc, "data/svm.txt")
14 | // Split the data into training and test sets (30% held out for testing)
15 | val splits = data.randomSplit(Array(0.5, 0.5))
16 | val (trainingData, testData) = (splits(0), splits(1))
17 | 
18 | // Train a DecisionTree model.
19 | //  Empty categoricalFeaturesInfo indicates all features are continuous.
20 | val numClasses = 2
21 | val categoricalFeaturesInfo = Map[Int, Int]()
22 | val impurity = "gini"
23 | val maxDepth = 10   
24 | val maxBins = 32
25 | val minInstancesPerNode =5
26 | 
27 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
28 |   impurity, maxDepth, maxBins, minInstancesPerNode)
29 | 
30 | // Evaluate model on test instances and compute test error
31 | val labelAndPreds = testData.map { point =>
32 |   val prediction = model.predict(point.features)
33 |   (point.label, prediction)
34 | }
35 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
36 | println("Test Error = " + testErr)
37 | println("Learned classification tree model:\n" + model.toDebugString)


--------------------------------------------------------------------------------
/spark/chapter5/randomforstexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter5/randomforstexample/result/RandomForests_classification.txt:
--------------------------------------------------------------------------------
 1 | Test Error = 0.034482758620689655
 2 | 
 3 | Learned classification forest model:
 4 | TreeEnsembleModel classifier with 3 trees
 5 | 
 6 |   Tree 0:
 7 |     If (feature 434 <= 0.0)
 8 |      If (feature 632 <= 0.0)
 9 |       Predict: 1.0
10 |      Else (feature 632 > 0.0)
11 |       Predict: 0.0
12 |     Else (feature 434 > 0.0)
13 |      Predict: 1.0
14 |   Tree 1:
15 |     If (feature 399 <= 15.0)
16 |      If (feature 356 <= 0.0)
17 |       If (feature 126 <= 254.0)
18 |        Predict: 1.0
19 |       Else (feature 126 > 254.0)
20 |        Predict: 0.0
21 |      Else (feature 356 > 0.0)
22 |       Predict: 0.0
23 |     Else (feature 399 > 15.0)
24 |      Predict: 0.0
25 |   Tree 2:
26 |     If (feature 517 <= 41.0)
27 |      Predict: 0.0
28 |     Else (feature 517 > 41.0)
29 |      If (feature 548 <= 251.0)
30 |       Predict: 1.0
31 |      Else (feature 548 > 251.0)
32 |       Predict: 0.0
33 | 


--------------------------------------------------------------------------------
/spark/chapter5/randomforstexample/src/RandomForest_regression.scala:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Decision Tree based learning - Random Forest example
 3 | # Chapter 5
 4 | 
 5 | // source https://spark.apache.org/docs/1.2.0/mllib-ensembles.html
 6 | // Random Forest. 
 7 | // The Mean Squared Error (MSE) is computed at the end to evaluate goodness of fit.
 8 | 
 9 | 
10 | import org.apache.spark.mllib.tree.RandomForest
11 | import org.apache.spark.mllib.util.MLUtils
12 | 
13 | // Load and parse the data file.
14 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/housing.txt")
15 | // Split the data into training and test sets (30% held out for testing)
16 | val splits = data.randomSplit(Array(0.7, 0.3))
17 | val (trainingData, testData) = (splits(0), splits(1))
18 | 
19 | // Train a RandomForest model.
20 | //  Empty categoricalFeaturesInfo indicates all features are continuous.
21 | val numClasses = 2
22 | val categoricalFeaturesInfo = Map[Int, Int]()
23 | val numTrees = 3 // Use more in practice.
24 | val featureSubsetStrategy = "auto" // Let the algorithm choose.
25 | val impurity = "variance"
26 | val maxDepth = 4
27 | val maxBins = 32
28 | 
29 | val model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo,
30 |   numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
31 | 
32 | // Evaluate model on test instances and compute test error
33 | val labelsAndPredictions = testData.map { point =>
34 |   val prediction = model.predict(point.features)
35 |   (point.label, prediction)
36 | }
37 | val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
38 | println("Test Mean Squared Error = " + testMSE)
39 | println("Learned regression forest model:\n" + model.toDebugString)


--------------------------------------------------------------------------------
/spark/chapter5/randomforstexample/src/RandomForests_classification.scala:
--------------------------------------------------------------------------------
 1 | # Practical Machine learning
 2 | # Decision Tree based learning - Random Forest example
 3 | # Chapter 5
 4 | 
 5 | // source https://spark.apache.org/docs/1.2.0/mllib-ensembles.html
 6 | // Random Forest. 
 7 | // The test error is calculated to measure the algorithm accuracy.
 8 | 
 9 | import org.apache.spark.mllib.tree.RandomForest
10 | import org.apache.spark.mllib.util.MLUtils
11 | 
12 | // Load and parse the data file.
13 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/sample_libsvm_data.txt")
14 | // Split the data into training and test sets (30% held out for testing)
15 | val splits = data.randomSplit(Array(0.7, 0.3))
16 | val (trainingData, testData) = (splits(0), splits(1))
17 | 
18 | // Train a RandomForest model.
19 | //  Empty categoricalFeaturesInfo indicates all features are continuous.
20 | val numClasses = 2
21 | val categoricalFeaturesInfo = Map[Int, Int]()
22 | val numTrees = 3 // Use more in practice.
23 | val featureSubsetStrategy = "auto" // Let the algorithm choose.
24 | val impurity = "gini"
25 | val maxDepth = 4
26 | val maxBins = 32
27 | 
28 | val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
29 |   numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
30 | 
31 | // Evaluate model on test instances and compute test error
32 | val labelAndPreds = testData.map { point =>
33 |   val prediction = model.predict(point.features)
34 |   (point.label, prediction)
35 | }
36 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
37 | println("Test Error = " + testErr)
38 | println("Learned classification forest model:\n" + model.toDebugString)


--------------------------------------------------------------------------------
/spark/chapter5/randomforstexample/src/test.txt:
--------------------------------------------------------------------------------
 1 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html 
 2 | // Decision Tree with Gini impurity as an impurity 
 3 | // The test error is calculated to measure the algorithm accuracy.
 4 | 
 5 | import org.apache.spark.mllib.tree.DecisionTree
 6 | import org.apache.spark.mllib.util.MLUtils
 7 | 
 8 | // Load and parse the data file.
 9 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/svm.txt")
10 | // Split the data into training and test sets (30% held out for testing)
11 | val splits = data.randomSplit(Array(0.5, 0.5))
12 | val (trainingData, testData) = (splits(0), splits(1))
13 | 
14 | // Train a DecisionTree model.
15 | //  Empty categoricalFeaturesInfo indicates all features are continuous.
16 | val numClasses = 2
17 | val categoricalFeaturesInfo = Map[Int, Int]()
18 | val impurity = "gini"
19 | val maxDepth = 10   
20 | val maxBins = 32
21 | val minInstancesPerNode =5
22 | 
23 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
24 |   impurity, maxDepth, maxBins, minInstancesPerNode)
25 | 
26 | // Evaluate model on test instances and compute test error
27 | val labelAndPreds = testData.map { point =>
28 |   val prediction = model.predict(point.features)
29 |   (point.label, prediction)
30 | }
31 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
32 | println("Test Error = " + testErr)
33 | println("Learned classification tree model:\n" + model.toDebugString)


--------------------------------------------------------------------------------
/spark/chapter5/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter6/knnexample/example-run:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | RCM_PATH="$(pwd)/target/scala-2.10/SparkRecommender-assembly-0.1.jar"
3 | LIB_PATH="$(pwd)/lib/"
4 | /usr/bin/spark-submit --driver-memory 2G --executor-memory 6G \
5 |   --driver-class-path $LIB_PATH --class Boot $RCM_PATH $@
6 | 


--------------------------------------------------------------------------------
/spark/chapter6/knnexample/project/Build.scala:
--------------------------------------------------------------------------------
 1 | import sbt.Keys._
 2 | import sbt._
 3 | 
 4 | object MyBuild extends Build {
 5 | 
 6 |   lazy val copyDependencies = TaskKey[Unit]("copy-dependencies")
 7 | 
 8 |   def copyDepTask = copyDependencies <<= (update, crossTarget, scalaVersion) map {
 9 |     (updateReport, out, scalaVer) =>
10 |       updateReport.allFiles foreach { srcPath =>
11 |         val destPath = out / "lib" / srcPath.getName
12 |         IO.copyFile(srcPath, destPath, preserveLastModified=true)
13 |       }
14 |   }
15 | 
16 |   lazy val root = Project(
17 |     "root",
18 |     file("."),
19 |     settings = Defaults.defaultSettings ++ Seq(
20 |       copyDepTask
21 |     )
22 |   )
23 | }


--------------------------------------------------------------------------------
/spark/chapter6/knnexample/project/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | 
3 | 


--------------------------------------------------------------------------------
/spark/chapter6/knnexample/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/spark/chapter6/knnexample/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn


--------------------------------------------------------------------------------
/spark/chapter6/knnexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter6/svmexample/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "Spark Kernel SVM"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | resolvers += "Spark Packages Repo" at "http://dl.bintray.com/spark-packages/maven"
 8 | 
 9 | libraryDependencies ++=  Seq(
10 | "org.apache.spark" %% "spark-core" % "1.3.1",
11 | "org.apache.spark"  %% "spark-mllib"             % "1.3.1",
12 | "amplab" % "spark-indexedrdd" % "0.1"
13 | )
14 | 


--------------------------------------------------------------------------------
/spark/chapter6/svmexample/doc/usage.txt:
--------------------------------------------------------------------------------
1 | Build:
2 | First cd into .../Spark_kernel_svm then:
3 | $ sbt package
4 | 
5 | Run:
6 | First cd into .../Spark_kernel_svm then:
7 | $ (your spark dir)/bin/spark-submit \
8 | --packages amplab:spark-indexedrdd:0.1 \
9 | target/scala-2.10/spark-kernel-svm_2.10-1.0.jar


--------------------------------------------------------------------------------
/spark/chapter6/svmexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter6/svmexample/src/main/scala/Kernels.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Support Vector machine example 
 3 | // Chapter 6
 4 | 
 5 | /*
 6 |  * Some Kernel functions.
 7 |  */
 8 | import org.apache.spark.mllib.linalg.Vector
 9 | import org.apache.spark.mllib.linalg.Vectors
10 | 
11 | /** Rbf Kernel, parametrized by gamma */
12 | class RbfKernelFunc(gamma_s: Double) extends java.io.Serializable{
13 |     var gamma: Double = gamma_s
14 |     def evaluate(x_1: Vector, x_2: Vector): Double = {
15 |         math.exp(-1 * gamma * math.pow(Vectors.sqdist(x_1, x_2),2))
16 |     }
17 | }


--------------------------------------------------------------------------------
/spark/chapter6/svmexample/src/main/scala/main.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Support Vector machine example 
 3 | // Chapter 6
 4 | 
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.SparkContext._
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.rdd._
 9 | 
10 | import org.apache.spark.mllib.util.MLUtils
11 | 
12 | import java.io._
13 | import java.lang.System
14 | 
15 | object TestKernelSVM {
16 |     def main(args: Array[String]) {
17 |       
18 |       if (args.length != 1 ) {
19 |       println("Usage: /path/to/spark/bin/spark-submit --packages amplab:spark-indexedrdd:0.1" +
20 |         "target/scala-2.10/spark-kernel-svm_2.10-1.0.jar <data file>")
21 |       sys.exit(1)
22 |       }
23 |       
24 |       val logFile = "README.md" // Should be some file on your system
25 |       val conf = new SparkConf().setAppName("KernelSVM Test")
26 |       val sc = new SparkContext(conf)
27 |        
28 |       val data =  MLUtils.loadLibSVMFile(sc, args(0))
29 | 
30 |       val splits = data.randomSplit(Array(0.8,0.2))
31 |       val training = splits(0)
32 |       val test = splits(1).collect()
33 |       
34 |       val m = training.count()
35 |       
36 |       var pack_size = 100
37 |       
38 |       val iterations = List((0.5*m).toLong,m.toLong,(1.5*m).toLong,(2*m).toLong)
39 |       var num_iter = 0
40 |       
41 |       val pw = new PrintWriter(new File("result.txt" ))
42 |       
43 |       for (num_iter <- iterations) {
44 |         val t1 = System.currentTimeMillis
45 |         val svm = new KernelSVM(training, 1.0/m, "rbf", 1.0)
46 |         svm.train(num_iter,pack_size)
47 |         val t2 = System.currentTimeMillis
48 |         val runtime = (t2 - t1)/1000
49 |         
50 |         var ss = m.toString + " " + num_iter.toString + " " + pack_size.toString + " " + svm.getAccuracy(test).toString + " " + runtime.toString + "\n"
51 |         pw.write(ss)
52 |       }
53 |       
54 |       pw.close
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/spark/chapter7/aprioriexample/pom.xml:
--------------------------------------------------------------------------------
 1 | <project>
 2 |   <groupId>spark</groupId>
 3 |   <artifactId>spark-apriori</artifactId>
 4 |   <modelVersion>4.0.0</modelVersion>
 5 |   <name>Spark A Priori</name>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0</version>
 8 |   <repositories>
 9 |     <repository>
10 |       <id>scala-tools.org</id>
11 |       <name>Scala-tools Maven2 Repository</name>
12 |       <url>http://scala-tools.org/repo-releases</url>
13 |     </repository>
14 |   </repositories>
15 |   <pluginRepositories>
16 |     <pluginRepository>
17 |       <id>scala-tools.org</id>
18 |       <name>Scala-tools Maven2 Repository</name>
19 |       <url>http://scala-tools.org/repo-releases</url>
20 |     </pluginRepository>
21 |   </pluginRepositories>
22 |   <dependencies>
23 |     <dependency>
24 |       <groupId>org.scala-lang</groupId>
25 |       <artifactId>scala-library</artifactId>
26 |       <version>2.11.5</version>
27 |     </dependency>
28 |     <dependency> <!-- Spark dependency -->
29 |       <groupId>org.apache.spark</groupId>
30 |       <artifactId>spark-core_2.10</artifactId>
31 |       <version>1.2.1</version>
32 |     </dependency>
33 |   </dependencies>
34 |   <build>
35 |     <sourceDirectory>src/main/scala</sourceDirectory>
36 |     <plugins>
37 |       <plugin>
38 |         <groupId>org.apache.maven.plugins</groupId>
39 |         <artifactId>maven-compiler-plugin</artifactId>
40 |         <configuration>
41 |           <source>1.7</source>
42 |           <target>1.7</target>
43 |         </configuration>
44 |       </plugin>
45 |       <plugin>
46 |         <groupId>net.alchim31.maven</groupId>
47 |         <artifactId>scala-maven-plugin</artifactId>
48 |         <version>3.1.6</version>
49 |         <executions>
50 |           <execution>
51 |             <goals>
52 |               <goal>compile</goal>
53 |               <goal>testCompile</goal>
54 |             </goals>
55 |           </execution>
56 |         </executions>
57 |         <configuration>
58 |           <args>
59 |             <!-- work-around for https://issues.scala-lang.org/browse/SI-8358 -->
60 |             <arg>-nobootcp</arg>
61 |           </args>
62 |         </configuration>
63 |       </plugin>
64 |     </plugins>
65 |   </build>
66 | </project>
67 | 


--------------------------------------------------------------------------------
/spark/chapter7/aprioriexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter7/aprioriexample/src/main/scala/default/BloomFilter.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Association rule based learning - Apriori example
 3 | // Chapter 7
 4 | 
 5 | package default
 6 | 
 7 | import scala.collection.BitSet
 8 | 
 9 | class BloomFilter[T](
10 |     buckets: Int,
11 |     multiplier: Int,
12 |     increment: Int,
13 |     private val bitset: BitSet = BitSet()) extends Set[T] {
14 |   import BloomFilter._
15 | 
16 |   override def contains(elem: T): Boolean =
17 |       bitset.contains((reHash(multiplier, increment)(elem) % buckets))
18 | 
19 |   override def +(elem: T): Set[T] =
20 |       new BloomFilter(
21 |           buckets,
22 |           multiplier,
23 |           increment,
24 |           bitset + (reHash(multiplier, increment)(elem) % buckets)
25 |       )
26 | 
27 |   override def -(elem: T): Set[T] = ???
28 | 
29 |   override def iterator: Iterator[T] = ???
30 | }
31 | 
32 | object BloomFilter extends App {
33 |   def ??? : Nothing = throw new UnsupportedOperationException()
34 | 
35 |   def apply[T](buckets: Int, multiplier: Int = 12568, increment: Int = 76509)(elems: T*) = {
36 |       new BloomFilter[T](
37 |           buckets,
38 |           multiplier,
39 |           increment,
40 |           BitSet(elems.map { elem: T => reHash(multiplier, increment)(elem) % buckets }: _*)
41 |       )
42 |   }
43 | 
44 |   def reHash(
45 |       multiplier: Int,
46 |       increment: Int
47 |   )(a: Any): Int = a.hashCode() * multiplier + increment
48 | }
49 | 


--------------------------------------------------------------------------------
/spark/chapter7/aprioriexample/src/main/scala/default/FrequentItemSets.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Association rule based learning - Apriori example
 3 | // Chapter 7
 4 | 
 5 | package default
 6 | 
 7 | import scala.collection.SortedSet
 8 | 
 9 | trait FrequentItemSets {
10 |   /**
11 |    * Collection type for frequent item sets.
12 |    *
13 |    * @tparam T item type.
14 |    */
15 |   type ItemSet[T] = SortedSet[T]
16 | }
17 | 


--------------------------------------------------------------------------------
/spark/chapter7/aprioriexample/src/main/scala/default/NaiveFrequentItemSets.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Association rule based learning - Apriori example
 3 | // Chapter 7
 4 | 
 5 | package default
 6 | 
 7 | import scala.collection.Map
 8 | import scala.reflect.ClassTag
 9 | 
10 | import org.apache.spark.SparkContext
11 | import org.apache.spark.SparkContext.rddToPairRDDFunctions
12 | import org.apache.spark.rdd.RDD
13 | 
14 | /**
15 |  * Utility to compute frequent item sets for association rule mining.
16 |  *
17 |  * @see [[http://en.wikipedia.org/wiki/Association_rule_learning]]
18 |  */
19 | object NaiveFrequentItemSets extends FrequentItemSets {
20 |   /**
21 |    * Computes frequent item sets up to the specified size.
22 |    *
23 |    * @param sparkContext context in which to run. This is used to broadcast shared memory.
24 |    * @param baskets to analyze.
25 |    * @param supportThreshold minimum # of times an item set must occur to be considered frequent.
26 |    * @param maxSize maximum item set size.
27 |    * @tparam T item type.
28 |    * @return map of frequent item sets and their counts.
29 |    */
30 |   def apply[T: Ordering: ClassTag](sparkContext: SparkContext)(
31 |       baskets: RDD[ItemSet[T]],
32 |       supportThreshold: Int,
33 |       maxSize: Int): Map[ItemSet[T], Int] = {
34 |     // Count item subsets from size 1 up to maxSize.
35 |     baskets.flatMap { basket: ItemSet[T] =>
36 |       (1 to maxSize).map(basket.subsets(_)).reduce(_ ++ _).map((_, 1))
37 |     }
38 |     .reduceByKey(_ + _)
39 |     // Filter by support threshold.
40 |     .filter { case (itemSet: ItemSet[T], count: Int) => count >= supportThreshold }
41 |     .collectAsMap()
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/spark/chapter7/aprioriexample/src/main/scala/default/TestMain.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Association rule based learning - Apriori example
 3 | // Chapter 7
 4 | 
 5 | package default
 6 | 
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.SparkConf
10 | import scala.collection.SortedSet
11 | 
12 | /**
13 |  * Delete me and write real tests.
14 |  */
15 | object TestMain extends App with FrequentItemSets {
16 | 
17 |   val conf: SparkConf = new SparkConf().setMaster("local").setAppName("Simple Application")
18 |   val sparkContext: SparkContext = new SparkContext(conf)
19 | 
20 |   val filePath: String = "/home/shashir/data.txt"
21 | 
22 |   val data: RDD[ItemSet[String]] = sparkContext.textFile(filePath, 2).map { line: String =>
23 |     SortedSet(line.split(" "): _*)
24 |   }.cache()
25 | 
26 |   APriori(sparkContext)(data, 400, 3).foreach(println)
27 | }
28 | 


--------------------------------------------------------------------------------
/spark/chapter7/fpgrowthexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter7/fpgrowthexample/src/main/scala/default/Test.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Association rule based learning - FPGrowth example
 3 | // Chapter 7
 4 | 
 5 | package default
 6 | 
 7 | object Test {
 8 |   def main (args: Array[String]) {
 9 |     val data = Array(Array("f", "c", "a", "d", "g", "i", "m", "p"), Array("a", "b", "c", "f", "l", "m", "o"), Array("b", "f", "h", "j", "o"), Array("b", "c", "k", "s", "p"), Array("a", "f", "c", "e", "l", "p", "m", "n"))
10 |     FPTree(data, 3)
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/spark/chapter7/fpgrowthexample/src/main/scala/default/TreeNode.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Association rule based learning - FPGrowth example
 3 | // Chapter 7
 4 | 
 5 | package default
 6 | 
 7 | import scala.collection.mutable.ArrayBuffer
 8 | 
 9 | /**
10 |  * TreeNode.scala
11 |  * Description: This is the definition of TreeNode of FP-Tree
12 |  * Author: Lin, Chen
13 |  * E-mail: chlin.ecnu@gmail.com
14 |  * Version: 1.0
15 |  */
16 | 
17 | class TreeNode (val name: String = null, var count: Long = 0, var parent: TreeNode = null, val children: ArrayBuffer[TreeNode] = new ArrayBuffer[TreeNode](),  var nextHomonym: TreeNode = null){
18 |   def findChild(name: String): TreeNode = {
19 |     children.find(_.name == name) match {
20 |       case Some(node) => node
21 |       case None => null
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/spark/chapter7/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter8/k-meansexample/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "Spark K-Means"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.0.1"
 8 | 
 9 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
10 | 
11 | 


--------------------------------------------------------------------------------
/spark/chapter8/k-meansexample/input/centroids.txt:
--------------------------------------------------------------------------------
 1 | 1	2024854310.3956 6731000548.2243
 2 | 2	721344868.9113 8856893681.4588
 3 | 3	8857754499.5514 5251661686.7686
 4 | 4	1077585262.2174 2699556533.0996
 5 | 5	4180772568.7333 8490011787.7244
 6 | 6	9695729913.2825 2127828538.6125
 7 | 7	4056302822.5753 3828270338.5277
 8 | 8	9588172531.2373 9453674798.3649
 9 | 9	1838580091.0374 1032953856.2467
10 | 10	9081427080.8788 245825688.9429
11 | 


--------------------------------------------------------------------------------
/spark/chapter8/k-meansexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter8/k-meansexample/run.sh:
--------------------------------------------------------------------------------
1 | sbt package
2 | rm -rf final
3 | spark-submit \
4 |   --class "com.jgalilee.spark.kmeans.JobDriver" \
5 |   --master local[4] \
6 |   ./target/scala-2.10/spark-k-means_2.10-1.0.jar \
7 |   input/points.txt input/centroids.txt final 10 0.0 3
8 | cat final/p* | sort
9 | 


--------------------------------------------------------------------------------
/spark/chapter8/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter9/naivebayesexample/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "blog-spark-naive-bayes-reuters"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | val sparkVersion = "1.0.0"
 8 | 
 9 | libraryDependencies <<= scalaVersion {
10 |   scala_version => Seq(
11 |     // Spark and Mllib
12 |     "org.apache.spark" %% "spark-core" % sparkVersion,
13 |     "org.apache.spark" %% "spark-mllib" % sparkVersion,
14 |     // Lucene
15 |     "org.apache.lucene" % "lucene-core" % "4.8.1",
16 |     // for Porter Stemmer
17 |     "org.apache.lucene" % "lucene-analyzers-common" % "4.8.1",
18 |     // Guava for the dictionary
19 |     "com.google.guava" % "guava" % "17.0",
20 |     // article extractor
21 |     "com.gravity" %% "goose" % "2.1.23"
22 |   )
23 | }
24 | 
25 | // used for goose
26 | resolvers += Resolver.mavenLocal
27 | 


--------------------------------------------------------------------------------
/spark/chapter9/naivebayesexample/download-reuters.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | REUTERS_FILE="reuters21578.tar.gz"
 4 | if [ ! -f $REUTERS_FILE ]
 5 | then
 6 |   wget http://www.daviddlewis.com/resources/testcollections/reuters21578/$REUTERS_FILE
 7 | fi
 8 | mkdir -p reuters
 9 | (cd reuters; tar xvfz ../$REUTERS_FILE)
10 | 


--------------------------------------------------------------------------------
/spark/chapter9/naivebayesexample/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
2 | 
3 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
4 | 
5 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.5.0")
6 | 


--------------------------------------------------------------------------------
/spark/chapter9/naivebayesexample/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/spark/chapter9/naivebayesexample/src/main/scala/default/ReutersParser.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Bayesian learning - Naive Bayes example
 3 | // Chapter 9
 4 | 
 5 | package default
 6 | 
 7 | 
 8 | import scala.xml.pull.{EvText, EvElemEnd, EvElemStart, XMLEventReader}
 9 | import scala.io.Source
10 | import scala.collection.mutable
11 | 
12 | object ReutersParser {
13 |   def PopularCategories = Seq("money", "fx", "crude", "grain", "trade", "interest", "wheat", "ship", "corn", "oil", "dlr", "gas", "oilseed", "supply", "sugar", "gnp", "coffee", "veg", "gold", "nat", "soybean", "bop", "livestock", "cpi")
14 | 
15 |   def parseAll(xmlFiles: Iterable[String]) = xmlFiles flatMap parse
16 | 
17 |   def parse(xmlFile: String) = {
18 |     val docs = mutable.ArrayBuffer.empty[Document]
19 |     val xml = new XMLEventReader(Source.fromFile(xmlFile, "latin1"))
20 |     var currentDoc: Document = null
21 |     var inTopics = false
22 |     var inLabel = false
23 |     var inBody = false
24 |     for (event <- xml) {
25 |       event match {
26 |         case EvElemStart(_, "REUTERS", attrs, _) =>
27 |           currentDoc = Document(attrs.get("NEWID").get.head.text)
28 | 
29 |         case EvElemEnd(_, "REUTERS") =>
30 |           if (currentDoc.labels.nonEmpty) {
31 |             docs += currentDoc
32 |           }
33 | 
34 |         case EvElemStart(_, "TOPICS", _, _) => inTopics = true
35 | 
36 |         case EvElemEnd(_, "TOPICS") => inTopics = false
37 | 
38 |         case EvElemStart(_, "D", _, _) => inLabel = true
39 | 
40 |         case EvElemEnd(_, "D") => inLabel = false
41 | 
42 |         case EvElemStart(_, "BODY", _, _) => inBody = true
43 | 
44 |         case EvElemEnd(_, "BODY") => inBody = false
45 | 
46 |         case EvText(text) =>
47 |           if (text.trim.nonEmpty) {
48 |             if (inTopics && inLabel && PopularCategories.contains(text)) {
49 |               currentDoc = currentDoc.copy(labels = currentDoc.labels + text)
50 |             } else if (inBody) {
51 |               currentDoc = currentDoc.copy(body = currentDoc.body + text.trim)
52 |             }
53 |           }
54 | 
55 |         case _ =>
56 |       }
57 |     }
58 |     docs
59 |   }
60 | }
61 | 
62 | case class Document(docId: String, body: String = "", labels: Set[String] = Set.empty)
63 | 


--------------------------------------------------------------------------------
/spark/chapter9/naivebayesexample/src/main/scala/default/Tokenizer.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Bayesian learning - Naive Bayes example
 3 | // Chapter 9
 4 | 
 5 | package default
 6 | 
 7 | 
 8 | import java.io.StringReader
 9 | import org.apache.lucene.analysis.en.EnglishAnalyzer
10 | import org.apache.lucene.util.Version
11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
12 | import scala.collection.mutable
13 | 
14 | object Tokenizer {
15 |   val LuceneVersion = Version.LUCENE_48
16 | 
17 |   def tokenizeAll(docs: Iterable[Document]) = docs.map(tokenize)
18 | 
19 |   def tokenize(doc: Document): TermDoc = TermDoc(doc.docId, doc.labels, tokenize(doc.body))
20 | 
21 |   def tokenize(content: String): Seq[String] = {
22 |     val tReader = new StringReader(content)
23 |     val analyzer = new EnglishAnalyzer(LuceneVersion)
24 |     val tStream = analyzer.tokenStream("contents", tReader)
25 |     val term = tStream.addAttribute(classOf[CharTermAttribute])
26 |     tStream.reset()
27 | 
28 |     val result = mutable.ArrayBuffer.empty[String]
29 |     while(tStream.incrementToken()) {
30 |       val termValue = term.toString
31 |       if (!(termValue matches ".*[\\d\\.].*")) {
32 |         result += term.toString
33 |       }
34 |     }
35 |     result
36 |   }
37 | }
38 | 
39 | case class TermDoc(doc: String, labels: Set[String], terms: Seq[String])


--------------------------------------------------------------------------------
/spark/chapter9/naivebayesexample/src/main/scala/default/VectorUtil.scala:
--------------------------------------------------------------------------------
 1 | // Practical Machine learning
 2 | // Bayesian learning - Naive Bayes example
 3 | // Chapter 9
 4 | 
 5 | package default
 6 | 
 7 | import com.google.common.collect.ImmutableBiMap
 8 | import scala.collection.JavaConversions._
 9 | import org.apache.spark.mllib.linalg.Vectors
10 | 
11 | class Dictionary(dict: Seq[String]) extends Serializable {
12 | 
13 |   // map term => index
14 |   val termToIndex = ImmutableBiMap.builder[String, Int]()
15 |     .putAll(dict.zipWithIndex.toMap[String, Int])
16 |     .build()
17 | 
18 |   @transient
19 |   lazy val indexToTerm = termToIndex.inverse()
20 | 
21 |   val count = termToIndex.size()
22 | 
23 |   def indexOf(term: String) = termToIndex(term)
24 | 
25 |   def valueOf(index: Int) = indexToTerm(index)
26 | 
27 |   def tfIdfs(terms: Seq[String], idfs: Map[String, Double]) = {
28 |     val filteredTerms = terms.filter(idfs contains)
29 |     (filteredTerms.groupBy(identity).map {
30 |       case (term, instances) =>
31 |         (indexOf(term), (instances.size.toDouble / filteredTerms.size.toDouble) * idfs(term))
32 |     }).toSeq.sortBy(_._1) // sort by termId
33 |   }
34 | 
35 |   def vectorize(tfIdfs: Iterable[(Int, Double)]) = {
36 |     Vectors.sparse(dict.size, tfIdfs.toSeq)
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/spark/chapter9/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------