├── .gitignore ├── README.md ├── julia ├── chapter10 │ ├── linearregressionexample │ │ ├── data.txt │ │ ├── linearregression-multivariable.jl │ │ └── readme.md │ ├── logisticregressionexample │ │ ├── dataset1.txt │ │ ├── logisticregression.jl │ │ └── readme.md │ └── readme.md ├── chapter11 │ ├── annexample │ │ ├── checkNNGradients.jl │ │ ├── computeNumericalGradient.jl │ │ ├── dataset1.mat │ │ ├── debugInitializeWeights.jl │ │ ├── displayData.jl │ │ ├── neural-network.jl │ │ ├── nnCostFunction.jl │ │ ├── predict.jl │ │ ├── randInitializeWeights.jl │ │ ├── readme.md │ │ ├── sigmoid.jl │ │ ├── sigmoidGradient.jl │ │ ├── submit.jl │ │ └── weights.mat │ ├── dlexample │ │ ├── autoencoder │ │ │ ├── README.md │ │ │ ├── autoencoder.jl │ │ │ ├── digits.jl │ │ │ ├── getdata-sparseautoencoder.sh │ │ │ ├── getdata-vectorization.sh │ │ │ └── matlab │ │ │ │ ├── loadMNISTImages.m │ │ │ │ └── loadMNISTLabels.m │ │ ├── datautils.jl │ │ ├── plottingutils.jl │ │ └── readme.md │ └── readme.md ├── chapter12 │ ├── readme.md │ └── rlexample │ │ ├── DeepQLearning.jl │ │ ├── dqn-example.jl │ │ ├── dqn.jl │ │ ├── dqnruntest.jl │ │ ├── dqntest1.jl │ │ └── readme.md ├── chapter13 │ ├── ensembleexample │ │ ├── Ensemble.jl │ │ ├── decisiontree.jl │ │ ├── decisiontree_test.jl │ │ ├── dimensionalityreduction.jl │ │ ├── dimensionalityreduction_test.jl │ │ ├── iris.csv │ │ ├── mlbase.jl │ │ ├── mlbase_test.jl │ │ ├── readme.md │ │ ├── transformers.jl │ │ ├── types.jl │ │ └── util.jl │ └── readme.md ├── chapter5 │ ├── decisiontreeexample │ │ ├── DecisionTree.jl │ │ ├── decision_tree_test1.jl │ │ ├── decision_tree_test2.jl │ │ ├── measures.jl │ │ └── readme.md │ ├── randomforstexample │ │ ├── RandomForests.jl │ │ ├── classifier.jl │ │ ├── example.jl │ │ ├── randomforest.jl │ │ ├── readme.md │ │ ├── regressor.jl │ │ ├── sort.jl │ │ ├── split.jl │ │ ├── tree.jl │ │ └── util.jl │ └── readme.md ├── chapter6 │ ├── knnexample │ │ ├── knn.jl │ │ ├── readme.md │ │ ├── sampledata.csv │ │ ├── test.zip │ │ ├── train.zip │ │ └── training.csv │ ├── readme.md │ └── svmexample │ │ ├── examplesvm1.jl │ │ ├── readme.md │ │ ├── reference │ │ ├── libsvm_wrapper.c │ │ ├── svm.cpp │ │ └── svm.h │ │ └── svm.jl ├── chapter7 │ ├── aprioriexample │ │ ├── apriori.jl │ │ ├── aprioritest.jl │ │ ├── common.jl │ │ └── readme.md │ ├── fpgrowthexample │ │ ├── common.jl │ │ ├── fpgrowth.jl │ │ └── readme.md │ └── readme.md ├── chapter8 │ ├── k-meansexample │ │ ├── k-means.jl │ │ └── readme.md │ └── readme.md └── chapter9 │ ├── naivebayesexample │ ├── NaiveBayes.jl │ ├── datastats.jl │ ├── nbexampledata-iris.jl │ ├── nbfunctions.jl │ ├── nbtest1.jl │ ├── nbtest2.jl │ ├── nbtypes.jl │ └── readme.md │ └── readme.md ├── mahout ├── chapter10 │ ├── linearregressionexample │ │ └── readme.md │ ├── logisticregressionexample │ │ ├── pom.xml │ │ ├── readme.md │ │ └── src │ │ │ ├── main │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── packt │ │ │ │ └── pml │ │ │ │ └── mahout │ │ │ │ └── logreg │ │ │ │ ├── LogisticRegreesionBase.java │ │ │ │ ├── LogisticRegressionApp.java │ │ │ │ └── LogisticRegressionBase.java │ │ │ └── test │ │ │ └── java │ │ │ └── com │ │ │ └── packt │ │ │ └── pml │ │ │ └── mahout │ │ │ └── logreg │ │ │ └── LogisticRegressionTest.java │ └── readme.md ├── chapter11 │ ├── annexample │ │ └── readme.md │ ├── dlexample │ │ ├── pom.xml │ │ ├── readme.md │ │ └── src │ │ │ ├── main │ │ │ └── java │ │ │ │ ├── Autoencoder.java │ │ │ │ ├── AutoencoderComputedParams.java │ │ │ │ ├── AutoencoderConfig.java │ │ │ │ ├── AutoencoderFct.java │ │ │ │ ├── AutoencoderFctGrd.java │ │ │ │ ├── AutoencoderGradient3.java │ │ │ │ ├── AutoencoderLearner.java │ │ │ │ ├── AutoencoderLinAlgebra.java │ │ │ │ ├── AutoencoderLineSearch.java │ │ │ │ ├── AutoencoderParams.java │ │ │ │ ├── AutoencoderSigmoid.java │ │ │ │ └── two_layers_autoencoders_model.prototxt │ │ │ └── test │ │ │ └── java │ │ │ ├── AutoencoderTest.java │ │ │ ├── ExtractPatchesTest.java │ │ │ ├── ExtractPatchesTuplesTest.java │ │ │ ├── FFTConvolutionTest.java │ │ │ ├── FeatureExtractionTest.java │ │ │ ├── LinAlgebraIOUtilsTest.java │ │ │ ├── LoadSaveModelTest.java │ │ │ ├── MaxPoolerTest.java │ │ │ ├── OneLayerTest.java │ │ │ ├── PreProcessTest.java │ │ │ ├── RankTest.java │ │ │ ├── ThreeLayerTest.java │ │ │ └── TwoLayersTest.java │ └── readme.md ├── chapter12 │ ├── readme.md │ └── rlexample │ │ └── readme.md ├── chapter13 │ ├── ensembleexample │ │ ├── data │ │ │ └── input │ │ │ │ ├── u.data │ │ │ │ ├── u1.base │ │ │ │ └── ua.base │ │ ├── pom.xml │ │ ├── readme.md │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── packt │ │ │ └── pml │ │ │ └── mahout │ │ │ └── ensemble │ │ │ ├── Hadoop.java │ │ │ ├── ItemRecommender.java │ │ │ ├── RecommenderEvaluator.java │ │ │ ├── Recommenders.java │ │ │ ├── SlopeOneBasedRecommender.java │ │ │ └── Utilities.java │ └── readme.md ├── chapter5 │ ├── decisiontreeexample │ │ └── readme.md │ └── randomforestexample │ │ ├── pom.xml │ │ ├── readme.md │ │ └── src │ │ ├── main │ │ └── java │ │ │ └── com │ │ │ └── packt │ │ │ └── pml │ │ │ └── mahout │ │ │ └── randomforest │ │ │ └── RandomForest.java │ │ └── test │ │ └── java │ │ └── com │ │ └── packt │ │ └── pml │ │ └── mahout │ │ └── randomforest │ │ └── RandomForestTest.java ├── chapter6 │ ├── knnexample │ │ ├── main │ │ │ └── java │ │ │ │ └── KNearestNeighbor.java │ │ ├── readme.md │ │ └── test │ │ │ └── java │ │ │ └── WeightedMatrixTest.java │ └── svmexample │ │ └── readme.md ├── chapter7 │ ├── aprioriexample │ │ └── readme.md │ └── fpgrowthexample │ │ ├── pom.xml │ │ ├── readme.md │ │ └── src │ │ ├── main │ │ └── java │ │ │ └── com │ │ │ └── packt │ │ │ └── pml │ │ │ └── mahout │ │ │ └── fpgrowth │ │ │ ├── FrequentPatternMetrics.java │ │ │ └── FrequentPatternMiningJava.java │ │ └── test │ │ └── java │ │ └── com │ │ └── packt │ │ └── pml │ │ └── mahout │ │ └── fpgrowth │ │ └── FPgrowthTest.java ├── chapter8 │ ├── k-meansexample │ │ ├── pom.xml │ │ ├── readme.md │ │ └── src │ │ │ ├── main │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── packt │ │ │ │ └── pml │ │ │ │ └── mahout │ │ │ │ └── kmeans │ │ │ │ ├── DataPreprocessing.java │ │ │ │ ├── InputDriver.java │ │ │ │ └── MahoutClusteringExample.java │ │ │ └── test │ │ │ └── java │ │ │ └── com │ │ │ └── packt │ │ │ └── pml │ │ │ └── mahout │ │ │ └── kmeans │ │ │ └── KMeansTest.java │ └── readme.md └── chapter9 │ ├── naivebayesexample │ ├── pom.xml │ ├── readme.md │ └── src │ │ ├── main │ │ └── java │ │ │ ├── com │ │ │ └── packt │ │ │ │ └── pml │ │ │ │ └── mahout │ │ │ │ └── naivebayes │ │ │ │ └── NaiveBayes.java │ │ │ └── start.sh │ │ └── test │ │ └── java │ │ └── com │ │ └── packt │ │ └── pml │ │ └── mahout │ │ └── naivebayes │ │ └── NaiveBayesTest.java │ └── readme.md ├── python-sckit-learn ├── chapter10 │ ├── linearregressionexample │ │ ├── data │ │ │ ├── winequality-red.csv │ │ │ └── winequality-white.csv │ │ ├── linear-regression-wine-data.py │ │ └── readme.md │ ├── logisticregressionexample │ │ ├── data │ │ │ ├── SMSSpamCollection │ │ │ ├── sms.csv │ │ │ ├── test.tsv │ │ │ └── train.tsv │ │ └── logistic-regression.py │ └── readme.md ├── chapter11 │ ├── annexample │ │ ├── ann.py │ │ └── readme.md │ ├── dlexample │ │ ├── example-1-data.ods │ │ ├── perceptron-data.ods │ │ ├── perceptron.py │ │ └── readme.md │ └── readme.md ├── chapter12 │ ├── readme.md │ └── rlexample │ │ ├── q-learning.py │ │ └── readme.md ├── chapter13 │ ├── ensembleexample │ │ ├── ensemble.py │ │ ├── ensemble_predict.py │ │ ├── ensemble_train.py │ │ ├── model_library.py │ │ └── readme.md │ └── readme.md ├── chapter5 │ ├── decisiontreeexample │ │ ├── data │ │ │ ├── ad.data │ │ │ └── ad.names │ │ ├── decision-tree.py │ │ └── information-gain.ods │ ├── randomforstexample │ │ ├── random-forests.py │ │ └── readme.md │ └── readme.md ├── chapter6 │ ├── knnexample │ │ ├── KNN.py │ │ ├── iris_data │ │ │ ├── README.md │ │ │ ├── iris.data │ │ │ └── iris.names │ │ ├── knn_example.png │ │ └── readme.md │ ├── readme.md │ └── svmexample │ │ ├── data │ │ ├── stopwords_en.txt │ │ └── titanic.csv │ │ ├── readme.md │ │ ├── svm.py │ │ └── svm_test.py ├── chapter7 │ ├── aprioriexample │ │ ├── INTEGRATED-DATASET.csv │ │ ├── apriori.py │ │ └── readme.md │ └── fpgrowthexample │ │ ├── data │ │ ├── numeric.csv │ │ └── tsk.csv │ │ ├── fp_growth.py │ │ ├── readme.md │ │ └── test-fpgrowth.py ├── chapter8 │ ├── k-meansexample │ │ ├── k-means.py │ │ └── readme.md │ └── readme.md ├── chapter9 │ ├── naivebayesexample │ │ ├── data-types.py │ │ ├── feature-selection.py │ │ ├── naivebayes-classifier.py │ │ ├── read-spam-data.py │ │ └── readme.md │ └── readme.md ├── data │ ├── stopwords_en.txt │ ├── titanic.csv │ └── titanic.png └── readme.md ├── r ├── chapter10 │ ├── linearregressionexample │ │ ├── Rplots.pdf │ │ ├── insurance.csv │ │ ├── linearregression.R │ │ └── readme.md │ ├── logisticregressionexample │ │ ├── dataset1.txt │ │ ├── dataset2.txt │ │ ├── logisticregression.R │ │ └── readme.md │ └── readme.md ├── chapter11 │ ├── annexample │ │ ├── Rplots.pdf │ │ ├── Rplots1.pdf │ │ ├── ann.R │ │ ├── concrete.csv │ │ └── readme.md │ ├── dlexample │ │ ├── autoencoder.R │ │ └── readme.md │ └── readme.md ├── chapter12 │ ├── readme.md │ └── rlexample │ │ ├── Results.pdf │ │ ├── qlaci.zip │ │ ├── qlearning │ │ ├── DESCRIPTION │ │ ├── INDEX │ │ ├── MD5 │ │ ├── Meta │ │ │ ├── Rd.rds │ │ │ ├── data.rds │ │ │ ├── hsearch.rds │ │ │ ├── links.rds │ │ │ ├── nsInfo.rds │ │ │ └── package.rds │ │ ├── NAMESPACE │ │ ├── R │ │ │ ├── qlearning │ │ │ ├── qlearning.rdb │ │ │ └── qlearning.rdx │ │ ├── data │ │ │ └── DataEx.RData │ │ ├── help │ │ │ ├── AnIndex │ │ │ ├── aliases.rds │ │ │ ├── paths.rds │ │ │ ├── qlearning.rdb │ │ │ └── qlearning.rdx │ │ └── html │ │ │ ├── 00Index.html │ │ │ └── R.css │ │ └── readme.md ├── chapter13 │ ├── ensembleexample │ │ ├── bagging-random-forest.R │ │ ├── credit.csv │ │ └── readme.md │ └── readme.md ├── chapter5 │ ├── decisiontreeexample │ │ ├── data │ │ │ ├── credit.csv │ │ │ └── mushrooms.csv │ │ ├── decision-trees.r │ │ └── readme.md │ ├── randomforstexample │ │ ├── data │ │ │ ├── test.csv │ │ │ └── train.csv │ │ ├── output │ │ │ ├── predict1.csv │ │ │ └── predict2.csv │ │ ├── randomforest.R │ │ └── readme.md │ └── readme.md ├── chapter6 │ ├── knnexample │ │ ├── knn.R │ │ ├── readme.md │ │ └── wisc_bc_data.csv │ ├── readme.md │ └── svmexample │ │ ├── letterdata.csv │ │ ├── readme.md │ │ └── svm.R ├── chapter7 │ ├── aprioriexample │ │ ├── Rplots.pdf │ │ ├── association-rules.R │ │ ├── groceries.csv │ │ ├── groceryrules.csv │ │ └── readme.md │ ├── fpgrowthexample │ │ └── readme.md │ └── readme.md ├── chapter8 │ ├── k-meansexample │ │ ├── kmeans-clustering.R │ │ ├── readme.md │ │ └── snsdata.csv │ └── readme.md └── chapter9 │ ├── naivebayesexample │ ├── Rplots.pdf │ ├── readme.md │ ├── sms_spam.csv │ └── snaive-bayes.R │ └── readme.md └── spark ├── chapter10 ├── linearregressionexample │ ├── ClickRate.py │ ├── readme.md │ └── startClickRate.sh ├── logisticregressionexample │ ├── dataset │ │ ├── spambase.DOCUMENTATION │ │ ├── spambase.data │ │ └── spambase.names │ ├── readme.md │ └── src │ │ └── main │ │ └── scala │ │ └── default │ │ └── SpamClassification-Logreg.scala └── readme.md ├── chapter11 ├── annexample │ ├── build.sbt │ ├── readme.md │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── default │ │ ├── Util.scala │ │ ├── algo │ │ ├── CostGradient.scala │ │ ├── DistCostGradientComputer.scala │ │ ├── GradientDescendOptimizer.scala │ │ ├── LoggingAbility.scala │ │ ├── NaiveCostGradientComputer.scala │ │ └── Predictor.scala │ │ ├── example │ │ └── MNIST.scala │ │ └── model │ │ ├── NeuralNetworkClassifier.scala │ │ ├── NeuralNetworkModel.scala │ │ └── Topology.scala ├── dlexample │ ├── RBM.scala │ └── readme.md └── readme.md ├── chapter12 ├── readme.md └── rlexample │ └── readme.md ├── chapter13 ├── ensembleexample │ ├── data │ │ ├── housing.txt │ │ └── sample_libsvm_data.txt │ ├── readme.md │ ├── result │ │ ├── GBT_clas.txt │ │ └── GBT_regression.txt │ └── src │ │ ├── GradientBoostTree_classification.scala │ │ ├── GradientBoostTree_regression.scala │ │ └── test.txt └── readme.md ├── chapter5 ├── decisiontreeexample │ ├── data │ │ ├── housing.txt │ │ └── sample_libsvm_data.txt │ ├── readme.md │ ├── result │ │ ├── classification.txt │ │ └── regression.txt │ └── src │ │ ├── decisiontree-classification.scala │ │ ├── decisiontree-regression.scala │ │ └── decsiontree-test.txt ├── randomforstexample │ ├── data │ │ ├── housing.txt │ │ └── sample_libsvm_data.txt │ ├── readme.md │ ├── result │ │ ├── RandomForest_regression.txt │ │ └── RandomForests_classification.txt │ └── src │ │ ├── RandomForest_regression.scala │ │ ├── RandomForests_classification.scala │ │ └── test.txt └── readme.md ├── chapter6 ├── knnexample │ ├── example-run │ ├── project │ │ ├── Build.scala │ │ ├── META-INF │ │ │ └── MANIFEST.MF │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── readme.md │ └── src │ │ └── Knn-recommender.scala └── svmexample │ ├── build.sbt │ ├── doc │ └── usage.txt │ ├── readme.md │ └── src │ └── main │ └── scala │ ├── KernelSVM.scala │ ├── Kernels.scala │ └── main.scala ├── chapter7 ├── aprioriexample │ ├── pom.xml │ ├── readme.md │ └── src │ │ └── main │ │ └── scala │ │ └── default │ │ ├── Apriori.scala │ │ ├── BloomFilter.scala │ │ ├── FrequentItemSets.scala │ │ ├── NaiveFrequentItemSets.scala │ │ └── TestMain.scala ├── fpgrowthexample │ ├── readme.md │ └── src │ │ └── main │ │ └── scala │ │ └── default │ │ ├── FPGrowth.scala │ │ ├── FPTree.scala │ │ ├── ParallelFPGrowth.scala │ │ ├── Test.scala │ │ └── TreeNode.scala └── readme.md ├── chapter8 ├── k-meansexample │ ├── build.sbt │ ├── input │ │ ├── centroids.txt │ │ └── points.txt │ ├── readme.md │ ├── run.sh │ └── src │ │ └── main │ │ └── scala │ │ └── default │ │ └── KMeans.scala └── readme.md └── chapter9 ├── naivebayesexample ├── build.sbt ├── download-reuters.sh ├── project │ └── plugins.sbt ├── readme.md └── src │ └── main │ └── scala │ └── default │ ├── NaiveBayes.scala │ ├── ReutersParser.scala │ ├── Tokenizer.scala │ └── VectorUtil.scala └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | mahout/chapter10/logisticregressionexample/pom.xml -------------------------------------------------------------------------------- /julia/chapter10/linearregressionexample/data.txt: -------------------------------------------------------------------------------- 1 | 2104,3,399900 2 | 1600,3,329900 3 | 2400,3,369000 4 | 1416,2,232000 5 | 3000,4,539900 6 | 1985,4,299900 7 | 1534,3,314900 8 | 1427,3,198999 9 | 1380,3,212000 10 | 1494,3,242500 11 | 1940,4,239999 12 | 2000,3,347000 13 | 1890,3,329999 14 | 4478,5,699900 15 | 1268,3,259900 16 | 2300,4,449900 17 | 1320,2,299900 18 | 1236,3,199900 19 | 2609,4,499998 20 | 3031,4,599000 21 | 1767,3,252900 22 | 1888,2,255000 23 | 1604,3,242900 24 | 1962,4,259900 25 | 3890,3,573900 26 | 1100,3,249900 27 | 1458,3,464500 28 | 2526,3,469000 29 | 2200,3,475000 30 | 2637,3,299900 31 | 1839,2,349900 32 | 1000,1,169900 33 | 2040,4,314900 34 | 3137,3,579900 35 | 1811,4,285900 36 | 1437,3,249900 37 | 1239,3,229900 38 | 2132,4,345000 39 | 4215,4,549000 40 | 2162,4,287000 41 | 1664,2,368500 42 | 2238,3,329900 43 | 2567,4,314000 44 | 1200,3,299000 45 | 852,2,179900 46 | 1852,4,299900 47 | 1203,3,239500 48 | -------------------------------------------------------------------------------- /julia/chapter10/linearregressionexample/linearregression-multivariable.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Regression Analysis - Linear Regression example 3 | # Chapter 10 4 | 5 | using Gadfly 6 | 7 | # Normalizes the features in x 8 | # The mean value of each feature is 0 and the standard deviation is 1 9 | # Returns normalized x, μ and σ 10 | function featureNormalize(x) 11 | rows = size(x,1) 12 | cols = size(x,2) 13 | 14 | μ = mean(x,1) 15 | σ = std(x,1) 16 | xNorm = zeros(x) 17 | 18 | # normalize 19 | for i in 1:cols 20 | for j in 1:rows 21 | xNorm[j,i] = (x[j,i] - μ[i]) / σ[i]; 22 | end 23 | end 24 | 25 | (xNorm, μ, σ) 26 | end 27 | 28 | 29 | println("Loading data ... ") 30 | data = readdlm("data.txt",',') 31 | x = data[:,1:2] 32 | y = data[:, 3] 33 | m = length(y) 34 | 35 | @printf("First 10 examples from the dataset: \n"); 36 | t = [x[1:10,:] y[1:10,:]]' 37 | for i in 1:10 38 | @printf(" x = [%.0f %.0f], y = %.0f \n", t[1,i], t[2,i], t[3,i]); 39 | end 40 | 41 | # Scale features and set them to zero mean 42 | (x, μ, σ) = featureNormalize(x); 43 | 44 | # Add intercept term to x 45 | x = [ones(m,1) x] 46 | 47 | #### Run Gradient Descent 48 | α = 0.001 49 | numIter = 4000 50 | θ = zeros(3,1) 51 | jHist = zeros(numIter, 1) 52 | 53 | for i in 1:numIter 54 | # next theta 55 | θ = θ - (α/m) * (x' * ((x*θ)-y)) 56 | # compute cost 57 | jHist[i] = sum((x*θ-y).^2)/(2m) 58 | end 59 | 60 | # plot convergence graph 61 | pl = plot( 62 | x=collect(1:numIter), 63 | y=jHist, 64 | Guide.xlabel("Iterations"), 65 | Guide.ylabel("Error"), 66 | Guide.title("Convergence Graph"), 67 | Geom.line 68 | ) 69 | draw(SVGJS("jHist.js.svg", 6inch, 6inch), pl) 70 | 71 | # Estimate the price of a 1650 sq-ft, 3 br house 72 | price = [1, (1650-μ[1])/σ[1], (3-μ[2])/σ[2]]' * θ 73 | println("Estimated price for a 1650 sq-ft, 3 br house: $price") 74 | 75 | println("done!") 76 | -------------------------------------------------------------------------------- /julia/chapter10/linearregressionexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter10/logisticregressionexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter10/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/checkNNGradients.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Artificial Neural Network 3 | # Chapter 11 4 | 5 | include("nnCostFunction.jl") 6 | include("computeNumericalGradient.jl") 7 | include("debugInitializeWeights.jl") 8 | 9 | function checkNNGradients(lambda = 0) 10 | input_layer_size = 3 11 | hidden_layer_size = 5 12 | num_labels = 3 13 | m = 5 14 | 15 | # We generate some 'random' test data 16 | Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) 17 | Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) 18 | # Reusing debugInitializeWeights to generate X 19 | X = debugInitializeWeights(m, input_layer_size - 1) 20 | y = (1 + mod(1:m, num_labels)')' 21 | 22 | # Unroll parameters 23 | nn_params = [Theta1[:] ; Theta2[:]] 24 | 25 | # Short hand for cost function 26 | costFunc = p -> nnCostFunction(p, input_layer_size, hidden_layer_size, 27 | num_labels, X, y, lambda) 28 | CHECKNNGRADIENTS(lambda) 29 | cost, grad = costFunc(nn_params) 30 | numgrad = computeNumericalGradient(costFunc, nn_params) 31 | 32 | # Visually examine the two gradient computations. The two columns 33 | # you get should be very similar. 34 | show([numgrad grad]) 35 | @printf(""" 36 | 37 | The above two columns you get should be very similar. 38 | (Left-Your Numerical Gradient, Right-Analytical Gradient) 39 | 40 | """) 41 | 42 | # Evaluate the norm of the difference between two solutions. 43 | # If you have a correct implementation, and assuming you used EPSILON = 0.0001 44 | # in computeNumericalGradient.m, then diff below should be less than 1e-9 45 | diff = norm(numgrad - grad) / norm(numgrad + grad) 46 | 47 | @printf(""" 48 | If your backpropagation implementation is correct, then 49 | the relative difference will be small (less than 1e-9). 50 | 51 | Relative Difference: %g 52 | """, diff) 53 | 54 | end 55 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/computeNumericalGradient.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Artificial Neural Network 3 | # Chapter 11 4 | 5 | function computeNumericalGradient(J, theta) 6 | 7 | # Notes: The following code implements numerical gradient checking, and 8 | # returns the numerical gradient.It sets numgrad(i) to (a numerical 9 | # approximation of) the partial derivative of J with respect to the 10 | # i-th input argument, evaluated at theta. (i.e., numgrad(i) should 11 | # be the (approximately) the partial derivative of J with respect 12 | # to theta(i).) 13 | 14 | numgrad = COMPUTENUMERICALGRADIENT(J, theta) 15 | perturb = zeros(size(theta)) 16 | e = 1e-4 17 | for p in 1:length(theta) 18 | # Set perturbation vector 19 | perturb[p] = e 20 | loss1, _ = J(theta - perturb) 21 | loss2, _ = J(theta + perturb) 22 | # Compute Numerical Gradient 23 | numgrad[p] = (loss2 - loss1) / 2e 24 | perturb[p] = 0 25 | end 26 | return numgrad 27 | end 28 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/dataset1.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/julia/chapter11/annexample/dataset1.mat -------------------------------------------------------------------------------- /julia/chapter11/annexample/debugInitializeWeights.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Artificial Neural Network 3 | # Chapter 11 4 | 5 | function debugInitializeWeights(fan_out, fan_in) 6 | # Note that W should be set to a matrix of size(1 + fan_in, fan_out) as 7 | # the first row of W handles the "bias" terms 8 | 9 | # Set W to zeros 10 | W = DEBUGINITIALIZEWEIGHTS(fan_in, fan_out) 11 | 12 | # Initialize W using "sin", this ensures that W is always of the same 13 | # values and will be useful for debugging 14 | W = reshape(sin(1:length(W)), size(W)) / 10 15 | 16 | return W 17 | end 18 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/displayData.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Artificial Neural Network 3 | # Chapter 11 4 | using Images, ImageView 5 | 6 | function displayData(X, example_width = round(Int, sqrt(size(X, 2)))) 7 | # Compute rows, cols 8 | m, n = size(X) 9 | example_height = round(Int, (n / example_width)) 10 | 11 | # Compute number of items to display 12 | display_rows = round(Int, sqrt(m)) 13 | display_cols = round(Int, ceil(m / display_rows)) 14 | 15 | # Between images padding 16 | pad = 1 17 | 18 | # Setup blank display 19 | display_array = - ones(pad + display_rows * (example_height + pad), 20 | pad + display_cols * (example_width + pad)) 21 | 22 | # Copy each example into a patch on the display array 23 | curr_ex = 1 24 | for j in 1:display_rows, i in 1:display_cols 25 | if curr_ex > m 26 | break 27 | end 28 | 29 | # Get the max value of the patch 30 | max_val = maximum(abs(X[curr_ex, :])) 31 | display_array[pad + (j - 1) * (example_height + pad) + (1:example_height), 32 | pad + (i - 1) * (example_width + pad) + (1:example_width)] = 33 | reshape(X[curr_ex, :], (example_height, example_width)) / max_val 34 | curr_ex += 1 35 | end 36 | 37 | # Display Image 38 | img = Image(display_array) 39 | [canvas, img] = DISPLAYDATA(X, example_width) 40 | return (canvas, img) 41 | end 42 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/predict.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Artificial Neural Network 3 | # Chapter 11 4 | 5 | function predict(Theta1, Theta2, X) 6 | # Useful values 7 | m = size(X, 1) 8 | num_labels = size(Theta2, 1) 9 | 10 | # You need to return the following variables correctly 11 | p = PREDICT(Theta1, Theta2, X) 12 | h1 = sigmoid([ones(m, 1) X] * Theta1') 13 | h2 = sigmoid([ones(m, 1) h1] * Theta2') 14 | 15 | for i in 1:m 16 | p[i] = findmax(h2[i, :])[2] 17 | end 18 | return p 19 | end 20 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/randInitializeWeights.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Artificial Neural Network 3 | # Chapter 11 4 | 5 | function randInitializeWeights(L_in, L_out) 6 | W = RANDINITIALIZEWEIGHTS(L_in, L_out) 7 | return W 8 | end 9 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/sigmoid.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Artificial Neural Network 3 | # Chapter 11 4 | @doc """ 5 | SIGMOID Compute sigmoid functoon 6 | J = SIGMOID(z) computes the sigmoid of z. 7 | """ -> 8 | function sigmoid(z) 9 | g = SIGMOID(z) 10 | return g 11 | end 12 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/sigmoidGradient.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Artificial Neural Network 3 | # Chapter 11 4 | function sigmoidGradient(z) 5 | g = SIGMOIDGRADIENT(z) 6 | return g 7 | end 8 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/submit.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Artificial Neural Network 3 | # Chapter 11 4 | 5 | export submit 6 | 7 | include("../data.jl") 8 | include("../submit.jl") 9 | 10 | include("nnCostFunction.jl") 11 | include("sigmoidGradient.jl") 12 | 13 | function submit() 14 | parts = [ 15 | Part(1, "Feedforward and Cost Function"), 16 | Part(2, "Regularized Cost Function"), 17 | Part(3, "Sigmoid Gradient"), 18 | Part(4, "Neural Network Gradient (Backpropagation)"), 19 | Part(5, "Regularized Gradient") 20 | ] 21 | conf = Conf("neural-network-learning", 22 | "Neural Networks Learning", parts, solver) 23 | 24 | submitWithConf(conf) 25 | end 26 | 27 | function solver(partId) 28 | # Random Test Cases 29 | X = reshape(3 * sin(1:1:30), (3, 10)) 30 | Xm = reshape(sin(1:32), (16, 2)) / 5 31 | ym = (1 + mod(1:16, 4)')' 32 | t1 = sin(reshape(1:2:24, (4, 3))) 33 | t2 = cos(reshape(1:2:40, (4, 5))) 34 | t = [t1[:] ; t2[:]] 35 | if partId == 1 36 | J, _ = nnCostFunction(t, 2, 4, 4, Xm, ym, 0) 37 | return @sprintf("%0.5f", J) 38 | elseif partId == 2 39 | J, _ = nnCostFunction(t, 2, 4, 4, Xm, ym, 1.5) 40 | return @sprintf("%0.5f", J) 41 | elseif partId == 3 42 | return join(map(x -> @sprintf("%0.5f", x), sigmoidGradient(X)), " ") 43 | elseif partId == 4 44 | J, grad = nnCostFunction(t, 2, 4, 4, Xm, ym, 0) 45 | return @sprintf("%0.5f ", J) * join(map(x -> @sprintf("%0.5f", x), grad), " ") 46 | elseif partId == 5 47 | J, grad = nnCostFunction(t, 2, 4, 4, Xm, ym, 1.5) 48 | return @sprintf("%0.5f ", J) * join(map(x -> @sprintf("%0.5f", x), grad), " ") 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /julia/chapter11/annexample/weights.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/julia/chapter11/annexample/weights.mat -------------------------------------------------------------------------------- /julia/chapter11/dlexample/autoencoder/README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning with Julia 2 | 3 | ## The tutorial 4 | The "Unsupervised Feature Learning and Deep Learning" tutorial can be found [here](http://deeplearning.stanford.edu/wiki/index.php/UFLD_TUTORIAL). The tutorial provides a fair amount of starter code in Matlab, which also contains the data we need to follow along with Julia. If you just want the data, you can use the shell scripts provided in this repository which have the form **getdata-*.sh**. Just fill in the name of the exercise you're working on. 5 | 6 | ## Julia scripts 7 | The scripts included are complete solutions; they are NOT starter code. I'm still working through the tutorial. But once I'm done, and have learned quite a bit more about Julia, I will include a directory with just starter code. 8 | 9 | -------------------------------------------------------------------------------- /julia/chapter11/dlexample/autoencoder/digits.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Deep learning - Sparse autoencoder example 3 | # Chapter 11 4 | 5 | using SparseAutoencoder 6 | using MAT 7 | 8 | vars = matread("./data/mnist-images.mat") 9 | data = vars["images"] 10 | 11 | visiblesize = 28*28 12 | hiddensize = 196 13 | sparsityparameter = 0.1 14 | lambda = 3e-3 15 | beta = 3.0 16 | patches = data[:,1:10000] 17 | 18 | minf,W1,W2,b1,b2 = autoencode(patches,hiddensize,visiblesize,lambda=lambda,beta=beta,rho=sparsityparameter) 19 | 20 | using HDF5, JLD 21 | @save "./digits-results.jld" 22 | -------------------------------------------------------------------------------- /julia/chapter11/dlexample/autoencoder/getdata-sparseautoencoder.sh: -------------------------------------------------------------------------------- 1 | mkdir -p data || exit 1 2 | cd data 3 | wget http://ufldl.stanford.edu/wiki/resources/sparseae_exercise.zip 4 | unzip sparseae_exercise.zip 5 | cp starter/IMAGES.mat IMAGES.mat 6 | rm -rf starter 7 | rm sparseae_exercise.zip 8 | cd .. 9 | -------------------------------------------------------------------------------- /julia/chapter11/dlexample/autoencoder/getdata-vectorization.sh: -------------------------------------------------------------------------------- 1 | mkdir -p data || exit 1 2 | cd data 3 | wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz 4 | gunzip train-images-idx3-ubyte.gz 5 | cd .. 6 | -------------------------------------------------------------------------------- /julia/chapter11/dlexample/autoencoder/matlab/loadMNISTImages.m: -------------------------------------------------------------------------------- 1 | function images = loadMNISTImages(filename) 2 | %loadMNISTImages returns a 28x28x[number of MNIST images] matrix containing 3 | %the raw MNIST images 4 | 5 | fp = fopen(filename, 'rb'); 6 | assert(fp ~= -1, ['Could not open ', filename, '']); 7 | 8 | magic = fread(fp, 1, 'int32', 0, 'ieee-be'); 9 | display(magic) 10 | assert(magic == 2051, ['Bad magic number in ', filename, '']); 11 | 12 | numImages = fread(fp, 1, 'int32', 0, 'ieee-be'); 13 | numRows = fread(fp, 1, 'int32', 0, 'ieee-be'); 14 | numCols = fread(fp, 1, 'int32', 0, 'ieee-be'); 15 | 16 | images = fread(fp, inf, 'unsigned char'); 17 | images = reshape(images, numCols, numRows, numImages); 18 | images = permute(images,[2 1 3]); 19 | 20 | fclose(fp); 21 | 22 | % Reshape to #pixels x #examples 23 | images = reshape(images, size(images, 1) * size(images, 2), size(images, 3)); 24 | % Convert to double and rescale to [0,1] 25 | images = double(images) / 255; 26 | 27 | end 28 | -------------------------------------------------------------------------------- /julia/chapter11/dlexample/autoencoder/matlab/loadMNISTLabels.m: -------------------------------------------------------------------------------- 1 | function labels = loadMNISTLabels(filename) 2 | %loadMNISTLabels returns a [number of MNIST images]x1 matrix containing 3 | %the labels for the MNIST images 4 | 5 | fp = fopen(filename, 'rb'); 6 | assert(fp ~= -1, ['Could not open ', filename, '']); 7 | 8 | magic = fread(fp, 1, 'int32', 0, 'ieee-be'); 9 | assert(magic == 2049, ['Bad magic number in ', filename, '']); 10 | 11 | numLabels = fread(fp, 1, 'int32', 0, 'ieee-be'); 12 | 13 | labels = fread(fp, inf, 'unsigned char'); 14 | 15 | assert(size(labels,1) == numLabels, 'Mismatch in label count'); 16 | 17 | fclose(fp); 18 | 19 | end 20 | -------------------------------------------------------------------------------- /julia/chapter11/dlexample/datautils.jl: -------------------------------------------------------------------------------- 1 | module DataUtils 2 | 3 | export sampleimages 4 | function sampleimages(images::Array{Float64,3},patchwidth::Int,patchheight::Int,numsamples::Int; scalevariance=true) 5 | width, height = size(images[:,:,1]) 6 | array::Array{Float64,2} = zeros(patchwidth*patchheight,numsamples) 7 | for index=1:numsamples 8 | image_index = rand(1:size(images,3)) 9 | x = rand(1:width-patchwidth+1) 10 | y = rand(1:height-patchheight+1) 11 | sample = images[x:x+patchwidth-1,y:y+patchheight-1,image_index] 12 | array[:,index] = reshape(sample,patchwidth*patchheight) 13 | array[:,index] -= mean(array[:,index]) #subtract mean 14 | end 15 | 16 | if scalevariance 17 | # rescale images to fit in range 0.1 to 0.9 18 | stddev = std(array) 19 | array = max(min(array,3*stddev),-3*stddev) / (3*stddev) 20 | array = (array + 1.0) * 0.4 + 0.1 21 | end 22 | return array 23 | end 24 | 25 | import Plotly 26 | export displaynetwork_plotly 27 | function displaynetwork_plotly(A,filename,username,userkey) 28 | m,n = size(A) 29 | sz = int(sqrt(m)) 30 | A -= mean(A) 31 | layout = [ 32 | "autosize" => false, 33 | "width" => 500, 34 | "height"=> 500 35 | ] 36 | 37 | gridsize = int(ceil(sqrt(n))) 38 | buffer = 1 39 | griddata = ones(gridsize*(sz+1)+1,gridsize*(sz+1)+1) 40 | index = 1 41 | for i = 1:gridsize 42 | for j = 1:gridsize 43 | if index > n 44 | continue 45 | end 46 | columnlimit = maximum(abs(A[:,index])) 47 | griddata[buffer+(i-1)*(sz+buffer)+(1:sz),buffer+(j-1)*(sz+buffer)+(1:sz)] = reshape(A[:,index],sz,sz)/columnlimit 48 | index += 1 49 | end 50 | end 51 | 52 | Plotly.signin(username, userkey) 53 | data = [ 54 | [ 55 | "z" => griddata, 56 | "colorscale" => "Greys", 57 | "type" => "heatmap" 58 | ] 59 | ] 60 | response = Plotly.plot(data, ["layout" => layout, "filename" => filename, "fileopt" => "overwrite"]) 61 | plot_url = response["url"] 62 | end 63 | 64 | end 65 | -------------------------------------------------------------------------------- /julia/chapter11/dlexample/plottingutils.jl: -------------------------------------------------------------------------------- 1 | module PlottingUtils 2 | 3 | function reshapedata(A) 4 | m,n = size(A) 5 | sz = int(sqrt(m)) 6 | A -= mean(A) 7 | 8 | gridsize = int(ceil(sqrt(n))) 9 | buffer = 1 10 | griddata = ones(gridsize*(sz+1)+1,gridsize*(sz+1)+1) 11 | index = 1 12 | for i = 1:gridsize 13 | for j = 1:gridsize 14 | if index > n 15 | continue 16 | end 17 | columnlimit = maximum(abs(A[:,index])) 18 | griddata[buffer+(i-1)*(sz+buffer)+(1:sz),buffer+(j-1)*(sz+buffer)+(1:sz)] = reshape(A[:,index],sz,sz)/columnlimit 19 | index += 1 20 | end 21 | end 22 | 23 | return griddata 24 | end 25 | 26 | import Plotly 27 | export displaynetwork_plotly 28 | function displaynetwork_plotly(A,filename,username,userkey) 29 | griddata = reshapedata(A) 30 | Plotly.signin(username, userkey) 31 | data = [ 32 | [ 33 | "z" => griddata, 34 | "colorscale" => "Greys", 35 | "type" => "heatmap" 36 | ] 37 | ] 38 | layout = [ 39 | "autosize" => false, 40 | "width" => 500, 41 | "height"=> 500 42 | ] 43 | response = Plotly.plot(data, ["layout" => layout, "filename" => filename, "fileopt" => "overwrite"]) 44 | plot_url = response["url"] 45 | end 46 | 47 | import Gadfly 48 | export displaynetwork_gadfly 49 | function displaynetwork_gadfly(A) 50 | griddata = reshapedata(A) 51 | Gadfly.spy(A) 52 | end 53 | 54 | import Winston 55 | export displaynetwork_winston 56 | function displaynetwork_winston(A) 57 | griddata = reshapedata(A) 58 | p = Winston.FramedPlot() 59 | Winston.colormap("grays") 60 | Winston.add(p,Winston.imagesc(griddata)) 61 | display(p) 62 | end 63 | 64 | end 65 | -------------------------------------------------------------------------------- /julia/chapter11/dlexample/readme.md: -------------------------------------------------------------------------------- 1 | Autoencoder example with a base from MATlab code 2 | -------------------------------------------------------------------------------- /julia/chapter11/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter12/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter12/rlexample/DeepQLearning.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Reinforcement learning - Q learning example 3 | # Chapter 12 4 | 5 | module DeepQLearning 6 | 7 | using NNGraph 8 | 9 | export DQN, forward, act, learn 10 | include("dqn.jl") 11 | 12 | end # module 13 | -------------------------------------------------------------------------------- /julia/chapter12/rlexample/dqn-example.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Reinforcement learning - Q learning example 3 | # Chapter 12 4 | 5 | using NNGraph, DeepQLearning 6 | reload("DeepQLearning") 7 | 8 | dqn = DeepQLearning.DQN(10,100,5) 9 | 10 | s0 = randNNMat(10,1) 11 | a = DeepQLearning.forward(dqn, s0) 12 | DeepQLearning.act(dqn,s0) 13 | DeepQLearning.learn(dqn, 0.) 14 | 15 | s1 = randNNMat(10,1) 16 | a = DeepQLearning.forward(dqn, s1) 17 | DeepQLearning.act(dqn,s1) 18 | DeepQLearning.learn(dqn, 0.) 19 | -------------------------------------------------------------------------------- /julia/chapter12/rlexample/dqnruntest.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Reinforcement learning - Q learning example 3 | # Chapter 12 4 | 5 | using DeepQLearning 6 | using Base.Test 7 | 8 | # write your own tests here 9 | @test 1 == 1 10 | -------------------------------------------------------------------------------- /julia/chapter12/rlexample/dqntest1.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Reinforcement learning - Q learning example 3 | # Chapter 12 4 | 5 | using DeepQLearning, NNGraph 6 | # reload("DeepQLearning") 7 | xs = linspace(0,360,100) 8 | ys = round(sin(deg2rad(xs)),3) 9 | deg2rad(xs) 10 | # plot(x=xs,y=ys) 11 | 12 | m = DQN(2,100,2) 13 | alpha=0.0001; t_alpha =0.15 14 | epsilon = 0.2; t_epsilon =0.45 15 | 16 | init = [0. 0.] 17 | s0 = NNMatrix(init'); a0 = 1; r0 = 0. 18 | t = 0 19 | for epoch = 1:1000 #0000 20 | t += 1 21 | avgReward = 0 22 | m.epsilon = epsilon * 1/t^t_epsilon 23 | m.alpha = alpha * 1/t^t_alpha 24 | for i = 2:length(xs) 25 | x, x2, y = xs[i],xs[i-1], ys[i] 26 | s = [x x2] 27 | s1 = NNMatrix(s') 28 | a1 = act(m,s1) 29 | r1 = (a1==1?-1:1) * sign(y) 30 | avgReward += r1 31 | if i > 2 learn(m,s0,a0,r1,s1) end 32 | s0 = s1; a0 = a1; r0 = r1 33 | end 34 | avgReward = avgReward / (length(xs)-2) 35 | if epoch % 100 == 0 println("$t $epoch avgReward = $(round(avgReward,3)) m.alpha=$(round(m.alpha,6)) m.epsilon=$(round(m.epsilon,6))") end 36 | end 37 | -------------------------------------------------------------------------------- /julia/chapter12/rlexample/readme.md: -------------------------------------------------------------------------------- 1 | This folder has a deep Q learning example code 2 | -------------------------------------------------------------------------------- /julia/chapter13/ensembleexample/Ensemble.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Ensemble learning example 3 | # Chapter 13 4 | 5 | # Ensemble module. 6 | module Ensemble 7 | 8 | # Load source files 9 | include("types.jl") 10 | include("util.jl") 11 | include("transformers.jl") 12 | 13 | end # module 14 | -------------------------------------------------------------------------------- /julia/chapter13/ensembleexample/decisiontree_test.jl: -------------------------------------------------------------------------------- 1 | module TestDecisionTreeWrapper 2 | 3 | include(joinpath("..", "fixture_learners.jl")) 4 | using .FixtureLearners 5 | nfcp = NumericFeatureClassification() 6 | 7 | using FactCheck 8 | 9 | 10 | importall Orchestra.Transformers.DecisionTreeWrapper 11 | using DecisionTree 12 | 13 | facts("DecisionTree learners") do 14 | context("PrunedTree gives same results as its backend") do 15 | # Predict with Orchestra learner 16 | learner = PrunedTree() 17 | orchestra_predictions = fit_and_transform!(learner, nfcp) 18 | 19 | # Predict with original backend learner 20 | srand(1) 21 | model = build_tree(nfcp.train_labels, nfcp.train_instances) 22 | model = prune_tree(model, 1.0) 23 | original_predictions = apply_tree(model, nfcp.test_instances) 24 | 25 | # Verify same predictions 26 | @fact orchestra_predictions => original_predictions 27 | end 28 | 29 | context("RandomForest gives same results as its backend") do 30 | # Predict with Orchestra learner 31 | learner = RandomForest() 32 | orchestra_predictions = fit_and_transform!(learner, nfcp) 33 | 34 | # Predict with original backend learner 35 | srand(1) 36 | model = build_forest( 37 | nfcp.train_labels, 38 | nfcp.train_instances, 39 | size(nfcp.train_instances, 2), 40 | 10, 41 | 0.7 42 | ) 43 | original_predictions = apply_forest(model, nfcp.test_instances) 44 | 45 | # Verify same predictions 46 | @fact orchestra_predictions => original_predictions 47 | end 48 | 49 | context("DecisionStumpAdaboost gives same results as its backend") do 50 | # Predict with Orchestra learner 51 | learner = DecisionStumpAdaboost() 52 | orchestra_predictions = fit_and_transform!(learner, nfcp) 53 | 54 | # Predict with original backend learner 55 | srand(1) 56 | model, coeffs = build_adaboost_stumps( 57 | nfcp.train_labels, 58 | nfcp.train_instances, 59 | 7 60 | ) 61 | original_predictions = apply_adaboost_stumps( 62 | model, coeffs, nfcp.test_instances 63 | ) 64 | 65 | # Verify same predictions 66 | @fact orchestra_predictions => original_predictions 67 | end 68 | 69 | context("RandomForest handles training-dependent options") do 70 | # Predict with Orchestra learner 71 | learner = RandomForest({:impl_options => {:num_subfeatures => 2}}) 72 | orchestra_predictions = fit_and_transform!(learner, nfcp) 73 | 74 | # Verify RandomForest didn't die 75 | @fact 1 => 1 76 | end 77 | end 78 | 79 | end # module 80 | -------------------------------------------------------------------------------- /julia/chapter13/ensembleexample/dimensionalityreduction.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Ensemble learning example 3 | # Chapter 13 4 | 5 | # Dimensionality Reduction transformers. 6 | module DimensionalityReductionWrapper 7 | 8 | importall Orchestra.Types 9 | importall Orchestra.Util 10 | import DimensionalityReduction: pca 11 | 12 | export PCA, 13 | fit!, 14 | transform! 15 | 16 | # Principal Component Analysis rotation 17 | # on features. 18 | # Features ordered by maximal variance descending. 19 | # 20 | # Fails if zero-variance feature exists. 21 | type PCA <: Transformer 22 | model 23 | options 24 | 25 | function PCA(options=Dict()) 26 | default_options = { 27 | :center => true, 28 | :scale => true 29 | } 30 | new(nothing, nested_dict_merge(default_options, options)) 31 | end 32 | end 33 | 34 | function fit!(p::PCA, instances::Matrix, labels::Vector) 35 | pca_model = pca(instances; p.options...) 36 | p.model = pca_model 37 | end 38 | 39 | function transform!(p::PCA, instances::Matrix) 40 | return instances * p.model.rotation 41 | end 42 | 43 | end # module 44 | -------------------------------------------------------------------------------- /julia/chapter13/ensembleexample/dimensionalityreduction_test.jl: -------------------------------------------------------------------------------- 1 | module TestDimensionalityReductionWrapper 2 | 3 | include(joinpath("..", "fixture_learners.jl")) 4 | using .FixtureLearners 5 | fcp = FeatureClassification() 6 | 7 | using FactCheck 8 | 9 | 10 | importall Orchestra.Transformers.DimensionalityReductionWrapper 11 | 12 | facts("DimensionalityReduction transformers") do 13 | context("PCA transforms features") do 14 | instances = [ 15 | 5 10; 16 | -5 0; 17 | 0 5; 18 | ] 19 | labels = ["x"; "y"; "z"] 20 | options = {:center => false, :scale => false} 21 | pca = PCA(options) 22 | fit!(pca, instances, labels) 23 | transformed = transform!(pca, instances) 24 | 25 | @fact true => maximum(instances - transformed * pca.model.rotation') < 10e-4 26 | end 27 | end 28 | 29 | end # module 30 | -------------------------------------------------------------------------------- /julia/chapter13/ensembleexample/mlbase.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Ensemble learning example 3 | # Chapter 13 4 | 5 | # MLBase transformers. 6 | module MLBaseWrapper 7 | 8 | importall Orchestra.Types 9 | importall Orchestra.Util 10 | 11 | import MLBase: Standardize, estimate, transform 12 | 13 | export StandardScaler, 14 | fit!, 15 | transform! 16 | 17 | # Standardizes each feature using (X - mean) / stddev. 18 | # Will produce NaN if standard deviation is zero. 19 | type StandardScaler <: Transformer 20 | model 21 | options 22 | 23 | function StandardScaler(options=Dict()) 24 | default_options = { 25 | :center => true, 26 | :scale => true 27 | } 28 | new(nothing, nested_dict_merge(default_options, options)) 29 | end 30 | end 31 | 32 | function fit!(st::StandardScaler, instances::Matrix, labels::Vector) 33 | st_transform = estimate(Standardize, instances'; st.options...) 34 | st.model = { 35 | :standardize_transform => st_transform 36 | } 37 | end 38 | 39 | function transform!(st::StandardScaler, instances::Matrix) 40 | st_transform = st.model[:standardize_transform] 41 | transposed_instances = instances' 42 | return transform(st_transform, transposed_instances)' 43 | end 44 | 45 | end # module 46 | -------------------------------------------------------------------------------- /julia/chapter13/ensembleexample/mlbase_test.jl: -------------------------------------------------------------------------------- 1 | module TestMLBaseWrapper 2 | 3 | include(joinpath("..", "fixture_learners.jl")) 4 | using .FixtureLearners 5 | fcp = FeatureClassification() 6 | 7 | using FactCheck 8 | 9 | 10 | importall Orchestra.Transformers.MLBaseWrapper 11 | 12 | facts("MLBase transformers") do 13 | context("StandardScaler transforms features") do 14 | instances = [ 15 | 5 10; 16 | -5 0; 17 | 0 5; 18 | ] 19 | labels = [ 20 | "x"; 21 | "y"; 22 | "z"; 23 | ] 24 | expected_transformed = [ 25 | 1.0 1.0; 26 | -1.0 -1.0; 27 | 0.0 0.0; 28 | ] 29 | standard_scaler = StandardScaler() 30 | fit!(standard_scaler, instances, labels) 31 | transformed = transform!(standard_scaler, instances) 32 | 33 | @fact transformed => expected_transformed 34 | end 35 | end 36 | 37 | end # module 38 | -------------------------------------------------------------------------------- /julia/chapter13/ensembleexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter13/ensembleexample/transformers.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Ensemble learning example 3 | # Chapter 13 4 | 5 | # Transformer definitions and implementations. 6 | module Transformers 7 | 8 | export Transformer, 9 | Learner, 10 | OneHotEncoder, 11 | Imputer, 12 | Pipeline, 13 | Wrapper, 14 | Identity, 15 | Baseline, 16 | PrunedTree, 17 | RandomForest, 18 | DecisionStumpAdaboost, 19 | StandardScaler, 20 | PCA, 21 | VoteEnsemble, 22 | StackEnsemble, 23 | BestLearner, 24 | SKLLearner, 25 | CRTLearner, 26 | fit!, 27 | transform! 28 | 29 | # Obtain system details 30 | import Orchestra.System: LIB_SKL_AVAILABLE, LIB_CRT_AVAILABLE 31 | 32 | # Include abstract types as convenience 33 | importall Orchestra.Types 34 | 35 | # Include atomic Orchestra transformers 36 | include(joinpath("orchestra", "baseline.jl")) 37 | importall .BaselineMethods 38 | include(joinpath("orchestra", "transformers.jl")) 39 | importall .OrchestraTransformers 40 | 41 | # Include Julia transformers 42 | include(joinpath("julia", "decisiontree.jl")) 43 | importall .DecisionTreeWrapper 44 | include(joinpath("julia", "mlbase.jl")) 45 | importall .MLBaseWrapper 46 | include(joinpath("julia", "dimensionalityreduction.jl")) 47 | importall .DimensionalityReductionWrapper 48 | 49 | # Include Python transformers 50 | if LIB_SKL_AVAILABLE 51 | include(joinpath("python", "scikit_learn.jl")) 52 | importall .ScikitLearnWrapper 53 | end 54 | 55 | # Include R transformers 56 | if LIB_CRT_AVAILABLE 57 | include(joinpath("r", "caret.jl")) 58 | importall .CaretWrapper 59 | end 60 | 61 | # Include aggregate transformers last, dependent on atomic transformers 62 | include(joinpath("orchestra", "ensemble.jl")) 63 | importall .EnsembleMethods 64 | 65 | end # module 66 | -------------------------------------------------------------------------------- /julia/chapter13/ensembleexample/types.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Ensemble learning example 3 | # Chapter 13 4 | 5 | # types. 6 | module Types 7 | 8 | export Transformer, 9 | Learner, 10 | TestLearner, 11 | fit!, 12 | transform! 13 | 14 | # All transformer types must have implementations 15 | # of function `fit!` and `transform!`. 16 | abstract Transformer 17 | 18 | # Learner abstract type which all machine learners implement. 19 | abstract Learner <: Transformer 20 | 21 | # Test learner. 22 | # Used to separate production learners from test. 23 | abstract TestLearner <: Learner 24 | 25 | # Trains transformer on provided instances and labels. 26 | # 27 | # @param transformer Target transformer. 28 | # @param instances Training instances. 29 | # @param labels Training labels. 30 | function fit!(transformer::Transformer, instances::Matrix, labels::Vector) 31 | error(typeof(transformer), " does not implement fit!") 32 | end 33 | 34 | # Trains transformer on provided instances and labels. 35 | # 36 | # @param transformer Target transformer. 37 | # @param instances Original instances. 38 | # @return Transformed instances. 39 | function transform!(transformer::Transformer, instances::Matrix) 40 | error(typeof(transformer), " does not implement transform!") 41 | end 42 | 43 | end # module 44 | -------------------------------------------------------------------------------- /julia/chapter13/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter5/decisiontreeexample/decision_tree_test1.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Decision Trees example 3 | # Chapter 5 4 | 5 | using Base.Test 6 | using DecisionTree 7 | 8 | n,m = 10^3, 5 ; 9 | features = rand(n,m); 10 | weights = rand(-1:1,m); 11 | labels = _int(features * weights); 12 | 13 | println("\n##### nfoldCV Classification Tree #####") 14 | accuracy = nfoldCV_tree(labels, features, 0.9, 3) 15 | @test mean(accuracy) > 0.7 16 | 17 | println("\n##### nfoldCV Classification Forest #####") 18 | accuracy = nfoldCV_forest(labels, features, 2, 10, 3) 19 | @test mean(accuracy) > 0.7 20 | 21 | println("\n##### nfoldCV Adaboosted Stumps #####") 22 | nfoldCV_stumps(labels, features, 7, 3) 23 | -------------------------------------------------------------------------------- /julia/chapter5/decisiontreeexample/decision_tree_test2.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Decision Trees example 3 | # Chapter 5 4 | 5 | 6 | using Base.Test 7 | using RDatasets 8 | using DecisionTree 9 | 10 | iris = dataset("datasets", "iris") 11 | features = array(iris[:, 1:4]); 12 | labels = array(iris[:, 5]); 13 | 14 | # train full-tree classifier 15 | model = build_tree(labels, features) 16 | # prune tree: merge leaves having >= 90% combined purity (default: 100%) 17 | model = prune_tree(model, 0.9) 18 | # pretty print of the tree, to a depth of 5 nodes (optional) 19 | print_tree(model, 5) 20 | # apply learned model 21 | apply_tree(model, [5.9,3.0,5.1,1.9]) 22 | # run n-fold cross validation for pruned tree, using 90% purity threshold purning, and 3 CV folds 23 | println("\n##### nfoldCV Classification Tree #####") 24 | accuracy = nfoldCV_tree(labels, features, 0.9, 3) 25 | @test mean(accuracy) > 0.8 26 | 27 | # train random forest classifier, using 2 random features, 10 trees and 0.5 of samples per tree (optional, defaults to 0.7) 28 | model = build_forest(labels, features, 2, 10, 0.5) 29 | # apply learned model 30 | apply_forest(model, [5.9,3.0,5.1,1.9]) 31 | # run n-fold cross validation for forests, using 2 random features, 10 trees, 3 folds, 0.5 of samples per tree (optional, defaults to 0.7) 32 | println("\n##### nfoldCV Classification Forest #####") 33 | accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5) 34 | @test mean(accuracy) > 0.8 35 | 36 | # train adaptive-boosted decision stumps, using 7 iterations 37 | model, coeffs = build_adaboost_stumps(labels, features, 7); 38 | # apply learned model 39 | apply_adaboost_stumps(model, coeffs, [5.9,3.0,5.1,1.9]) 40 | # run n-fold cross validation for boosted stumps, using 7 iterations and 3 folds 41 | println("\n##### nfoldCV Classification Adaboosted Stumps #####") 42 | nfoldCV_stumps(labels, features, 7, 3) 43 | 44 | -------------------------------------------------------------------------------- /julia/chapter5/decisiontreeexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter5/randomforstexample/RandomForests.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Decision Trees based learning - Random Forests example 3 | # Chapter 5 4 | 5 | module RandomForests 6 | 7 | export 8 | RandomForestClassifier, 9 | RandomForestRegressor, 10 | fit, 11 | predict, 12 | feature_importances, 13 | oob_error 14 | 15 | using DataFrames 16 | 17 | include("util.jl") 18 | include("tree.jl") 19 | include("randomforest.jl") 20 | include("classifier.jl") 21 | include("regressor.jl") 22 | 23 | end # RandomForests module 24 | -------------------------------------------------------------------------------- /julia/chapter5/randomforstexample/example.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Decision Trees based learning - Random Forests example 3 | # Chapter 5 4 | 5 | typealias TabularData Union{AbstractMatrix,DataFrame} 6 | 7 | type Example{T<:TabularData} 8 | x::T # tabular data 9 | y::AbstractVector 10 | n_labels::Int 11 | n_features::Int 12 | sample_weight::Vector{Float64} 13 | 14 | function Example(x::T, y::AbstractVector, sample_weight::Vector{Float64}) 15 | n_labels = length(unique(y)) 16 | n_features = size(x, 2) 17 | new(x, y, n_labels, n_features, sample_weight) 18 | end 19 | 20 | Example(x::T, y::AbstractVector) = Example{T}(x, y, ones(Float64, size(x, 1))) 21 | end 22 | -------------------------------------------------------------------------------- /julia/chapter5/randomforstexample/readme.md: -------------------------------------------------------------------------------- 1 | # RandomForests.jl 2 | 3 | CART-based random forest implementation in Julia. 4 | This package supports: 5 | * Classification model 6 | * Regression model 7 | * Out-of-bag (OOB) error 8 | * Feature importances 9 | * Various configurable parameters 10 | **Please be aware that this package is not yet fully examined implementation. You can use it at your own risk.** 11 | And your bug report or suggestion is welcome! 12 | ```julia 13 | RandomForestClassifier(;n_estimators::Int=10, 14 | max_features::Union(Integer, FloatingPoint, Symbol)=:sqrt, 15 | max_depth=nothing, 16 | min_samples_split::Int=2, 17 | criterion::Symbol=:gini) 18 | ``` 19 | 20 | ```julia 21 | RandomForestRegressor(;n_estimators::Int=10, 22 | max_features::Union(Integer, FloatingPoint, Symbol)=:third, 23 | max_depth=nothing, 24 | min_samples_split::Int=2) 25 | ``` 26 | 27 | * `n_estimators`: the number of weak estimators 28 | * `max_features`: the number of candidate features at each split 29 | * if `Integer` is given, the fixed number of features are used 30 | * if `FloatingPoint` is given, the proportion of given value (0.0, 1.0] are used 31 | * if `Symbol` is given, the number of candidate features is decided by a strategy 32 | * `:sqrt`: `ifloor(sqrt(n_features))` 33 | * `:third`: `div(n_features, 3)` 34 | * `max_depth`: the maximum depth of each tree 35 | * the default argument `nothing` means there is no limitation of the maximum depth 36 | * `min_samples_split`: the minimum number of sub-samples to try to split a node 37 | * `criterion`: the criterion of impurity measure (classification only) 38 | * `:gini`: Gini index 39 | * `:entropy`: Cross entropy 40 | 41 | `RandomForestRegressor` always uses the mean squared error for its impurity measure. 42 | At the current moment, there is no configurable criteria for regression model. 43 | 44 | 45 | ## Related package 46 | * [DecisionTree.jl] 47 | * DecisionTree.jl is based on the ID3 (Iterative Dichotomiser 3) algorithm while RandomForests.jl uses CART (Classification And Regression Tree). 48 | 49 | ## Acknowledgement 50 | The algorithm and interface are highly inspired by those of [scikit-learn](http://scikit-learn.org). 51 | -------------------------------------------------------------------------------- /julia/chapter5/randomforstexample/sort.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Decision Trees based learning - Random Forests example 3 | # Chapter 5 4 | 5 | import Base: sort! 6 | 7 | const SMALL_THRESHOLD = 40 8 | 9 | function sort!(samples::AbstractVector, feature::AbstractVector, range::UnitRange{Int}) 10 | # inplace sort `samples` and `feature` vector in one shot, along with `feature`, between `range` 11 | len = length(range) 12 | @assert len == length(feature) <= length(samples) 13 | sort!(sub(samples, range), feature, 1, endof(feature)) 14 | end 15 | 16 | function sort!(x::AbstractVector, y::AbstractVector, lo::Int, hi::Int) 17 | # if the range subject to sorting is small, the insertion sort would be faster 18 | if hi - lo <= SMALL_THRESHOLD 19 | isort!(x, y, lo, hi) 20 | else 21 | # quick sort 22 | p = partition(x, y, lo, hi) 23 | sort!(x, y, lo, p - 1) 24 | sort!(x, y, p + 1, hi) 25 | end 26 | return 27 | end 28 | 29 | # insertion sort 30 | function isort!(x::AbstractVector, y::AbstractVector, lo::Int, hi::Int) 31 | @inbounds for i in lo+1:hi 32 | elm = y[i] 33 | tmp = x[i] 34 | j = i 35 | while j > lo && y[j-1] > elm 36 | # shift elements 37 | y[j] = y[j-1] 38 | x[j] = x[j-1] 39 | j -= 1 40 | end 41 | y[j] = elm 42 | x[j] = tmp 43 | end 44 | end 45 | 46 | function median(x::AbstractVector, i::Int, j::Int, k::Int) 47 | if x[i] < x[j] 48 | if x[j] < x[k] 49 | return j 50 | elseif x[k] < x[i] 51 | return i 52 | else 53 | return k 54 | end 55 | else 56 | # implies x[j] <= x[i] 57 | if x[k] <= x[j] 58 | return j 59 | elseif x[i] <= x[k] 60 | return i 61 | else 62 | return k 63 | end 64 | end 65 | end 66 | 67 | function partition(x::AbstractVector, y::AbstractVector, lo::Int, hi::Int) 68 | # choose pivot 69 | pivot_index = median(y, lo, hi, div(lo + hi, 2)) 70 | pivot_value = y[pivot_index] 71 | 72 | # swap elements at pivot_index and hi 73 | y[pivot_index], y[hi] = y[hi], y[pivot_index] 74 | x[pivot_index], x[hi] = x[hi], x[pivot_index] 75 | 76 | p = lo 77 | @inbounds for i in lo:hi-1 78 | if y[i] <= pivot_value 79 | # swap elements at i and p 80 | y[i], y[p] = y[p], y[i] 81 | x[i], x[p] = x[p], x[i] 82 | p += 1 83 | end 84 | end 85 | 86 | # swap elements at p and hi 87 | y[p], y[hi] = y[hi], y[p] 88 | x[p], x[hi] = x[hi], x[p] 89 | 90 | p 91 | end 92 | -------------------------------------------------------------------------------- /julia/chapter5/randomforstexample/util.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Decision Trees based learning - Random Forests example 3 | # Chapter 5 4 | 5 | function Base.vec(x::DataFrame) 6 | n, m = size(x) 7 | n == 1 || error("x must be a single record") 8 | [x[n, j] for j in 1:m] 9 | end 10 | -------------------------------------------------------------------------------- /julia/chapter5/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter6/knnexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter6/knnexample/test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/julia/chapter6/knnexample/test.zip -------------------------------------------------------------------------------- /julia/chapter6/knnexample/train.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/julia/chapter6/knnexample/train.zip -------------------------------------------------------------------------------- /julia/chapter6/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter6/svmexample/examplesvm1.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Support Vector Machine example 3 | # Chapter 6 4 | 5 | reposDir = EnvHash()["JuliaRepos"] 6 | 7 | ## Load julia-svm 8 | load("svm.jl") 9 | 10 | n = int(1e3) 11 | p = 20 12 | X = rand(n, p) 13 | y = float(randi((0, 1), n)) 14 | 15 | svp = svmproblem(y, X) 16 | svparam = svmparameter("epsilon_svr", "rbf", int32(3), 17 | 1., 0., 40., 0.001, 18 | 1., 0.5, 19 | 1., int32(1), int32(0)) 20 | model = svmtrain(svp, svparam) 21 | 22 | X2 = rand(10, p) 23 | 24 | pred = svmpredict(model, X2) 25 | -------------------------------------------------------------------------------- /julia/chapter6/svmexample/readme.md: -------------------------------------------------------------------------------- 1 | Julia bindings to libsvm 2 | -------------------------------------------------------------------------------- /julia/chapter7/aprioriexample/apriori.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Association rule based learning - Apriori example 3 | # Chapter 7 4 | 5 | # Find k-freq-itemset in given transactions of items queried together 6 | using StatsBase 7 | 8 | # Find frequent itemsets from transactions 9 | # @T: transaction list 10 | # @minsupp: minimum support 11 | function find_freq_itemset(T, minsupp) 12 | N = length(T) 13 | # Find itemset I from transaction list T 14 | I = Array(Int64,0) 15 | for t in T 16 | for i in t 17 | push!(I,i) 18 | end 19 | end 20 | I = Set(I) 21 | 22 | # Find freq-itemset when k = 1: Fₖ = {i | i ∈ I^σ({i}) ≥ N × minsupp} 23 | k = 1 24 | F = [] 25 | push!(F,map(x->[x],filter(i->σ(i,T) >= N * minsupp, I))) # F₁ 26 | while true 27 | Cₖ = gen_candidate(F[end]) # Generate candidate set Cₖ from Fₖ₋₁ 28 | Fₖ = filter(c->σ(c,T) >= Nbumanzu * minsupp, Cₖ) 29 | if !isempty(Fₖ) 30 | push!(F,Fₖ) # Eliminate infrequent candidates, then set to Fₖ 31 | else break 32 | end 33 | end 34 | F 35 | end 36 | 37 | # Generate freq-itemset from a list of itemsets 38 | # @x: list of itemsets 39 | function gen_candidate(x) 40 | n = length(x) 41 | Cₖ = Array(Array{Int64,1},0) 42 | for a = 1:n, b = 1:n 43 | if a >= b;continue 44 | end 45 | is_candidate = true 46 | sort!(x[a]); sort!(x[b]) 47 | for i in 1:length(x[1])-1 48 | if x[a][i] == x[b][i]; continue 49 | else is_candidate = false; break 50 | end 51 | end 52 | if is_candidate 53 | push!(Cₖ, sort!([ x[a][1:end-1], x[a][end], x[b][end] ])) 54 | end 55 | end 56 | Cₖ 57 | end 58 | 59 | # Generate rules from frequent itemsets 60 | # @x: list of frequent itemsets 61 | # @T: Transaction list 62 | function gen_rules(x, T) 63 | if length(x) <= 1; return [] # F contains 1-itemsets only, hence no rules generated. 64 | end 65 | x = reduce(append!,x[2:end]) 66 | R = Array(Rule,0) 67 | for f in x # f as each freq-f-itemset fₖ 68 | ap_genrules!(R,f,map(i->Array([i]),f),T) # H₁ itemset is same as f 69 | end 70 | R 71 | end 72 | 73 | function ap_genrules!(R, f, H, T) 74 | k, m = length(f), length(H[1]) 75 | if k > m + 1 76 | H = gen_candidate(H) 77 | H_plus_1 = [] 78 | for h in H 79 | p = setdiff(f,h) 80 | if conf(p, h, T) >= minconf 81 | push!(R, Rule(p,h)) 82 | push!(H_plus_1, h) 83 | end 84 | end 85 | ap_genrules(R, f, H_plus_1, T) 86 | end 87 | end 88 | -------------------------------------------------------------------------------- /julia/chapter7/aprioriexample/aprioritest.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Association rule based learning - Apriori TEST 3 | # Chapter 7 4 | 5 | using Base.Test 6 | include("apriori.jl") 7 | 8 | function _gen_dummy_data!(transactions) 9 | range = [1:10] 10 | for i in 1:length(transactions) 11 | transactions[i] = sample(range, sample(range, 1)[1], replace = false) 12 | end 13 | transactions 14 | end 15 | 16 | minsupp = 0.5 17 | minconf = 0.2 18 | T = Array(Array{Int64,1},10) 19 | T[1] = [1,2] 20 | T[2] = [1,3,4,5] 21 | T[3] = [2,3,4,6] 22 | T[4] = [1,2,3,4] 23 | T[5] = [1,2,3,6] 24 | T[6] = [1,2,3,5] 25 | T[7] = [1,2,3,5,6] 26 | T[8] = [1,3,4,5,6] 27 | T[9] = [1,2,3,4,5,6] 28 | T[10] = [1,2,3,4,5] 29 | 30 | @test_approx_eq σ([2,3,4],T[1:5]) 2 31 | @test_approx_eq supp([2,3],4,T[1:5]) 0.4 32 | @test_approx_eq_eps conf([2,3],4,T[1:5]) 0.67 1e-2 33 | 34 | F₂ = Array(Array{Int64,1},4) 35 | F₂[1] = [3,4] 36 | F₂[2] = [1,3] 37 | F₂[3] = [1,2] 38 | F₂[4] = [2,3] 39 | @test length(gen_candidate(F₂)) == 1 40 | @test gen_candidate(F₂)[1] == [1,2,3] 41 | 42 | F = find_freq_itemset(T[1:5], minsupp) 43 | @test length(F) == 2 44 | @test Set(F[1]) == Set([1],[2],[3],[4]) 45 | @test Set(F[2]) == Set([3,4],[1,3],[1,2],[2,3]) 46 | -------------------------------------------------------------------------------- /julia/chapter7/aprioriexample/common.jl: -------------------------------------------------------------------------------- 1 | # Common types and functions 2 | 3 | type Rule 4 | P::Array{Int64} # Antecedent 5 | Q::Array{Int64} # Consequent 6 | end 7 | 8 | # Support Count: σ(x) = | {tᵢ|x ⊆ tᵢ,tᵢ∈ T}| 9 | function σ(x, T) 10 | ret = 0 11 | for t in T 12 | ⊆(x,t) && (ret += 1) 13 | end 14 | ret 15 | end 16 | 17 | # Support of itemset x -> y, which x does not intersect y. 18 | supp(x,y,T) = σ(∪(x,y),T)/length(T) 19 | 20 | # Confidence of itemset x-> y, which x does not intersect y. 21 | conf(x,y,T) = σ(∪(x,y),T)/σ(x,T) 22 | -------------------------------------------------------------------------------- /julia/chapter7/aprioriexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter7/fpgrowthexample/common.jl: -------------------------------------------------------------------------------- 1 | # Common types and functions 2 | 3 | type Rule 4 | P::Array{Int64} # Antecedent 5 | Q::Array{Int64} # Consequent 6 | end 7 | 8 | # Support Count: σ(x) = | {tᵢ|x ⊆ tᵢ,tᵢ∈ T}| 9 | function σ(x, T) 10 | ret = 0 11 | for t in T 12 | ⊆(x,t) && (ret += 1) 13 | end 14 | ret 15 | end 16 | 17 | # Support of itemset x -> y, which x does not intersect y. 18 | supp(x,y,T) = σ(∪(x,y),T)/length(T) 19 | 20 | # Confidence of itemset x-> y, which x does not intersect y. 21 | conf(x,y,T) = σ(∪(x,y),T)/σ(x,T) 22 | -------------------------------------------------------------------------------- /julia/chapter7/fpgrowthexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter7/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /julia/chapter8/k-meansexample/k-means.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Clustering based learning - L-mena clustering 3 | # Chapter 8 4 | 5 | using Images 6 | using ImageView 7 | using Color 8 | 9 | # Run non-interactively, accept either an image or a directory 10 | # If directory, iterate through image filetypes 11 | 12 | # Enter filepath and number of dominant colors wanted, k 13 | function dominant_colors (filename, k) 14 | if isfile(filename) 15 | img = imread(filename,RGB) 16 | init(img, filename, k) 17 | elseif isdir(filename) 18 | files = filter!(abspath(r"\.(?:jpe?g|gif|png|tiff)$")), readdir()) #creates an array of filenames by filtering out only files that in file extensions. 19 | for i in files 20 | dominant_colors(i) 21 | end 22 | else 23 | error("No image found.") 24 | end 25 | end 26 | 27 | type Point 28 | coords::Array # Color associated with pt, a 3D array. 29 | ct:Int # Count 30 | end 31 | 32 | type Cluster 33 | points::Array # Points associated with cluster, C_k 34 | centroid::Point # Center of cluster, assumed mean of pt values 35 | k::Int # Cluster count 36 | end 37 | 38 | function init(img, filename, k) 39 | # Convert color space from sRGB (linear) to CIEXYZ to CIELAB 40 | run(`convert $filename -thumbnail 200x200 $filename`) #convert to thumbnail via ImageMagick CLI 41 | img = convert(Image{LAB}, img) #use Color to convert to LAB automagically 42 | points = getpoints(img) 43 | randclusters(points, k) 44 | kmeans(points, k) 45 | 46 | function getpoints(img) 47 | points = [] 48 | count = 0 49 | for count, color in img[1:width(img)] 50 | for count, color in img[2:height(img)] 51 | count += 1 52 | points.append(Point(color, count)) 53 | end 54 | return points 55 | end 56 | 57 | # Sq. euclidean distance 58 | function distance (pt1, pt2) 59 | return mapreduce((pt1.coords[i]-pt2.coords[i])**2, +, 1:length(pt1.coords)) 60 | end 61 | 62 | # Randomly assign pixels to represent intial centroid/clusters 63 | function randclusters(pts::Array, k) 64 | kclusters = [] 65 | for n = 1:k 66 | kclusters.append(Cluster(pts, pts[rand(1:end)], n) 67 | return kclusters 68 | end 69 | 70 | # Recalculate Centroid 71 | function recenter(points, ) 72 | 73 | # K-Means Algorithm (alternate between resigning points to a cluster based on similarity and cluster centroid based on the points assigned) 74 | function kmeans() 75 | # Repeat n number of times 76 | 77 | # Optionally convert returned clusters back to sRGB -------------------------------------------------------------------------------- /julia/chapter8/k-meansexample/readme.md: -------------------------------------------------------------------------------- 1 | Finding dominant colors in an image ala Google's Palette using K-Means clustering in Julia. 2 | Reference from a MIT license code 3 | -------------------------------------------------------------------------------- /julia/chapter8/readme.md: -------------------------------------------------------------------------------- 1 | Finding dominant colors in an image ala Google's Palette using K-Means clustering in Julia. 2 | Reference from a MIT license code 3 | -------------------------------------------------------------------------------- /julia/chapter9/naivebayesexample/NaiveBayes.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Bayesian learning - Naive Bayes example 3 | # Chapter 9 4 | module NaiveBayes 5 | 6 | export NBModel, 7 | MultinomialNB, 8 | GaussianNB, 9 | fit, 10 | predict, 11 | predict_proba 12 | 13 | include("nbtypes.jl") 14 | include("core.jl") 15 | 16 | end 17 | -------------------------------------------------------------------------------- /julia/chapter9/naivebayesexample/datastats.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Bayesian learning - Naive Bayes example 3 | # Chapter 9 4 | 5 | using Base.BLAS 6 | 7 | # type for collecting data statistics incrementally 8 | type DataStats 9 | x_sums::Vector{Float64} # sum(x_i) 10 | cross_sums::Matrix{Float64} # sum(x_i'*x_i) (lower-triangular matrix) 11 | n_obs::Uint64 # number of observations 12 | obs_axis::Int64 # observation axis, e.g. size(X, obs_axis) 13 | # should return number of observations 14 | function DataStats(n_vars, obs_axis=1) 15 | @assert obs_axis == 1 || obs_axis == 2 16 | new(zeros(Float64, n_vars), zeros(Float64, n_vars, n_vars), 0, obs_axis) 17 | end 18 | end 19 | 20 | 21 | function Base.show(io::IO, dstats::DataStats) 22 | print(io, "DataStats(n_vars=$(length(dstats.x_sums))," * 23 | "n_obs=$(dstats.n_obs),obs_axis=$(dstats.obs_axis))") 24 | end 25 | 26 | 27 | # Collect data statistics. 28 | # This method may be called multiple times on different 29 | # data samples to collect aggregative statistics. 30 | function updatestats(dstats::DataStats, X::Matrix{Float64}) 31 | trans = dstats.obs_axis == 1 ? 'T' : 'N' 32 | axpy!(1.0, sum(X, dstats.obs_axis), dstats.x_sums) 33 | syrk!('L', trans, 1.0, X, 1.0, dstats.cross_sums) 34 | dstats.n_obs += size(X, dstats.obs_axis) 35 | return dstats 36 | end 37 | 38 | function Base.mean(dstats::DataStats) 39 | @assert (dstats.n_obs >= 1) "At least 1 observations is requied" 40 | return dstats.x_sums ./ dstats.n_obs 41 | end 42 | 43 | function Base.cov(dstats::DataStats) 44 | @assert (dstats.n_obs >= 2) "At least 2 observations are requied" 45 | mu = mean(dstats) 46 | C = (dstats.cross_sums - dstats.n_obs * (mu*mu')) / (dstats.n_obs - 1) 47 | Base.LinAlg.copytri!(C, 'L') 48 | return C 49 | end 50 | 51 | 52 | -------------------------------------------------------------------------------- /julia/chapter9/naivebayesexample/nbexampledata-iris.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Bayesian learning - Naive Bayes example 3 | # Chapter 9 4 | 5 | using NaiveBayes 6 | using RDatasets 7 | 8 | iris = dataset("datasets", "iris") 9 | 10 | # observations in columns and variables in rows 11 | X = array(iris[:, 1:4])' 12 | p, n = size(X) 13 | # by default species is a PooledDataArray, 14 | y = [species for species in iris[:, 5]] 15 | 16 | # how much data use for training 17 | train_frac = 0.9 18 | k = int(floor(train_frac * n)) 19 | idxs = randperm(n) 20 | train = idxs[1:k] 21 | test = idxs[k+1:end] 22 | 23 | model = GaussianNB(unique(y), p) 24 | fit(model, X[:, train], y[train]) 25 | 26 | accuracy = countnz(predict(model, X[:,test]) .== y[test]) / countnz(test) 27 | 28 | println("Accuracy: $accuracy") 29 | -------------------------------------------------------------------------------- /julia/chapter9/naivebayesexample/nbtest1.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Bayesian learning - Naive Bayes example 3 | # Chapter 9 4 | 5 | using StatsBase 6 | using NaiveBayes 7 | 8 | function test_multinomial() 9 | print("testing MultinomialNB... ") 10 | m = MultinomialNB([:a, :b, :c], 5) 11 | X = [1 2 5 2; 12 | 5 3 -2 1; 13 | 0 2 1 11; 14 | 6 -1 3 3; 15 | 5 7 7 1] 16 | y = [:a, :b, :a, :c] 17 | 18 | fit(m, X, y) 19 | @assert predict(m, X) == y 20 | println("OK") 21 | end 22 | 23 | function test_gaussian() 24 | print("testing GaussianNB... ") 25 | n_obs = 100 26 | m = GaussianNB([:a, :b, :c], 5) 27 | X = randn(5, n_obs) 28 | y = sample([:a, :b, :c], n_obs) 29 | 30 | fit(m, X, y) 31 | accuracy = sum(predict(m, X) .== y) / n_obs 32 | println(accuracy) 33 | println("OK") 34 | end 35 | 36 | 37 | -------------------------------------------------------------------------------- /julia/chapter9/naivebayesexample/nbtest2.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Bayesian learning - Naive Bayes example 3 | # Chapter 9 4 | 5 | include("ntests1.jl") 6 | 7 | # normal (variables on columns) 8 | X = rand(40, 10) 9 | ds = DataStats(10) 10 | updatestats(ds, X[1:20, :]) 11 | updatestats(ds, X[21:end, :]) 12 | 13 | @assert all((cov(X) - cov(ds)) .< 0.0001) 14 | 15 | # transposed (variables on rows) 16 | X = rand(40, 10) 17 | ds = DataStats(10, 2) 18 | updatestats(ds, X') 19 | 20 | @assert all((cov(X) - cov(ds)) .< 0.0001) 21 | 22 | println("All OK") 23 | -------------------------------------------------------------------------------- /julia/chapter9/naivebayesexample/nbtypes.jl: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Bayesian learning - Naive Bayes example 3 | # Chapter 9 4 | 5 | using Distributions 6 | 7 | include("datastats.jl") 8 | 9 | # Base type for Naive Bayes models. 10 | # Inherited classes should have at least following fields: 11 | # c_counts::Dict{C, Int64} - count of ocurrences of each class 12 | # n_obs::Int64 - total number of observations 13 | abstract NBModel{C} 14 | 15 | ##################################### 16 | ##### Multinomial Naive Bayes ##### 17 | ##################################### 18 | 19 | type MultinomialNB{C} <: NBModel 20 | c_counts::Dict{C, Int64} # count of ocurrences of each class 21 | x_counts::Dict{C, Vector{Number}} # count/sum of occurrences of each var 22 | x_totals::Vector{Number} # total occurrences of each var 23 | n_obs::Int64 # total number of seen observations 24 | end 25 | 26 | 27 | # Multinomial Naive Bayes classifier 28 | # 29 | # classes : array of objects 30 | # Class names 31 | # n_vars : Int64 32 | # Number of variables in observations 33 | # alpha : Number (optional, default 1) 34 | # Smoothing parameter. E.g. if alpha equals 1, each variable in each class 35 | # is believed to have 1 observation by default 36 | function MultinomialNB{C}(classes::Vector{C}, n_vars::Int64; alpha=1) 37 | c_counts = Dict(classes, ones(Int64, length(classes)) * alpha) 38 | x_counts = Dict{C, Vector{Int64}}() 39 | for c in classes 40 | x_counts[c] = ones(Int64, n_vars) * alpha 41 | end 42 | x_totals = ones(Float64, n_vars) * alpha * length(c_counts) 43 | MultinomialNB{C}(c_counts, x_counts, x_totals, sum(x_totals)) 44 | end 45 | 46 | 47 | function Base.show(io::IO, m::MultinomialNB) 48 | print(io, "MultinomialNB($(m.c_counts))") 49 | end 50 | 51 | 52 | ##################################### 53 | ###### Gaussian Naive Bayes ####### 54 | ##################################### 55 | 56 | type GaussianNB{C} <: NBModel 57 | c_counts::Dict{C, Int64} # count of ocurrences of each class 58 | c_stats::Dict{C, DataStats} # aggregative data statistics 59 | gaussians::Dict{C, MvNormal} # precomputed distribution 60 | # x_counts::Dict{C, Vector{Number}} # ?? count/sum of occurrences of each var 61 | # x_totals::Vector{Number} # ?? total occurrences of each var 62 | n_obs::Int64 # total number of seen observations 63 | end 64 | 65 | 66 | function GaussianNB{C}(classes::Vector{C}, n_vars::Int64) 67 | c_counts = Dict(classes, zeros(Int64, length(classes))) 68 | c_stats = Dict(classes, [DataStats(n_vars, 2) for i=1:length(classes)]) 69 | gaussians = Dict{C, MvNormal}() 70 | GaussianNB{C}(c_counts, c_stats, gaussians, 0) 71 | end 72 | 73 | 74 | function Base.show(io::IO, m::GaussianNB) 75 | print(io, "GaussianNB($(m.c_counts))") 76 | end 77 | -------------------------------------------------------------------------------- /julia/chapter9/naivebayesexample/readme.md: -------------------------------------------------------------------------------- 1 | Naive Bayes classifier. Currently 2 types of NB are supported: 2 | 3 | * **MultinomialNB** - assumes variables have a multinomial distribution. Good e.g. for text classification. See `examples/nums.jl` for usage. 4 | * **GaussianNB** - assumes variables have a multivariate normal distribution. Good for real-valued data. See `examples/iris.jl` for usage. 5 | 6 | Since `GaussianNB` models multivariate distribution, it's not really a "naive" classifier (i.e. no independence assumption is made), so the name may change in the future. 7 | -------------------------------------------------------------------------------- /julia/chapter9/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter10/linearregressionexample/readme.md: -------------------------------------------------------------------------------- 1 | There are no direct APIs for implementing Linear Regression Algorithm. There are some alternative ways of solving this problem using Hadoop MapReduce and Python or R packages. 2 | 3 | This is a placeholder folder and implementation details will be added as and when supporting APIs are made available with the maout distributions 4 | 5 | Reference to list of algorithms supported by mahout can be found here: 6 | https://mahout.apache.org/users/basics/algorithms.html 7 | 8 | -------------------------------------------------------------------------------- /mahout/chapter10/logisticregressionexample/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.pml.mahout.logreg 6 | logistic-regression 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | logistic-regression 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | 26 | net.sf.opencsv 27 | opencsv 28 | 2.3 29 | 30 | 31 | org.apache.mahout 32 | mahout-examples 33 | 0.9 34 | jar 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /mahout/chapter10/logisticregressionexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter10/logisticregressionexample/src/main/java/com/packt/pml/mahout/logreg/LogisticRegressionApp.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Practical Machine learning 3 | * Logistic Regression Example 4 | * Chapter 10 5 | */ 6 | package com.packt.pml.mahout.logreg; 7 | 8 | import au.com.bytecode.opencsv.CSVReader; 9 | import au.com.bytecode.opencsv.CSVWriter; 10 | import java.io.FileNotFoundException; 11 | import java.io.FileReader; 12 | import java.io.FileWriter; 13 | import java.io.IOException; 14 | 15 | 16 | /** 17 | * Hello world! 18 | * 19 | */ 20 | public class LogisticRegressionApp 21 | { 22 | public static void main( String[] args ) throws FileNotFoundException, IOException 23 | { 24 | 25 | CSVReader reader = new CSVReader(new FileReader("$WORK_DIR/train/train.csv")); 26 | 27 | String [] nextLine; 28 | String [] previousLine; 29 | String [] headernew = new String [reader.readNext().length + 1]; 30 | 31 | CSVWriter writer = new CSVWriter(new FileWriter("$WORK_DIR/train/final.csv"), ','); 32 | 33 | nextLine = reader.readNext(); 34 | 35 | for (int i = 0; i < nextLine.length;i++) 36 | { 37 | headernew[i] = nextLine[i]; 38 | } 39 | 40 | headernew[headernew.length-1] = "action"; 41 | writer.writeNext(headernew); 42 | 43 | previousLine = reader.readNext(); 44 | 45 | 46 | while ((nextLine = reader.readNext()) != null) { 47 | // nextLine[] is an array of values from the line 48 | System.out.println(nextLine[0] + nextLine[1] + "etc..."); 49 | headernew = new String [nextLine.length + 1]; 50 | 51 | for (int i = 0; i < headernew.length-1;i++) 52 | { 53 | headernew[i] = nextLine[i]; 54 | } 55 | 56 | if ( 57 | Double.parseDouble(previousLine[4]) < Double.parseDouble(nextLine[4]) 58 | ) 59 | { 60 | headernew[headernew.length] = "SELL"; 61 | } else { 62 | headernew[headernew.length] = "BUY"; 63 | } 64 | 65 | writer.writeNext(headernew); 66 | 67 | previousLine = nextLine; 68 | 69 | 70 | } 71 | 72 | reader.close(); 73 | writer.close(); 74 | 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /mahout/chapter10/logisticregressionexample/src/test/java/com/packt/pml/mahout/logreg/LogisticRegressionTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Practical Machine learning 3 | * Logistic Regression Example 4 | * Chapter 10 5 | */ 6 | package com.packt.pml.mahout.logreg; 7 | 8 | import junit.framework.Test; 9 | import junit.framework.TestCase; 10 | import junit.framework.TestSuite; 11 | 12 | /** 13 | * Unit test for simple App. 14 | */ 15 | public class LogisticRegressionTest 16 | extends TestCase 17 | { 18 | /** 19 | * Create the test case 20 | * 21 | * @param testName name of the test case 22 | */ 23 | public LogisticRegressionTest( String testName ) 24 | { 25 | super( testName ); 26 | } 27 | 28 | /** 29 | * @return the suite of tests being tested 30 | */ 31 | public static Test suite() 32 | { 33 | return new TestSuite( LogisticRegressionTest.class ); 34 | } 35 | 36 | /** 37 | * Rigourous Test :-) 38 | */ 39 | public void testLogisticRegression() 40 | { 41 | assertTrue( true ); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /mahout/chapter10/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter11/annexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter11/dlexample/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | EPFL 4 | DeepManuscriptLearning 5 | 0.0.1 6 | 7 | src 8 | 9 | 10 | maven-compiler-plugin 11 | 3.1 12 | 13 | 1.7 14 | 1.7 15 | 16 | 17 | 18 | 19 | 20 | 21 | org.apache.spark 22 | spark-core_2.10 23 | 1.3.0 24 | 25 | 26 | org.apache.spark 27 | spark-mllib_2.10 28 | 1.3.0 29 | 30 | 31 | com.google.protobuf 32 | protobuf-java 33 | 2.6.1 34 | 35 | 36 | junit 37 | junit-dep 38 | 4.8.2 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /mahout/chapter11/dlexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter11/dlexample/src/main/java/AutoencoderComputedParams.java: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Deep learning - Autoencoder example 3 | // Chapter 11 4 | 5 | package main.java; 6 | 7 | import java.io.Serializable; 8 | 9 | public class AutoencoderComputedParams implements Serializable { 10 | 11 | private long numSamples; 12 | private double[] sparsityArray; 13 | 14 | public AutoencoderComputedParams(long numSamples, double[] sparsityArray) { 15 | super(); 16 | this.numSamples = numSamples; 17 | this.sparsityArray = sparsityArray; 18 | } 19 | 20 | public long getNumSamples() { 21 | return numSamples; 22 | } 23 | 24 | public void setNumSamples(long numSamples) { 25 | this.numSamples = numSamples; 26 | } 27 | 28 | public double[] getSparsityArray() { 29 | return sparsityArray; 30 | } 31 | 32 | public void setSparsityArray(double[] sparsityArray) { 33 | this.sparsityArray = sparsityArray; 34 | } 35 | 36 | 37 | 38 | 39 | } 40 | -------------------------------------------------------------------------------- /mahout/chapter11/dlexample/src/main/java/AutoencoderFctGrd.java: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Deep learning - Autoencoder example 3 | // Chapter 11 4 | package main.java; 5 | 6 | import java.io.Serializable; 7 | 8 | import org.apache.spark.mllib.linalg.DenseMatrix; 9 | import org.apache.spark.mllib.linalg.DenseVector; 10 | 11 | public class AutoencoderFctGrd implements Serializable{ 12 | 13 | private DenseMatrix w1; 14 | private DenseMatrix w2; 15 | private DenseVector b1; 16 | private DenseVector b2; 17 | private double value; 18 | 19 | public AutoencoderFctGrd(DenseMatrix w1, DenseMatrix w2, DenseVector b1, DenseVector b2,double value) { 20 | this.w1 = w1; 21 | this.w2 = w2; 22 | this.b1 = b1; 23 | this.b2 = b2; 24 | this.value = value; 25 | } 26 | 27 | public DenseMatrix getW1() { 28 | return w1; 29 | } 30 | 31 | public void setW1(DenseMatrix w1) { 32 | this.w1 = w1; 33 | } 34 | 35 | public DenseMatrix getW2() { 36 | return w2; 37 | } 38 | 39 | public void setW2(DenseMatrix w2) { 40 | this.w2 = w2; 41 | } 42 | 43 | public DenseVector getB1() { 44 | return b1; 45 | } 46 | 47 | public void setB1(DenseVector b1) { 48 | this.b1 = b1; 49 | } 50 | 51 | public DenseVector getB2() { 52 | return b2; 53 | } 54 | 55 | public void setB2(DenseVector b2) { 56 | this.b2 = b2; 57 | } 58 | 59 | public double getValue() { 60 | return value; 61 | } 62 | 63 | public void setValue(double value) { 64 | this.value = value; 65 | } 66 | 67 | @Override 68 | public String toString() { 69 | String r = new String(); 70 | //r = "Contains: "+w1.toString()+" "+w2.toString()+" "+b1.toString()+" "+b2.toString()+" "+value+"\n"; 71 | r = "Contains: "+value+"\n"; 72 | return r; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /mahout/chapter11/dlexample/src/main/java/AutoencoderLearner.java: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Deep learning - Autoencoder example 3 | // Chapter 11 4 | 5 | package main.java; 6 | 7 | import main.java.DeepModelSettings.ConfigBaseLayer; 8 | import main.java.DeepModelSettings.ConfigKMeans; 9 | 10 | import org.apache.spark.api.java.JavaRDD; 11 | import org.apache.spark.mllib.linalg.Vector; 12 | 13 | public class AutoencoderLearner implements Learner{ 14 | 15 | private AutoencoderConfig conf; 16 | 17 | public AutoencoderLearner(ConfigBaseLayer configLayer) { 18 | this.conf = new AutoencoderConfig(configLayer); 19 | } 20 | 21 | @Override 22 | public Vector[] call(JavaRDD data) throws Exception { 23 | return new Autoencoder(conf).train(data); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /mahout/chapter11/dlexample/src/main/java/AutoencoderParams.java: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Deep learning - Autoencoder example 3 | // Chapter 11 4 | 5 | package main.java; 6 | 7 | import java.io.Serializable; 8 | 9 | import org.apache.spark.mllib.linalg.DenseMatrix; 10 | import org.apache.spark.mllib.linalg.DenseVector; 11 | 12 | public class AutoencoderParams implements Serializable { 13 | 14 | private DenseMatrix w1; 15 | private DenseMatrix w2; 16 | private DenseVector b1; 17 | private DenseVector b2; 18 | 19 | public AutoencoderParams(DenseMatrix w1, DenseMatrix w2, DenseVector b1, DenseVector b2) { 20 | this.w1 = w1; 21 | this.w2 = w2; 22 | this.b1 = b1; 23 | this.b2 = b2; 24 | } 25 | 26 | public DenseMatrix getW1() { 27 | return w1; 28 | } 29 | 30 | public void setW1(DenseMatrix w1) { 31 | this.w1 = w1; 32 | } 33 | 34 | public DenseMatrix getW2() { 35 | return w2; 36 | } 37 | 38 | public void setW2(DenseMatrix w2) { 39 | this.w2 = w2; 40 | } 41 | 42 | public DenseVector getB1() { 43 | return b1; 44 | } 45 | 46 | public void setB1(DenseVector b1) { 47 | this.b1 = b1; 48 | } 49 | 50 | public DenseVector getB2() { 51 | return b2; 52 | } 53 | 54 | public void setB2(DenseVector b2) { 55 | this.b2 = b2; 56 | } 57 | 58 | 59 | 60 | } 61 | -------------------------------------------------------------------------------- /mahout/chapter11/dlexample/src/main/java/AutoencoderSigmoid.java: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Deep learning - Autoencoder example 3 | // Chapter 11 4 | 5 | package main.java; 6 | 7 | public class AutoencoderSigmoid { 8 | 9 | 10 | //Thread safe singleton class since Spark is run one thread per Partition 11 | 12 | private static volatile AutoencoderSigmoid instance = null; 13 | 14 | 15 | 16 | private AutoencoderSigmoid(){ 17 | 18 | } 19 | 20 | public static AutoencoderSigmoid getInstance(){ 21 | 22 | if (instance == null){ 23 | synchronized (AutoencoderSigmoid.class) { 24 | if (instance == null){ 25 | instance = new AutoencoderSigmoid(); 26 | } 27 | } 28 | } 29 | return instance; 30 | } 31 | 32 | 33 | public static double getValue(double x){ 34 | int i = (int) Math.round(x*100); 35 | return values[i]; 36 | //alternatively make a 3 point average 37 | } 38 | //to complete with generated values 39 | private static double[] values = new double[]{-1.0,0.0,1.0}; 40 | } 41 | -------------------------------------------------------------------------------- /mahout/chapter11/dlexample/src/main/java/two_layers_autoencoders_model.prototxt: -------------------------------------------------------------------------------- 1 | config_layer { 2 | config_preprocess { 3 | eps_1 : 0.1 4 | eps_2 : 10 5 | } 6 | 7 | config_autoencoders { 8 | number_of_units : 700 9 | rho : 0.5 10 | lambda : 0.0001 11 | beta : 1 12 | numEpochs : 1 13 | numBatches : 2 14 | alpha_init : 0.05 15 | alpha_step : 2.0 16 | alpha_max_steps : 10 17 | } 18 | 19 | config_feature_extractor { 20 | input_dim1: 128 21 | input_dim2: 32 22 | feature_dim1: 32 23 | feature_dim2: 32 24 | } 25 | config_pooler { 26 | pool_size: 2 27 | } 28 | } 29 | 30 | config_layer { 31 | config_autoencoders { 32 | number_of_units : 700 33 | rho : 0.5 34 | lambda : 0.0001 35 | beta : 1 36 | numEpochs : 1 37 | numBatches : 2 38 | alpha_init : 0.05 39 | alpha_step : 2.0 40 | alpha_max_steps : 10 41 | } 42 | config_pooler { 43 | pool_size: 2 44 | } 45 | } -------------------------------------------------------------------------------- /mahout/chapter11/dlexample/src/test/java/RankTest.java: -------------------------------------------------------------------------------- 1 | package test.java; 2 | 3 | import java.io.Serializable; 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | import java.util.Comparator; 7 | import java.util.List; 8 | 9 | import main.java.ComputeSimilarity; 10 | import main.java.MatrixOps; 11 | 12 | import org.apache.commons.lang.ArrayUtils; 13 | import org.apache.spark.api.java.JavaRDD; 14 | import org.apache.spark.api.java.JavaSparkContext; 15 | import org.apache.spark.mllib.linalg.DenseMatrix; 16 | import org.apache.spark.mllib.linalg.Vector; 17 | import org.apache.spark.mllib.linalg.Vectors; 18 | import org.junit.After; 19 | import org.junit.Assert; 20 | import org.junit.Before; 21 | import org.junit.Ignore; 22 | import org.junit.Test; 23 | 24 | public class RankTest implements Serializable { 25 | 26 | /** 27 | * 28 | */ 29 | private static final long serialVersionUID = 8707243339400493968L; 30 | private transient JavaSparkContext sc; 31 | 32 | 33 | /** 34 | * @throws java.lang.Exception 35 | */ 36 | @Before 37 | public void setUp() throws Exception { 38 | sc = new JavaSparkContext("local", "FeatureExtractionTest"); 39 | } 40 | 41 | 42 | /** 43 | * @throws java.lang.Exception 44 | */ 45 | @After 46 | public void tearDown() throws Exception { 47 | sc.stop(); 48 | sc = null; 49 | } 50 | 51 | 52 | @Test @Ignore 53 | public void rankTest() { 54 | 55 | // simple example 56 | double[] x1 = {0.35, 0.65, 0.28, 0.12}; 57 | double[] x2 = {0.86, 0.96, 0.34, 0.57}; 58 | double[] query = {0.46, 0.92, 0.78, 0.34}; 59 | 60 | double[] expected_output = {0.955073918586867, 0.897967422096528}; 61 | 62 | Vector queryV = Vectors.dense(query); 63 | 64 | // create a parallel dataset from the local matrix 65 | List matX = new ArrayList(2); 66 | matX.add(Vectors.dense(x1)); 67 | matX.add(Vectors.dense(x2)); 68 | JavaRDD matRDD = sc.parallelize(matX); 69 | 70 | // compute cosine similarities 71 | JavaRDD sims = matRDD.map(new ComputeSimilarity(queryV)); 72 | 73 | final Double[] output = sims.collect().toArray(new Double[2]); 74 | final double[] outputD = ArrayUtils.toPrimitive(output); 75 | 76 | // sort the similarities and the indices 77 | final Integer[] idx = new Integer[2]; 78 | for (int i = 0; i < 2; i++) { 79 | idx[i] = i; 80 | } 81 | Arrays.sort(idx, new Comparator() { 82 | @Override 83 | public int compare(final Integer o1, final Integer o2) { 84 | return Double.compare(outputD[o1], outputD[o2]); 85 | } 86 | }); 87 | System.out.println("Sorted indices"); 88 | for (int i = 0; i < 2; i++) { 89 | System.out.println(idx[i]); 90 | } 91 | 92 | Assert.assertArrayEquals(expected_output, outputD, 1e-6); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /mahout/chapter11/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter12/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter12/rlexample/readme.md: -------------------------------------------------------------------------------- 1 | There are no direct APIs for implementing reinforcement learning. This is a placeholder folder and implementation details will be added as and when supporting APIs are made available with the maout distributions 2 | 3 | Reference to list of algorithms supported by mahout can be found here: 4 | https://mahout.apache.org/users/basics/algorithms.html 5 | -------------------------------------------------------------------------------- /mahout/chapter13/ensembleexample/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.pml.mahout.ensemble 6 | ensemble 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | ensemble 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-core 27 | 0.20.2 28 | jar 29 | 30 | 31 | org.apache.mahout 32 | mahout-core 33 | 0.9 34 | jar 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /mahout/chapter13/ensembleexample/readme.md: -------------------------------------------------------------------------------- 1 | A ensemble of different distributed recommendation systems using Apache Mahout. 2 | -------------------------------------------------------------------------------- /mahout/chapter13/ensembleexample/src/main/java/com/packt/pml/mahout/ensemble/ItemRecommender.java: -------------------------------------------------------------------------------- 1 | /** Practical Machine learning 2 | * Ensemble learning 3 | * Chapter 13 4 | **/ 5 | 6 | package com.packt.pml.mahout.ensemble; 7 | 8 | /* 9 | * A item based recommender model. 10 | */ 11 | 12 | import java.io.File; 13 | import java.io.IOException; 14 | 15 | import org.apache.mahout.cf.taste.common.TasteException; 16 | import org.apache.mahout.cf.taste.eval.RecommenderBuilder; 17 | import org.apache.mahout.cf.taste.impl.eval.RMSRecommenderEvaluator; 18 | import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; 19 | import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender; 20 | import org.apache.mahout.cf.taste.model.DataModel; 21 | import org.apache.mahout.cf.taste.recommender.Recommender; 22 | import org.apache.mahout.cf.taste.similarity.ItemSimilarity; 23 | import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity; 24 | import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; 25 | 26 | class ItembasedBuilder implements RecommenderBuilder{ 27 | int k; 28 | ItemSimilarity similarity; 29 | 30 | ItembasedBuilder(int similarityMeasure, DataModel dataModel) throws TasteException{ 31 | 32 | if(similarityMeasure==0) 33 | similarity = new EuclideanDistanceSimilarity(dataModel); 34 | else 35 | similarity = new PearsonCorrelationSimilarity(dataModel); 36 | } 37 | public Recommender buildRecommender(DataModel dataModel) throws TasteException { 38 | return new GenericItemBasedRecommender(dataModel, similarity); 39 | } 40 | 41 | } 42 | 43 | public class ItemRecommender { 44 | 45 | public static void main(String args[]) throws IOException, TasteException{ 46 | 47 | DataModel model = new FileDataModel(new File("data/input/u1.base")); 48 | RMSRecommenderEvaluator evaluator = new RMSRecommenderEvaluator(); 49 | ItembasedBuilder builder; 50 | double score; 51 | builder = new ItembasedBuilder(1,model); 52 | score = evaluator.evaluate(builder, null, model, 0.8, 0.7); 53 | System.out.println(score); 54 | } 55 | } -------------------------------------------------------------------------------- /mahout/chapter13/ensembleexample/src/main/java/com/packt/pml/mahout/ensemble/SlopeOneBasedRecommender.java: -------------------------------------------------------------------------------- 1 | /** Practical Machine learning 2 | * Ensemble learning 3 | * Chapter 13 4 | **/ 5 | 6 | package com.packt.pml.mahout.ensemble; 7 | 8 | /* 9 | * A slope one based recommender model. 10 | */ 11 | 12 | import java.io.File; 13 | import java.io.IOException; 14 | 15 | import org.apache.mahout.cf.taste.common.TasteException; 16 | import org.apache.mahout.cf.taste.eval.RecommenderBuilder; 17 | import org.apache.mahout.cf.taste.impl.eval.RMSRecommenderEvaluator; 18 | import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; 19 | import org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender; 20 | import org.apache.mahout.cf.taste.model.DataModel; 21 | import org.apache.mahout.cf.taste.recommender.Recommender; 22 | 23 | 24 | public class SlopeOneBasedRecommender { 25 | 26 | public static void main(String args[]) throws IOException, TasteException{ 27 | 28 | DataModel model = new FileDataModel(new File("data/input/u.data")); 29 | RMSRecommenderEvaluator evaluator = new RMSRecommenderEvaluator(); 30 | 31 | RecommenderBuilder builder = new RecommenderBuilder() { 32 | public Recommender buildRecommender(DataModel model)throws TasteException { 33 | SlopeOneRecommender slope = new SlopeOneRecommender(model); 34 | System.out.println(slope.recommend(199, 3).toString()); 35 | 36 | return slope; 37 | } 38 | }; 39 | 40 | double score = evaluator.evaluate(builder, null, model, 0.8, 0.7); 41 | System.out.println(score); 42 | } 43 | } 44 | 45 | /*OutPut 46 | * Score = 0.9507197266125407 47 | * [RecommendedItem[item:1175, value:7.0], RecommendedItem[item:1158, value:6.0], RecommendedItem[item:1026, value:5.7245636]] 48 | */ -------------------------------------------------------------------------------- /mahout/chapter13/ensembleexample/src/main/java/com/packt/pml/mahout/ensemble/Utilities.java: -------------------------------------------------------------------------------- 1 | /** Practical Machine learning 2 | * Ensemble learning 3 | * Chapter 13 4 | **/ 5 | 6 | package com.packt.pml.mahout.ensemble; 7 | 8 | /* 9 | * A collection of utility functions for working with the ensemble 10 | * 11 | */ 12 | import java.util.ArrayList; 13 | import java.util.HashMap; 14 | import java.util.Iterator; 15 | import java.util.List; 16 | import java.util.Set; 17 | 18 | class Utilities{ 19 | HashMap> hm; 20 | public Utilities() { 21 | // TODO Auto-generated constructor stub 22 | hm = new HashMap>(); 23 | } 24 | 25 | public void insert(Long item, Float value){ 26 | 27 | if(!hm.containsKey(item)) 28 | hm.put(item, new ArrayList()); 29 | hm.get(item).add(value); 30 | } 31 | 32 | public void show(){ 33 | System.out.println(hm); 34 | } 35 | 36 | public HashMap getAverage(){ 37 | HashMap result = new HashMap(); 38 | Set items = hm.keySet(); 39 | Iterator it = items.iterator(); 40 | float sum,avg; 41 | while(it.hasNext()){ 42 | Long key = it.next(); 43 | List values = hm.get(key); 44 | Iterator itv = values.iterator(); 45 | sum=0; 46 | while(itv.hasNext()) 47 | sum = sum + itv.next(); 48 | avg = sum/values.size(); 49 | result.put(key, new Float(avg)); 50 | } 51 | 52 | return result; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /mahout/chapter13/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter5/decisiontreeexample/readme.md: -------------------------------------------------------------------------------- 1 | Please refer to the RandomForest implementation as a reference. The DecisionFOrest API is used for this purpose, but with the following specific implementation lines.. 2 | 3 | int numberOfTrees = 1; 4 | Data data = loadData(...); 5 | DecisionForest forest = buildForest(numberOfTrees, data); 6 | 7 | String path = "saved-trees/" + numberOfTrees + "-trees.txt"; 8 | DataOutputStream dos = new DataOutputStream(new FileOutputStream(path)); 9 | 10 | forest.write(dos); 11 | -------------------------------------------------------------------------------- /mahout/chapter5/randomforestexample/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.pml.mahout.randomforest 6 | random-forest 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | random-forest 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.mahout 26 | mahout-core 27 | 0.8-SNAPSHOT 28 | jar 29 | 30 | 31 | org.uncommons.maths 32 | uncommons-maths 33 | 1.2.2 34 | jar 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /mahout/chapter5/randomforestexample/readme.md: -------------------------------------------------------------------------------- 1 | This folder has an example implementation for random forest overed as a part of chapter 5 using Apache Mahout 0.9 distribution. 2 | -------------------------------------------------------------------------------- /mahout/chapter5/randomforestexample/src/test/java/com/packt/pml/mahout/randomforest/RandomForestTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Practical Machine learning 3 | * Random Forest Example 4 | * Chapter 05 5 | * @author sunilag 6 | */ 7 | package com.packt.pml.mahout.randomforest; 8 | 9 | import junit.framework.Test; 10 | import junit.framework.TestCase; 11 | import junit.framework.TestSuite; 12 | 13 | /** 14 | * Unit test for simple App. 15 | */ 16 | public class RandomForestTest 17 | extends TestCase 18 | { 19 | /** 20 | * Create the test case 21 | * 22 | * @param testName name of the test case 23 | */ 24 | public RandomForestTest( String testName ) 25 | { 26 | super( testName ); 27 | } 28 | 29 | /** 30 | * @return the suite of tests being tested 31 | */ 32 | public static Test suite() 33 | { 34 | return new TestSuite( RandomForestTest.class ); 35 | } 36 | 37 | /** 38 | * Rigourous Test :-) 39 | */ 40 | public void testRandomForest() 41 | { 42 | assertTrue( true ); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /mahout/chapter6/knnexample/readme.md: -------------------------------------------------------------------------------- 1 | There are no direct APIs for implementing KNN algorithm. A derived implementation is presented in this folder and implementation details will be added as and when supporting APIs are made available with the maout distributions 2 | 3 | Reference to list of algorithms supported by mahout can be found here: 4 | https://mahout.apache.org/users/basics/algorithms.html 5 | -------------------------------------------------------------------------------- /mahout/chapter6/knnexample/test/java/WeightedMatrixTest.java: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // K Nearest Neighbor example 3 | // Chapter 6 4 | 5 | package test.java; 6 | 7 | import main.java.KNearestNeighbor; 8 | 9 | import org.apache.spark.mllib.linalg.Matrix; 10 | import org.apache.spark.mllib.linalg.Vector; 11 | import org.apache.spark.mllib.linalg.Vectors; 12 | 13 | public class WeightedMatrixTest { 14 | 15 | public static void main(String[] args) { 16 | double testArray[][] = {{1,2,3,4,5,6},{1,2,3,4,5,7},{2,4,6,8,10,12},{2,3,4,5,6,7},{5,5,5,5,5,5}}; 17 | System.out.println(testArray.length); 18 | Vector[] vectors = new Vector[5]; 19 | 20 | for (int i = 0; i < 5; i++) { 21 | vectors[i] = Vectors.dense(testArray[i]); 22 | System.out.println("Vector " + i +": " + vectors[i].toArray()[0] + " " + vectors[i].toArray()[1] + " " + vectors[i].toArray()[2] + " " + vectors[i].toArray()[3] + " " + vectors[i].toArray()[4] + " " + vectors[i].toArray()[5] + " "); 23 | } 24 | KNearestNeighbor w = new KNearestNeighbor(vectors, 3,0,1,1); 25 | Matrix m = w.getWeightedMatrix(); 26 | System.out.println("Matrix:"); 27 | int size = m.numCols(); 28 | double[] mA = m.toArray(); 29 | for(int i=0; i 3 | 4.0.0 4 | 5 | com.packt.pml.mahout.fpgrowth 6 | fp-growth 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | fp-growth 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-core 27 | 0.20.2 28 | jar 29 | 30 | 31 | org.apache.mahout 32 | mahout-core 33 | 0.9 34 | jar 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /mahout/chapter7/fpgrowthexample/readme.md: -------------------------------------------------------------------------------- 1 | This folder has an example implementation for fp-growth algorithm using Apache Mahout 0.9 distribution covered as a part of chapter 7 association rule based learning methods 2 | -------------------------------------------------------------------------------- /mahout/chapter7/fpgrowthexample/src/test/java/com/packt/pml/mahout/fpgrowth/FPgrowthTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Practical Machine Learning 3 | * Aoocition rule based learning - FP growth example 4 | * Chapter 07 5 | **/ 6 | package com.packt.pml.mahout.fpgrowth; 7 | 8 | import junit.framework.Test; 9 | import junit.framework.TestCase; 10 | import junit.framework.TestSuite; 11 | 12 | /** 13 | * Unit test for simple App. 14 | */ 15 | public class FPgrowthTest 16 | extends TestCase 17 | { 18 | /** 19 | * Create the test case 20 | * 21 | * @param testName name of the test case 22 | */ 23 | public FPgrowthTest( String testName ) 24 | { 25 | super( testName ); 26 | } 27 | 28 | /** 29 | * @return the suite of tests being tested 30 | */ 31 | public static Test suite() 32 | { 33 | return new TestSuite( FPgrowthTest.class ); 34 | } 35 | 36 | /** 37 | * Rigourous Test :-) 38 | */ 39 | public void testApp() 40 | { 41 | assertTrue( true ); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /mahout/chapter8/k-meansexample/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.pml.mahout.clustering 6 | k-means 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | k-means 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-core 27 | 0.20.2 28 | jar 29 | 30 | 31 | org.apache.mahout 32 | mahout-core 33 | 0.9 34 | jar 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /mahout/chapter8/k-meansexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter8/k-meansexample/src/main/java/com/packt/pml/mahout/kmeans/DataPreprocessing.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Practical Machine learning 3 | * Clustering based learning - K-means clustering Example 4 | * Chapter 8 5 | */ 6 | package com.packt.pml.mahout.kmeans; 7 | 8 | import java.io.IOException; 9 | 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.fs.FileSystem; 13 | 14 | import chapter7.src.InputDriver; 15 | 16 | public class DataPreprocessing { 17 | 18 | public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException 19 | { 20 | 21 | Configuration conf = new Configuration(); 22 | conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml")); 23 | conf.addResource(new Path("/usr/local/hadoop/conf/hdfs-site.xml")); 24 | 25 | //create the file system object and pass the configuration object 26 | FileSystem fileSystem = FileSystem.get(conf); 27 | //We then create the input and output Path Objects. 28 | 29 | 30 | //define the input and sequence file directory 31 | String inputPath="chapter7/clustering_input"; 32 | String inputSeq="clustering_seq"; 33 | 34 | Path inputDir = new Path(inputPath); 35 | Path inputSeqDir = new Path(inputSeq); 36 | 37 | if (fileSystem.exists(inputSeqDir)) { 38 | System.out.println("Output already exists"); 39 | fileSystem.delete(inputSeqDir, true); 40 | System.out.println("deleted output directory"); 41 | } 42 | 43 | //The last step is to encode the vectors using the //RandomAccessSparseVector 44 | InputDriver.runJob(inputDir, inputSeqDir, 45 | "org.apache.mahout.math.RandomAccessSparseVector",conf); 46 | 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /mahout/chapter8/k-meansexample/src/test/java/com/packt/pml/mahout/kmeans/KMeansTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Practical Machine learning 3 | * Clustering based learning - K-means clustering Example 4 | * Chapter 8 5 | */ 6 | package com.packt.pml.mahout.kmeans; 7 | 8 | import junit.framework.Test; 9 | import junit.framework.TestCase; 10 | import junit.framework.TestSuite; 11 | 12 | /** 13 | * Unit test for simple App. 14 | */ 15 | public class KMeansTest 16 | extends TestCase 17 | { 18 | /** 19 | * Create the test case 20 | * 21 | * @param testName name of the test case 22 | */ 23 | public KMeansTest( String testName ) 24 | { 25 | super( testName ); 26 | } 27 | 28 | /** 29 | * @return the suite of tests being tested 30 | */ 31 | public static Test suite() 32 | { 33 | return new TestSuite( KMeansTest.class ); 34 | } 35 | 36 | /** 37 | * Rigourous Test :-) 38 | */ 39 | public void testApp() 40 | { 41 | assertTrue( true ); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /mahout/chapter8/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter9/naivebayesexample/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.pml.mahout.naivebayes 6 | naive-bayes 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | naive-bayes 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.mahout 26 | mahout-core 27 | 0.9 28 | jar 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /mahout/chapter9/naivebayesexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mahout/chapter9/naivebayesexample/src/main/java/start.sh: -------------------------------------------------------------------------------- 1 | export WORK_DIR=/chapter9/mahout/naive-bayes 2 | mkdir $WORK_DIR 3 | cd $WORK_DIR 4 | wget http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz 5 | tar –xvzf 20news-bydate.tar.gz 6 | mkdir ${WORK_DIR}/20news-all 7 | mkdir ${WORK_DIR}/20news-seq 8 | cp -R ${WORK_DIR}/20news-bydate*/*/* ${WORK_DIR}/20news-all 9 | 10 | 11 | mahout seqdirectory -i ${WORK_DIR}/20news-all -o ${WORK_DIR}/20news-seq -ow 12 | 13 | mahout seq2sparse -i ${WORK_DIR}/20news-seq -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf -ow 14 | 15 | mahout split -i ${WORK_DIR}/20news-vectors/tfidf-vectors --trainingOutput ${WORK_DIR}/20news-train-vectors --testOutput ${WORK_DIR}/20news-test-vectors --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential -------------------------------------------------------------------------------- /mahout/chapter9/naivebayesexample/src/test/java/com/packt/pml/mahout/naivebayes/NaiveBayesTest.java: -------------------------------------------------------------------------------- 1 | package com.packt.pml.mahout.naivebayes; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class NaiveBayesTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public NaiveBayesTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( NaiveBayesTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testNaiveBayes() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /mahout/chapter9/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter10/linearregressionexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter10/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter11/annexample/ann.py: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Deep learning - Artificial Neural Networks example 3 | # Chapter 11 4 | 5 | from pybrain.supervised.trainers import BackpropTrainer 6 | from pybrain.tools.shortcuts import buildNetwork 7 | from pybrain.datasets import SupervisedDataSet 8 | from pybrain.structure import SigmoidLayer 9 | 10 | network = buildNetwork(2, 5, 1, hiddenclass=SigmoidLayer) 11 | 12 | data_set = SupervisedDataSet(2, 1) 13 | data_set.addSample((0, 0), [0]) 14 | data_set.addSample((0, 1), [1]) 15 | data_set.addSample((1, 0), [1]) 16 | data_set.addSample((1, 1), [0]) 17 | 18 | trainer = BackpropTrainer(module=network, dataset=data_set, momentum=0.00, learningrate=0.10, weightdecay=0.0, 19 | lrdecay=1.0) 20 | 21 | error = 1 22 | epochsToTrain = 0 23 | while error > 0.0001: 24 | epochsToTrain += 1 25 | error = trainer.train() 26 | 27 | results = network.activateOnDataset(data_set) 28 | for i in range(len(results)): 29 | print data_set['input'][i][0], 'xor', data_set['input'][i][1], '=', int(results[i] > 0.5) 30 | 31 | """ 32 | 0.0 xor 0.0 = 0 33 | 0.0 xor 1.0 = 1 34 | 1.0 xor 0.0 = 1 35 | 1.0 xor 1.0 = 0 36 | """ 37 | 38 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter11/annexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter11/dlexample/example-1-data.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter11/dlexample/example-1-data.ods -------------------------------------------------------------------------------- /python-sckit-learn/chapter11/dlexample/perceptron-data.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter11/dlexample/perceptron-data.ods -------------------------------------------------------------------------------- /python-sckit-learn/chapter11/dlexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter11/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter12/readme.md: -------------------------------------------------------------------------------- 1 | Reference from Andrew Ng example for reinforcement learning: 2 | http://cs229.stanford.edu/notes/cs229-notes12.pdf 3 | 4 | scikit-learn provides excellent tools for supervised and unsupervised learning but explicitly does not deal with reinforcement learning. 5 | This example implementation is intended to compliment the functionality of scikit-learn. 6 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter12/rlexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter13/ensembleexample/ensemble_predict.py: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Ensemble learning 3 | # Chapter 13 4 | 5 | """ 6 | ========================================================== 7 | Prediction utility for trained EnsembleSelectionClassifier 8 | ========================================================== 9 | 10 | Get predictions from trained EnsembleSelectionClassifier given 11 | svm format data file. 12 | 13 | Can output predicted classes or probabilities from the full 14 | ensemble or just the best model. 15 | 16 | Expects to find a trained ensemble in the sqlite db specified. 17 | 18 | usage: ensemble_predict.py [-h] [-s {best,ens}] [-p] db_file data_file 19 | 20 | Get EnsembleSelectionClassifier predictions 21 | 22 | positional arguments: 23 | db_file sqlite db file containing model 24 | data_file testing data in svm format 25 | 26 | optional arguments: 27 | -h, --help show this help message and exit 28 | -s {best,ens} choose source of prediction ["best", "ens"] 29 | -p predict probabilities 30 | """ 31 | from __future__ import print_function 32 | 33 | import numpy as np 34 | 35 | from argparse import ArgumentParser 36 | 37 | from sklearn.datasets import load_svmlight_file 38 | 39 | from ensemble import EnsembleSelectionClassifier 40 | 41 | 42 | def parse_args(): 43 | desc = 'Get EnsembleSelectionClassifier predictions' 44 | parser = ArgumentParser(description=desc) 45 | 46 | parser.add_argument('db_file', help='sqlite db file containing model') 47 | parser.add_argument('data_file', help='testing data in svm format') 48 | 49 | help_fmt = 'choose source of prediction ["best", "ens"] (default "ens")' 50 | parser.add_argument('-s', dest='pred_src', 51 | choices=['best', 'ens'], 52 | help=help_fmt, default='ens') 53 | 54 | parser.add_argument('-p', dest='return_probs', 55 | action='store_true', default=False, 56 | help='predict probabilities') 57 | 58 | return parser.parse_args() 59 | 60 | 61 | if (__name__ == '__main__'): 62 | res = parse_args() 63 | 64 | X, _ = load_svmlight_file(res.data_file) 65 | X = X.toarray() 66 | 67 | ens = EnsembleSelectionClassifier(db_file=res.db_file, models=None) 68 | 69 | if (res.pred_src == 'best'): 70 | preds = ens.best_model_predict_proba(X) 71 | else: 72 | preds = ens.predict_proba(X) 73 | 74 | if (not res.return_probs): 75 | preds = np.argmax(preds, axis=1) 76 | 77 | for p in preds: 78 | if (res.return_probs): 79 | mesg = " ".join(["%.5f" % v for v in p]) 80 | else: 81 | mesg = p 82 | 83 | print(mesg) 84 | 85 | # Original Author: David C. Lambert [dcl -at- panix -dot- com] 86 | # Copyright(c) 2013 87 | # License: Simple BSD -------------------------------------------------------------------------------- /python-sckit-learn/chapter13/ensembleexample/readme.md: -------------------------------------------------------------------------------- 1 | ######An implementation of [Caruana et al's Ensemble Selection algorithm] (http://www.cs.cornell.edu/~caruana/ctp/ct.papers/caruana.icml04.icdm06long.pdf) [1][2] in Python, based on [scikit-learn](http://scikit-learn.org). 2 | 3 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter13/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter5/decisiontreeexample/data/ad.names: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter5/decisiontreeexample/data/ad.names -------------------------------------------------------------------------------- /python-sckit-learn/chapter5/decisiontreeexample/information-gain.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter5/decisiontreeexample/information-gain.ods -------------------------------------------------------------------------------- /python-sckit-learn/chapter5/randomforstexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter5/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter6/knnexample/iris_data/README.md: -------------------------------------------------------------------------------- 1 | # Iris Dataset 2 | 3 | This Dataset has been obtained from [UCI ML Repository](http://archive.ics.uci.edu/ml/datasets/Iris). 4 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter6/knnexample/knn_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/chapter6/knnexample/knn_example.png -------------------------------------------------------------------------------- /python-sckit-learn/chapter6/knnexample/readme.md: -------------------------------------------------------------------------------- 1 | # kNN using the scikit-learn package 2 | A basic k-Nearest Neighbour implementation in python using the scikitlearn package. 3 | 4 | # Usage 5 | Look for code in `demo.py`. 6 | 7 | # Dependencies 8 | - Numpy 9 | - Matplotlib 10 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter6/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter6/svmexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter6/svmexample/svm.py: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Support Vector Machines example 3 | # Chapter 6 4 | 5 | # Example: ExImage Recognition with Support Vector Machines 6 | 7 | import sklearn as sk 8 | import numpy as np 9 | import matplotlib 10 | import matplotlib.pyplot as plt 11 | 12 | # print 'IPython version:', IPython.__version__ 13 | # print 'numpy version:', np.__version__ 14 | # print 'scikit-learn version:', sk.__version__ 15 | # print 'matplotlib version:', matplotlib.__version__ 16 | from sklearn.datasets import fetch_olivetti_faces 17 | 18 | # fetch the faces data 19 | faces = fetch_olivetti_faces() 20 | 21 | # print faces.DESCR 22 | 23 | def print_faces(images, target, top_n): 24 | # set up the figure size in inches 25 | fig = plt.figure(figsize=(12, 12)) 26 | fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) 27 | for i in range(top_n): 28 | # plot the images in a matrix of 20x20 29 | p = fig.add_subplot(20, 20, i + 1, xticks=[], yticks=[]) 30 | p.imshow(images[i], cmap=plt.cm.bone) 31 | 32 | # label the image with the target value 33 | p.text(0, 14, str(target[i])) 34 | p.text(0, 60, str(i)) 35 | 36 | print_faces(faces.images, faces.target, 20) 37 | print_faces(faces.images, faces.target, 400) 38 | 39 | # Build training and testing sets 40 | from sklearn.svm import SVC 41 | svc_1 = SVC(kernel='linear') 42 | from sklearn.cross_validation import train_test_split 43 | 44 | X_train, X_test, y_train, y_test = train_test_split( 45 | faces.data, faces.target, test_size=0.25, random_state=0) 46 | 47 | # Perform 5-fold cross-validation 48 | from sklearn.cross_validation import cross_val_score, KFold 49 | from scipy.stats import sem 50 | 51 | def evaluate_cross_validation(clf, X, y, K): 52 | # create a k-fold croos validation iterator 53 | cv = KFold(len(y), K, shuffle=True, random_state=0) 54 | # by default the score used is the one returned by score method of the estimator (accuracy) 55 | scores = cross_val_score(clf, X, y, cv=cv) 56 | print scores 57 | print ("Mean score: {0:.3f} (+/-{1:.3f})").format( 58 | np.mean(scores), sem(scores)) 59 | 60 | evaluate_cross_validation(svc_1, X_train, y_train, 5) 61 | 62 | # measure precision and recall on the evaluation set, for each class. 63 | train_and_evaluate(svc_1, X_train, X_test, y_train, y_test) -------------------------------------------------------------------------------- /python-sckit-learn/chapter6/svmexample/svm_test.py: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Support Vector Machines example 3 | # Chapter 6 4 | 5 | #SVM test code 6 | import sklearn as sk 7 | import numpy as np 8 | import matplotlib 9 | import matplotlib.pyplot as plt 10 | 11 | # the index ranges of images of people with glasses 12 | glasses = [ 13 | (10, 19), (30, 32), (37, 38), (50, 59), (63, 64), 14 | (69, 69), (120, 121), (124, 129), (130, 139), (160, 161), 15 | (164, 169), (180, 182), (185, 185), (189, 189), (190, 192), 16 | (194, 194), (196, 199), (260, 269), (270, 279), (300, 309), 17 | (330, 339), (358, 359), (360, 369) 18 | ] 19 | 20 | def create_target(segments): 21 | # create a new y array of target size initialized with zeros 22 | y = np.zeros(faces.target.shape[0]) 23 | # put 1 in the specified segments 24 | for (start, end) in segments: 25 | y[start:end + 1] = 1 26 | return y 27 | 28 | 29 | target_glasses = create_target(glasses) 30 | 31 | X_train, X_test, y_train, y_test = train_test_split( 32 | faces.data, target_glasses, test_size=0.25, random_state=0) 33 | 34 | svc_2 = SVC(kernel='linear') 35 | evaluate_cross_validation(svc_2, X_train, y_train, 5) 36 | train_and_evaluate(svc_2, X_train, X_test, y_train, y_test) 37 | 38 | X_test = faces.data[30:40] 39 | y_test = target_glasses[30:40] 40 | 41 | print y_test.shape[0] 42 | 43 | select = np.ones(target_glasses.shape[0]) 44 | select[30:40] = 0 45 | X_train = faces.data[select == 1] 46 | y_train = target_glasses[select == 1] 47 | 48 | print y_train.shape[0] 49 | 50 | svc_3 = SVC(kernel='linear') 51 | train_and_evaluate(svc_3, X_train, X_test, y_train, y_test) 52 | y_pred = svc_3.predict(X_test) 53 | 54 | eval_faces = [np.reshape(a, (64, 64)) for a in X_test] 55 | print_faces(eval_faces, y_pred, 10) -------------------------------------------------------------------------------- /python-sckit-learn/chapter7/aprioriexample/readme.md: -------------------------------------------------------------------------------- 1 | This folder conatins python Implementation of Apriori Algorithm 2 | 3 | The dataset is a copy of the “Online directory of certified businesses with a detailed profile” file from the Small Business Services (SBS) 4 | dataset in the `NYC Open Data Sets `_ -------------------------------------------------------------------------------- /python-sckit-learn/chapter7/fpgrowthexample/data/numeric.csv: -------------------------------------------------------------------------------- 1 | 1,2,3,4,5 2 | 1,2,4 3 | 2,3,5 4 | 3,4,5 5 | 1,2,3,4 6 | 1,2,3 -------------------------------------------------------------------------------- /python-sckit-learn/chapter7/fpgrowthexample/data/tsk.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | b,c,d 3 | a,c,d,e 4 | a,d,e 5 | a,b,c 6 | a,b,c,d 7 | a 8 | a,b,c 9 | a,b,d 10 | b,c,e 11 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter7/fpgrowthexample/readme.md: -------------------------------------------------------------------------------- 1 | This folder provides an implementation for fpgrowth using vanilla python libraries and not scikit-learn packages 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter8/k-meansexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter8/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter9/naivebayesexample/data-types.py: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Bayesian learning - Naive Bayes example 3 | # Chapter 9 4 | 5 | from collections import namedtuple 6 | 7 | """ 8 | This module defines the datatypes used by the other modules. 9 | """ 10 | 11 | Dataset = namedtuple("Dataset", ["data", "target"] ) 12 | 13 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter9/naivebayesexample/feature-selection.py: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Bayesian learning - Naive Bayes example 3 | # Chapter 9 4 | 5 | from datatypes import Dataset 6 | 7 | from sklearn.feature_selection import SelectKBest, f_classif 8 | from sklearn.lda import LDA 9 | from sklearn.qda import QDA 10 | from sklearn.decomposition import PCA 11 | 12 | def univariate_feature_selection(ds, n): 13 | """ 14 | Selects 'n' features in the dataset. Returns the Reduced Dataset 15 | n (int), ds (Dataset) -> Dataset 16 | """ 17 | 18 | selector = SelectKBest(f_classif, n) 19 | selector.fit(ds.data, ds.target) 20 | features = selector.get_support(indices=True) 21 | return Dataset(selector.transform(ds.data), ds.target) 22 | 23 | def lda(ds, n): 24 | ''' 25 | Outputs the projection of the data in the best 26 | discriminant dimension. 27 | Maximum of 2 dimensions for our binary case (values of n greater than this will be ignored by sklearn) 28 | ''' 29 | selector = LDA(n_components=n) 30 | selector.fit(ds.data, ds.target) 31 | new_data = selector.transform(ds.data) 32 | return Dataset(new_data, ds.target) 33 | 34 | def pca(ds,n): 35 | ''' 36 | Uses the PCA classifier to reduces the dimensionality by choosing the n lastest elements 37 | of the transform. 38 | ''' 39 | selector = PCA() 40 | selector.fit(ds.data, ds.target) 41 | new_data = selector.transform(ds.data)[:, :-n] 42 | return Dataset(new_data, ds.target) 43 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter9/naivebayesexample/readme.md: -------------------------------------------------------------------------------- 1 | Naive Bayes Classifier implementation with scikit-learn with the spambase dataset. 2 | 3 | -------------------------------------------------------------------------------- /python-sckit-learn/chapter9/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python-sckit-learn/data/titanic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/python-sckit-learn/data/titanic.png -------------------------------------------------------------------------------- /python-sckit-learn/readme.md: -------------------------------------------------------------------------------- 1 | # Introduction to Machine Learning with Scikit-Learn 2 | 3 | **Code & Data for Introduction to Machine Learning with Scikit-Learn** 4 | 5 | [](http://scikit-learn.org/stable/tutorial/machine_learning_map/) 6 | 7 | ## Installing Scikit-Learn with pip 8 | 9 | See the full [installation instructions](http://scikit-learn.org/stable/install.html) for more details; these are provided for convenience only. 10 | 11 | Scikit-Learn requires: 12 | 13 | - Python >= 2.6 or >= 3.3 14 | - Numpy >= 1.6.1 15 | - SciPy >= 0.9 16 | 17 | Once you have installed `pip` (the python package manager): 18 | 19 | ### Mac OS X 20 | 21 | This should be super easy: 22 | 23 | pip install -U numpy scipy scikit-learn 24 | 25 | Now just wait! Also, you have no excuse not to do this in a virtualenv. 26 | 27 | ### Windows 28 | 29 | Install [numpy](http://numpy.scipy.org/) and [scipy](http://www.scipy.org/) with their official installers. You can then use PyPi to install scikit-learn: 30 | 31 | pip install -U scikit-learn 32 | 33 | If you're having trouble, consider one of the unofficial windows installers or anacondas (see the Scikit-Learn page for more). 34 | 35 | ### Ubuntu Linux 36 | 37 | Unfortunately there are no official binary packages for Linux. First install the build dependencies: 38 | 39 | sudo apt-get install build-essential python-dev python-setuptools \ 40 | python-numpy python-scipy \ 41 | libatlas-dev libatlas3gf-base 42 | 43 | Then you can build (hopefully) Scikit-learn with pip: 44 | 45 | pip install --user --install-option="--prefix=" -U scikit-learn 46 | 47 | Keep in mind however, that there are other dependencies and might be issues with ATLAS and BLAS - see the official installation for more. 48 | -------------------------------------------------------------------------------- /r/chapter10/linearregressionexample/Rplots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter10/linearregressionexample/Rplots.pdf -------------------------------------------------------------------------------- /r/chapter10/linearregressionexample/linearregression.R: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Regression Analysis - Linear Regression 3 | # Chapter 10 4 | 5 | 6 | ins <- read.csv("insurance.csv", stringsAsFactors = TRUE) 7 | 8 | str(ins) 9 | summary(ins$charges) 10 | hist(ins$charges) 11 | 12 | table(ins$region) 13 | 14 | cor(ins[c("age", "bmi", "children", "charges")]) 15 | 16 | pairs(ins[c("age", "bmi", "children", "charges")]) 17 | 18 | library(psych) 19 | 20 | pairs.panels(ins[c("age", "bmi", "children", "charges")]) 21 | 22 | 23 | ins_model <- lm(charges ~ age + children + bmi + sex + smoker + region, data = ins) 24 | 25 | ins_model <- lm(charges ~ ., data = ins) 26 | 27 | summary(ins_model) 28 | 29 | 30 | ins$age2 <- ins$age^2 31 | 32 | ins$bmi30 <- ifelse(ins$bmi >= 30, 1, 0) 33 | 34 | ins_model2 <- lm(charges ~ age + age2 + children + bmi + sex + bmi30*smoker + region, data = ins) 35 | 36 | summary(ins_model2) 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /r/chapter10/linearregressionexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter10/logisticregressionexample/dataset1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter10/logisticregressionexample/dataset1.txt -------------------------------------------------------------------------------- /r/chapter10/logisticregressionexample/dataset2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter10/logisticregressionexample/dataset2.txt -------------------------------------------------------------------------------- /r/chapter10/logisticregressionexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter10/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter11/annexample/Rplots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter11/annexample/Rplots.pdf -------------------------------------------------------------------------------- /r/chapter11/annexample/Rplots1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter11/annexample/Rplots1.pdf -------------------------------------------------------------------------------- /r/chapter11/annexample/ann.R: -------------------------------------------------------------------------------- 1 | # Practical Machine Learning 2 | # Neural Networks (to predict the strength of concrete) 3 | # Chapter 11 4 | 5 | conc <- read.csv("concrete.csv") 6 | str(conc) 7 | 8 | 9 | normalize <- function(x) { 10 | return((x - min(x))/(max(x) - min(x))) 11 | } 12 | 13 | 14 | conc_norm <- as.data.frame(lapply(conc, normalize)) 15 | 16 | summary(conc_norm$strength) 17 | summary(conc$strength) 18 | 19 | 20 | conc_train <- conc_norm[1:773, ] 21 | conc_test <- conc_norm[774:1030, ] 22 | 23 | library(neuralnet) 24 | 25 | # deault: 1 hidden nodes 26 | conc_model <- neuralnet(strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age, data = conc_train) 27 | plot(conc_model) 28 | 29 | model_results <- compute(conc_model, conc_test[1:8]) 30 | 31 | pred_strength <- model_results$net.result 32 | 33 | cor(pred_strength, conc_test$strength) 34 | 35 | 36 | # 5 hidden layers 37 | conc_model2 <- neuralnet(strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age, data = conc_train, hidden = 5) 38 | plot(conc_model2) 39 | 40 | model_results2 <- compute(conc_model2, conc_test[1:8]) 41 | 42 | pred_strength2 <- model_results2$net.result 43 | 44 | cor(pred_strength2, conc_test$strength) 45 | 46 | -------------------------------------------------------------------------------- /r/chapter11/annexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter11/dlexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter11/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter12/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter12/rlexample/Results.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/Results.pdf -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlaci.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlaci.zip -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: qlearning 2 | Type: Package 3 | Title: Q-Learning: A Data Analysis Method for Constructing Adaptive 4 | Interventions 5 | Version: 2.0 6 | Date: 2012-01-16 7 | Author: Min Qian, Inbal Nahum-Shani, Ashkan Ertefaie, Amarpreet Kaur and Susan A. Murphy 8 | Maintainer: Min Qian 9 | Description: The qlearning package uses q-learning method to analyze data from a SMART trial, to inform the construction of high-quality adaptive interventions. See Nahum-Shani et al. (2010) Q-Learning: A Data Analysis Method for Constructing Adaptive Interventions for more details. 10 | Depends: R(>= 2.12.0) 11 | License: GPL (version 2 or later) 12 | Packaged: 2012-01-16 16:35:40 UTC; min 13 | Built: R 2.14.1; ; 2012-01-16 16:35:51 UTC; windows 14 | -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/INDEX: -------------------------------------------------------------------------------- 1 | DataEx An example data set collected from a SMART 2 | trial 3 | qlearning Q-Learning: A Data Analysis Method for 4 | Constructing Adaptive Interventions 5 | -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/MD5: -------------------------------------------------------------------------------- 1 | be2a63cdb1381190e83b83026e2a3e56 *DESCRIPTION 2 | 1db07346d2729caed9473132bd1ef715 *INDEX 3 | fdfc9b6890d9cefb85bd7af2188d7ab4 *Meta/Rd.rds 4 | f2f59fc5c8c7f3884203e050db360fb5 *Meta/data.rds 5 | 09602eeaeca4b9942ea7487e10034e9b *Meta/hsearch.rds 6 | 311171ce88715c62d1ba8ce79ec59b7d *Meta/links.rds 7 | c43efce1f9e580a609ff991e5bf452d6 *Meta/nsInfo.rds 8 | 44bee8c0ebb6a5b7d963b96dc99190bf *Meta/package.rds 9 | df390c53434517b304ac5db487184641 *NAMESPACE 10 | 240d28d145138a75831809e31a480bad *R/qlearning 11 | b877002c67ec87e7354dff3fb8a40c61 *R/qlearning.rdb 12 | 7efa81702a1b5f6229018ce17df72e2b *R/qlearning.rdx 13 | dc2d1ebe7a40fbd6d8359c3fd99686ae *data/DataEx.RData 14 | 13a2ace3051dd4135315513fedac1985 *help/AnIndex 15 | ddcd8f1047bf7890c0bfdf0d141837b7 *help/aliases.rds 16 | 4d3736ed70bd147d49eeb12c91202084 *help/paths.rds 17 | df801dd385d4c58da7fa648c46f222ca *help/qlearning.rdb 18 | b8290f01c2be1ce797d6d647c3e09a43 *help/qlearning.rdx 19 | 8b319481a8c2dc47aad8462c0e98f856 *html/00Index.html 20 | 444535b9cb76ddff1bab1e1865a3fb14 *html/R.css 21 | -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/Meta/Rd.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/Rd.rds -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/Meta/data.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/data.rds -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/Meta/hsearch.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/hsearch.rds -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/Meta/links.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/links.rds -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/Meta/nsInfo.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/nsInfo.rds -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/Meta/package.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/Meta/package.rds -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/R/qlearning: -------------------------------------------------------------------------------- 1 | local({ 2 | info <- loadingNamespaceInfo() 3 | ns <- .Internal(getRegisteredNamespace(as.name(info$pkgname))) 4 | if (is.null(ns)) 5 | stop("cannot find namespace environment"); 6 | barepackage <- sub("([^-]+)_.*", "\\1", info$pkgname) 7 | dbbase <- file.path(info$libname, info$pkgname, "R", barepackage) 8 | lazyLoad(dbbase, ns, filter = function(n) n != ".__NAMESPACE__.") 9 | }) 10 | -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/R/qlearning.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/R/qlearning.rdb -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/R/qlearning.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/R/qlearning.rdx -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/data/DataEx.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/data/DataEx.RData -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/help/AnIndex: -------------------------------------------------------------------------------- 1 | DataEx DataEx 2 | qlearning qlearning 3 | -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/help/aliases.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/help/aliases.rds -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/help/paths.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/help/paths.rds -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/help/qlearning.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/help/qlearning.rdb -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/help/qlearning.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter12/rlexample/qlearning/help/qlearning.rdx -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/html/00Index.html: -------------------------------------------------------------------------------- 1 | 2 | R: Q-Learning: A Data Analysis Method for Constructing Adaptive 3 | Interventions 4 | 5 | 6 | 7 | Q-Learning: A Data Analysis Method for Constructing Adaptive 8 | Interventions 9 | 10 | 11 | 12 | 13 | 14 | 15 | Documentation for package ‘qlearning’ version 2.0 16 | 17 | DESCRIPTION file. 18 | 19 | 20 | Help Pages 21 | 22 | 23 | 24 | DataEx 25 | An example data set collected from a SMART trial 26 | qlearning 27 | Q-Learning: A Data Analysis Method for Constructing Adaptive Interventions 28 | 29 | 30 | -------------------------------------------------------------------------------- /r/chapter12/rlexample/qlearning/html/R.css: -------------------------------------------------------------------------------- 1 | BODY{ background: white; 2 | color: black } 3 | 4 | A:link{ background: white; 5 | color: blue } 6 | A:visited{ background: white; 7 | color: rgb(50%, 0%, 50%) } 8 | 9 | H1{ background: white; 10 | color: rgb(55%, 55%, 55%); 11 | font-family: monospace; 12 | font-size: x-large; 13 | text-align: center } 14 | 15 | H2{ background: white; 16 | color: rgb(40%, 40%, 40%); 17 | font-family: monospace; 18 | font-size: large; 19 | text-align: center } 20 | 21 | H3{ background: white; 22 | color: rgb(40%, 40%, 40%); 23 | font-family: monospace; 24 | font-size: large } 25 | 26 | H4{ background: white; 27 | color: rgb(40%, 40%, 40%); 28 | font-family: monospace; 29 | font-style: italic; 30 | font-size: large } 31 | 32 | H5{ background: white; 33 | color: rgb(40%, 40%, 40%); 34 | font-family: monospace } 35 | 36 | H6{ background: white; 37 | color: rgb(40%, 40%, 40%); 38 | font-family: monospace; 39 | font-style: italic } 40 | 41 | IMG.toplogo{ vertical-align: middle } 42 | 43 | IMG.arrow{ width: 30px; 44 | height: 30px; 45 | border: 0 } 46 | 47 | span.acronym{font-size: small} 48 | span.env{font-family: monospace} 49 | span.file{font-family: monospace} 50 | span.option{font-family: monospace} 51 | span.pkg{font-weight: bold} 52 | span.samp{font-family: monospace} 53 | 54 | div.vignettes a:hover { 55 | background: rgb(85%, 85%, 85%); 56 | } 57 | 58 | -------------------------------------------------------------------------------- /r/chapter12/rlexample/readme.md: -------------------------------------------------------------------------------- 1 | This example implementation for Q learning uses an R library built by PennState University. 2 | The details of this library can be found at link below: 3 | https://methodology.psu.edu/downloads/qlearning 4 | 5 | qlaci library zip can be downloaded at the link below: 6 | https://methodology.psu.edu/downloads/qlaci 7 | 8 | This library for Reinforcement learning techniques is evolving. This code will be updated as and when any new implementations are published in the CRAN repository. 9 | 10 | Reference from the above link: 11 | The qlaci (Q-learning with adaptive confidence intervals) R package can be used with data from a sequential, multiple assignment, randomized trial (SMART) to design an adaptive intervention. The qlaci R package requires R 2.15, available for free download. This is the recommended platform for running this package. At the time of release, R version 3.0.x was released recently. We will only support installation of qlaci on R 2.15.x. 12 | -------------------------------------------------------------------------------- /r/chapter13/ensembleexample/bagging-random-forest.R: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Bagging & Random Forest example 3 | # Chapter 13 4 | 5 | credit <- read.csv("credit.csv") 6 | 7 | library(caret) 8 | 9 | m <- train(default ~ ., data = credit, method = "C5.0") 10 | p <- predict(m, credit) 11 | 12 | table(p, credit$default) 13 | 14 | head(predict(m, credit)) 15 | head(predict(m, credit, type = "prob")) 16 | 17 | ctrl <- trainControl(method = "cv", number = 10, selectionFunction = "oneSE") 18 | 19 | grid <- expand.grid(.model = "tree", .trials = c(1, 5, 10, 15, 20, 25, 30, 35), .winnow = "FALSE") 20 | 21 | grid 22 | 23 | m <- train(default ~ ., data = credit, method = "C5.0", metric = "Kappa", trControl = ctrl, tuneGrid = grid) 24 | m 25 | 26 | 27 | library(ipred) 28 | 29 | mybag <- bagging(default ~ ., data = credit, nbagg = 25) 30 | 31 | credit_pred <- predict(mybag, credit) 32 | table(credit_pred, credit$default) 33 | 34 | library(caret) 35 | ctrl <- trainControl(method = "cv", number = 10) 36 | train(default ~ ., data = credit, method = "treebag", trControl = ctrl) 37 | 38 | 39 | 40 | # Bagging 41 | 42 | str(svmBag) 43 | svmBag$fit 44 | 45 | bagctrl <- bagControl(fit = svmBag$fit, predict = svmBag$pred, aggregate = svmBag$aggregate) 46 | 47 | svmBag <- train(default ~ ., data = credit, "bag", trControl = ctrl, bagControl = bagctrl) 48 | svmBag 49 | 50 | 51 | # Random Forest 52 | 53 | library(randomForest) 54 | rf <- randomForest(default ~ ., data = credit) 55 | rf 56 | 57 | library(caret) 58 | ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10) 59 | 60 | grid_rf <- expand.grid(.mtry = c(2, 4, 8, 16)) 61 | m_rf <- train(default ~ ., data = credit, method = "rf", metric = "Kappa", trControl = ctrl, tuneGrid = grid_rf) 62 | 63 | 64 | grid_c50 <- expand.grid(.model = "tree", .trials = c(10, 20, 30, 40), .winnow = "FALSE") 65 | m_c50 <- train(default ~ ., data = credit, method = "C5.0", metric = "Kappa", trControl = ctrl, tuneGrid = grid_c50) 66 | 67 | m_rf 68 | m-c50 69 | 70 | 71 | -------------------------------------------------------------------------------- /r/chapter13/ensembleexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter13/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter5/decisiontreeexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter5/randomforstexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter5/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter6/knnexample/knn.R: -------------------------------------------------------------------------------- 1 | # Practical Machine Learning 2 | # K- Nearest Neighbor algorithm 3 | # Chapter 6 4 | 5 | wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE) 6 | wbcd <- wbcd[-1] 7 | table(wbcd$diagnosis) 8 | wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant")) 9 | round(prop.table(table(wbcd$diagnosis))*100, digits = 1) 10 | summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")]) 11 | 12 | normalize <- function(x) { 13 | return((x - min(x))/(max(x) - min(x))) 14 | } 15 | 16 | wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize)) 17 | 18 | summary(wbcd_n$area_mean) 19 | 20 | wbcd_train <- wbcd_n[1:469, ] 21 | wbcd_test <- wbcd_n[470:569, ] 22 | 23 | wbcd_train_labels <- wbcd[1:469, 1] 24 | wbcd_test_labels <- wbcd[470:569, 1] 25 | 26 | library("class") 27 | 28 | # call the knn function of the class package 29 | wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21) 30 | 31 | # need gmodels for CrossTable 32 | library(gmodels) 33 | 34 | CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq = FALSE) 35 | 36 | # z score 37 | wbcd_z <- as.data.frame(scale(wbcd[-1])) 38 | 39 | 40 | summary(wbcd_z$area_mean) 41 | 42 | wbcd_train <- wbcd_z[1:469, ] 43 | wbcd_test <- wbcd_z[470:569, ] 44 | wbcd_train_labels <- wbcd[1:469, 1] 45 | wbcd_test_labels <- wbcd[470:569, 1] 46 | 47 | wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21) 48 | 49 | CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq = FALSE) 50 | -------------------------------------------------------------------------------- /r/chapter6/knnexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter6/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter6/svmexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter6/svmexample/svm.R: -------------------------------------------------------------------------------- 1 | # Practical Machine Learning 2 | # Support Vector Machines (SVM) 3 | # Chapter 6 4 | 5 | 6 | let <- read.csv("letterdata.csv") 7 | str(let) 8 | 9 | let_train <- let[1:16000, ] 10 | let_test <- let[16001:20000, ] 11 | 12 | 13 | 14 | # linear kernel (vanilla) 15 | 16 | library(kernlab) 17 | let_classifier <- ksvm(letter ~ ., data = let_train, kernel = "vanilladot") 18 | let_classifier 19 | 20 | let_pred <- predict(let_classifier, let_test) 21 | 22 | head(let_pred) 23 | table(let_pred, let_test$letter) 24 | 25 | agreement <- let_pred == let_test$letter 26 | table(agreement) 27 | prop.table(table(agreement)) 28 | 29 | 30 | 31 | 32 | # RBF kernel 33 | 34 | let_classifier2 <- ksvm(letter ~ ., data = let_train, kernel = "rbfdot") 35 | let_pred2 <- predict(let_classifier2, let_test) 36 | 37 | agreement2 <- let_pred == let_test$letter 38 | table(agreement2) 39 | prop.table(table(agreement2)) 40 | 41 | -------------------------------------------------------------------------------- /r/chapter7/aprioriexample/Rplots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter7/aprioriexample/Rplots.pdf -------------------------------------------------------------------------------- /r/chapter7/aprioriexample/association-rules.R: -------------------------------------------------------------------------------- 1 | # Practical Machine Learning 2 | # Association Rules - Apriori 3 | # Chapter 7 4 | 5 | library(arules) 6 | 7 | g <- read.transactions("groceries.csv", sep = ",") 8 | 9 | summary(g) 10 | 11 | inspect(g[1:5]) 12 | 13 | itemFrequency(g[, 1:3]) 14 | itemFrequencyPlot(g, support = 0.1) 15 | itemFrequencyPlot(g, topN = 20) 16 | 17 | image(g[1:5]) 18 | image(sample(g, 100)) 19 | 20 | apriori(g) 21 | 22 | grules <- apriori(g, parameter = list(support = 0.006, confidence = 0.25, minlen = 2)) 23 | 24 | grules 25 | summary(grules) 26 | 27 | inspect(grules[1:3]) 28 | inspect(sort(grules, by = "lift")[1:5]) 29 | 30 | berryrules <- subset(grules, items %in% "berries") 31 | inspect(berryrules) 32 | 33 | write(grules, file = "groceryrules.csv", sep = ",", quote = TRUE, row.names = FALSE) 34 | 35 | grules_df <- as(grules, "data.frame") 36 | 37 | str(grules_df) 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /r/chapter7/aprioriexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter7/fpgrowthexample/readme.md: -------------------------------------------------------------------------------- 1 | There are no explicit libraries avaialble in R for FP growth implementation. The folder is left as a place holder for implementing the same in future. 2 | -------------------------------------------------------------------------------- /r/chapter7/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter8/k-meansexample/kmeans-clustering.R: -------------------------------------------------------------------------------- 1 | # Practical machine learning 2 | # K Means Clustering 3 | # Chapter 8 4 | 5 | 6 | teens <- read.csv("snsdata.csv") 7 | 8 | str(teens) 9 | 10 | table(teens$gender) 11 | table(teens$gender, useNA = "ifany") 12 | 13 | summary(teens$age) 14 | 15 | teens$age <- ifelse(teens$age >= 13 & teens$age < 20, teens$age, NA) 16 | 17 | summary(teens$age) 18 | 19 | teens$female <- ifelse(teens$gender == "F" & !is.na(teens$gender), 1, 0) 20 | teens$no_gender <- ifelse(is.na(teens$gender), 1, 0) 21 | 22 | table(teens$gender, useNA = "ifany") 23 | table(teens$female, useNA = "ifany") 24 | table(teens$no_gender, useNA = "ifany") 25 | 26 | mean(teens$age) 27 | mean(teens$age, na.rm = TRUE) 28 | 29 | aggregate(data = teens, age ~ gradyear, mean, na.rm = TRUE) 30 | 31 | 32 | ave_age <- mean(teens$age, na.rm = TRUE) 33 | ave_age 34 | 35 | teens$age <- ifelse(is.na(teens$age), ave_age, teens$age) 36 | 37 | summary(teens$age) 38 | 39 | interests <- teens[5:40] 40 | 41 | interests_z <- as.data.frame(lapply(interests, scale)) 42 | 43 | teen_clusters <- kmeans(interests_z, 5) 44 | teen_clusters$size 45 | 46 | teen_clusters$centers 47 | 48 | teens$cluster <- teen_clusters$cluster 49 | 50 | teens[1:5, c("cluster", "gender", "age", "friends")] 51 | 52 | aggregate(data = teens, age ~ cluster, mean) 53 | aggregate(data = teens, female ~ cluster, mean) 54 | aggregate(data = teens, friends ~ cluster, mean) 55 | -------------------------------------------------------------------------------- /r/chapter8/k-meansexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter8/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter9/naivebayesexample/Rplots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktCode/Practical-Machine-Learning/453bb7125fe1657f56610878c6cdf353d92834bb/r/chapter9/naivebayesexample/Rplots.pdf -------------------------------------------------------------------------------- /r/chapter9/naivebayesexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /r/chapter9/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter10/linearregressionexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter10/linearregressionexample/startClickRate.sh: -------------------------------------------------------------------------------- 1 | cd /home/sunilag/spark-1.4.1-bin-hadoop2.6/bin 2 | ./spark-submit /home/sunilag/Spark_Linear_Regression/ClickRate.py 3 | -------------------------------------------------------------------------------- /spark/chapter10/logisticregressionexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter10/logisticregressionexample/src/main/scala/default/SpamClassification-Logreg.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Logistic Regression example 3 | // Chapter 10 4 | 5 | package default 6 | 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS 8 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 9 | import org.apache.spark.mllib.linalg.Vectors 10 | import org.apache.spark.mllib.optimization.L1Updater 11 | import org.apache.spark.mllib.regression.LabeledPoint 12 | import org.apache.spark.{SparkConf, SparkContext} 13 | 14 | /** 15 | * @author Oleksiy Dyagilev 16 | */ 17 | object SpamClassification extends App { 18 | 19 | runSpark() 20 | 21 | def runSpark() { 22 | val conf = new SparkConf().setAppName("Spam classification").setMaster("local[*]") 23 | val sc = new SparkContext(conf) 24 | val file = sc.textFile("./dataset/spambase.data") 25 | val examples = file.map { line => 26 | val parts = line.split(",").map(_.toDouble) 27 | LabeledPoint(parts.last, Vectors.dense(parts.init)) 28 | } 29 | 30 | val splits = examples.randomSplit(Array(0.8, 0.2)) 31 | val training = splits(0).cache() 32 | val test = splits(1).cache() 33 | 34 | val numTraining = training.count() 35 | val numTest = test.count() 36 | println(s"Training: $numTraining, test: $numTest.") 37 | 38 | examples.unpersist(blocking = false) 39 | 40 | val algorithm = new LogisticRegressionWithLBFGS() 41 | 42 | // new SquaredL2Updater() 43 | val updater = new L1Updater() 44 | 45 | algorithm.optimizer 46 | .setNumIterations(1000) 47 | .setUpdater(updater) 48 | // .setRegParam(0.0) 49 | 50 | val model = algorithm.run(training).clearThreshold() 51 | 52 | val prediction = model.predict(test.map(_.features)) 53 | val predictionAndLabel = prediction.zip(test.map(_.label)) 54 | 55 | val metrics = new BinaryClassificationMetrics(predictionAndLabel) 56 | 57 | println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.") 58 | println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.") 59 | 60 | sc.stop() 61 | 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /spark/chapter10/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/build.sbt: -------------------------------------------------------------------------------- 1 | name := "NeuralNetwork" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" % "spark-core_2.10" % "1.2.1", 9 | "org.apache.spark" % "spark-mllib_2.10" % "1.2.1", 10 | "org.scalanlp" % "breeze_2.10" % "0.10", 11 | "org.scalanlp" % "breeze-natives_2.10" % "0.10", 12 | "org.scalatest" % "scalatest_2.10" % "2.2.4" % "test" 13 | ) -------------------------------------------------------------------------------- /spark/chapter11/annexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015 Meihua Wu 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # Set everything to be logged to the console 18 | log4j.rootCategory=WARN, console 19 | log4j.appender.console=org.apache.log4j.ConsoleAppender 20 | log4j.appender.console.target=System.err 21 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 22 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 23 | 24 | # Settings to quiet third party logs that are too verbose 25 | log4j.logger.org.eclipse.jetty=WARN 26 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 27 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 28 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 29 | 30 | 31 | # 32 | log4j.logger.rotationsymmetry.neuralnetwork.algo.GradientDescendOptimizer=TRACE -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/Util.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default 6 | 7 | import breeze.linalg.{DenseVector=>BDV, DenseMatrix=>BDM} 8 | import breeze.numerics.{sigmoid} 9 | import org.apache.spark.mllib.linalg.{Vectors, Vector} 10 | import scala.math.abs 11 | 12 | object Util { 13 | 14 | def toBreeze(v: Vector): BDV[Double] = { 15 | new BDV(v.toArray) 16 | } 17 | 18 | def addBias(v: BDV[Double]) : BDV[Double] ={ 19 | BDV.vertcat(BDV(1d), v) 20 | } 21 | 22 | def removeBias(v: BDV[Double]): BDV[Double] ={ 23 | v(1 to -1) 24 | } 25 | 26 | def sigmoidGradient(v: BDV[Double]) : BDV[Double] = { 27 | val s = sigmoid(v) 28 | s :* ((-s) + 1d) 29 | } 30 | 31 | def doubleEqual(v1: Double, v2: Double, p: Double = 1e-4): Boolean = { 32 | abs(v1-v2) <= p 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/algo/CostGradient.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default.algo 6 | 7 | import breeze.linalg.DenseMatrix 8 | 9 | case class CostGradient(val cost: Double, val thetaGradient: List[DenseMatrix[Double]], val n: Int) 10 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/algo/GradientDescendOptimizer.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default.algo 6 | 7 | import rotationsymmetry.neuralnetwork.model.{NeuralNetworkModel, Topology} 8 | import org.apache.spark.mllib.regression.LabeledPoint 9 | import org.apache.spark.rdd.RDD 10 | import breeze.linalg.{DenseMatrix=>BDM, DenseVector=>BDV} 11 | 12 | import scala.util.Random 13 | 14 | 15 | object GradientDescendOptimizer extends LoggingAbility{ 16 | 17 | def optimize(data: RDD[LabeledPoint], 18 | neuralNetworkModel: NeuralNetworkModel, 19 | topology: Topology, 20 | initTheta: List[BDM[Double]], 21 | rate: Double, 22 | lambda: Double, 23 | normalFactor: Double, 24 | maxIter: Int, 25 | batchProp: Double = 1, 26 | batchSeed: Integer): GradientDescendSolution ={ 27 | 28 | 29 | var theta = initTheta 30 | 31 | val costHistory: Array[Double] = new Array[Double](maxIter) 32 | 33 | var i: Integer = 0 34 | while (i < maxIter){ 35 | val costGradient = DistCostGradientComputer.compute(data, 36 | theta, 37 | neuralNetworkModel, 38 | lambda, 39 | normalFactor, 40 | batchProp, 41 | batchSeed + i) 42 | 43 | costHistory(i) = costGradient.cost 44 | 45 | val unrolledThetaVector: BDV[Double] = new BDV(Topology.unrollTheta(theta)) 46 | val unrolledGradientVector: BDV[Double] = new BDV(Topology.unrollTheta(costGradient.thetaGradient)) 47 | 48 | val updatedUnrolledThetaVector: BDV[Double] = unrolledThetaVector - (unrolledGradientVector * rate) 49 | 50 | theta = topology.generateThetaFrom(updatedUnrolledThetaVector.toArray) 51 | i = i + 1 52 | logger.trace("Iteration: " + i + "/" + maxIter + " Cost: " + costGradient.cost) 53 | } 54 | 55 | GradientDescendSolution(costHistory.toList, theta) 56 | 57 | } 58 | } 59 | 60 | case class GradientDescendSolution(val costHistory: List[Double], val theta: List[BDM[Double]] ) 61 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/algo/LoggingAbility.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default.algo 6 | 7 | import org.apache.log4j.Logger 8 | 9 | 10 | trait LoggingAbility { 11 | val loggerName = this.getClass.getName 12 | lazy val logger = Logger.getLogger(loggerName.split("\\$").head) 13 | 14 | } 15 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/algo/NaiveCostGradientComputer.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default.algo 6 | 7 | import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} 8 | import breeze.numerics.sigmoid 9 | import rotationsymmetry.neuralnetwork.model.{NeuralNetworkModel, Topology} 10 | import org.apache.spark.mllib.regression.LabeledPoint 11 | import rotationsymmetry.neuralnetwork.Util 12 | 13 | 14 | object NaiveCostGradientComputer { 15 | 16 | def compute_cost(data: List[LabeledPoint], theta: List[BDM[Double]], neuralNetworkModel: NeuralNetworkModel): Double = { 17 | val costList = data map {d => 18 | val acc = theta.foldLeft(Util.toBreeze(d.features))( 19 | (a, th)=> sigmoid (th * Util.addBias(a)) 20 | ) 21 | neuralNetworkModel.cost(acc, d.label) 22 | } 23 | costList.sum / data.size.toDouble 24 | } 25 | 26 | def compute_gradient(data: List[LabeledPoint], 27 | theta: List[BDM[Double]], 28 | neuralNetworkModel: NeuralNetworkModel, 29 | eps: Double): BDV[Double] ={ 30 | 31 | val topology = Topology(theta) 32 | 33 | val thetaUnrolled: Array[Double] = Topology.unrollTheta(theta) 34 | 35 | val thetaUnrolledWithEps: List[Array[Double]] = addEps(thetaUnrolled, eps) 36 | 37 | val thetaWithEps = thetaUnrolledWithEps map (topology.generateThetaFrom(_)) 38 | 39 | val costWithEps: List[Double] = thetaWithEps map (th => 40 | compute_cost(data, th, neuralNetworkModel) 41 | ) 42 | 43 | val costAtOrigin = compute_cost(data, theta, neuralNetworkModel) 44 | 45 | val diff = (BDV(costWithEps.toArray) - costAtOrigin) 46 | 47 | diff / eps 48 | 49 | } 50 | 51 | def addEps(thetaUnrolled: Array[Double], eps: Double): List[Array[Double]] ={ 52 | val out = for (i <- 0 until thetaUnrolled.length) yield { 53 | val tmp = thetaUnrolled.clone() 54 | tmp(i) = tmp(i) + eps 55 | tmp 56 | } 57 | out.toList 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/algo/Predictor.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default.algo 6 | 7 | import breeze.numerics.sigmoid 8 | import rotationsymmetry.neuralnetwork.model.NeuralNetworkModel 9 | import org.apache.spark.mllib.linalg.Vector 10 | import org.apache.spark.rdd.RDD 11 | import breeze.linalg.{DenseMatrix=>BDM} 12 | import rotationsymmetry.neuralnetwork.Util 13 | 14 | 15 | object Predictor { 16 | def predict(features: RDD[Vector], neuralNetworkModel: NeuralNetworkModel, theta: List[BDM[Double]]): RDD[Double] ={ 17 | features.map(x=>{ 18 | val xVec = Util.toBreeze(x) 19 | val outputActivation = theta.foldLeft(xVec)((a, th)=> sigmoid( th * Util.addBias(a))) 20 | neuralNetworkModel.predict(outputActivation) 21 | }) 22 | } 23 | 24 | def predict(features: Array[Vector], neuralNetworkModel: NeuralNetworkModel, theta: List[BDM[Double]]): Array[Double] ={ 25 | features.map(x=>{ 26 | val xVec = Util.toBreeze(x) 27 | val outputActivation = theta.foldLeft(xVec)((a, th)=> sigmoid( th * Util.addBias(a))) 28 | neuralNetworkModel.predict(outputActivation) 29 | }) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/example/MNIST.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default.example 6 | 7 | import org.apache.spark.mllib.linalg.Vectors 8 | import org.apache.spark.mllib.regression.LabeledPoint 9 | import org.apache.spark.mllib.util.MLUtils._ 10 | import org.apache.spark.{SparkContext, SparkConf} 11 | 12 | object MNIST { 13 | 14 | 15 | def processData(): Unit = { 16 | val conf = new SparkConf().setAppName("Simple Application").setMaster("local") 17 | val sc = new SparkContext(conf) 18 | val xData=sc.textFile("x.txt") 19 | val xValue = xData.map(line => line.trim().split(" ").map(_.toDouble)) 20 | 21 | val yData=sc.textFile("y.txt") 22 | val yValue = yData.map(line => { 23 | val yInt = line.trim().toInt 24 | yInt match { 25 | case 10 => 0 26 | case _ => yInt 27 | } 28 | }) 29 | 30 | val data = yValue.zip(xValue).map( 31 | line => LabeledPoint(line._1, Vectors.dense(line._2)) 32 | ) 33 | 34 | saveAsLibSVMFile(data, "data.libsvm") 35 | sc.stop() 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/model/NeuralNetworkClassifier.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default.model 6 | 7 | import breeze.linalg.{argmax, DenseVector, sum} 8 | import breeze.numerics.log 9 | 10 | 11 | 12 | class NeuralNetworkClassifier(nGroup: Int) extends NeuralNetworkModel(){ 13 | 14 | override def cost(activations: DenseVector[Double], y: Double): Double ={ 15 | 16 | handelException(activations, y) 17 | 18 | val tmp_act: DenseVector[Double] = ((- activations) + 1d) 19 | 20 | val yInt: Int = y.floor.toInt 21 | 22 | tmp_act(yInt) = activations(yInt) 23 | 24 | sum(-log(tmp_act)) 25 | 26 | } 27 | 28 | override def delta(activations: DenseVector[Double], y: Double): DenseVector[Double] = { 29 | 30 | handelException(activations, y) 31 | 32 | val tmp_act: DenseVector[Double] = activations.copy 33 | 34 | val yInt: Int = y.floor.toInt 35 | 36 | tmp_act(yInt) = tmp_act(yInt) - 1d 37 | 38 | tmp_act 39 | } 40 | 41 | override def predict(activations: DenseVector[Double]): Double ={ 42 | require(activations.length == nGroup, "Number of output activations is not equal to number of group.") 43 | 44 | val groupWithMaxActivation = argmax(activations) 45 | groupWithMaxActivation.toDouble 46 | } 47 | 48 | 49 | 50 | private def handelException(activations: DenseVector[Double], y: Double): Unit ={ 51 | 52 | require(activations.length == nGroup, "Number of output activations is not equal to number of group.") 53 | 54 | require(0 <= y && y < nGroup, "y is out of range: " + "y=" + y + "; nGroup=" + nGroup) 55 | 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/model/NeuralNetworkModel.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default.model 6 | 7 | import breeze.linalg.{DenseVector => BDV} 8 | 9 | 10 | abstract class NeuralNetworkModel extends Serializable { 11 | def cost(activations: BDV[Double], y: Double): Double 12 | def delta(activations: BDV[Double], y: Double): BDV[Double] 13 | def predict(activations: BDV[Double]): Double 14 | 15 | } 16 | -------------------------------------------------------------------------------- /spark/chapter11/annexample/src/main/scala/default/model/Topology.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Neural Network example 3 | // Chapter 11 4 | 5 | package default.model 6 | 7 | import breeze.linalg.DenseMatrix 8 | import breeze.numerics._ 9 | 10 | import scala.util.Random 11 | 12 | 13 | class Topology(val self: List[Int]) { 14 | require(self.size >= 2, "Neural Network is less than 2 layers.") 15 | 16 | require(self.forall(_ >0), "The number of activations in a layout should be positive.") 17 | 18 | 19 | private[this] val rowDim = self.tail 20 | 21 | private[this] val colDim = self.dropRight(1) map (_ + 1) 22 | 23 | private[this]val dimPair = (rowDim zip colDim) map (rc => RowColPair(rc._1, rc._2)) 24 | 25 | private[this] val start = dimPair.scanLeft(0) ( 26 | (s: Int, pair: RowColPair) => s + pair.row * pair.col 27 | ) 28 | 29 | def generateThetaFrom(values: Array[Double]): List[DenseMatrix[Double]] ={ 30 | require(values.length == start.last, "input is of incorrect length.") 31 | 32 | val dimPair_start = dimPair zip start.dropRight(1) 33 | 34 | dimPair_start.map( 35 | _ match { 36 | case (pair: RowColPair, s: Int) =>{ 37 | new DenseMatrix(pair.row, pair.col, values.slice(s, s + pair.row * pair.col)) 38 | } 39 | } 40 | ) 41 | } 42 | 43 | def generateThetaFrom(rand: Random): List[DenseMatrix[Double]] ={ 44 | 45 | val dimPair_start = dimPair zip start.dropRight(1) 46 | 47 | dimPair_start.map( 48 | _ match { 49 | case (pair: RowColPair, s: Int) =>{ 50 | val eps = sqrt(6d / (pair.row + pair.col - 1)) 51 | 52 | val value: Array[Double] = (for (i <- 0 until pair.row * pair.col) yield (rand.nextDouble() - 0.5) * 2 * eps).toArray 53 | new DenseMatrix(pair.row, pair.col, value) 54 | } 55 | } 56 | ) 57 | } 58 | 59 | private[this] case class RowColPair(val row: Int, val col: Int) 60 | 61 | } 62 | 63 | object Topology { 64 | def apply(theta: List[DenseMatrix[Double]]): Topology ={ 65 | new Topology((theta.head.cols - 1) +: theta.map(_.rows)) 66 | } 67 | 68 | def unrollTheta(theta: List[DenseMatrix[Double]]): Array[Double] = { 69 | val unrolledMatrixList: List[List[Double]] = theta map (_.toArray.toList) 70 | unrolledMatrixList reduce(_ ::: _) toArray 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /spark/chapter11/dlexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter11/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter12/readme.md: -------------------------------------------------------------------------------- 1 | There are no sepcific libraries in MLIB available for Reinforcement learning technique slike Q learning / TD learning 2 | -------------------------------------------------------------------------------- /spark/chapter12/rlexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter13/ensembleexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter13/ensembleexample/result/GBT_clas.txt: -------------------------------------------------------------------------------- 1 | Test Error = 0.0 2 | Learned classification GBT model: 3 | TreeEnsembleModel classifier with 3 trees 4 | 5 | Tree 0: 6 | If (feature 434 <= 0.0) 7 | If (feature 100 <= 165.0) 8 | Predict: -1.0 9 | Else (feature 100 > 165.0) 10 | Predict: 1.0 11 | Else (feature 434 > 0.0) 12 | Predict: 1.0 13 | Tree 1: 14 | If (feature 434 <= 0.0) 15 | If (feature 568 <= 253.0) 16 | If (feature 211 <= 250.0) 17 | Predict: 0.47681168808847024 18 | Else (feature 211 > 250.0) 19 | Predict: 0.4768116880884703 20 | Else (feature 568 > 253.0) 21 | Predict: -0.4768116880884694 22 | Else (feature 434 > 0.0) 23 | If (feature 351 <= 58.0) 24 | Predict: -0.4768116880884702 25 | Else (feature 351 > 58.0) 26 | Predict: -0.47681168808847035 27 | Tree 2: 28 | If (feature 434 <= 0.0) 29 | If (feature 295 <= 253.0) 30 | If (feature 152 <= 253.0) 31 | Predict: -0.5183379293761909 32 | Else (feature 152 > 253.0) 33 | Predict: -0.5183379293761909 34 | Else (feature 295 > 253.0) 35 | Predict: 0.5183379293761909 36 | Else (feature 434 > 0.0) 37 | If (feature 157 <= 252.0) 38 | Predict: 0.5183379293761909 39 | Else (feature 157 > 252.0) 40 | If (feature 156 <= 85.0) 41 | Predict: 0.5183379293761909 42 | Else (feature 156 > 85.0) 43 | Predict: 0.5183379293761909 44 | -------------------------------------------------------------------------------- /spark/chapter13/ensembleexample/src/GradientBoostTree_classification.scala: -------------------------------------------------------------------------------- 1 | // https://spark.apache.org/docs/1.2.0/mllib-ensembles.html 2 | // perform classification using Gradient-Boosted Trees with log loss. 3 | // The test error is calculated to measure the algorithm accuracy. 4 | 5 | 6 | 7 | import org.apache.spark.mllib.tree.GradientBoostedTrees 8 | import org.apache.spark.mllib.tree.configuration.BoostingStrategy 9 | import org.apache.spark.mllib.util.MLUtils 10 | 11 | // Load and parse the data file. 12 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/sample_libsvm_data.txt") 13 | // Split the data into training and test sets (30% held out for testing) 14 | val splits = data.randomSplit(Array(0.7, 0.3)) 15 | val (trainingData, testData) = (splits(0), splits(1)) 16 | 17 | // Train a GradientBoostedTrees model. 18 | // The defaultParams for Classification use LogLoss by default. 19 | val boostingStrategy = BoostingStrategy.defaultParams("Classification") 20 | boostingStrategy.numIterations = 3 // Note: Use more iterations in practice. 21 | 22 | // It is numClasses rather than numClassesForclassification 23 | // chech in API // https://spark.apache.org/docs/1.2.0/api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy 24 | boostingStrategy.treeStrategy.numClasses= 2 25 | 26 | boostingStrategy.treeStrategy.maxDepth = 5 27 | // Empty categoricalFeaturesInfo indicates all features are continuous. 28 | boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() 29 | 30 | val model = GradientBoostedTrees.train(trainingData, boostingStrategy) 31 | 32 | // Evaluate model on test instances and compute test error 33 | val labelAndPreds = testData.map { point => 34 | val prediction = model.predict(point.features) 35 | (point.label, prediction) 36 | } 37 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() 38 | println("Test Error = " + testErr) 39 | println("Learned classification GBT model:\n" + model.toDebugString) -------------------------------------------------------------------------------- /spark/chapter13/ensembleexample/src/GradientBoostTree_regression.scala: -------------------------------------------------------------------------------- 1 | // https://spark.apache.org/docs/1.2.0/mllib-ensembles.html 2 | // Gradient-Boosted Trees with Squared Error as the loss. 3 | // The Mean Squared Error (MSE) is computed at the end to evaluate goodness of fit. 4 | 5 | 6 | import org.apache.spark.mllib.tree.GradientBoostedTrees 7 | import org.apache.spark.mllib.tree.configuration.BoostingStrategy 8 | import org.apache.spark.mllib.util.MLUtils 9 | 10 | // Load and parse the data file. 11 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/housing.txt") 12 | // Split the data into training and test sets (30% held out for testing) 13 | val splits = data.randomSplit(Array(0.7, 0.3)) 14 | val (trainingData, testData) = (splits(0), splits(1)) 15 | 16 | // Train a GradientBoostedTrees model. 17 | // The defaultParams for Regression use SquaredError by default. 18 | val boostingStrategy = BoostingStrategy.defaultParams("Regression") 19 | boostingStrategy.numIterations = 3 // Note: Use more iterations in practice. 20 | boostingStrategy.treeStrategy.maxDepth = 5 21 | // Empty categoricalFeaturesInfo indicates all features are continuous. 22 | boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() 23 | 24 | val model = GradientBoostedTrees.train(trainingData, boostingStrategy) 25 | 26 | // Evaluate model on test instances and compute test error 27 | val labelsAndPredictions = testData.map { point => 28 | val prediction = model.predict(point.features) 29 | (point.label, prediction) 30 | } 31 | val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean() 32 | println("Test Mean Squared Error = " + testMSE) 33 | println("Learned regression GBT model:\n" + model.toDebugString) -------------------------------------------------------------------------------- /spark/chapter13/ensembleexample/src/test.txt: -------------------------------------------------------------------------------- 1 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html 2 | // Decision Tree with Gini impurity as an impurity 3 | // The test error is calculated to measure the algorithm accuracy. 4 | 5 | import org.apache.spark.mllib.tree.DecisionTree 6 | import org.apache.spark.mllib.util.MLUtils 7 | 8 | // Load and parse the data file. 9 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/svm.txt") 10 | // Split the data into training and test sets (30% held out for testing) 11 | val splits = data.randomSplit(Array(0.5, 0.5)) 12 | val (trainingData, testData) = (splits(0), splits(1)) 13 | 14 | // Train a DecisionTree model. 15 | // Empty categoricalFeaturesInfo indicates all features are continuous. 16 | val numClasses = 2 17 | val categoricalFeaturesInfo = Map[Int, Int]() 18 | val impurity = "gini" 19 | val maxDepth = 10 20 | val maxBins = 32 21 | val minInstancesPerNode =5 22 | 23 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, 24 | impurity, maxDepth, maxBins, minInstancesPerNode) 25 | 26 | // Evaluate model on test instances and compute test error 27 | val labelAndPreds = testData.map { point => 28 | val prediction = model.predict(point.features) 29 | (point.label, prediction) 30 | } 31 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() 32 | println("Test Error = " + testErr) 33 | println("Learned classification tree model:\n" + model.toDebugString) -------------------------------------------------------------------------------- /spark/chapter13/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter5/decisiontreeexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter5/decisiontreeexample/result/classification.txt: -------------------------------------------------------------------------------- 1 | Test Error = 0.03125 2 | 3 | Learned classification tree model: 4 | DecisionTreeModel classifier of depth 2 with 5 nodes 5 | If (feature 434 <= 0.0) 6 | If (feature 99 <= 0.0) 7 | Predict: 0.0 8 | Else (feature 99 > 0.0) 9 | Predict: 1.0 10 | Else (feature 434 > 0.0) 11 | Predict: 1.0 12 | -------------------------------------------------------------------------------- /spark/chapter5/decisiontreeexample/result/regression.txt: -------------------------------------------------------------------------------- 1 | Test Mean Squared Error = 22.935061460324544 2 | 3 | Learned regression tree model: 4 | DecisionTreeModel regressor of depth 4 with 29 nodes 5 | If (feature 5 <= 0.269592) 6 | If (feature 12 <= -0.279249) 7 | If (feature 5 <= 0.140832) 8 | If (feature 7 <= -0.95335) 9 | Predict: 38.95 10 | Else (feature 7 > -0.95335) 11 | Predict: 21.594078947368423 12 | Else (feature 5 > 0.140832) 13 | If (feature 4 <= -0.427984) 14 | Predict: 28.96785714285715 15 | Else (feature 4 > -0.427984) 16 | Predict: 23.650000000000006 17 | Else (feature 12 > -0.279249) 18 | If (feature 0 <= -0.987141) 19 | If (feature 7 <= -0.857305) 20 | Predict: 15.514285714285714 21 | Else (feature 7 > -0.857305) 22 | Predict: 19.866666666666667 23 | Else (feature 0 > -0.987141) 24 | If (feature 0 <= -0.78628) 25 | Predict: 14.6 26 | Else (feature 0 > -0.78628) 27 | Predict: 11.236842105263158 28 | Else (feature 5 > 0.269592) 29 | If (feature 5 <= 0.491857) 30 | If (feature 12 <= -0.566225) 31 | If (feature 7 <= -0.857305) 32 | Predict: 45.65 33 | Else (feature 7 > -0.857305) 34 | Predict: 33.66923076923077 35 | Else (feature 12 > -0.566225) 36 | If (feature 2 <= -0.579179) 37 | Predict: 27.580000000000002 38 | Else (feature 2 > -0.579179) 39 | Predict: 15.0 40 | Else (feature 5 > 0.491857) 41 | If (feature 11 <= 0.798729) 42 | Predict: 21.9 43 | Else (feature 11 > 0.798729) 44 | If (feature 10 <= 0.106383) 45 | Predict: 46.4470588235294 46 | Else (feature 10 > 0.106383) 47 | Predict: 40.125 48 | -------------------------------------------------------------------------------- /spark/chapter5/decisiontreeexample/src/decisiontree-classification.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Decision Tree example 3 | // Chapter 5 4 | 5 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html 6 | // Decision Tree with Gini impurity as an impurity 7 | // The test error is calculated to measure the algorithm accuracy. 8 | 9 | import org.apache.spark.mllib.tree.DecisionTree 10 | import org.apache.spark.mllib.util.MLUtils 11 | 12 | // Load and parse the data file. 13 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/sample_libsvm_data.txt") 14 | // Split the data into training and test sets (30% held out for testing) 15 | val splits = data.randomSplit(Array(0.7, 0.3)) 16 | val (trainingData, testData) = (splits(0), splits(1)) 17 | 18 | // Train a DecisionTree model. 19 | // Empty categoricalFeaturesInfo indicates all features are continuous. 20 | val numClasses = 2 21 | val categoricalFeaturesInfo = Map[Int, Int]() 22 | val impurity = "gini" 23 | val maxDepth = 4 // rather than 5, easy to test 24 | val maxBins = 32 25 | 26 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, 27 | impurity, maxDepth, maxBins) 28 | 29 | // Evaluate model on test instances and compute test error 30 | val labelAndPreds = testData.map { point => 31 | val prediction = model.predict(point.features) 32 | (point.label, prediction) 33 | } 34 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() 35 | println("Test Error = " + testErr) 36 | println("Learned classification tree model:\n" + model.toDebugString) -------------------------------------------------------------------------------- /spark/chapter5/decisiontreeexample/src/decisiontree-regression.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Decision Tree example 3 | // Chapter 5 4 | 5 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html 6 | // decision tree with variance as an impurity measure and a maximum tree depth of 4 7 | // The Mean Squared Error (MSE) is computed at the end to evaluate goodness of fit. 8 | 9 | import org.apache.spark.mllib.tree.DecisionTree 10 | import org.apache.spark.mllib.util.MLUtils 11 | 12 | // Load and parse the data file. 13 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/housing.txt") 14 | // Split the data into training and test sets (30% held out for testing) 15 | val splits = data.randomSplit(Array(0.7, 0.3)) 16 | val (trainingData, testData) = (splits(0), splits(1)) 17 | 18 | // Train a DecisionTree model. 19 | // Empty categoricalFeaturesInfo indicates all features are continuous. 20 | val categoricalFeaturesInfo = Map[Int, Int]() 21 | val impurity = "variance" 22 | val maxDepth = 4 23 | val maxBins = 32 24 | 25 | val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, 26 | maxDepth, maxBins) 27 | 28 | // Evaluate model on test instances and compute test error 29 | val labelsAndPredictions = testData.map { point => 30 | val prediction = model.predict(point.features) 31 | (point.label, prediction) 32 | } 33 | val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean() 34 | println("Test Mean Squared Error = " + testMSE) 35 | println("Learned regression tree model:\n" + model.toDebugString) -------------------------------------------------------------------------------- /spark/chapter5/decisiontreeexample/src/decsiontree-test.txt: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Decision Tree example 3 | // Chapter 5 4 | 5 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html 6 | // Decision Tree with Gini impurity as an impurity 7 | // The test error is calculated to measure the algorithm accuracy. 8 | 9 | import org.apache.spark.mllib.tree.DecisionTree 10 | import org.apache.spark.mllib.util.MLUtils 11 | 12 | // Load and parse the data file. 13 | val data = MLUtils.loadLibSVMFile(sc, "data/svm.txt") 14 | // Split the data into training and test sets (30% held out for testing) 15 | val splits = data.randomSplit(Array(0.5, 0.5)) 16 | val (trainingData, testData) = (splits(0), splits(1)) 17 | 18 | // Train a DecisionTree model. 19 | // Empty categoricalFeaturesInfo indicates all features are continuous. 20 | val numClasses = 2 21 | val categoricalFeaturesInfo = Map[Int, Int]() 22 | val impurity = "gini" 23 | val maxDepth = 10 24 | val maxBins = 32 25 | val minInstancesPerNode =5 26 | 27 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, 28 | impurity, maxDepth, maxBins, minInstancesPerNode) 29 | 30 | // Evaluate model on test instances and compute test error 31 | val labelAndPreds = testData.map { point => 32 | val prediction = model.predict(point.features) 33 | (point.label, prediction) 34 | } 35 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() 36 | println("Test Error = " + testErr) 37 | println("Learned classification tree model:\n" + model.toDebugString) -------------------------------------------------------------------------------- /spark/chapter5/randomforstexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter5/randomforstexample/result/RandomForests_classification.txt: -------------------------------------------------------------------------------- 1 | Test Error = 0.034482758620689655 2 | 3 | Learned classification forest model: 4 | TreeEnsembleModel classifier with 3 trees 5 | 6 | Tree 0: 7 | If (feature 434 <= 0.0) 8 | If (feature 632 <= 0.0) 9 | Predict: 1.0 10 | Else (feature 632 > 0.0) 11 | Predict: 0.0 12 | Else (feature 434 > 0.0) 13 | Predict: 1.0 14 | Tree 1: 15 | If (feature 399 <= 15.0) 16 | If (feature 356 <= 0.0) 17 | If (feature 126 <= 254.0) 18 | Predict: 1.0 19 | Else (feature 126 > 254.0) 20 | Predict: 0.0 21 | Else (feature 356 > 0.0) 22 | Predict: 0.0 23 | Else (feature 399 > 15.0) 24 | Predict: 0.0 25 | Tree 2: 26 | If (feature 517 <= 41.0) 27 | Predict: 0.0 28 | Else (feature 517 > 41.0) 29 | If (feature 548 <= 251.0) 30 | Predict: 1.0 31 | Else (feature 548 > 251.0) 32 | Predict: 0.0 33 | -------------------------------------------------------------------------------- /spark/chapter5/randomforstexample/src/RandomForest_regression.scala: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Decision Tree based learning - Random Forest example 3 | # Chapter 5 4 | 5 | // source https://spark.apache.org/docs/1.2.0/mllib-ensembles.html 6 | // Random Forest. 7 | // The Mean Squared Error (MSE) is computed at the end to evaluate goodness of fit. 8 | 9 | 10 | import org.apache.spark.mllib.tree.RandomForest 11 | import org.apache.spark.mllib.util.MLUtils 12 | 13 | // Load and parse the data file. 14 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/housing.txt") 15 | // Split the data into training and test sets (30% held out for testing) 16 | val splits = data.randomSplit(Array(0.7, 0.3)) 17 | val (trainingData, testData) = (splits(0), splits(1)) 18 | 19 | // Train a RandomForest model. 20 | // Empty categoricalFeaturesInfo indicates all features are continuous. 21 | val numClasses = 2 22 | val categoricalFeaturesInfo = Map[Int, Int]() 23 | val numTrees = 3 // Use more in practice. 24 | val featureSubsetStrategy = "auto" // Let the algorithm choose. 25 | val impurity = "variance" 26 | val maxDepth = 4 27 | val maxBins = 32 28 | 29 | val model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo, 30 | numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) 31 | 32 | // Evaluate model on test instances and compute test error 33 | val labelsAndPredictions = testData.map { point => 34 | val prediction = model.predict(point.features) 35 | (point.label, prediction) 36 | } 37 | val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean() 38 | println("Test Mean Squared Error = " + testMSE) 39 | println("Learned regression forest model:\n" + model.toDebugString) -------------------------------------------------------------------------------- /spark/chapter5/randomforstexample/src/RandomForests_classification.scala: -------------------------------------------------------------------------------- 1 | # Practical Machine learning 2 | # Decision Tree based learning - Random Forest example 3 | # Chapter 5 4 | 5 | // source https://spark.apache.org/docs/1.2.0/mllib-ensembles.html 6 | // Random Forest. 7 | // The test error is calculated to measure the algorithm accuracy. 8 | 9 | import org.apache.spark.mllib.tree.RandomForest 10 | import org.apache.spark.mllib.util.MLUtils 11 | 12 | // Load and parse the data file. 13 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/sample_libsvm_data.txt") 14 | // Split the data into training and test sets (30% held out for testing) 15 | val splits = data.randomSplit(Array(0.7, 0.3)) 16 | val (trainingData, testData) = (splits(0), splits(1)) 17 | 18 | // Train a RandomForest model. 19 | // Empty categoricalFeaturesInfo indicates all features are continuous. 20 | val numClasses = 2 21 | val categoricalFeaturesInfo = Map[Int, Int]() 22 | val numTrees = 3 // Use more in practice. 23 | val featureSubsetStrategy = "auto" // Let the algorithm choose. 24 | val impurity = "gini" 25 | val maxDepth = 4 26 | val maxBins = 32 27 | 28 | val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, 29 | numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) 30 | 31 | // Evaluate model on test instances and compute test error 32 | val labelAndPreds = testData.map { point => 33 | val prediction = model.predict(point.features) 34 | (point.label, prediction) 35 | } 36 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() 37 | println("Test Error = " + testErr) 38 | println("Learned classification forest model:\n" + model.toDebugString) -------------------------------------------------------------------------------- /spark/chapter5/randomforstexample/src/test.txt: -------------------------------------------------------------------------------- 1 | // source: https://spark.apache.org/docs/1.2.0/mllib-decision-tree.html 2 | // Decision Tree with Gini impurity as an impurity 3 | // The test error is calculated to measure the algorithm accuracy. 4 | 5 | import org.apache.spark.mllib.tree.DecisionTree 6 | import org.apache.spark.mllib.util.MLUtils 7 | 8 | // Load and parse the data file. 9 | val data = MLUtils.loadLibSVMFile(sc, "/home/yyan/Desktop/data/svm.txt") 10 | // Split the data into training and test sets (30% held out for testing) 11 | val splits = data.randomSplit(Array(0.5, 0.5)) 12 | val (trainingData, testData) = (splits(0), splits(1)) 13 | 14 | // Train a DecisionTree model. 15 | // Empty categoricalFeaturesInfo indicates all features are continuous. 16 | val numClasses = 2 17 | val categoricalFeaturesInfo = Map[Int, Int]() 18 | val impurity = "gini" 19 | val maxDepth = 10 20 | val maxBins = 32 21 | val minInstancesPerNode =5 22 | 23 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, 24 | impurity, maxDepth, maxBins, minInstancesPerNode) 25 | 26 | // Evaluate model on test instances and compute test error 27 | val labelAndPreds = testData.map { point => 28 | val prediction = model.predict(point.features) 29 | (point.label, prediction) 30 | } 31 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() 32 | println("Test Error = " + testErr) 33 | println("Learned classification tree model:\n" + model.toDebugString) -------------------------------------------------------------------------------- /spark/chapter5/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter6/knnexample/example-run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RCM_PATH="$(pwd)/target/scala-2.10/SparkRecommender-assembly-0.1.jar" 3 | LIB_PATH="$(pwd)/lib/" 4 | /usr/bin/spark-submit --driver-memory 2G --executor-memory 6G \ 5 | --driver-class-path $LIB_PATH --class Boot $RCM_PATH $@ 6 | -------------------------------------------------------------------------------- /spark/chapter6/knnexample/project/Build.scala: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | import sbt._ 3 | 4 | object MyBuild extends Build { 5 | 6 | lazy val copyDependencies = TaskKey[Unit]("copy-dependencies") 7 | 8 | def copyDepTask = copyDependencies <<= (update, crossTarget, scalaVersion) map { 9 | (updateReport, out, scalaVer) => 10 | updateReport.allFiles foreach { srcPath => 11 | val destPath = out / "lib" / srcPath.getName 12 | IO.copyFile(srcPath, destPath, preserveLastModified=true) 13 | } 14 | } 15 | 16 | lazy val root = Project( 17 | "root", 18 | file("."), 19 | settings = Defaults.defaultSettings ++ Seq( 20 | copyDepTask 21 | ) 22 | ) 23 | } -------------------------------------------------------------------------------- /spark/chapter6/knnexample/project/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | 3 | -------------------------------------------------------------------------------- /spark/chapter6/knnexample/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /spark/chapter6/knnexample/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn -------------------------------------------------------------------------------- /spark/chapter6/knnexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter6/svmexample/build.sbt: -------------------------------------------------------------------------------- 1 | name := "Spark Kernel SVM" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | resolvers += "Spark Packages Repo" at "http://dl.bintray.com/spark-packages/maven" 8 | 9 | libraryDependencies ++= Seq( 10 | "org.apache.spark" %% "spark-core" % "1.3.1", 11 | "org.apache.spark" %% "spark-mllib" % "1.3.1", 12 | "amplab" % "spark-indexedrdd" % "0.1" 13 | ) 14 | -------------------------------------------------------------------------------- /spark/chapter6/svmexample/doc/usage.txt: -------------------------------------------------------------------------------- 1 | Build: 2 | First cd into .../Spark_kernel_svm then: 3 | $ sbt package 4 | 5 | Run: 6 | First cd into .../Spark_kernel_svm then: 7 | $ (your spark dir)/bin/spark-submit \ 8 | --packages amplab:spark-indexedrdd:0.1 \ 9 | target/scala-2.10/spark-kernel-svm_2.10-1.0.jar -------------------------------------------------------------------------------- /spark/chapter6/svmexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter6/svmexample/src/main/scala/Kernels.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Support Vector machine example 3 | // Chapter 6 4 | 5 | /* 6 | * Some Kernel functions. 7 | */ 8 | import org.apache.spark.mllib.linalg.Vector 9 | import org.apache.spark.mllib.linalg.Vectors 10 | 11 | /** Rbf Kernel, parametrized by gamma */ 12 | class RbfKernelFunc(gamma_s: Double) extends java.io.Serializable{ 13 | var gamma: Double = gamma_s 14 | def evaluate(x_1: Vector, x_2: Vector): Double = { 15 | math.exp(-1 * gamma * math.pow(Vectors.sqdist(x_1, x_2),2)) 16 | } 17 | } -------------------------------------------------------------------------------- /spark/chapter6/svmexample/src/main/scala/main.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Support Vector machine example 3 | // Chapter 6 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkContext._ 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.rdd._ 9 | 10 | import org.apache.spark.mllib.util.MLUtils 11 | 12 | import java.io._ 13 | import java.lang.System 14 | 15 | object TestKernelSVM { 16 | def main(args: Array[String]) { 17 | 18 | if (args.length != 1 ) { 19 | println("Usage: /path/to/spark/bin/spark-submit --packages amplab:spark-indexedrdd:0.1" + 20 | "target/scala-2.10/spark-kernel-svm_2.10-1.0.jar ") 21 | sys.exit(1) 22 | } 23 | 24 | val logFile = "README.md" // Should be some file on your system 25 | val conf = new SparkConf().setAppName("KernelSVM Test") 26 | val sc = new SparkContext(conf) 27 | 28 | val data = MLUtils.loadLibSVMFile(sc, args(0)) 29 | 30 | val splits = data.randomSplit(Array(0.8,0.2)) 31 | val training = splits(0) 32 | val test = splits(1).collect() 33 | 34 | val m = training.count() 35 | 36 | var pack_size = 100 37 | 38 | val iterations = List((0.5*m).toLong,m.toLong,(1.5*m).toLong,(2*m).toLong) 39 | var num_iter = 0 40 | 41 | val pw = new PrintWriter(new File("result.txt" )) 42 | 43 | for (num_iter <- iterations) { 44 | val t1 = System.currentTimeMillis 45 | val svm = new KernelSVM(training, 1.0/m, "rbf", 1.0) 46 | svm.train(num_iter,pack_size) 47 | val t2 = System.currentTimeMillis 48 | val runtime = (t2 - t1)/1000 49 | 50 | var ss = m.toString + " " + num_iter.toString + " " + pack_size.toString + " " + svm.getAccuracy(test).toString + " " + runtime.toString + "\n" 51 | pw.write(ss) 52 | } 53 | 54 | pw.close 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /spark/chapter7/aprioriexample/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | spark 3 | spark-apriori 4 | 4.0.0 5 | Spark A Priori 6 | jar 7 | 1.0 8 | 9 | 10 | scala-tools.org 11 | Scala-tools Maven2 Repository 12 | http://scala-tools.org/repo-releases 13 | 14 | 15 | 16 | 17 | scala-tools.org 18 | Scala-tools Maven2 Repository 19 | http://scala-tools.org/repo-releases 20 | 21 | 22 | 23 | 24 | org.scala-lang 25 | scala-library 26 | 2.11.5 27 | 28 | 29 | org.apache.spark 30 | spark-core_2.10 31 | 1.2.1 32 | 33 | 34 | 35 | src/main/scala 36 | 37 | 38 | org.apache.maven.plugins 39 | maven-compiler-plugin 40 | 41 | 1.7 42 | 1.7 43 | 44 | 45 | 46 | net.alchim31.maven 47 | scala-maven-plugin 48 | 3.1.6 49 | 50 | 51 | 52 | compile 53 | testCompile 54 | 55 | 56 | 57 | 58 | 59 | 60 | -nobootcp 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /spark/chapter7/aprioriexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter7/aprioriexample/src/main/scala/default/BloomFilter.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Association rule based learning - Apriori example 3 | // Chapter 7 4 | 5 | package default 6 | 7 | import scala.collection.BitSet 8 | 9 | class BloomFilter[T]( 10 | buckets: Int, 11 | multiplier: Int, 12 | increment: Int, 13 | private val bitset: BitSet = BitSet()) extends Set[T] { 14 | import BloomFilter._ 15 | 16 | override def contains(elem: T): Boolean = 17 | bitset.contains((reHash(multiplier, increment)(elem) % buckets)) 18 | 19 | override def +(elem: T): Set[T] = 20 | new BloomFilter( 21 | buckets, 22 | multiplier, 23 | increment, 24 | bitset + (reHash(multiplier, increment)(elem) % buckets) 25 | ) 26 | 27 | override def -(elem: T): Set[T] = ??? 28 | 29 | override def iterator: Iterator[T] = ??? 30 | } 31 | 32 | object BloomFilter extends App { 33 | def ??? : Nothing = throw new UnsupportedOperationException() 34 | 35 | def apply[T](buckets: Int, multiplier: Int = 12568, increment: Int = 76509)(elems: T*) = { 36 | new BloomFilter[T]( 37 | buckets, 38 | multiplier, 39 | increment, 40 | BitSet(elems.map { elem: T => reHash(multiplier, increment)(elem) % buckets }: _*) 41 | ) 42 | } 43 | 44 | def reHash( 45 | multiplier: Int, 46 | increment: Int 47 | )(a: Any): Int = a.hashCode() * multiplier + increment 48 | } 49 | -------------------------------------------------------------------------------- /spark/chapter7/aprioriexample/src/main/scala/default/FrequentItemSets.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Association rule based learning - Apriori example 3 | // Chapter 7 4 | 5 | package default 6 | 7 | import scala.collection.SortedSet 8 | 9 | trait FrequentItemSets { 10 | /** 11 | * Collection type for frequent item sets. 12 | * 13 | * @tparam T item type. 14 | */ 15 | type ItemSet[T] = SortedSet[T] 16 | } 17 | -------------------------------------------------------------------------------- /spark/chapter7/aprioriexample/src/main/scala/default/NaiveFrequentItemSets.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Association rule based learning - Apriori example 3 | // Chapter 7 4 | 5 | package default 6 | 7 | import scala.collection.Map 8 | import scala.reflect.ClassTag 9 | 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.SparkContext.rddToPairRDDFunctions 12 | import org.apache.spark.rdd.RDD 13 | 14 | /** 15 | * Utility to compute frequent item sets for association rule mining. 16 | * 17 | * @see [[http://en.wikipedia.org/wiki/Association_rule_learning]] 18 | */ 19 | object NaiveFrequentItemSets extends FrequentItemSets { 20 | /** 21 | * Computes frequent item sets up to the specified size. 22 | * 23 | * @param sparkContext context in which to run. This is used to broadcast shared memory. 24 | * @param baskets to analyze. 25 | * @param supportThreshold minimum # of times an item set must occur to be considered frequent. 26 | * @param maxSize maximum item set size. 27 | * @tparam T item type. 28 | * @return map of frequent item sets and their counts. 29 | */ 30 | def apply[T: Ordering: ClassTag](sparkContext: SparkContext)( 31 | baskets: RDD[ItemSet[T]], 32 | supportThreshold: Int, 33 | maxSize: Int): Map[ItemSet[T], Int] = { 34 | // Count item subsets from size 1 up to maxSize. 35 | baskets.flatMap { basket: ItemSet[T] => 36 | (1 to maxSize).map(basket.subsets(_)).reduce(_ ++ _).map((_, 1)) 37 | } 38 | .reduceByKey(_ + _) 39 | // Filter by support threshold. 40 | .filter { case (itemSet: ItemSet[T], count: Int) => count >= supportThreshold } 41 | .collectAsMap() 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /spark/chapter7/aprioriexample/src/main/scala/default/TestMain.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Association rule based learning - Apriori example 3 | // Chapter 7 4 | 5 | package default 6 | 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.SparkConf 10 | import scala.collection.SortedSet 11 | 12 | /** 13 | * Delete me and write real tests. 14 | */ 15 | object TestMain extends App with FrequentItemSets { 16 | 17 | val conf: SparkConf = new SparkConf().setMaster("local").setAppName("Simple Application") 18 | val sparkContext: SparkContext = new SparkContext(conf) 19 | 20 | val filePath: String = "/home/shashir/data.txt" 21 | 22 | val data: RDD[ItemSet[String]] = sparkContext.textFile(filePath, 2).map { line: String => 23 | SortedSet(line.split(" "): _*) 24 | }.cache() 25 | 26 | APriori(sparkContext)(data, 400, 3).foreach(println) 27 | } 28 | -------------------------------------------------------------------------------- /spark/chapter7/fpgrowthexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter7/fpgrowthexample/src/main/scala/default/Test.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Association rule based learning - FPGrowth example 3 | // Chapter 7 4 | 5 | package default 6 | 7 | object Test { 8 | def main (args: Array[String]) { 9 | val data = Array(Array("f", "c", "a", "d", "g", "i", "m", "p"), Array("a", "b", "c", "f", "l", "m", "o"), Array("b", "f", "h", "j", "o"), Array("b", "c", "k", "s", "p"), Array("a", "f", "c", "e", "l", "p", "m", "n")) 10 | FPTree(data, 3) 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /spark/chapter7/fpgrowthexample/src/main/scala/default/TreeNode.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Association rule based learning - FPGrowth example 3 | // Chapter 7 4 | 5 | package default 6 | 7 | import scala.collection.mutable.ArrayBuffer 8 | 9 | /** 10 | * TreeNode.scala 11 | * Description: This is the definition of TreeNode of FP-Tree 12 | * Author: Lin, Chen 13 | * E-mail: chlin.ecnu@gmail.com 14 | * Version: 1.0 15 | */ 16 | 17 | class TreeNode (val name: String = null, var count: Long = 0, var parent: TreeNode = null, val children: ArrayBuffer[TreeNode] = new ArrayBuffer[TreeNode](), var nextHomonym: TreeNode = null){ 18 | def findChild(name: String): TreeNode = { 19 | children.find(_.name == name) match { 20 | case Some(node) => node 21 | case None => null 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /spark/chapter7/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter8/k-meansexample/build.sbt: -------------------------------------------------------------------------------- 1 | name := "Spark K-Means" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.0.1" 8 | 9 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/" 10 | 11 | -------------------------------------------------------------------------------- /spark/chapter8/k-meansexample/input/centroids.txt: -------------------------------------------------------------------------------- 1 | 1 2024854310.3956 6731000548.2243 2 | 2 721344868.9113 8856893681.4588 3 | 3 8857754499.5514 5251661686.7686 4 | 4 1077585262.2174 2699556533.0996 5 | 5 4180772568.7333 8490011787.7244 6 | 6 9695729913.2825 2127828538.6125 7 | 7 4056302822.5753 3828270338.5277 8 | 8 9588172531.2373 9453674798.3649 9 | 9 1838580091.0374 1032953856.2467 10 | 10 9081427080.8788 245825688.9429 11 | -------------------------------------------------------------------------------- /spark/chapter8/k-meansexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter8/k-meansexample/run.sh: -------------------------------------------------------------------------------- 1 | sbt package 2 | rm -rf final 3 | spark-submit \ 4 | --class "com.jgalilee.spark.kmeans.JobDriver" \ 5 | --master local[4] \ 6 | ./target/scala-2.10/spark-k-means_2.10-1.0.jar \ 7 | input/points.txt input/centroids.txt final 10 0.0 3 8 | cat final/p* | sort 9 | -------------------------------------------------------------------------------- /spark/chapter8/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter9/naivebayesexample/build.sbt: -------------------------------------------------------------------------------- 1 | name := "blog-spark-naive-bayes-reuters" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | val sparkVersion = "1.0.0" 8 | 9 | libraryDependencies <<= scalaVersion { 10 | scala_version => Seq( 11 | // Spark and Mllib 12 | "org.apache.spark" %% "spark-core" % sparkVersion, 13 | "org.apache.spark" %% "spark-mllib" % sparkVersion, 14 | // Lucene 15 | "org.apache.lucene" % "lucene-core" % "4.8.1", 16 | // for Porter Stemmer 17 | "org.apache.lucene" % "lucene-analyzers-common" % "4.8.1", 18 | // Guava for the dictionary 19 | "com.google.guava" % "guava" % "17.0", 20 | // article extractor 21 | "com.gravity" %% "goose" % "2.1.23" 22 | ) 23 | } 24 | 25 | // used for goose 26 | resolvers += Resolver.mavenLocal 27 | -------------------------------------------------------------------------------- /spark/chapter9/naivebayesexample/download-reuters.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | REUTERS_FILE="reuters21578.tar.gz" 4 | if [ ! -f $REUTERS_FILE ] 5 | then 6 | wget http://www.daviddlewis.com/resources/testcollections/reuters21578/$REUTERS_FILE 7 | fi 8 | mkdir -p reuters 9 | (cd reuters; tar xvfz ../$REUTERS_FILE) 10 | -------------------------------------------------------------------------------- /spark/chapter9/naivebayesexample/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | 3 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") 4 | 5 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.5.0") 6 | -------------------------------------------------------------------------------- /spark/chapter9/naivebayesexample/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /spark/chapter9/naivebayesexample/src/main/scala/default/ReutersParser.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Bayesian learning - Naive Bayes example 3 | // Chapter 9 4 | 5 | package default 6 | 7 | 8 | import scala.xml.pull.{EvText, EvElemEnd, EvElemStart, XMLEventReader} 9 | import scala.io.Source 10 | import scala.collection.mutable 11 | 12 | object ReutersParser { 13 | def PopularCategories = Seq("money", "fx", "crude", "grain", "trade", "interest", "wheat", "ship", "corn", "oil", "dlr", "gas", "oilseed", "supply", "sugar", "gnp", "coffee", "veg", "gold", "nat", "soybean", "bop", "livestock", "cpi") 14 | 15 | def parseAll(xmlFiles: Iterable[String]) = xmlFiles flatMap parse 16 | 17 | def parse(xmlFile: String) = { 18 | val docs = mutable.ArrayBuffer.empty[Document] 19 | val xml = new XMLEventReader(Source.fromFile(xmlFile, "latin1")) 20 | var currentDoc: Document = null 21 | var inTopics = false 22 | var inLabel = false 23 | var inBody = false 24 | for (event <- xml) { 25 | event match { 26 | case EvElemStart(_, "REUTERS", attrs, _) => 27 | currentDoc = Document(attrs.get("NEWID").get.head.text) 28 | 29 | case EvElemEnd(_, "REUTERS") => 30 | if (currentDoc.labels.nonEmpty) { 31 | docs += currentDoc 32 | } 33 | 34 | case EvElemStart(_, "TOPICS", _, _) => inTopics = true 35 | 36 | case EvElemEnd(_, "TOPICS") => inTopics = false 37 | 38 | case EvElemStart(_, "D", _, _) => inLabel = true 39 | 40 | case EvElemEnd(_, "D") => inLabel = false 41 | 42 | case EvElemStart(_, "BODY", _, _) => inBody = true 43 | 44 | case EvElemEnd(_, "BODY") => inBody = false 45 | 46 | case EvText(text) => 47 | if (text.trim.nonEmpty) { 48 | if (inTopics && inLabel && PopularCategories.contains(text)) { 49 | currentDoc = currentDoc.copy(labels = currentDoc.labels + text) 50 | } else if (inBody) { 51 | currentDoc = currentDoc.copy(body = currentDoc.body + text.trim) 52 | } 53 | } 54 | 55 | case _ => 56 | } 57 | } 58 | docs 59 | } 60 | } 61 | 62 | case class Document(docId: String, body: String = "", labels: Set[String] = Set.empty) 63 | -------------------------------------------------------------------------------- /spark/chapter9/naivebayesexample/src/main/scala/default/Tokenizer.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Bayesian learning - Naive Bayes example 3 | // Chapter 9 4 | 5 | package default 6 | 7 | 8 | import java.io.StringReader 9 | import org.apache.lucene.analysis.en.EnglishAnalyzer 10 | import org.apache.lucene.util.Version 11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 12 | import scala.collection.mutable 13 | 14 | object Tokenizer { 15 | val LuceneVersion = Version.LUCENE_48 16 | 17 | def tokenizeAll(docs: Iterable[Document]) = docs.map(tokenize) 18 | 19 | def tokenize(doc: Document): TermDoc = TermDoc(doc.docId, doc.labels, tokenize(doc.body)) 20 | 21 | def tokenize(content: String): Seq[String] = { 22 | val tReader = new StringReader(content) 23 | val analyzer = new EnglishAnalyzer(LuceneVersion) 24 | val tStream = analyzer.tokenStream("contents", tReader) 25 | val term = tStream.addAttribute(classOf[CharTermAttribute]) 26 | tStream.reset() 27 | 28 | val result = mutable.ArrayBuffer.empty[String] 29 | while(tStream.incrementToken()) { 30 | val termValue = term.toString 31 | if (!(termValue matches ".*[\\d\\.].*")) { 32 | result += term.toString 33 | } 34 | } 35 | result 36 | } 37 | } 38 | 39 | case class TermDoc(doc: String, labels: Set[String], terms: Seq[String]) -------------------------------------------------------------------------------- /spark/chapter9/naivebayesexample/src/main/scala/default/VectorUtil.scala: -------------------------------------------------------------------------------- 1 | // Practical Machine learning 2 | // Bayesian learning - Naive Bayes example 3 | // Chapter 9 4 | 5 | package default 6 | 7 | import com.google.common.collect.ImmutableBiMap 8 | import scala.collection.JavaConversions._ 9 | import org.apache.spark.mllib.linalg.Vectors 10 | 11 | class Dictionary(dict: Seq[String]) extends Serializable { 12 | 13 | // map term => index 14 | val termToIndex = ImmutableBiMap.builder[String, Int]() 15 | .putAll(dict.zipWithIndex.toMap[String, Int]) 16 | .build() 17 | 18 | @transient 19 | lazy val indexToTerm = termToIndex.inverse() 20 | 21 | val count = termToIndex.size() 22 | 23 | def indexOf(term: String) = termToIndex(term) 24 | 25 | def valueOf(index: Int) = indexToTerm(index) 26 | 27 | def tfIdfs(terms: Seq[String], idfs: Map[String, Double]) = { 28 | val filteredTerms = terms.filter(idfs contains) 29 | (filteredTerms.groupBy(identity).map { 30 | case (term, instances) => 31 | (indexOf(term), (instances.size.toDouble / filteredTerms.size.toDouble) * idfs(term)) 32 | }).toSeq.sortBy(_._1) // sort by termId 33 | } 34 | 35 | def vectorize(tfIdfs: Iterable[(Int, Double)]) = { 36 | Vectors.sparse(dict.size, tfIdfs.toSeq) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /spark/chapter9/readme.md: -------------------------------------------------------------------------------- 1 | 2 | --------------------------------------------------------------------------------