├── CSVtoLibSVM_converter.R ├── JavaRandomForestClassificationExample.java ├── JavaRandomForestRegressionExample.java ├── Letterdata_libsvm.data ├── LinearRegression_revisited.py ├── README.md ├── letterdata.data ├── pom.xml └── winutils.exe /CSVtoLibSVM_converter.R: -------------------------------------------------------------------------------- 1 | install.packages("e1071") # download the e1071 library 2 | install.packages("SparseM") # download the SparseM library 3 | library(e1071) 4 | library(SparseM) # load the libraries 5 | train <- read.csv("C:/Users/rezkar/Downloads/letterdata.csv" ) 6 | 7 | df2 <- transform(df, id=match(sample, unique(sample))) 8 | dim(train) 9 | # load the csv dataset into memory 10 | train$letter <- as.numeric(as.factor(train$letter)) # convert the labels into numeric format 11 | # from any other format (int in my case) 12 | x <- as.matrix(train) # convert from data.frame format to 13 | # matrix format 14 | y <- train[,17] # put the labels in a separate vector 15 | xs <- as.matrix.csr(x) # convert to compressed sparse row format 16 | write.matrix.csr(xs, y = y, file="C:/Users/rezkar/Downloads/Letterdata.data") # write the output libsvm format file 17 | -------------------------------------------------------------------------------- /JavaRandomForestClassificationExample.java: -------------------------------------------------------------------------------- 1 | package com.example.RandomForest; 2 | 3 | import java.util.HashMap; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.api.java.function.Function; 7 | import org.apache.spark.api.java.function.PairFunction; 8 | import org.apache.spark.mllib.evaluation.MulticlassMetrics; 9 | import org.apache.spark.mllib.regression.LabeledPoint; 10 | import org.apache.spark.mllib.tree.RandomForest; 11 | import org.apache.spark.mllib.tree.model.RandomForestModel; 12 | import org.apache.spark.mllib.util.MLUtils; 13 | import org.apache.spark.sql.SparkSession; 14 | import com.example.SparkSession.UtilityForSparkSession; 15 | import scala.Tuple2; 16 | 17 | public class JavaRandomForestClassificationExample { 18 | static SparkSession spark = UtilityForSparkSession.mySession(); 19 | 20 | public static void main(String[] args) { 21 | // Load and parse the data file. 22 | String datapath = "input/Letterdata_libsvm.data"; 23 | JavaRDD data = MLUtils.loadLibSVMFile(spark.sparkContext(), datapath).toJavaRDD(); 24 | // Split the data into training and test sets (30% held out for testing) 25 | JavaRDD[] splits = data.randomSplit(new double[]{0.7, 0.3}, 12345); 26 | JavaRDD trainingData = splits[0]; 27 | JavaRDD testData = splits[1]; 28 | 29 | // Train a RandomForest model. Empty categoricalFeaturesInfo indicates all features are continuous. 30 | Integer numClasses = 26; 31 | HashMap categoricalFeaturesInfo = new HashMap<>(); 32 | Integer numTrees = 10; // Use more in practice. 33 | String featureSubsetStrategy = "auto"; // Let the algorithm choose feature subset strategy. 34 | String impurity = "gini"; 35 | Integer maxDepth = 30; 36 | Integer maxBins = 40; 37 | Integer seed = 12345; 38 | 39 | final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses, 40 | categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, 41 | seed); 42 | 43 | // Evaluation-1: evaluate the model on test instances and compute test error 44 | JavaPairRDD predictionAndLabel = 45 | testData.mapToPair(new PairFunction() { 46 | @Override 47 | public Tuple2 call(LabeledPoint p) { 48 | return new Tuple2<>(model.predict(p.features()), p.label()); 49 | } 50 | }); 51 | 52 | Double testErr = 53 | 1.0 * predictionAndLabel.filter(new Function, Boolean>() { 54 | @Override 55 | public Boolean call(Tuple2 pl) { 56 | return !pl._1().equals(pl._2()); 57 | } 58 | }).count() / testData.count(); 59 | System.out.println("Test Error: " + testErr); 60 | System.out.println("Learned classification forest model:\n" + model.toDebugString()); 61 | 62 | // Evaluation-2: evaluate the model on test instances and compute the related performance measure statistics 63 | JavaRDD> predictionAndLabels = testData.map( 64 | new Function>() { 65 | public Tuple2 call(LabeledPoint p) { 66 | Double prediction = model.predict(p.features()); 67 | return new Tuple2(prediction, p.label()); 68 | } 69 | } 70 | ); 71 | 72 | // Get evaluation metrics. 73 | MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); 74 | System.out.println(metrics.confusionMatrix()); 75 | System.out.println(metrics.confusionMatrix()); 76 | double precision = metrics.precision(metrics.labels()[0]); 77 | double recall = metrics.recall(metrics.labels()[0]); 78 | double f_measure = metrics.fMeasure(); 79 | double query_label = 8.0; 80 | double TP = metrics.truePositiveRate(query_label); 81 | double FP = metrics.falsePositiveRate(query_label); 82 | double WTP = metrics.weightedTruePositiveRate(); 83 | double WFP = metrics.weightedFalsePositiveRate(); 84 | System.out.println("Precision = " + precision); 85 | System.out.println("Recall = " + recall); 86 | System.out.println("F-measure = " + f_measure); 87 | System.out.println("True Positive Rate = " + TP); 88 | System.out.println("False Positive Rate = " + FP); 89 | System.out.println("Weighted True Positive Rate = " + WTP); 90 | System.out.println("Weighted False Positive Rate = " + WFP); 91 | 92 | // Save and load model 93 | //model.save(spark.sparkContext(), "target/tmp/myRandomForestClassificationModel"); 94 | //RandomForestModel sameModel = RandomForestModel.load(spark.sparkContext(), "target/tmp/myRandomForestClassificationModel"); 95 | 96 | spark.stop(); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /JavaRandomForestRegressionExample.java: -------------------------------------------------------------------------------- 1 | package com.example.RandomForest; 2 | 3 | import java.io.FileNotFoundException; 4 | import java.io.FileOutputStream; 5 | import java.io.PrintStream; 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | import org.apache.spark.api.java.JavaPairRDD; 9 | import org.apache.spark.api.java.JavaRDD; 10 | import org.apache.spark.api.java.function.Function; 11 | import org.apache.spark.api.java.function.Function2; 12 | import org.apache.spark.api.java.function.PairFunction; 13 | import org.apache.spark.mllib.evaluation.MulticlassMetrics; 14 | import org.apache.spark.mllib.regression.LabeledPoint; 15 | import org.apache.spark.mllib.tree.RandomForest; 16 | import org.apache.spark.mllib.tree.model.RandomForestModel; 17 | import org.apache.spark.mllib.util.MLUtils; 18 | import org.apache.spark.sql.Dataset; 19 | import org.apache.spark.sql.Row; 20 | import org.apache.spark.sql.SparkSession; 21 | import com.example.SparkSession.UtilityForSparkSession; 22 | import scala.Tuple2; 23 | 24 | public class JavaRandomForestRegressionExample { 25 | static SparkSession spark = UtilityForSparkSession.mySession(); 26 | public static void main(String[] args) throws FileNotFoundException { 27 | // String datapath = "C:/Users/rezkar/Downloads/KEGG/YearPredictionMSD/YearPredictionMSD"; 28 | String datapath = args[0]; 29 | Dataset df = spark.read().format("libsvm").option("header", "true").load(datapath); 30 | df.show(); 31 | 32 | JavaRDD data = MLUtils.loadLibSVMFile(spark.sparkContext(), datapath).toJavaRDD(); 33 | // Split the data into training and test sets (89.98147% held out for training and the rest as testing) 34 | JavaRDD[] splits = data.randomSplit(new double[]{0.8998147, 0.1001853}); 35 | JavaRDD trainingData = splits[0]; 36 | JavaRDD testData = splits[1]; 37 | 38 | // Set parameters. The empty categoricalFeaturesInfo indicates all features are continuous. 39 | Map categoricalFeaturesInfo = new HashMap<>(); 40 | Integer numTrees = 20; // Use more in practice. 41 | String featureSubsetStrategy = "auto"; // Let the algorithm choose. 42 | String impurity = "variance"; 43 | Integer maxDepth = 20; 44 | Integer maxBins = 20; 45 | Integer seed = 12345; 46 | // Train a RandomForest model. 47 | final RandomForestModel model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); 48 | 49 | // Evaluate model on test instances and compute test error 50 | JavaPairRDD predictionAndLabel = 51 | testData.mapToPair(new PairFunction() { 52 | @Override 53 | public Tuple2 call(LabeledPoint p) { 54 | return new Tuple2<>(model.predict(p.features()), p.label()); 55 | } 56 | }); 57 | 58 | PrintStream out = new PrintStream(new FileOutputStream("output.txt")); 59 | System.setOut(out); 60 | 61 | Double testMSE = 62 | predictionAndLabel.map(new Function, Double>() { 63 | @Override 64 | public Double call(Tuple2 pl) { 65 | Double diff = pl._1() - pl._2(); 66 | return diff * diff; 67 | } 68 | }).reduce(new Function2() { 69 | @Override 70 | public Double call(Double a, Double b) { 71 | return a + b; 72 | } 73 | }) / testData.count(); 74 | 75 | System.out.println("Test Mean Squared Error: " + testMSE); 76 | //System.out.println("Learned regression forest model:\n" + model.toDebugString()); 77 | 78 | // Evaluation -2: 79 | // Evaluation-2: evaluate the model on test instances and compute the related performance measure statistics 80 | JavaRDD> predictionAndLabels = testData.map( 81 | new Function>() { 82 | public Tuple2 call(LabeledPoint p) { 83 | Double prediction = model.predict(p.features()); 84 | return new Tuple2(prediction, p.label()); 85 | } 86 | } 87 | ); 88 | 89 | // Get evaluation metrics. 90 | MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); 91 | //System.out.println(metrics.confusionMatrix()); 92 | // System.out.println(metrics.confusionMatrix()); 93 | double precision = metrics.precision(metrics.labels()[0]); 94 | double recall = metrics.recall(metrics.labels()[0]); 95 | double f_measure = metrics.fMeasure(); 96 | double query_label = 2001; 97 | double TP = metrics.truePositiveRate(query_label); 98 | double FP = metrics.falsePositiveRate(query_label); 99 | double WTP = metrics.weightedTruePositiveRate(); 100 | double WFP = metrics.weightedFalsePositiveRate(); 101 | System.out.println("Precision = " + precision); 102 | System.out.println("Recall = " + recall); 103 | System.out.println("F-measure = " + f_measure); 104 | System.out.println("True Positive Rate = " + TP); 105 | System.out.println("False Positive Rate = " + FP); 106 | System.out.println("Weighted True Positive Rate = " + WTP); 107 | System.out.println("Weighted False Positive Rate = " + WFP); 108 | 109 | 110 | // Save and load model 111 | //model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel"); 112 | //RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),"target/tmp/myRandomForestRegressionModel"); 113 | // $example off$ 114 | 115 | spark.stop(); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /LinearRegression_revisited.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | from numpy import genfromtxt 6 | from sklearn.datasets import load_boston 7 | from sklearn.model_selection import train_test_split 8 | 9 | import os 10 | from tensorflow.python.framework import ops 11 | import warnings 12 | 13 | warnings.filterwarnings("ignore") 14 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 15 | ops.reset_default_graph() 16 | 17 | def read_boston_data(): 18 | boston = load_boston() 19 | features = np.array(boston.data) 20 | labels = np.array(boston.target) 21 | return features, labels 22 | 23 | def feature_normalize(dataset): 24 | mu = np.mean(dataset,axis=0) 25 | sigma = np.std(dataset,axis=0) 26 | return(dataset - mu)/sigma 27 | 28 | def append_bias_reshape(features,labels): 29 | n_training_samples = features.shape[0] 30 | n_dim = features.shape[1] 31 | f = np.reshape(np.c_[np.ones(n_training_samples),features],[n_training_samples,n_dim + 1]) 32 | l = np.reshape(labels,[n_training_samples,1]) 33 | return f, l 34 | 35 | features,labels = read_boston_data() 36 | normalized_features = feature_normalize(features) 37 | data, label = append_bias_reshape(normalized_features,labels) 38 | n_dim = data.shape[1] 39 | 40 | # Train-test split 41 | train_x, test_x, train_y, test_y = train_test_split(data,label,test_size = 0.25,random_state = 100) 42 | 43 | learning_rate = 0.01 44 | training_epochs = 100000 45 | log_loss = np.empty(shape=[1],dtype=float) 46 | 47 | X = tf.placeholder(tf.float32,[None,n_dim]) #takes any number of rows but n_dim columns 48 | Y = tf.placeholder(tf.float32,[None,1]) # #takes any number of rows but only 1 continuous column 49 | W = tf.Variable(tf.ones([n_dim,1])) # W weight vector 50 | 51 | init_op = tf.global_variables_initializer() 52 | 53 | # LInear regression operation: First line will multiply features matrix to weights matrix and can be used for prediction. 54 | #The second line is cost or loss function (squared error of regression line). 55 | #Finally, the third line perform one step of gradient descent optimization to minimize the cost function. 56 | 57 | y_ = tf.matmul(X, W) 58 | cost_op = tf.reduce_mean(tf.square(y_ - Y)) 59 | training_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_op) 60 | 61 | sess = tf.Session() 62 | sess.run(init_op) 63 | 64 | for epoch in range(training_epochs): 65 | sess.run(training_step,feed_dict={X:train_x,Y:train_y}) 66 | log_loss = np.append(log_loss,sess.run(cost_op,feed_dict={X: train_x,Y: train_y})) 67 | 68 | plt.plot(range(len(log_loss)),log_loss) 69 | plt.axis([0,training_epochs,0,np.max(log_loss)]) 70 | plt.show() 71 | 72 | pred_y = sess.run(y_, feed_dict={X: test_x}) 73 | mse = tf.reduce_mean(tf.square(pred_y - test_y)) 74 | print("MSE: %.4f" % sess.run(mse)) 75 | 76 | fig, ax = plt.subplots() 77 | ax.scatter(test_y, pred_y) 78 | ax.plot([test_y.min(), test_y.max()], [test_y.min(), test_y.max()], 'k--', lw=3) 79 | ax.set_xlabel('Measured') 80 | ax.set_ylabel('Predicted') 81 | plt.show() 82 | 83 | sess.close() 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RandomForestSpark -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.examples 6 | MillionSongsDatabase 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | MillionSongsDatabase 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 1.8 16 | 2.0.0 17 | 18 | 19 | 20 | 21 | org.apache.spark 22 | spark-core_2.11 23 | ${spark.version} 24 | 25 | 26 | org.apache.spark 27 | spark-sql_2.11 28 | ${spark.version} 29 | 30 | 31 | org.apache.spark 32 | spark-streaming_2.11 33 | ${spark.version} 34 | 35 | 36 | 37 | org.apache.bahir 38 | spark-streaming-twitter_2.11 39 | ${spark.version} 40 | 41 | 42 | org.apache.spark 43 | spark-mllib_2.11 44 | ${spark.version} 45 | 46 | 47 | org.apache.spark 48 | spark-hive_2.11 49 | ${spark.version} 50 | 51 | 52 | org.apache.spark 53 | spark-graphx_2.11 54 | ${spark.version} 55 | 56 | 57 | 58 | org.apache.spark 59 | spark-yarn_2.11 60 | ${spark.version} 61 | 62 | 63 | org.apache.spark 64 | spark-network-shuffle_2.11 65 | ${spark.version} 66 | 67 | 68 | org.apache.spark 69 | spark-streaming-kafka_2.10 70 | 1.6.2 71 | 72 | 73 | org.apache.spark 74 | spark-streaming-flume_2.11 75 | ${spark.version} 76 | 77 | 78 | com.databricks 79 | spark-csv_2.11 80 | 1.3.0 81 | 82 | 83 | mysql 84 | mysql-connector-java 85 | 5.1.38 86 | 87 | 88 | junit 89 | junit 90 | 3.8.1 91 | test 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | org.apache.maven.plugins 100 | maven-eclipse-plugin 101 | 2.9 102 | 103 | true 104 | false 105 | 106 | 107 | 108 | 109 | org.apache.maven.plugins 110 | maven-compiler-plugin 111 | 3.5.1 112 | 113 | ${jdk.version} 114 | ${jdk.version} 115 | 116 | 117 | 118 | org.apache.maven.plugins 119 | maven-shade-plugin 120 | 2.4.3 121 | 122 | true 123 | 124 | 125 | 126 | 127 | org.apache.maven.plugins 128 | maven-assembly-plugin 129 | 2.4.1 130 | 131 | 132 | 133 | jar-with-dependencies 134 | 135 | 136 | 137 | 138 | com.example.RandomForest.SongPredictionusingLinear 139 | 140 | 141 | 142 | 143 | oozie.launcher.mapreduce.job.user.classpath.first 144 | true 145 | 146 | 147 | 148 | 149 | 150 | make-assembly 151 | 152 | package 153 | 154 | single 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /winutils.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rezacsedu/RandomForestSpark/f485e4d4d2de80ff05114cce1f8f1179b0d306c5/winutils.exe --------------------------------------------------------------------------------