├── LICENSE ├── README.md ├── pom.xml └── src └── main ├── java └── com │ └── packt │ └── sfjd │ ├── ch10 │ ├── BikeRentalPrediction.java │ ├── Flight.java │ ├── FlightDelay.java │ ├── JavaALSExample.java │ ├── JavaEstimatorTransformerParamExample.java │ └── Rating.java │ ├── ch11 │ ├── AbsFunc1.java │ ├── AbsFunc2.java │ ├── AbsFunc3.java │ ├── AbsFunc4.java │ ├── AbsFunc5.java │ ├── AbsFunc6.java │ ├── AbsFunc7.java │ ├── AbsFunc8.java │ ├── PropertyGraphExample.java │ └── PropertyGraphExampleFromEdges.java │ ├── ch2 │ ├── AInnerClassVsLambda.java │ ├── Car.java │ ├── ClosureDemo.java │ ├── ClosureExample.java │ ├── CollectorsExamples.java │ ├── CreateStreamExample.java │ ├── Interface1.java │ ├── Interface2.java │ ├── InterfaceImpl.java │ ├── IntermediateOpExample.java │ ├── LambdaExamples.java │ ├── LexicalScoping.java │ ├── MethodReferenceExample.java │ ├── MyFileNameFilter.java │ ├── MyFilterImpl.java │ ├── MyInterface.java │ ├── MyInterfaceDemo.java │ ├── MyInterfaceImpl.java │ ├── ShortCircuitOperationExample.java │ ├── TerminalOpExample.java │ ├── WordCountInJava.java │ └── generics │ │ ├── FirstExample.java │ │ ├── MyGeneric.java │ │ └── MyGenericsDemo.java │ ├── ch4 │ ├── ActionExamples.java │ ├── ActionsExamplesOld.java │ ├── AggeregateExample.java │ ├── JavaWordCount.java │ ├── PersistExample.java │ ├── SparkWordCount.java │ ├── SparkWordCount_1_7.java │ ├── WordCount.java │ └── transformations │ │ ├── Test.java │ │ ├── TestMain.java │ │ └── Transformations.java │ ├── ch5 │ ├── CSVFileOperations.java │ ├── CassandraExample.java │ ├── DelimitedFileOperations.java │ ├── Employee.java │ ├── HdfsExample.java │ ├── JsonFileOperations.java │ ├── LFSExample.java │ ├── Movie.java │ ├── Person.java │ ├── PersonDetails.java │ ├── S3Example.java │ ├── TextFileOperations.java │ └── XMLFileOperations.java │ ├── ch7 │ ├── AdvanceActionExamples.java │ ├── BroadcastVariable.java │ ├── CustomPartitioner.java │ ├── CustomPartitionerExample.java │ ├── ListAccumulator.java │ ├── MapSideJoinBroadcast.java │ ├── PartitionIndexInformation.java │ ├── Partitioning.java │ ├── TestAccumulator.java │ └── Transformations.java │ ├── ch8 │ ├── Average.java │ ├── AverageUDAF.java │ ├── CalcDaysUDF.java │ ├── ContextCreation.java │ ├── DatasetOperations.java │ ├── DfExample.java │ ├── DsExample.java │ ├── Employee.java │ ├── SparkSessionExample.java │ ├── SparkSessionHeloWorld.java │ ├── TypeSafeUDAF.java │ └── UDFExample.java │ └── ch9 │ ├── Calculator.java │ ├── FileStreamingEx.java │ ├── FlightDetails.java │ ├── KafkaExample.java │ ├── StateFulProcessingExample.java │ ├── StateLessProcessingExample.java │ ├── StructuredStreamingExample.java │ ├── TweetText.java │ ├── WindowBatchInterval.java │ ├── WordCountRecoverableEx.java │ ├── WordCountSocketEx.java │ ├── WordCountSocketJava8Ex.java │ ├── WordCountSocketStateful.java │ └── WordCountTransformOpEx.java └── resources ├── Apology_by_Plato.txt ├── Employee.txt ├── breakfast_menu.xml ├── dept.txt ├── log4j.properties ├── logFileWithException.log ├── movies.csv ├── numSeries.txt ├── people.tsv └── pep_json.json /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Apache Spark for Java Developers 5 | This is the code repository for [Apache Spark for Java Developers](https://www.packtpub.com/big-data-and-business-intelligence/apache-spark-java-developers?utm_source=github&utm_medium=repository&utm_campaign=9781787126497), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish. 6 | ## About the Book 7 | Apache Spark is the buzzword in the big data industry right now, especially with the increasing need for real-time streaming and data processing. While Spark is built on Scala, the Spark Java API exposes all the Spark features available in the Scala version for Java developers. This book will show you how you can implement various functionalities of the Apache Spark framework in Java, without stepping out of your comfort zone. 8 | 9 | The book starts with an introduction to the Apache Spark 2.x ecosystem, followed by explaining how to install and configure Spark, and refreshes the Java concepts that will be useful to you when consuming Apache Spark's APIs. You will explore RDD and its associated common Action and Transformation Java APIs, set up a production-like clustered environment, and work with Spark SQL. Moving on, you will perform near-real-time processing with Spark streaming, Machine Learning analytics with Spark MLlib, and graph processing with GraphX, all using various Java packages. 10 | 11 | By the end of the book, you will have a solid foundation in implementing components in the Spark framework in Java to build fast, real-time applications. 12 | 13 | ## Instructions and Navigation 14 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02. 15 | 16 | Chapter wise code files are placed inside the following folder: 17 | 18 | 19 | The code will look like the following: 20 | 21 | Any command-line input or output is written as follows: "\src\main\java\com\packt\sfjd" 22 | ``` 23 | SparkConf conf =new SparkConf().setMaster("local").setAppName("Local File 24 | system Example"); 25 | JavaSparkContext jsc=new JavaSparkContext(conf); 26 | ``` 27 | 28 | If you want to set up Spark on your local machine, then you can follow the instructions mentioned in Chapter 3, Let Us Spark. 29 | 30 | ## Related Products 31 | * [Apache Spark for Data Science Cookbook](https://www.packtpub.com/big-data-and-business-intelligence/apache-spark-data-science-cookbook?utm_source=github&utm_medium=repository&utm_campaign=9781785880100) 32 | 33 | * [Mastering Apache Spark 2.x - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/mastering-apache-spark-2x-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781786462749) 34 | 35 | * [Apache Spark 2.x Cookbook](https://www.packtpub.com/big-data-and-business-intelligence/apache-spark-2x-cookbook?utm_source=github&utm_medium=repository&utm_campaign=9781787127265) 36 | ### Download a free PDF 37 | 38 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
39 |

https://packt.link/free-ebook/9781787126497

-------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.packt.spark.dev 5 | SparkForJavaDevelopers 6 | 0.0.1-SNAPSHOT 7 | 8 | 2.1.1 9 | 2.11 10 | 1.8 11 | 2.6.5 12 | UTF-8 13 | 14 | 15 | 16 | 17 | 18 | org.apache.hadoop 19 | hadoop-aws 20 | 2.7.1 21 | 22 | 23 | 24 | org.apache.spark 25 | spark-core_${scala.binary.version} 26 | ${spark.version} 27 | 28 | 29 | 30 | 31 | org.apache.spark 32 | spark-sql_${scala.binary.version} 33 | ${spark.version} 34 | 35 | 36 | com.databricks 37 | spark-xml_${scala.binary.version} 38 | 0.3.3 39 | 40 | 41 | org.apache.spark 42 | spark-streaming_${scala.binary.version} 43 | ${spark.version} 44 | 45 | 46 | org.apache.spark 47 | spark-mllib_${scala.binary.version} 48 | ${spark.version} 49 | 50 | 51 | org.apache.spark 52 | spark-streaming-kafka-0-10_${scala.binary.version} 53 | ${spark.version} 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | com.databricks 62 | spark-xml_${scala.binary.version} 63 | 0.4.1 64 | 65 | 66 | 67 | net.sf.saxon 68 | Saxon-HE 69 | 9.4 70 | 71 | 72 | com.datastax.spark 73 | spark-cassandra-connector_2.11 74 | 2.0.0-M1 75 | 77 | 78 | 79 | 80 | 81 | com.fasterxml.jackson.core 82 | jackson-annotations 83 | ${jackson.version} 84 | 85 | 86 | com.fasterxml.jackson.core 87 | jackson-core 88 | ${jackson.version} 89 | 90 | 91 | com.fasterxml.jackson.core 92 | jackson-databind 93 | ${jackson.version} 94 | 95 | 96 | com.sun 97 | tools 98 | ${java-version} 99 | system 100 | C:\\Program Files\\Java\\jdk1.8.0_65\\lib\\tools.jar 101 | 102 | 103 | 104 | 105 | 106 | 107 | org.apache.maven.plugins 108 | maven-compiler-plugin 109 | 3.1 110 | 111 | ${java-version} 112 | ${java-version} 113 | 114 | 115 | 116 | maven-assembly-plugin 117 | 118 | 119 | jar-with-dependencies 120 | 121 | 122 | 123 | 124 | make-assembly 125 | package 126 | 127 | single 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch10/BikeRentalPrediction.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch10; 2 | 3 | import org.apache.log4j.Level; 4 | import org.apache.log4j.LogManager; 5 | import org.apache.log4j.Logger; 6 | import org.apache.spark.ml.Pipeline; 7 | import org.apache.spark.ml.PipelineModel; 8 | import org.apache.spark.ml.PipelineStage; 9 | import org.apache.spark.ml.evaluation.RegressionEvaluator; 10 | import org.apache.spark.ml.feature.VectorAssembler; 11 | import org.apache.spark.ml.feature.VectorIndexer; 12 | import org.apache.spark.ml.param.ParamMap; 13 | import org.apache.spark.ml.regression.GBTRegressor; 14 | import org.apache.spark.ml.tuning.CrossValidator; 15 | import org.apache.spark.ml.tuning.ParamGridBuilder; 16 | import org.apache.spark.sql.Dataset; 17 | import org.apache.spark.sql.Row; 18 | import org.apache.spark.sql.SparkSession; 19 | 20 | 21 | import org.apache.spark.sql.types.DataTypes; 22 | 23 | 24 | import static org.apache.spark.sql.functions.col; 25 | 26 | //https://docs.cloud.databricks.com/docs/latest/sample_applications/index.html#Sample%20ML/MLPipeline%20Bike%20Dataset.html 27 | 28 | public class BikeRentalPrediction { 29 | 30 | public static void main(String[] args) { 31 | System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); 32 | SparkSession sparkSession = SparkSession 33 | .builder() 34 | .master("local") 35 | .config("spark.sql.warehouse.dir", 36 | "file:///E:/sumitK/Hadoop/warehouse") 37 | .appName("BikeRentalPrediction").getOrCreate(); 38 | Logger rootLogger = LogManager.getRootLogger(); 39 | rootLogger.setLevel(Level.WARN); 40 | //We use the sqlContext.read method to read the data and set a few options: 41 | // 'format': specifies the Spark CSV data source 42 | // 'header': set to true to indicate that the first line of the CSV data file is a header 43 | // The file is called 'hour.csv'. 44 | Dataset ds=sparkSession.read() 45 | .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") 46 | .option("header", "true") 47 | .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv"); 48 | 49 | ds.cache(); 50 | 51 | ds.select("season").show();; 52 | 53 | ds.show(); 54 | 55 | System.out.println("Our dataset has rows :: "+ ds.count()); 56 | 57 | Dataset df = ds.drop("instant").drop("dteday").drop("casual").drop("registered"); 58 | df.printSchema(); 59 | //col("...") is preferable to df.col("...") 60 | Dataset dformatted = df.select(col("season").cast(DataTypes.IntegerType), 61 | col("yr").cast(DataTypes.IntegerType), 62 | col("mnth").cast(DataTypes.IntegerType), 63 | col("hr").cast(DataTypes.IntegerType), 64 | col("holiday").cast(DataTypes.IntegerType), 65 | col("weekday").cast(DataTypes.IntegerType), 66 | col("workingday").cast(DataTypes.IntegerType), 67 | col("weathersit").cast(DataTypes.IntegerType), 68 | col("temp").cast(DataTypes.IntegerType), 69 | col("atemp").cast(DataTypes.IntegerType), 70 | col("hum").cast(DataTypes.IntegerType), 71 | col("windspeed").cast(DataTypes.IntegerType), 72 | col("cnt").cast(DataTypes.IntegerType)); 73 | 74 | 75 | dformatted.printSchema(); 76 | Dataset[] data= dformatted.randomSplit(new double[]{0.7,0.3}); 77 | System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count()); 78 | 79 | /// 80 | //removing 'cnt' cloumn and then forming str array 81 | String[] featuresCols = dformatted.drop("cnt").columns(); 82 | 83 | for(String str:featuresCols){ 84 | System.out.println(str+" :: "); 85 | } 86 | 87 | //This concatenates all feature columns into a single feature vector in a new column "rawFeatures". 88 | VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures"); 89 | //This identifies categorical features and indexes them. 90 | VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4); 91 | 92 | //Takes the "features" column and learns to predict "cnt" 93 | GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt"); 94 | 95 | // Define a grid of hyperparameters to test: 96 | // - maxDepth: max depth of each decision tree in the GBT ensemble 97 | // - maxIter: iterations, i.e., number of trees in each GBT ensemble 98 | // In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100). 99 | ParamMap[] paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build(); 100 | // We define an evaluation metric. This tells CrossValidator how well we are doing by comparing the true labels with predictions. 101 | RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol()); 102 | 103 | // # Declare the CrossValidator, which runs model tuning for us. 104 | CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid); 105 | 106 | Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv}); 107 | 108 | PipelineModel pipelineModel=pipeline.fit(data[0]); 109 | 110 | Dataset predictions = pipelineModel.transform(data[1]); 111 | 112 | predictions.show(); 113 | //predictions.select("cnt", "prediction", *featuresCols); 114 | } 115 | 116 | } 117 | 118 | 119 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch10/Flight.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch10; 2 | 3 | import java.io.Serializable; 4 | 5 | public class Flight implements Serializable { 6 | /** 7 | |-- CRSArrTime: integer (nullable = true) 8 | |-- CRSDepTime: integer (nullable = true) 9 | |-- CRSElapsedTime: integer (nullable = true) 10 | |-- actualElapsedTime: integer (nullable = true) 11 | |-- airTime: integer (nullable = true) 12 | |-- arrDelay: double (nullable = true) 13 | |-- arrTime: integer (nullable = true) 14 | |-- dayOfWeek: string (nullable = true) 15 | |-- dayofMonth: string (nullable = true) 16 | |-- depDelay: integer (nullable = true) 17 | |-- depTime: integer (nullable = true) 18 | |-- distance: integer (nullable = true) 19 | |-- month: string (nullable = true) 20 | |-- origin: string (nullable = true) 21 | |-- uniqueCarrier: string (nullable = true) 22 | */ 23 | private static final long serialVersionUID = 1L; 24 | private String Month; 25 | private String DayofMonth; 26 | private String DayOfWeek; 27 | private Integer DepTime; 28 | private Integer CRSDepTime; 29 | private Integer ArrTime; 30 | private Integer CRSArrTime; 31 | private String UniqueCarrier; 32 | private Integer ActualElapsedTime; 33 | private Integer CRSElapsedTime; 34 | private Integer AirTime; 35 | private Double ArrDelay; 36 | private Integer DepDelay; 37 | private String Origin; 38 | private Integer Distance; 39 | 40 | 41 | 42 | public Flight(String month, String dayofMonth, String dayOfWeek, 43 | Integer depTime, Integer cRSDepTime, Integer arrTime, 44 | Integer cRSArrTime, String uniqueCarrier, 45 | Integer actualElapsedTime, Integer cRSElapsedTime, Integer airTime, 46 | Double arrDelay, Integer depDelay, String origin, Integer distance) { 47 | super(); 48 | Month = month; 49 | DayofMonth = dayofMonth; 50 | DayOfWeek = dayOfWeek; 51 | DepTime = depTime; 52 | CRSDepTime = cRSDepTime; 53 | ArrTime = arrTime; 54 | CRSArrTime = cRSArrTime; 55 | UniqueCarrier = uniqueCarrier; 56 | ActualElapsedTime = actualElapsedTime; 57 | CRSElapsedTime = cRSElapsedTime; 58 | AirTime = airTime; 59 | ArrDelay = arrDelay; 60 | DepDelay = depDelay; 61 | Origin = origin; 62 | Distance = distance; 63 | } 64 | 65 | 66 | @Override 67 | public String toString() { 68 | return "Flight [Month=" + Month + ", DayofMonth=" + DayofMonth 69 | + ", DayOfWeek=" + DayOfWeek + ", DepTime=" + DepTime 70 | + ", CRSDepTime=" + CRSDepTime + ", ArrTime=" + ArrTime 71 | + ", CRSArrTime=" + CRSArrTime + ", UniqueCarrier=" 72 | + UniqueCarrier + ", ActualElapsedTime=" + ActualElapsedTime 73 | + ", CRSElapsedTime=" + CRSElapsedTime + ", AirTime=" + AirTime 74 | + ", ArrDelay=" + ArrDelay + ", DepDelay=" + DepDelay 75 | + ", Origin=" + Origin + ", Distance=" + Distance + "]"; 76 | } 77 | 78 | 79 | public String getMonth() { 80 | return Month; 81 | } 82 | public void setMonth(String month) { 83 | Month = month; 84 | } 85 | public String getDayofMonth() { 86 | return DayofMonth; 87 | } 88 | public void setDayofMonth(String dayofMonth) { 89 | DayofMonth = dayofMonth; 90 | } 91 | public String getDayOfWeek() { 92 | return DayOfWeek; 93 | } 94 | public void setDayOfWeek(String dayOfWeek) { 95 | DayOfWeek = dayOfWeek; 96 | } 97 | public Integer getDepTime() { 98 | return DepTime; 99 | } 100 | public void setDepTime(Integer depTime) { 101 | DepTime = depTime; 102 | } 103 | public Integer getCRSDepTime() { 104 | return CRSDepTime; 105 | } 106 | public void setCRSDepTime(Integer cRSDepTime) { 107 | CRSDepTime = cRSDepTime; 108 | } 109 | public Integer getArrTime() { 110 | return ArrTime; 111 | } 112 | public void setArrTime(Integer arrTime) { 113 | ArrTime = arrTime; 114 | } 115 | public Integer getCRSArrTime() { 116 | return CRSArrTime; 117 | } 118 | public void setCRSArrTime(Integer cRSArrTime) { 119 | CRSArrTime = cRSArrTime; 120 | } 121 | public String getUniqueCarrier() { 122 | return UniqueCarrier; 123 | } 124 | public void setUniqueCarrier(String uniqueCarrier) { 125 | UniqueCarrier = uniqueCarrier; 126 | } 127 | public Integer getActualElapsedTime() { 128 | return ActualElapsedTime; 129 | } 130 | public void setActualElapsedTime(Integer actualElapsedTime) { 131 | ActualElapsedTime = actualElapsedTime; 132 | } 133 | public Integer getCRSElapsedTime() { 134 | return CRSElapsedTime; 135 | } 136 | public void setCRSElapsedTime(Integer cRSElapsedTime) { 137 | CRSElapsedTime = cRSElapsedTime; 138 | } 139 | public Integer getAirTime() { 140 | return AirTime; 141 | } 142 | public void setAirTime(Integer airTime) { 143 | AirTime = airTime; 144 | } 145 | public Double getArrDelay() { 146 | return ArrDelay; 147 | } 148 | public void setArrDelay(Double arrDelay) { 149 | ArrDelay = arrDelay; 150 | } 151 | public Integer getDepDelay() { 152 | return DepDelay; 153 | } 154 | public void setDepDelay(Integer depDelay) { 155 | DepDelay = depDelay; 156 | } 157 | public String getOrigin() { 158 | return Origin; 159 | } 160 | public void setOrigin(String origin) { 161 | Origin = origin; 162 | } 163 | public Integer getDistance() { 164 | return Distance; 165 | } 166 | public void setDistance(Integer distance) { 167 | Distance = distance; 168 | } 169 | 170 | 171 | } 172 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch10/JavaALSExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch10; 2 | 3 | import org.apache.log4j.Level; 4 | import org.apache.log4j.LogManager; 5 | import org.apache.log4j.Logger; 6 | import org.apache.spark.sql.Dataset; 7 | import org.apache.spark.sql.Row; 8 | import org.apache.spark.sql.SparkSession; 9 | 10 | 11 | import org.apache.spark.api.java.JavaRDD; 12 | import org.apache.spark.api.java.function.Function; 13 | import org.apache.spark.ml.evaluation.RegressionEvaluator; 14 | import org.apache.spark.ml.recommendation.ALS; 15 | import org.apache.spark.ml.recommendation.ALSModel; 16 | 17 | 18 | //examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java 19 | // examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala 20 | 21 | public class JavaALSExample { 22 | 23 | public static void main(String[] args) { 24 | System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); 25 | Logger rootLogger = LogManager.getRootLogger(); 26 | rootLogger.setLevel(Level.WARN); 27 | SparkSession spark = SparkSession 28 | .builder() 29 | .master("local") 30 | .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse") 31 | .appName("JavaALSExample") 32 | .getOrCreate(); 33 | 34 | // $example on$ 35 | JavaRDD ratingsRDD = spark 36 | .read().textFile("E:\\sumitK\\Hadoop\\movieLens-latest-small\\ratings.csv").javaRDD().filter(str-> !str.contains("userId")) 37 | .map(new Function() { 38 | public Rating call(String str) { 39 | return Rating.parseRating(str); 40 | } 41 | }); 42 | 43 | /* Dataset ratingDS = spark.read() 44 | .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") 45 | .option("header", "true") 46 | .load("E:\\sumitK\\Hadoop\\movieLens-latest-small\\ratings.csv");*/ 47 | 48 | 49 | 50 | Dataset ratings = spark.createDataFrame(ratingsRDD, Rating.class); 51 | ratings.show(); 52 | Dataset[] splits = ratings.randomSplit(new double[]{0.8, 0.2}); 53 | Dataset training = splits[0]; 54 | Dataset test = splits[1]; 55 | System.out.println("The no of training rows are :"+training.count()+" and the row count of test are :"+test.count()); 56 | 57 | // Build the recommendation model using ALS on the training data 58 | ALS als = new ALS() 59 | .setMaxIter(5) 60 | .setRegParam(0.01) 61 | .setUserCol("userId") 62 | .setItemCol("movieId") 63 | .setRatingCol("rating"); 64 | ALSModel model = als.fit(training); 65 | 66 | // Evaluate the model by computing the RMSE on the test data 67 | Dataset predictions = model.transform(test); 68 | predictions.show(); 69 | 70 | RegressionEvaluator evaluator = new RegressionEvaluator() 71 | .setMetricName("rmse") 72 | .setLabelCol("rating") 73 | .setPredictionCol("prediction"); 74 | Double rmse = evaluator.evaluate(predictions); 75 | System.out.println("Root-mean-square error = " + rmse); 76 | // $example off$ 77 | spark.stop(); 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch10/JavaEstimatorTransformerParamExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch10; 2 | 3 | //$example on$ 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.log4j.Level; 8 | import org.apache.log4j.LogManager; 9 | import org.apache.log4j.Logger; 10 | import org.apache.spark.ml.classification.LogisticRegression; 11 | import org.apache.spark.ml.classification.LogisticRegressionModel; 12 | import org.apache.spark.ml.linalg.VectorUDT; 13 | import org.apache.spark.ml.linalg.Vectors; 14 | import org.apache.spark.ml.param.ParamMap; 15 | import org.apache.spark.sql.Dataset; 16 | import org.apache.spark.sql.Row; 17 | import org.apache.spark.sql.RowFactory; 18 | import org.apache.spark.sql.types.DataTypes; 19 | import org.apache.spark.sql.types.Metadata; 20 | import org.apache.spark.sql.types.StructField; 21 | import org.apache.spark.sql.types.StructType; 22 | //$example off$ 23 | import org.apache.spark.sql.SparkSession; 24 | 25 | public class JavaEstimatorTransformerParamExample { 26 | 27 | public static void main(String[] args) { 28 | SparkSession spark = SparkSession 29 | .builder().master("local").config("spark.sql.warehouse.dir", "file:///C:/Users/sumit.kumar/Downloads/bin/warehouse") 30 | .appName("JavaEstimatorTransformerParamExample") 31 | .getOrCreate(); 32 | Logger rootLogger = LogManager.getRootLogger(); 33 | rootLogger.setLevel(Level.WARN); 34 | // $example on$ 35 | // Prepare training data. 36 | List dataTraining = Arrays.asList( 37 | RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)), 38 | RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)), 39 | RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)), 40 | RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5)) 41 | ); 42 | StructType schema = new StructType(new StructField[]{ 43 | new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), 44 | new StructField("features", new VectorUDT(), false, Metadata.empty()) 45 | }); 46 | Dataset training = spark.createDataFrame(dataTraining, schema); 47 | 48 | // Create a LogisticRegression instance. This instance is an Estimator. 49 | LogisticRegression lr = new LogisticRegression(); 50 | // Print out the parameters, documentation, and any default values. 51 | System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n"); 52 | 53 | // We may set parameters using setter methods. 54 | lr.setMaxIter(10).setRegParam(0.01); 55 | 56 | // Learn a LogisticRegression model. This uses the parameters stored in lr. 57 | LogisticRegressionModel model1 = lr.fit(training); 58 | // Since model1 is a Model (i.e., a Transformer produced by an Estimator), 59 | // we can view the parameters it used during fit(). 60 | // This prints the parameter (name: value) pairs, where names are unique IDs for this 61 | // LogisticRegression instance. 62 | System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap()); 63 | 64 | // We may alternatively specify parameters using a ParamMap. 65 | ParamMap paramMap = new ParamMap() 66 | .put(lr.maxIter().w(20)) // Specify 1 Param. 67 | .put(lr.maxIter(), 30) // This overwrites the original maxIter. 68 | .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params. 69 | 70 | // One can also combine ParamMaps. 71 | ParamMap paramMap2 = new ParamMap() 72 | .put(lr.probabilityCol().w("myProbability")); // Change output column name 73 | ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); 74 | 75 | // Now learn a new model using the paramMapCombined parameters. 76 | // paramMapCombined overrides all parameters set earlier via lr.set* methods. 77 | LogisticRegressionModel model2 = lr.fit(training, paramMapCombined); 78 | System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); 79 | 80 | // Prepare test documents. 81 | List dataTest = Arrays.asList( 82 | RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)), 83 | RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)), 84 | RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5)) 85 | ); 86 | Dataset test = spark.createDataFrame(dataTest, schema); 87 | 88 | // Make predictions on test documents using the Transformer.transform() method. 89 | // LogisticRegression.transform will only use the 'features' column. 90 | // Note that model2.transform() outputs a 'myProbability' column instead of the usual 91 | // 'probability' column since we renamed the lr.probabilityCol parameter previously. 92 | Dataset results = model2.transform(test); 93 | Dataset rows = results.select("features", "label", "myProbability", "prediction"); 94 | for (Row r: rows.collectAsList()) { 95 | System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) 96 | + ", prediction=" + r.get(3)); 97 | } 98 | // $example off$ 99 | 100 | spark.stop(); 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch10/Rating.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch10; 2 | 3 | import java.io.Serializable; 4 | 5 | public class Rating implements Serializable{ 6 | 7 | 8 | /** 9 | * 10 | */ 11 | private static final long serialVersionUID = 1L; 12 | private int userId; 13 | private int movieId; 14 | private float rating; 15 | private long timestamp; 16 | 17 | public Rating() {} 18 | 19 | public Rating(int userId, int movieId, float rating, long timestamp) { 20 | this.userId = userId; 21 | this.movieId = movieId; 22 | this.rating = rating; 23 | this.timestamp = timestamp; 24 | } 25 | 26 | public int getUserId() { 27 | return userId; 28 | } 29 | 30 | public int getMovieId() { 31 | return movieId; 32 | } 33 | 34 | public float getRating() { 35 | return rating; 36 | } 37 | 38 | public long getTimestamp() { 39 | return timestamp; 40 | } 41 | 42 | public static Rating parseRating(String str) { 43 | String[] fields = str.split(","); 44 | if (fields.length != 4) { 45 | throw new IllegalArgumentException("Each line must contain 4 fields"); 46 | } 47 | int userId = Integer.parseInt(fields[0]); 48 | int movieId = Integer.parseInt(fields[1]); 49 | float rating = Float.parseFloat(fields[2]); 50 | long timestamp = Long.parseLong(fields[3]); 51 | return new Rating(userId, movieId, rating, timestamp); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/AbsFunc1.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.io.Serializable; 4 | 5 | import org.apache.spark.graphx.EdgeTriplet; 6 | 7 | import scala.runtime.AbstractFunction1; 8 | 9 | public class AbsFunc1 extends AbstractFunction1, Object> implements Serializable{ 10 | 11 | 12 | @Override 13 | public Object apply(EdgeTriplet arg0) { 14 | return arg0.attr().equals("Friend"); 15 | } 16 | 17 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/AbsFunc2.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.io.Serializable; 4 | 5 | import scala.runtime.AbstractFunction2; 6 | 7 | public class AbsFunc2 extends AbstractFunction2 implements Serializable{ 8 | 9 | @Override 10 | public Object apply(Object arg0, String arg1) { 11 | 12 | return true; 13 | } 14 | 15 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/AbsFunc3.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.io.Serializable; 4 | 5 | 6 | public class AbsFunc3 extends scala.runtime.AbstractFunction2 implements Serializable{ 7 | 8 | @Override 9 | public String apply(Object arg0, String arg1) { 10 | 11 | return "Vertex:"+arg1; 12 | } 13 | 14 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/AbsFunc4.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.io.Serializable; 4 | 5 | import org.apache.spark.graphx.EdgeContext; 6 | import org.apache.spark.graphx.EdgeTriplet; 7 | 8 | import scala.runtime.AbstractFunction1; 9 | import scala.runtime.BoxedUnit; 10 | 11 | public class AbsFunc4 extends AbstractFunction1, BoxedUnit> implements Serializable{ 12 | 13 | @Override 14 | public BoxedUnit apply(EdgeContext arg0) { 15 | 16 | 17 | arg0.sendToDst(1); 18 | return BoxedUnit.UNIT; 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/AbsFunc5.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.io.Serializable; 4 | 5 | 6 | public class AbsFunc5 extends scala.runtime.AbstractFunction2 implements Serializable{ 7 | 8 | @Override 9 | public Integer apply(Integer i1, Integer i2) { 10 | 11 | return i1+i2; 12 | } 13 | 14 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/AbsFunc6.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.io.Serializable; 4 | 5 | import scala.Option; 6 | import scala.runtime.AbstractFunction3; 7 | 8 | public class AbsFunc6 extends AbstractFunction3, String> implements Serializable { 9 | 10 | @Override 11 | public String apply(Object o, String s1, Option s2) { 12 | 13 | if (s2.isEmpty()) { 14 | return s1 ; 15 | } else { 16 | return s1 + " " + s2.get(); 17 | } 18 | 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/AbsFunc7.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.io.Serializable; 4 | 5 | import org.apache.spark.graphx.Edge; 6 | import org.apache.spark.graphx.EdgeTriplet; 7 | 8 | import scala.runtime.AbstractFunction1; 9 | 10 | public class AbsFunc7 extends AbstractFunction1,Integer> implements Serializable{ 11 | 12 | @Override 13 | public Integer apply(Edge edge) { 14 | return edge.attr().length(); 15 | } 16 | 17 | 18 | 19 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/AbsFunc8.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.io.Serializable; 4 | 5 | import org.apache.spark.graphx.Edge; 6 | import org.apache.spark.graphx.EdgeTriplet; 7 | 8 | import scala.runtime.AbstractFunction1; 9 | 10 | public class AbsFunc8 extends AbstractFunction1,Integer> implements Serializable{ 11 | 12 | @Override 13 | public Integer apply(EdgeTriplet triplet) { 14 | return triplet.attr().length(); 15 | } 16 | 17 | 18 | 19 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/PropertyGraphExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.spark.SparkConf; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | import org.apache.spark.graphx.Edge; 10 | import org.apache.spark.graphx.Graph; 11 | import org.apache.spark.graphx.GraphOps; 12 | import org.apache.spark.graphx.PartitionStrategy; 13 | import org.apache.spark.graphx.TripletFields; 14 | import org.apache.spark.graphx.VertexRDD; 15 | import org.apache.spark.storage.StorageLevel; 16 | 17 | import scala.Predef.$eq$colon$eq; 18 | import scala.Tuple2; 19 | import scala.reflect.ClassTag; 20 | 21 | public class PropertyGraphExample { 22 | public static void main(String[] args) { 23 | 24 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 25 | SparkConf conf = new SparkConf().setMaster("local").setAppName("graph"); 26 | JavaSparkContext javaSparkContext = new JavaSparkContext(conf); 27 | ClassTag stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class); 28 | ClassTag intTag = scala.reflect.ClassTag$.MODULE$.apply(Integer.class); 29 | 30 | 31 | 32 | //$eq$colon$eq scala$Predef$$singleton_$eq$colon$eq = scala.Predef$.MODULE$.scala$Predef$$singleton_$eq$colon$eq; 33 | $eq$colon$eq tpEquals = scala.Predef.$eq$colon$eq$.MODULE$.tpEquals(); 34 | List> vertices = new ArrayList<>(); 35 | 36 | vertices.add(new Tuple2(1l, "James")); 37 | vertices.add(new Tuple2(2l, "Robert")); 38 | vertices.add(new Tuple2(3l, "Charlie")); 39 | vertices.add(new Tuple2(4l, "Roger")); 40 | vertices.add(new Tuple2(5l, "Tony")); 41 | 42 | List> edges = new ArrayList<>(); 43 | 44 | edges.add(new Edge(2, 1, "Friend")); 45 | edges.add(new Edge(3, 2, "Advisor")); 46 | edges.add(new Edge(3, 1, "Friend")); 47 | /*edges.add(new Edge(1, 2, "Friend")); 48 | edges.add(new Edge(2, 3, "Advisor")); 49 | edges.add(new Edge(1, 3, "Friend"));*/ 50 | edges.add(new Edge(4, 3, "colleague")); 51 | edges.add(new Edge(4, 5, "Relative")); 52 | edges.add(new Edge(5, 2, "BusinessPartners")); 53 | 54 | JavaRDD> verticesRDD = javaSparkContext.parallelize(vertices); 55 | JavaRDD> edgesRDD = javaSparkContext.parallelize(edges); 56 | 57 | Graph graph = Graph.apply(verticesRDD.rdd(), edgesRDD.rdd(), "", StorageLevel.MEMORY_ONLY(), 58 | StorageLevel.MEMORY_ONLY(), stringTag, stringTag); 59 | 60 | 61 | 62 | 63 | graph.vertices().toJavaRDD().collect().forEach(System.out::println); 64 | /*System.out.println("-------------------------------"); 65 | graph.edges().toJavaRDD().collect().forEach(System.out::println);*/ 66 | 67 | //Graph operations 68 | 69 | //mapvertices 70 | 71 | /*Graph mapVertices = graph.mapVertices(new AbsFunc3(), stringTag, tpEquals); 72 | mapVertices.vertices().toJavaRDD().collect().forEach(System.out::println);*/ 73 | 74 | //mapEdges 75 | 76 | /*Graph mapEdges = graph.mapEdges(new AbsFunc7(), scala.reflect.ClassTag$.MODULE$.apply(Integer.class)); 77 | mapEdges.edges().toJavaRDD().collect().forEach(System.out::println);*/ 78 | 79 | //mapTriplets 80 | //Graph mapTriplets = graph.mapTriplets(new AbsFunc8(), scala.reflect.ClassTag$.MODULE$.apply(Integer.class)); 81 | //mapTriplets.triplets().toJavaRDD().collect().forEach(System.out::println); 82 | 83 | //Other way - loose indices 84 | //JavaRDD map = graph.vertices().toJavaRDD().map(x->"Vertex:"+x); 85 | 86 | //Triplets 87 | 88 | //Reverse 89 | /* Graph reversedGraph = graph.reverse(); 90 | reversedGraph.triplets().toJavaRDD().collect().forEach(System.out::println);*/ 91 | 92 | 93 | //Subgraph 94 | /* Graph subgraph = graph.subgraph(new AbsFunc1(), new AbsFunc2()); 95 | subgraph.triplets().toJavaRDD().collect().forEach(System.out::println);*/ 96 | 97 | //Aggregate Messages 98 | 99 | /*VertexRDD aggregateMessages = graph.aggregateMessages(new AbsFunc4(), new AbsFunc5(), TripletFields.All, intTag); 100 | 101 | aggregateMessages.toJavaRDD().collect().forEach(System.out::println);*/ 102 | 103 | 104 | 105 | //Join 106 | // List> dataToJoin = new ArrayList<>(); 107 | // 108 | // dataToJoin.add(new Tuple2(1l,"Wilson")); 109 | // dataToJoin.add(new Tuple2(2l,"Harmon")); 110 | // dataToJoin.add(new Tuple2(3l,"Johnson")); 111 | // dataToJoin.add(new Tuple2(4l,"Peterson")); 112 | // dataToJoin.add(new Tuple2(5l,"Adams")); 113 | // 114 | // JavaRDD> dataToJoinRdd = javaSparkContext.parallelize(dataToJoin); 115 | // 116 | // Graph outerJoinVertices = graph.outerJoinVertices(dataToJoinRdd.rdd(), new AbsFunc6(), scala.reflect.ClassTag$.MODULE$.apply(String.class), scala.reflect.ClassTag$.MODULE$.apply(String.class), scala.Predef.$eq$colon$eq$.MODULE$.tpEquals()); 117 | // outerJoinVertices.vertices().toJavaRDD().collect().forEach(System.out::println); 118 | 119 | 120 | //Graph-Anaytics 121 | 122 | //PageRank 123 | /*Graph graphWithStaticRanking = graph.ops().staticPageRank(1,0.20); 124 | graphWithStaticRanking.vertices().toJavaRDD().collect().forEach(System.out::println); 125 | */ 126 | //graph.ops().pageRank(0.00001,0.20).vertices().toJavaRDD().collect().forEach(System.out::println);; 127 | 128 | //Triangle count 129 | graph.partitionBy(PartitionStrategy.CanonicalRandomVertexCut$.MODULE$); 130 | 131 | Graph triangleCountedGraph = graph.ops().triangleCount(); 132 | triangleCountedGraph.vertices().toJavaRDD().collect().forEach(System.out::println); 133 | 134 | //Connected components 135 | /*Graph connectedComponentsGraph = graph.ops().connectedComponents(); 136 | connectedComponentsGraph.vertices().toJavaRDD().collect().forEach(System.out::println);;*/ 137 | 138 | /*scala.collection.immutable.Set set = new scala.collection.immutable.HashSet(); 139 | List list =new ArrayList<>(); 140 | 141 | JavaConverters.collectionAsScalaIterableConverter(list).asScala().toSeq();*/ 142 | // ShortestPaths.run 143 | 144 | 145 | 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch11/PropertyGraphExampleFromEdges.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch11; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.spark.SparkConf; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | import org.apache.spark.graphx.Edge; 10 | import org.apache.spark.graphx.EdgeTriplet; 11 | import org.apache.spark.graphx.Graph; 12 | import org.apache.spark.storage.StorageLevel; 13 | 14 | import scala.Function1; 15 | import scala.reflect.ClassTag; 16 | import scala.runtime.AbstractFunction1; 17 | 18 | public class PropertyGraphExampleFromEdges { 19 | public static void main(String[] args) { 20 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 21 | SparkConf conf = new SparkConf().setMaster("local").setAppName("graph"); 22 | JavaSparkContext javaSparkContext = new JavaSparkContext(conf); 23 | ClassTag stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class); 24 | 25 | 26 | List> edges = new ArrayList<>(); 27 | 28 | edges.add(new Edge(1, 2, "Friend")); 29 | edges.add(new Edge(2, 3, "Advisor")); 30 | edges.add(new Edge(1, 3, "Friend")); 31 | edges.add(new Edge(4, 3, "colleague")); 32 | edges.add(new Edge(4, 5, "Relative")); 33 | edges.add(new Edge(2, 5, "BusinessPartners")); 34 | 35 | 36 | JavaRDD> edgeRDD = javaSparkContext.parallelize(edges); 37 | 38 | 39 | Graph graph = Graph.fromEdges(edgeRDD.rdd(), "",StorageLevel.MEMORY_ONLY(), StorageLevel.MEMORY_ONLY(), stringTag, stringTag); 40 | 41 | 42 | graph.vertices().toJavaRDD().collect().forEach(System.out::println); 43 | 44 | 45 | 46 | // graph.aggregateMessages(sendMsg, mergeMsg, tripletFields, evidence$11) 47 | 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/AInnerClassVsLambda.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.io.File; 4 | import java.io.FilenameFilter; 5 | 6 | public class AInnerClassVsLambda { 7 | 8 | public static void main(String[] args) { 9 | 10 | File sourceDir= new File("/home/user"); 11 | sourceDir.list(new FilenameFilter() { 12 | 13 | @Override 14 | public boolean accept(File dir, String name) { 15 | 16 | return name.endsWith("txt"); 17 | } 18 | }); 19 | 20 | 21 | sourceDir.list((dir,name)->name.endsWith("txt")); 22 | 23 | // Lexical scoping-wont work ---System.out.println(dir); 24 | 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/Car.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | public interface Car { 4 | 5 | void shape(); 6 | void price(); 7 | void color(); 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/ClosureDemo.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | import java.util.function.Function; 6 | 7 | public class ClosureDemo { 8 | public static void main(String[] args) { 9 | List list = Arrays.asList(1, 2, 3, 4, 5); 10 | Function closure = ClosureExample.closure(); 11 | list.stream().map(closure).forEach(n -> System.out.print(n+" ")); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/ClosureExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.util.function.Function; 4 | 5 | public class ClosureExample { 6 | public static Function closure() { 7 | int a=3; 8 | 9 | Function function = t->{ 10 | //a++; 11 | return t*a; 12 | }; 13 | 14 | return function; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/CollectorsExamples.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.Map; 6 | import java.util.Set; 7 | import java.util.TreeSet; 8 | import java.util.function.Supplier; 9 | import java.util.stream.Collectors; 10 | import java.util.stream.Stream; 11 | 12 | public class CollectorsExamples { 13 | 14 | public static void main(String[] args) { 15 | 16 | Supplier> streamSupplier =()->Stream.of( new String[]{"The","Stream","from","an","array","of","The","Strings"} ) ; 17 | 18 | //String Concatenation using non parameterized joining 19 | String concatedString = streamSupplier.get().collect(Collectors.joining()); 20 | System.out.println("The result of String Concatnation using non parameterized joining :: "); 21 | System.out.println(concatedString); 22 | 23 | //String Concatenation using joining with delimiter parameter 24 | String delimitedString = streamSupplier.get().collect(Collectors.joining(",")); 25 | System.out.println("The result of String Concatenation using joining with delimeter parameter :: "); 26 | System.out.println(delimitedString); 27 | 28 | //String Concatenation using joining with delimiter parameter 29 | String concatString = streamSupplier.get().collect(Collectors.joining(",","[","]")); 30 | System.out.println("The result of String Concatenation using joining with delimeter parameter :: "); 31 | System.out.println(concatString); 32 | 33 | //Collection Collectors 34 | List listCollected =streamSupplier.get().collect(Collectors.toList()); 35 | System.out.println("The list collected value of Stream are :: "+listCollected); 36 | 37 | Set setCollected=streamSupplier.get().collect(Collectors.toSet()); 38 | System.out.println("The set collected value of Stream are :: "+setCollected); 39 | 40 | Set orderedSetCollected=streamSupplier.get().collect(Collectors.toCollection(TreeSet::new)); 41 | System.out.println("The ordered set collected value of Stream are :: "+orderedSetCollected); 42 | 43 | //Map Collectors 44 | Map mapCollected=orderedSetCollected.stream().collect(Collectors.toMap(x->x.toString(),x->x.toString().length() )); 45 | System.out.println("The generated Map values are :: "+mapCollected); 46 | 47 | //Map Collectors with duplicate key handling 48 | Map> mapWithDupVals=streamSupplier.get().collect(Collectors.toMap(x->x.toString(), //KeyMapper 49 | x -> {List tmp = new ArrayList <> (); tmp.add(x.toString().length()); return tmp;}, //ValueMapper 50 | (L1, L2) -> { L1.addAll(L2); return L1;} //MergeFunction 51 | )); 52 | System.out.println("The generated Map values with duplicate values::"+mapWithDupVals); 53 | 54 | //Grouping Collectors 55 | Map> groupExample= streamSupplier.get().collect(Collectors.groupingBy(x->x.toString().length())); 56 | System.out.println("Grouping stream elements on the basis of its length :: "+groupExample); 57 | 58 | //Partition Collectors 59 | Map> partitionExample=streamSupplier.get().collect(Collectors.partitioningBy( x->x.toString().length() > 5 )); 60 | System.out.println("Patitioning of elements on the basis of its length :: "+partitionExample); 61 | 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/CreateStreamExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.io.IOException; 4 | import java.math.BigInteger; 5 | import java.nio.charset.Charset; 6 | import java.nio.file.Files; 7 | import java.nio.file.Path; 8 | import java.nio.file.Paths; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.HashMap; 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.concurrent.atomic.AtomicInteger; 15 | import java.util.stream.DoubleStream; 16 | import java.util.stream.IntStream; 17 | import java.util.stream.LongStream; 18 | import java.util.stream.Stream; 19 | 20 | public class CreateStreamExample { 21 | 22 | public static void main(String[] args) throws IOException { 23 | 24 | //Creating Streams using user/programmatically specified elements 25 | Stream Userstream = Stream.of("Creating","Streams","from","Specific","elements"); 26 | Userstream.forEach(p -> System.out.println(p)); 27 | 28 | 29 | //Creating Streams using array of objects 30 | Stream ArrayStream = Stream.of( new String[]{"Stream","from","an","array","of","objects"} ); 31 | ArrayStream.forEach(p -> System.out.println(p)); 32 | 33 | 34 | //Creating Streams from an array 35 | String[] StringArray=new String[]{"We","can","convert","an","array","to","a","Stream","using","Arrays","as","well"}; 36 | Stream StringStream=Arrays.stream(StringArray); 37 | StringStream.forEach(p -> System.out.println(p)); 38 | 39 | //Creating Streams from Collection 40 | List myCollection = new ArrayList<>(); 41 | for(int i=0; i<10; i++){ 42 | myCollection.add(Math.random()); 43 | } 44 | //sequential stream 45 | Stream sequentialStream = myCollection.stream(); 46 | sequentialStream.forEach(p -> System.out.println(p)); 47 | 48 | //parallel stream 49 | Stream parallelStream = myCollection.parallelStream(); 50 | parallelStream.forEach(p -> System.out.println(p)); 51 | 52 | 53 | //Stream from Hashmap 54 | Map mapData = new HashMap<>(); 55 | mapData.put("This", 1900); 56 | mapData.put("is", 2000); 57 | mapData.put("HashMap", 2100); 58 | 59 | mapData.entrySet() 60 | .stream() 61 | .forEach(p -> System.out.println(p)); 62 | 63 | mapData.keySet() 64 | .stream() 65 | .forEach(p-> System.out.println(p)); 66 | 67 | //primitive streams 68 | IntStream.range(1, 4) 69 | .forEach(p -> System.out.println(p)); 70 | 71 | LongStream.rangeClosed(1, 4) 72 | .forEach(p -> System.out.println(p)); 73 | 74 | DoubleStream.of(1.0,2.0,3.0,4.0) 75 | .forEach(p -> System.out.println(p)); 76 | 77 | //Infinite Streams using generate() 78 | Stream sequentialDoubleStream = Stream.generate(Math :: random); 79 | 80 | Stream sequentialIntegerStream = Stream.generate(new AtomicInteger () :: getAndIncrement); 81 | 82 | //Infinite Streams using iterate() 83 | Stream sequentialIntegerStream1 = Stream.iterate (Integer.MIN_VALUE, i -> i++); 84 | 85 | Stream sequentialBigIntegerStream = Stream.iterate(BigInteger.ZERO, i -> i.add (BigInteger.TEN)); 86 | 87 | //Streams from File 88 | Stream streamOfStrings = Files.lines(Paths.get("Apology_by_Plato.txt")); 89 | Stream streamWithCharset = Files.lines(Paths.get("Apology_by_Plato.txt"), Charset.forName("UTF-8")); 90 | 91 | 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/Interface1.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | //@FunctionalInterface 4 | public interface Interface1 { 5 | 6 | default void hello(){ 7 | System.out.println("Hello from Interface1"); 8 | } 9 | 10 | //void method1(); 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/Interface2.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | public interface Interface2 { 4 | default void hello(){ 5 | System.out.println("Hello from Interface1"); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/InterfaceImpl.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | public class InterfaceImpl implements Interface1,Interface2{ 4 | @Override 5 | public void hello() { 6 | // TODO Auto-generated method stub 7 | Interface1.super.hello(); 8 | Interface2.super.hello(); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/IntermediateOpExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.util.Arrays; 4 | import java.util.Comparator; 5 | import java.util.List; 6 | import java.util.function.Supplier; 7 | import java.util.stream.IntStream; 8 | import java.util.stream.Stream; 9 | 10 | public class IntermediateOpExample { 11 | 12 | public static void main(String[] args) { 13 | 14 | //Filter Operation 15 | IntStream.rangeClosed(1, 10) 16 | .filter(s -> s>4) 17 | .forEach(p -> System.out.println(p)); 18 | 19 | //Map Operation 20 | Supplier> streamSupplier =()->Stream.of( new String[]{"Stream","from","an","array","of","objects"} ) ; 21 | int sumOfLength=streamSupplier.get().map(x -> x.toString().length()).peek(x->System.out.println(Integer.parseInt(x.toString()))) 22 | .mapToInt(x->x.intValue()).sum(); 23 | 24 | int incrementVal=6; 25 | IntStream.rangeClosed(1, 10) 26 | .filter(s -> s>4) 27 | .map(x -> x+incrementVal) 28 | .forEach(p -> System.out.println(p)); 29 | 30 | Stream.of(new String[]{"Let me see what i get this time"}).map(x -> x.split("\\s+")).forEach(System.out::println); 31 | 32 | //Sorted 33 | // Stream ArrayStream = Stream.of( new String[]{"stream","from","an","array","of","objects"} ); 34 | 35 | //http://stackoverflow.com/questions/23860533/copy-a-stream-to-avoid-stream-has-already-been-operated-upon-or-closed-java-8 36 | // Supplier> streamSupplier =()-> Stream.of( new String[]{"stream","from","an","array","of","objects"} ); 37 | 38 | //Natural Sorting 39 | streamSupplier.get().sorted().forEach(System.out::println); 40 | 41 | //Comparing elements with reverse order 42 | streamSupplier.get().sorted(Comparator.reverseOrder()).forEach(System.out::println); 43 | 44 | 45 | //Sorting the element in reverse order based on their length 46 | streamSupplier.get().sorted(Comparator.comparing(x -> x.toString().length()).reversed()).forEach(System.out::println); 47 | 48 | //Sorting on multiple fields 49 | streamSupplier.get().sorted(Comparator.comparing(x -> x.toString().length()).thenComparing(x->x.toString())).forEach(System.out::println); 50 | 51 | 52 | //Distinct filters all the multiple records having same length 53 | streamSupplier.get().mapToInt(x-> x.toString().length()).distinct().forEach(System.out::println); 54 | 55 | //Limiting the size of the stream 56 | streamSupplier.get().limit(2).forEach(System.out::println); 57 | 58 | //flatMap 59 | Stream> streamList = Stream.of( 60 | Arrays.asList("FistList-FirstElement"), 61 | Arrays.asList("SecondList-FirstElement", "SecondList-SecondElement"), 62 | Arrays.asList("ThirdList-FirstElement")); 63 | //The streamList is of the form List 64 | Stream flatStream = streamList 65 | .flatMap(strList -> strList.stream()); 66 | // But after applying flatMap operaton it translates into Strem 67 | flatStream.forEach(System.out::println); 68 | 69 | // 70 | 71 | // Stream.of(1, 2, 3) 72 | // .flatMap(x -> IntStream.range(0, x)) 73 | // .forEach(System.out::println); 74 | 75 | System.out.println( " the count of stream is "+ 76 | 77 | //String[] sr=(String[]) 78 | 79 | Stream.of(new String[]{"Let,me,see,what,i,get,this,time","ok,now,what"}) //Stream 80 | .peek(x->System.out.println( "the length of the sream is"+ x.length())) 81 | .map(x -> x.split(",")) //Stream> 82 | .peek(x->System.out.println(x.length)) 83 | .count()); 84 | 85 | 86 | // .collect(Collectors.toList()) 87 | // .flatMap(x -> Arrays.stream(x)).forEach(System.out::println); 88 | 89 | 90 | /*.forEach(x -> { 91 | for(String sr:x){ 92 | System.out.println(x.length); 93 | System.out.println(sr); 94 | } 95 | }); 96 | */ 97 | // .peek(x-> System.out.println(x)) 98 | // .flatMap(Arrays::stream) 99 | // .forEach(System.out::println); 100 | 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/LambdaExamples.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | public class LambdaExamples { 7 | public static void main(String[] args) { 8 | List list = Arrays.asList(1,2,3,4,5); 9 | 10 | list.forEach(n-> System.out.println(n)); 11 | 12 | list.stream().map(n -> n*2 ).forEach(n-> System.out.println(n));; 13 | list.stream().map(n->{ 14 | return n*2; 15 | }).forEach(System.out::println); 16 | 17 | } 18 | 19 | 20 | 21 | 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/LexicalScoping.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | public class LexicalScoping { 4 | int a = 1; 5 | // a has class level scope. So It will be available to be accessed 6 | // throughout the class 7 | 8 | public void sumandPrint() { 9 | int b = 1; 10 | int c = a + b; 11 | // b and c are local variables of method. These will be accessible 12 | // inside the method only 13 | } 14 | // b and c are no longer accessible 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/MethodReferenceExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.util.Arrays; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | import java.util.Optional; 7 | import java.util.TreeSet; 8 | import java.util.function.Supplier; 9 | import java.util.stream.Collectors; 10 | import java.util.stream.IntStream; 11 | import java.util.stream.Stream; 12 | 13 | 14 | public class MethodReferenceExample { 15 | 16 | public static boolean isOdd(Integer n) { return n % 2 != 0; }; 17 | public static boolean isEven(Integer n) { return n % 2 == 0; }; 18 | 19 | 20 | public static void main(String[] args) { 21 | Supplier> streamSupplier =()->Stream.of( new String[]{"Stream","from","an","array","of","objects"} ) ; 22 | 23 | //1.Static Method Reference 24 | IntStream.range(1, 8).filter(MethodReferenceExample::isOdd).forEach(x->System.out.println(x)); 25 | 26 | //Instance Method Reference 27 | IntStream.range(1, 8).filter(x-> x%2==0).forEach(System.out::println); 28 | 29 | //Constructor Reference 30 | TreeSet hset= streamSupplier.get().collect(Collectors.toCollection(TreeSet::new)); 31 | 32 | 33 | //4. Instance method Reference of an arbitrary object of a particular type 34 | System.out.println(" The sum of lengths are ::"+ streamSupplier.get().map(x->x.length()).reduce(Integer::sum)); 35 | 36 | 37 | } 38 | 39 | 40 | } 41 | 42 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/MyFileNameFilter.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.io.File; 4 | import java.io.FilenameFilter; 5 | 6 | 7 | public class MyFileNameFilter implements FilenameFilter { 8 | @Override 9 | public boolean accept(File dir, String name) { 10 | 11 | return name.endsWith("java"); 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/MyFilterImpl.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.io.File; 4 | 5 | public class MyFilterImpl { 6 | public static void main(String[] args) { 7 | File dir = new File("src/main/java"); 8 | //dir.list(new MyFileNameFilter()); 9 | dir.list((dirname,name)->name.endsWith("java")); 10 | 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/MyInterface.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | public interface MyInterface { 4 | 5 | default String hello() { 6 | return "Inside static method in interface"; 7 | } 8 | 9 | void absmethod(); 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/MyInterfaceDemo.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | public class MyInterfaceDemo { 4 | public static void main(String[] args) { 5 | System.out.println(); 6 | MyInterfaceImpl obj =new MyInterfaceImpl(); 7 | obj.hello(); // wont-complie 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/MyInterfaceImpl.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | public class MyInterfaceImpl implements MyInterface{ 4 | 5 | @Override 6 | public void absmethod() { 7 | System.out.println("Abstract method implementaion in class"); 8 | } 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/ShortCircuitOperationExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | public class ShortCircuitOperationExample { 4 | 5 | public static void main(String[] args) { 6 | // TODO Auto-generated method stub 7 | /* 8 | boolean matched = memberNames.stream() 9 | .anyMatch((s) -> s.startsWith("A")); 10 | 11 | System.out.println(matched); 12 | 13 | String firstMatchedName = memberNames.stream() 14 | .filter((s) -> s.startsWith("L")) 15 | .findFirst().get(); 16 | 17 | System.out.println(firstMatchedName);*/ 18 | 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/TerminalOpExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.util.Arrays; 4 | import java.util.Optional; 5 | import java.util.function.Supplier; 6 | import java.util.stream.Collectors; 7 | import java.util.stream.IntStream; 8 | import java.util.stream.Stream; 9 | 10 | public class TerminalOpExample { 11 | 12 | public static void main(String[] args) { 13 | // forEach 14 | Supplier> streamSupplier =()->Stream.of( new String[]{"Stream","from","an","array","of","objects"} ) ; 15 | //Sequential For each 16 | streamSupplier.get().sequential().forEach(P->System.out.println("Sequential output :: "+P)); 17 | //Parallel For each 18 | streamSupplier.get().parallel().forEach(P->System.out.println("Parallel output :: "+P)); 19 | 20 | //sum 21 | // System.out.println(streamSupplier.get().map(x -> x.toString().length()).peek(System.out::println).sum()); 22 | 23 | System.out.println("Number of alphabets present in the stream ::"+streamSupplier.get().mapToInt(x -> x.length()).sum()); //Notice here had we used MAP , we would have had to another map function to convert the in Int. 24 | 25 | //reduce 26 | Optional simpleSum= streamSupplier.get().map(x->x.length()).reduce((x,y)-> x+y); 27 | 28 | System.out.println( "The value with simpe reduce is ::"+simpleSum.get()); 29 | 30 | Integer defaulValSum= streamSupplier.get().map(x->x.length()).reduce(0,(x,y)-> x+y); 31 | System.out.println( "The value with default reduce is ::"+defaulValSum); 32 | 33 | Integer valSum= streamSupplier.get().reduce(0,(x,y)-> x+y.length(),(acc1,acc2)->acc1+acc2); 34 | System.out.println("The value with with cobine reduce is ::"+valSum); 35 | 36 | //collect 37 | 38 | StringBuilder concat = streamSupplier.get() 39 | .collect(() -> new StringBuilder(), 40 | (sbuilder, str) -> sbuilder.append(str), 41 | (sbuilder1, sbuiler2) -> sbuilder1.append(sbuiler2)); 42 | 43 | 44 | StringBuilder concatM = streamSupplier.get() 45 | .collect(StringBuilder::new, 46 | StringBuilder::append, 47 | StringBuilder::append); 48 | 49 | String concatC = streamSupplier.get().collect(Collectors.joining()); 50 | 51 | //Match 52 | boolean matchesAll =streamSupplier.get().allMatch(x->x.toString().length() > 1); 53 | System.out.println("All the elemetns have lenght greater than 1 ::"+matchesAll); 54 | 55 | boolean noneMatches =streamSupplier.get().noneMatch(x->x.toString().length() > 1); 56 | System.out.println("None of the elemetns have lenght greater than 1 ::"+noneMatches); 57 | 58 | boolean anyMatches =streamSupplier.get().peek(x->System.out.println("Element being iterated is :: "+x)).anyMatch(x->x.toString().length() == 2); 59 | System.out.println("The short circuit terminal operation finished with return value :: "+anyMatches); 60 | 61 | //Finding Element 62 | System.out.println("In a paralled stream from 5-100 finding any element :: "+IntStream.range(5, 100).parallel().findAny()); 63 | 64 | System.out.println("In a paralled stream from 8-100 finding the first element :: "+IntStream.range(8, 100).parallel().findFirst()); 65 | 66 | //Count 67 | long elementCount=streamSupplier.get().count(); 68 | System.out.println("The number of elements in the stream are :: "+elementCount); 69 | 70 | 71 | 72 | //System.out.println( joinWithReduce(Stream . of ( "foo" , "bar" , "baz" ) )); 73 | 74 | //System.out.println( joinWithCollect(Stream . of ( "foo" , "bar" , "baz" ) )); 75 | 76 | } 77 | 78 | static String joinWithReduce ( Stream < String > stream ) { // BAD 79 | return stream.reduce( new StringBuilder (), StringBuilder :: append , StringBuilder :: append ). toString (); } 80 | 81 | static String joinWithCollect ( Stream < String > stream ) { // OK 82 | return stream.collect ( StringBuilder :: new , StringBuilder :: append , StringBuilder :: append ). toString (); } 83 | 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/WordCountInJava.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Files; 5 | import java.nio.file.Paths; 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.TreeMap; 11 | import java.util.stream.Collectors; 12 | import java.util.stream.Stream; 13 | 14 | import static java.util.function.Function.identity; 15 | import static java.util.stream.Collectors.counting; 16 | import static java.util.stream.Collectors.groupingBy; 17 | 18 | public class WordCountInJava { 19 | public static final String REGEX = "\\s+"; 20 | public static final String NEW_LINE_CHAR = "\n"; 21 | public static final String imagineLyrics="Imagine there's no heaven \n" 22 | + "It's easy if you try \n" 23 | + "No hell below us \n" 24 | + "Above us only sky \n" 25 | + "Imagine all the people living for today"; 26 | 27 | public static void main(String[] args) { 28 | 29 | try { 30 | //TreeMap count = Files.lines(Paths.get(args[0]), StandardCharsets.UTF_8) 31 | TreeMap count = Stream.of( imagineLyrics.split(NEW_LINE_CHAR)) 32 | .map(line -> line.split(REGEX)) 33 | .flatMap(Arrays::stream) 34 | .collect(groupingBy(identity(), TreeMap::new, counting())); 35 | 36 | // Using Lambda Expression 37 | Stream.of(count).forEach(x -> System.out.println(x)); 38 | //Using Method Reference 39 | Stream.of(count).forEach(System.out::println); 40 | 41 | 42 | Stream mapResult=Stream.of( imagineLyrics.split(NEW_LINE_CHAR)) 43 | .map(line -> line.split(REGEX)).flatMap(Arrays::stream); 44 | 45 | //sort and suffle phase 46 | Map> sortedData=mapResult.collect(Collectors.toMap(x->x.toString(), x->{ 47 | List temp=new ArrayList<>(); temp.add(1); return temp; }, 48 | (L1,L2)-> {L1.addAll(L2);return L1;})); 49 | //Reduce Phase 50 | /*Map wordCount=sortedData.entrySet().stream().collect(Collectors.toMap( 51 | e -> e.getKey(), 52 | e -> Integer.parseInt(e.getValue()) 53 | ));*/ 54 | 55 | } catch (Exception e) { 56 | // TODO Auto-generated catch block 57 | e.printStackTrace(); 58 | } 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/generics/FirstExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2.generics; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | public class FirstExample { 7 | public static void main(String[] args) { 8 | 9 | List list1 =new ArrayList(); 10 | List list2 =new ArrayList(); 11 | 12 | List list =new ArrayList<>(); 13 | 14 | list.add(1); 15 | list.add(2); 16 | list.add("hello"); 17 | 18 | Integer object = (Integer)list.get(0); 19 | 20 | System.out.println(object); 21 | 22 | List listGeneric =new ArrayList<>(); 23 | 24 | listGeneric.add(1); 25 | listGeneric.add(2); 26 | //list1.add("hello"); - wont work 27 | Integer intObject = listGeneric.get(0); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/generics/MyGeneric.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2.generics; 2 | 3 | public class MyGeneric { 4 | 5 | T input; 6 | 7 | public MyGeneric(T input) { 8 | this.input=input; 9 | 10 | } 11 | 12 | public T getInput() 13 | { 14 | return input; 15 | } 16 | 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch2/generics/MyGenericsDemo.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch2.generics; 2 | 3 | public class MyGenericsDemo { 4 | 5 | public static void main(String[] args) { 6 | MyGeneric m1 =new MyGeneric(1); 7 | System.out.println(m1.getInput()); 8 | 9 | MyGeneric m2 =new MyGeneric("hello"); 10 | System.out.println(m2.getInput()); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/ActionsExamplesOld.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch4; 2 | 3 | import java.io.Serializable; 4 | import java.util.Arrays; 5 | import java.util.Comparator; 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.function.BiConsumer; 9 | 10 | import org.apache.log4j.Level; 11 | import org.apache.log4j.LogManager; 12 | import org.apache.log4j.Logger; 13 | import org.apache.spark.SparkConf; 14 | import org.apache.spark.api.java.JavaDoubleRDD; 15 | import org.apache.spark.api.java.JavaPairRDD; 16 | import org.apache.spark.api.java.JavaRDD; 17 | import org.apache.spark.api.java.JavaSparkContext; 18 | import org.apache.spark.api.java.function.Function2; 19 | import org.apache.spark.api.java.function.PairFunction; 20 | import org.apache.spark.sql.SparkSession; 21 | import org.spark_project.guava.collect.Lists; 22 | 23 | import scala.Tuple2; 24 | 25 | public class ActionsExamplesOld implements Serializable{ 26 | 27 | /** 28 | * 29 | */ 30 | private static final long serialVersionUID = 1L; 31 | 32 | public static void main(String[] args) { 33 | System.setProperty("hadoop.home.dir", "C:\\Users\\sumit.kumar\\Downloads"); 34 | String logFile = "src/main/resources/numSeries.txt"; // Should be some file on your system 35 | Logger rootLogger = LogManager.getRootLogger(); 36 | rootLogger.setLevel(Level.WARN); 37 | /* SparkSession spark = SparkSession 38 | .builder().master("local") 39 | .appName("JavaPageRank") 40 | .config("spark.sql.warehouse.dir", "file:///C:/Users/sumit.kumar/Downloads/bin/warehouse") 41 | .getOrCreate(); 42 | */ 43 | SparkConf conf = new SparkConf().setMaster("local").setAppName("ApacheSparkForJavaDevelopers"); 44 | // SparkContext context =new SparkContext(conf); 45 | // RDD textFile = context.textFile("abc", 1); 46 | 47 | JavaSparkContext spark = new JavaSparkContext(conf); 48 | 49 | 50 | 51 | JavaRDD lines = spark.textFile(logFile); 52 | //JavaRDD lines = spark.textFile(logFile).toJavaRDD().cache(); 53 | JavaDoubleRDD intMap= lines.mapToDouble(a-> Integer.parseInt(a)).cache(); 54 | JavaPairRDD intDivMap= intMap.mapToPair(new PairFunction() { 55 | 56 | /** 57 | * 58 | */ 59 | private static final long serialVersionUID = 1L; 60 | 61 | @Override 62 | public Tuple2 call(Double t) throws Exception { 63 | 64 | return new Tuple2(t, t%2); 65 | } 66 | }); 67 | 68 | // isEmpty 69 | JavaRDD intRDD = spark.parallelize(Arrays.asList(1,2,3)); 70 | boolean isRDDEmpty= intRDD.filter(a-> a.equals(5)).isEmpty(); 71 | System.out.println("The RDD is empty ::"+isRDDEmpty); 72 | 73 | //Collect 74 | List collectedList= lines.collect(); 75 | 76 | //count() 77 | long countVal=lines.count(); 78 | //CountByKey: 79 | Map countByKeyMap= intDivMap.countByKey(); 80 | 81 | countByKeyMap.forEach(new BiConsumer() { 82 | 83 | @Override 84 | public void accept( Double L, Long U ) { 85 | System.out.println("The key val is 1 ::"+L); 86 | System.out.println("The Long is 1 ::"+U); 87 | } 88 | }); 89 | 90 | 91 | Map, Long> countByValMap= intDivMap.countByValue(); 92 | 93 | countByValMap.forEach(new BiConsumer, Long>() { 94 | 95 | @Override 96 | public void accept( Tuple2 L, Long U ) { 97 | System.out.println("The touple val is 1 ::"+L._1()); 98 | System.out.println("The touple val is 2 ::"+L._2()); 99 | System.out.println("The Long is 1 ::"+U); 100 | } 101 | }); 102 | 103 | 104 | //countByValue() 105 | Map countByVal=lines.countByValue(); 106 | // max 107 | intMap.max(); 108 | 109 | /* Comparator comp =new Comparator() { 110 | 111 | @Override 112 | public int compare(Double a, Double b) { 113 | // TODO Auto-generated method stub 114 | return a.compareTo(b); 115 | } 116 | };*/ 117 | 118 | intMap.max(new doubleComparator()); 119 | 120 | /* intMap.max(new Comparator() { 121 | 122 | @Override 123 | public int compare(Double a, Double b) { 124 | // TODO Auto-generated method stub 125 | return a.compareTo(b); 126 | } 127 | }); 128 | */ 129 | intMap.max(Comparator.naturalOrder()); 130 | intMap.max(Comparator.reverseOrder()); 131 | //////check this 132 | // intMap.max(Comparator.comparing(a->a)); 133 | 134 | //min 135 | intMap.min(); 136 | intMap.min(Comparator.naturalOrder()); 137 | 138 | // First: 139 | System.out.println("The first element of RDD is"+ intMap.first()); 140 | 141 | 142 | 143 | //take() 144 | List takeTwo=lines.take(2); 145 | takeTwo.forEach(x->System.out.println("The take elements are :: "+x)); 146 | 147 | // TakeOrdered: 148 | List takeOrderedTwo= lines.takeOrdered(2); 149 | takeOrderedTwo.forEach(x->System.out.println("The takeOrdered elements are :: "+x)); 150 | 151 | 152 | // takeOrdered(int num, java.util.Comparator comp) 153 | List takeCustomOrderedTwo= lines.takeOrdered(2, Comparator.reverseOrder()); 154 | takeCustomOrderedTwo.forEach(x->System.out.println("The takeOrdered elements with custom Comparator are :: "+x)); 155 | 156 | 157 | 158 | //TakeSample: 159 | intRDD.takeSample(true, 3).forEach(x-> System.out.println("The take sample vals for true are :"+x)); 160 | intRDD.takeSample(false, 3).forEach(x-> System.out.println("The take sample vals for false are :"+x)); 161 | intRDD.takeSample(true, 3,9).forEach(x-> System.out.println("The take sample vals with seed are :"+x)); 162 | 163 | //top() 164 | List topFive=lines.top(5); 165 | topFive.forEach(x->System.out.println("The value of top are ::"+x)); 166 | 167 | // top(int num, java.util.Comparator comp) 168 | // lines.top(3, Comparator.comparing(x->Integer.parseInt(x))); 169 | 170 | //reduce 171 | Function2 reduceSumFunc = (a, b) -> (Integer.parseInt(a) + Integer.parseInt(b)); 172 | Double sumInt=intMap.reduce((a,b)->a+b); 173 | 174 | 175 | /* Integer sumInt=lines.reduce(new Function2( 176 | ) { 177 | @Override 178 | public Integer call(String a, String b) throws Exception { 179 | // TODO Auto-generated method stub 180 | return Integer.parseInt(a) + Integer.parseInt(b); 181 | } 182 | });*/ 183 | 184 | 185 | //fold() 186 | Double foldInt=intMap.fold((double) 0, (a,b)-> a+b); 187 | 188 | // 189 | //Aggeregate: 190 | // ForEach: 191 | lines.foreach(s->System.out.println(s)); 192 | 193 | // saveAsTextFile 194 | // saveAsObjectFile(String path) 195 | JavaRDD rdd = spark.parallelize(Lists.newArrayList("1", "2")); 196 | rdd.mapToPair(p -> new Tuple2<>(p, p)).saveAsObjectFile("objFileDir"); 197 | JavaPairRDD pairRDD 198 | = JavaPairRDD.fromJavaRDD(spark.objectFile("objFileDir")); 199 | pairRDD.collect().forEach(System.out::println); 200 | 201 | } 202 | 203 | static class doubleComparator implements Comparator,Serializable{ 204 | 205 | /** 206 | * 207 | */ 208 | private static final long serialVersionUID = 1L; 209 | 210 | @Override 211 | public int compare(Double a, Double b) { 212 | // TODO Auto-generated method stub 213 | return a.compareTo(b); 214 | } 215 | } 216 | 217 | } 218 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/AggeregateExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch4; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.spark.api.java.JavaPairRDD; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import org.apache.spark.api.java.JavaSparkContext; 10 | import org.apache.spark.api.java.function.Function2; 11 | 12 | import scala.Tuple2; 13 | 14 | public class AggeregateExample { 15 | 16 | public static void main(String[] args) { 17 | String master; 18 | if (args.length > 0) { 19 | master = args[0]; 20 | } else { 21 | master = "local"; 22 | } 23 | 24 | JavaSparkContext sc = new JavaSparkContext( 25 | master, "AggeregateExample"); 26 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3,4,5),3); 27 | System.out.println("The no of partitions are ::"+rdd.getNumPartitions()); 28 | //TODO print elements with partition with index mappationwithindex() 29 | Function2 agg=new Function2() { 30 | @Override 31 | public String call(String v1, Integer v2) throws Exception { 32 | return v1+v2; 33 | } 34 | } ; 35 | 36 | Function2 combineAgg=new Function2() { 37 | @Override 38 | public String call(String v1, String v2) throws Exception { 39 | return v1+v2; 40 | } 41 | }; 42 | 43 | 44 | //String result= rdd.aggregate("X", agg, combineAgg); 45 | String result= rdd.aggregate("X", (x,y)->x+y, (x,z)->x+z); 46 | System.out.println("The aggerate value is ::"+result); 47 | 48 | 49 | int res= rdd.aggregate(3, (x,y)-> x>y?x:y, (w,z)->w>z?w:z); 50 | System.out.println("the res is ::"+res); 51 | 52 | List> listS = new ArrayList>(); 53 | listS.add(new Tuple2("a", 1)); 54 | listS.add(new Tuple2("b", 2)); 55 | listS.add(new Tuple2("c", 3)); 56 | listS.add(new Tuple2("a", 4)); 57 | 58 | // 59 | JavaPairRDD R = sc.parallelizePairs(listS); 60 | List> es= R.aggregateByKey(1, (x,y)->x+y, (x,y)->x+y).collect(); 61 | 62 | for (Tuple2 tuple2 : es) { 63 | System.out.println("the key is"+tuple2._1()+" and the val is ::"+tuple2._2()); 64 | } 65 | 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/JavaWordCount.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch4; 2 | 3 | import scala.Tuple2; 4 | 5 | import org.apache.spark.api.java.JavaPairRDD; 6 | import org.apache.spark.api.java.JavaRDD; 7 | import org.apache.spark.api.java.function.FlatMapFunction; 8 | import org.apache.spark.api.java.function.Function2; 9 | import org.apache.spark.api.java.function.PairFunction; 10 | import org.apache.spark.sql.SparkSession; 11 | 12 | import java.util.Arrays; 13 | import java.util.Iterator; 14 | import java.util.List; 15 | import java.util.regex.Pattern; 16 | 17 | public final class JavaWordCount { 18 | private static final Pattern SPACE = Pattern.compile(" "); 19 | 20 | public static void main(String[] args) throws Exception { 21 | 22 | if (args.length < 1) { 23 | System.err.println("Usage: JavaWordCount "); 24 | System.exit(1); 25 | } 26 | 27 | SparkSession spark = SparkSession 28 | .builder() 29 | .appName("JavaWordCount") 30 | .getOrCreate(); 31 | 32 | JavaRDD lines = spark.read().textFile(args[0]).javaRDD(); 33 | 34 | JavaRDD words = lines.flatMap(new FlatMapFunction() { 35 | @Override 36 | public Iterator call(String s) { 37 | return Arrays.asList(SPACE.split(s)).iterator(); 38 | } 39 | }); 40 | 41 | JavaPairRDD ones = words.mapToPair( 42 | new PairFunction() { 43 | @Override 44 | public Tuple2 call(String s) { 45 | return new Tuple2<>(s, 1); 46 | } 47 | }); 48 | 49 | JavaPairRDD counts = ones.reduceByKey( 50 | new Function2() { 51 | @Override 52 | public Integer call(Integer i1, Integer i2) { 53 | return i1 + i2; 54 | } 55 | }); 56 | 57 | List> output = counts.collect(); 58 | for (Tuple2 tuple : output) { 59 | System.out.println(tuple._1() + ": " + tuple._2()); 60 | } 61 | spark.stop(); 62 | } 63 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/PersistExample.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package com.packt.sfjd.ch4; 5 | 6 | import java.util.Arrays; 7 | 8 | import org.apache.log4j.Level; 9 | import org.apache.log4j.LogManager; 10 | import org.apache.log4j.Logger; 11 | import org.apache.spark.SparkConf; 12 | import org.apache.spark.api.java.JavaRDD; 13 | import org.apache.spark.api.java.JavaSparkContext; 14 | import org.apache.spark.api.java.function.VoidFunction; 15 | import org.apache.spark.sql.SparkSession; 16 | import org.apache.spark.sql.catalog.Function; 17 | import org.apache.spark.storage.StorageLevel; 18 | 19 | /** 20 | * @author sumit.kumar 21 | * 22 | */ 23 | public class PersistExample { 24 | 25 | /** 26 | * @param args 27 | */ 28 | public static void main(String[] args) { 29 | //C:\Users\sumit.kumar\Downloads\bin\warehouse 30 | //System.setProperty("hadoop.home.dir", "C:\\Users\\sumit.kumar\\Downloads"); 31 | String logFile = "src/main/resources/Apology_by_Plato.txt"; // Should be some file on your system 32 | Logger rootLogger = LogManager.getRootLogger(); 33 | rootLogger.setLevel(Level.WARN); 34 | SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false"); 35 | JavaSparkContext sparkContext = new JavaSparkContext(conf); 36 | JavaRDD rdd = sparkContext.parallelize(Arrays.asList(1, 2, 3,4,5),3).cache(); 37 | JavaRDD evenRDD= rdd.filter(new org.apache.spark.api.java.function.Function() { 38 | @Override 39 | public Boolean call(Integer v1) throws Exception { 40 | return ((v1%2)==0)?true:false; 41 | } 42 | }); 43 | 44 | evenRDD.persist(StorageLevel.MEMORY_AND_DISK()); 45 | evenRDD.foreach(new VoidFunction() { 46 | @Override 47 | public void call(Integer t) throws Exception { 48 | System.out.println("The value of RDD are :"+t); 49 | } 50 | }); 51 | //unpersisting the RDD 52 | evenRDD.unpersist(); 53 | rdd.unpersist(); 54 | 55 | /* JavaRDD lines = spark.read().textFile(logFile).javaRDD().cache(); 56 | System.out.println("DEBUG: \n"+ lines.toDebugString()); 57 | long word= lines.count(); 58 | JavaRDD distinctLines=lines.distinct(); 59 | System.out.println("DEBUG: \n"+ distinctLines.toDebugString()); 60 | JavaRDD finalRdd=lines.subtract(distinctLines); 61 | 62 | 63 | System.out.println("DEBUG: \n"+ finalRdd.toDebugString()); 64 | System.out.println("The count is "+word); 65 | System.out.println("The count is "+distinctLines.count()); 66 | System.out.println("The count is "+finalRdd.count()); 67 | 68 | finalRdd.foreach(new VoidFunction() { 69 | 70 | @Override 71 | public void call(String t) throws Exception { 72 | // TODO Auto-generated method stub 73 | System.out.println(t); 74 | } 75 | }); 76 | */ /*SparkConf conf = new SparkConf().setAppName("Simple Application"); 77 | JavaSparkContext sc = new JavaSparkContext(conf); 78 | StorageLevel newLevel; 79 | JavaRDD logData = sc.textFile(logFile).cache(); 80 | 81 | long numAs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) { 82 | public Boolean call(String s) { return s.contains("a"); } 83 | }).count(); 84 | 85 | long numBs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) { 86 | public Boolean call(String s) { return s.contains("b"); } 87 | }).count(); 88 | 89 | System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs); 90 | 91 | sc.stop();*/ 92 | 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/SparkWordCount.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch4; 2 | 3 | import java.io.File; 4 | import java.util.Arrays; 5 | 6 | import org.apache.commons.io.FileUtils; 7 | import org.apache.spark.api.java.JavaPairRDD; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import org.apache.spark.api.java.JavaSparkContext; 10 | 11 | import scala.Tuple2; 12 | //http://stackoverflow.com/questions/19620642/failed-to-locate-the-winutils-binary-in-the-hadoop-binary-path 13 | 14 | //http://www.javaworld.com/article/2972863/big-data/open-source-java-projects-apache-spark.html 15 | 16 | public class SparkWordCount { 17 | public static void main(String[] args) throws Exception { 18 | System.out.println(System.getProperty("hadoop.home.dir")); 19 | String inputPath = args[0]; 20 | String outputPath = args[1]; 21 | FileUtils.deleteQuietly(new File(outputPath)); 22 | 23 | JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount"); 24 | 25 | JavaRDD rdd = sc.textFile(inputPath); 26 | 27 | JavaPairRDD counts = rdd 28 | .flatMap(x -> Arrays.asList(x.split(" ")).iterator()) 29 | .mapToPair(x -> new Tuple2((String) x, 1)) 30 | .reduceByKey((x, y) -> x + y); 31 | 32 | counts.saveAsTextFile(outputPath); 33 | sc.close(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/SparkWordCount_1_7.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch4; 2 | 3 | import java.io.File; 4 | import java.util.Arrays; 5 | import java.util.Iterator; 6 | 7 | import org.apache.commons.io.FileUtils; 8 | import org.apache.spark.api.java.JavaPairRDD; 9 | import org.apache.spark.api.java.JavaRDD; 10 | import org.apache.spark.api.java.JavaSparkContext; 11 | import org.apache.spark.api.java.function.FlatMapFunction; 12 | import org.apache.spark.api.java.function.Function2; 13 | import org.apache.spark.api.java.function.PairFunction; 14 | 15 | import scala.Tuple2; 16 | 17 | public class SparkWordCount_1_7 { 18 | public static void main(String[] args) throws Exception { 19 | System.out.println(System.getProperty("hadoop.home.dir")); 20 | String inputPath = args[0]; 21 | String outputPath = args[1]; 22 | FileUtils.deleteQuietly(new File(outputPath)); 23 | 24 | JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount1.7"); 25 | 26 | JavaRDD rdd = sc.textFile(inputPath); 27 | 28 | JavaPairRDD counts = rdd 29 | .flatMap(new FlatMapFunction() { 30 | public Iterator call(String x) { 31 | return (Iterator) Arrays.asList(x.split(" ")); 32 | } 33 | }).mapToPair(new PairFunction() { 34 | public Tuple2 call(String x) { 35 | return new Tuple2(x, 1); 36 | } 37 | }).reduceByKey(new Function2() { 38 | public Integer call(Integer x, Integer y) { 39 | return x + y; 40 | } 41 | }); 42 | counts.saveAsTextFile(outputPath); 43 | sc.close(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/WordCount.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch4; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import org.apache.spark.api.java.function.FlatMapFunction; 8 | import org.apache.spark.api.java.function.Function2; 9 | import org.apache.spark.api.java.function.PairFunction; 10 | 11 | import scala.Tuple2; 12 | 13 | import java.util.Arrays; 14 | import java.util.Iterator; 15 | 16 | /** 17 | * Sample Spark application that counts the words in a text file 18 | */ 19 | public class WordCount 20 | { 21 | 22 | public static void wordCountJava7( String filename ) 23 | { 24 | // Define a configuration to use to interact with Spark 25 | SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App"); 26 | 27 | // Create a Java version of the Spark Context from the configuration 28 | JavaSparkContext sc = new JavaSparkContext(conf); 29 | 30 | // Load the input data, which is a text file read from the command line 31 | JavaRDD input = sc.textFile( filename ); 32 | 33 | // Java 7 and earlier 34 | JavaRDD words = input.flatMap( 35 | new FlatMapFunction() { 36 | public Iterator call(String s) { 37 | return (Iterator) Arrays.asList(s.split(" ")); 38 | } 39 | } ); 40 | 41 | // Java 7 and earlier: transform the collection of words into pairs (word and 1) 42 | JavaPairRDD counts = words.mapToPair( 43 | new PairFunction(){ 44 | public Tuple2 call(String s){ 45 | return new Tuple2(s, 1); 46 | } 47 | } ); 48 | 49 | // Java 7 and earlier: count the words 50 | JavaPairRDD reducedCounts = counts.reduceByKey( 51 | new Function2(){ 52 | public Integer call(Integer x, Integer y){ return x + y; } 53 | } ); 54 | 55 | // Save the word count back out to a text file, causing evaluation. 56 | reducedCounts.saveAsTextFile( "output" ); 57 | } 58 | 59 | public static void wordCountJava8( String filename ) 60 | { 61 | // Define a configuration to use to interact with Spark 62 | SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App"); 63 | 64 | // Create a Java version of the Spark Context from the configuration 65 | JavaSparkContext sc = new JavaSparkContext(conf); 66 | 67 | // Load the input data, which is a text file read from the command line 68 | JavaRDD input = sc.textFile( filename ); 69 | 70 | // Java 8 with lambdas: split the input string into words 71 | // TODO here a change has happened 72 | JavaRDD words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() ); 73 | 74 | // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them 75 | JavaPairRDD counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y ); 76 | 77 | // Save the word count back out to a text file, causing evaluation. 78 | counts.saveAsTextFile( "output" ); 79 | } 80 | 81 | public static void main( String[] args ) 82 | { 83 | if( args.length == 0 ) 84 | { 85 | System.out.println( "Usage: WordCount " ); 86 | System.exit( 0 ); 87 | } 88 | 89 | wordCountJava8( args[ 0 ] ); 90 | } 91 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/transformations/Test.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch4.transformations; 2 | 3 | import java.io.Serializable; 4 | 5 | public class Test implements Serializable{//implements Comparable,Serializable{ 6 | 7 | Test(int age) 8 | { 9 | this.age=age; 10 | } 11 | private int age; 12 | 13 | 14 | public int getAge() { 15 | return age; 16 | } 17 | 18 | 19 | public void setAge(int age) { 20 | this.age = age; 21 | } 22 | 23 | 24 | // @Override 25 | // public int compareTo(Test o) { 26 | // 27 | // return this.getAge()-o.getAge(); 28 | // } 29 | 30 | 31 | @Override 32 | public String toString() { 33 | return "Test [age=" + age + "]"; 34 | } 35 | 36 | 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/transformations/TestMain.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch4.transformations; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.List; 6 | 7 | public class TestMain { 8 | public static void main(String[] args) { 9 | List list =new ArrayList<>(); 10 | list.add(new Test(5)); 11 | list.add(new Test(3)); 12 | list.add(new Test(6)); 13 | 14 | //Collections.sort(list); 15 | list.forEach(t -> System.out.println(t.getAge())); 16 | 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch4/transformations/Transformations.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch4.transformations; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.Iterator; 6 | import java.util.List; 7 | import java.util.stream.Collector; 8 | import java.util.stream.Collectors; 9 | 10 | import org.apache.spark.Partitioner; 11 | import org.apache.spark.SparkConf; 12 | import org.apache.spark.SparkContext; 13 | import org.apache.spark.api.java.JavaPairRDD; 14 | import org.apache.spark.api.java.JavaRDD; 15 | import org.apache.spark.api.java.JavaSparkContext; 16 | import org.apache.spark.api.java.function.Function; 17 | import org.apache.spark.api.java.function.Function2; 18 | import org.apache.spark.rdd.RDD; 19 | 20 | import scala.Tuple2; 21 | 22 | public class Transformations { 23 | public static void main(String[] args) { 24 | SparkConf conf = new SparkConf().setMaster("local").setAppName("ApacheSparkForJavaDevelopers"); 25 | // SparkContext context =new SparkContext(conf); 26 | // RDD textFile = context.textFile("abc", 1); 27 | 28 | JavaSparkContext javaSparkContext = new JavaSparkContext(conf); 29 | 30 | List intList = Arrays.asList(1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); 31 | 32 | JavaRDD intRDD = javaSparkContext.parallelize(intList, 2); 33 | // intRDD.repartition(2); 34 | 35 | // Map Transformation 36 | JavaRDD mappedRDD = intRDD.map(x -> x + 1); 37 | 38 | // Map with partitions 39 | JavaRDD mapPartitions = intRDD.mapPartitions(iterator -> { 40 | int sum = 0; 41 | while (iterator.hasNext()) { 42 | sum += iterator.next(); 43 | } 44 | return Arrays.asList(sum).iterator(); 45 | }); 46 | 47 | // map partitions with index 48 | JavaRDD mapPartitionsWithIndex = intRDD 49 | .mapPartitionsWithIndex(new Function2, Iterator>() { 50 | 51 | /** 52 | * 53 | */ 54 | private static final long serialVersionUID = 739746028261776589L; 55 | 56 | @Override 57 | public Iterator call(Integer index, Iterator iterator) throws Exception { 58 | int sum = 0; 59 | while (iterator.hasNext()) { 60 | sum += iterator.next(); 61 | } 62 | return Arrays.asList(index + ":" + sum).iterator(); 63 | } 64 | }, true); 65 | 66 | // filter RDD 67 | JavaRDD filter = intRDD.filter(x -> (x % 2 == 0)); 68 | 69 | JavaRDD stringRDD = javaSparkContext.parallelize(Arrays.asList("Hello Spark", "Hello Java")); 70 | 71 | // flat map 72 | 73 | JavaRDD flatMap = stringRDD.flatMap(t -> Arrays.asList(t.split(" ")).iterator()); 74 | // map to pair 75 | 76 | JavaPairRDD pairRDD = intRDD.mapToPair( 77 | i -> (i % 2 == 0) ? new Tuple2("even", i) : new Tuple2("odd", i)); 78 | 79 | // flat map to pair 80 | 81 | JavaPairRDD flatMapToPair = stringRDD.flatMapToPair(s -> Arrays.asList(s.split(" ")).stream() 82 | .map(token -> new Tuple2(token, token.length())).collect(Collectors.toList()) 83 | .iterator()); 84 | // List> list =new ArrayList<>(); 85 | // for (String token : s.split(" ")) { 86 | // list.add(new Tuple2(token, token.length())); 87 | // 88 | // } 89 | // return list.iterator(); 90 | 91 | // sample 92 | JavaRDD sample = intRDD.sample(true, 2); 93 | 94 | // union 95 | JavaRDD intRDD2 = javaSparkContext.parallelize(Arrays.asList(1, 2, 3)); 96 | JavaRDD union = intRDD.union(intRDD2); 97 | 98 | // intersection 99 | JavaRDD intersection = intRDD.intersection(intRDD2); 100 | JavaRDD subtract = intRDD.subtract(intRDD2); 101 | 102 | // distinct 103 | JavaRDD rddwithdupElements = javaSparkContext 104 | .parallelize(Arrays.asList(1, 1, 2, 4, 5, 6, 8, 8, 9, 10, 11, 11)); 105 | JavaRDD distinct = rddwithdupElements.distinct(); 106 | 107 | pairRDD.repartition(2); 108 | 109 | // groupbykey 110 | JavaPairRDD> groupByKey = pairRDD.groupByKey(); 111 | 112 | // reducebykey 113 | JavaPairRDD reduceByKey = pairRDD.reduceByKey(new Function2() { 114 | @Override 115 | public Integer call(Integer v1, Integer v2) throws Exception { 116 | return v1 + v2; 117 | } 118 | }); 119 | 120 | // sort by key 121 | JavaPairRDD sortByKey = pairRDD.sortByKey(); 122 | 123 | JavaPairRDD aggregateByKey = pairRDD.aggregateByKey(0, 124 | new Function2() { 125 | 126 | /** 127 | * 128 | */ 129 | private static final long serialVersionUID = -9193256894160862119L; 130 | 131 | @Override 132 | public Integer call(Integer v1, Integer v2) throws Exception { 133 | return v1 + v2; 134 | } 135 | }, new Function2() { 136 | 137 | @Override 138 | public Integer call(Integer v1, Integer v2) throws Exception { 139 | 140 | return v1 + v2; 141 | } 142 | }); 143 | 144 | JavaPairRDD combineByKey = pairRDD.combineByKey(new Function() { 145 | 146 | /** 147 | * 148 | */ 149 | private static final long serialVersionUID = -1965754276530922495L; 150 | 151 | @Override 152 | public Integer call(Integer v1) throws Exception { 153 | 154 | return v1; 155 | } 156 | }, new Function2() { 157 | 158 | /** 159 | * 160 | */ 161 | private static final long serialVersionUID = -9193256894160862119L; 162 | 163 | @Override 164 | public Integer call(Integer v1, Integer v2) throws Exception { 165 | return v1 + v2; 166 | } 167 | }, new Function2() { 168 | 169 | @Override 170 | public Integer call(Integer v1, Integer v2) throws Exception { 171 | 172 | return v1 + v2; 173 | } 174 | }); 175 | 176 | JavaPairRDD foldByKey = pairRDD.foldByKey(0, new Function2() { 177 | 178 | @Override 179 | public Integer call(Integer v1, Integer v2) throws Exception { 180 | 181 | return v1 + v2; 182 | } 183 | }); 184 | 185 | 186 | // System.out.println(intRDD.collect()); 187 | // System.out.println(intRDD2.collect()); 188 | 189 | JavaRDD rddStrings = javaSparkContext.parallelize(Arrays.asList("A","B","C")); 190 | JavaRDD rddIntegers = javaSparkContext.parallelize(Arrays.asList(1,4,5)); 191 | 192 | rddStrings.cartesian(rddIntegers); 193 | 194 | 195 | //sort bykey 196 | JavaPairRDD unsortedPairRDD = javaSparkContext.parallelizePairs( 197 | Arrays.asList(new Tuple2("B", 2), new Tuple2("C", 5), 198 | new Tuple2("D", 7), new Tuple2("A", 8))); 199 | 200 | unsortedPairRDD.sortByKey(false); 201 | 202 | 203 | 204 | // join 205 | 206 | 207 | JavaPairRDD pairRDD1 = javaSparkContext.parallelizePairs( 208 | Arrays.asList(new Tuple2("B", "A"), new Tuple2("C", "D"), 209 | new Tuple2("D", "E"), new Tuple2("A", "B"))); 210 | JavaPairRDD pairRDD2 = javaSparkContext.parallelizePairs( 211 | Arrays.asList(new Tuple2("B", 2), new Tuple2("C", 5), 212 | new Tuple2("D", 7), new Tuple2("A", 8))); 213 | JavaPairRDD> joinedRDD = pairRDD1.join(pairRDD2); 214 | pairRDD1.leftOuterJoin(pairRDD2); 215 | pairRDD1.rightOuterJoin(pairRDD2); 216 | pairRDD1.fullOuterJoin(pairRDD2); 217 | System.out.println(joinedRDD.collect()); 218 | 219 | //cogroup 220 | JavaPairRDD pairRDD3 = javaSparkContext.parallelizePairs( 221 | Arrays.asList(new Tuple2("B", "A"), new Tuple2("B", "D"), 222 | new Tuple2("A", "E"), new Tuple2("A", "B"))); 223 | JavaPairRDD pairRDD4 = javaSparkContext.parallelizePairs( 224 | Arrays.asList(new Tuple2("B", 2), new Tuple2("B", 5), 225 | new Tuple2("A", 7), new Tuple2("A", 8))); 226 | JavaPairRDD, Iterable>> cogroup = pairRDD3.cogroup(pairRDD4); 227 | JavaPairRDD, Iterable>> groupWith = pairRDD3.groupWith(pairRDD4); 228 | 229 | System.out.println(cogroup.collect()); 230 | 231 | 232 | 233 | } 234 | } 235 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/CSVFileOperations.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | 4 | 5 | import org.apache.log4j.Level; 6 | import org.apache.log4j.LogManager; 7 | import org.apache.log4j.Logger; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import org.apache.spark.sql.Dataset; 10 | import org.apache.spark.sql.Row; 11 | import org.apache.spark.sql.SparkSession; 12 | import org.apache.spark.sql.types.DataTypes; 13 | import org.apache.spark.sql.types.Metadata; 14 | import org.apache.spark.sql.types.StructField; 15 | import org.apache.spark.sql.types.StructType; 16 | //29,"City of Lost Children, The (Cité des enfants perdus, La) (1995)",Adventure|Drama|Fantasy|Mystery|Sci-Fi 17 | //40,"Cry, the Beloved Country (1995)",Drama 18 | // 19 | public class CSVFileOperations { 20 | 21 | public static void main(String[] args) { 22 | System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); 23 | 24 | SparkSession sparkSession = SparkSession 25 | .builder() 26 | .master("local") 27 | .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse") 28 | .appName("JavaALSExample") 29 | .getOrCreate(); 30 | Logger rootLogger = LogManager.getRootLogger(); 31 | rootLogger.setLevel(Level.WARN); 32 | 33 | JavaRDD moviesRDD = sparkSession 34 | .read().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv") 35 | .javaRDD().filter( str-> !(null==str)) 36 | .filter(str-> !(str.length()==0)) 37 | .filter(str-> !str.contains("movieId")) 38 | .map(str -> Movie.parseRating(str)); 39 | 40 | moviesRDD.foreach(m -> System.out.println(m)); 41 | 42 | Dataset csv_read = sparkSession.read().format("com.databricks.spark.csv") 43 | .option("header", "true") 44 | .option("inferSchema", "true") 45 | .load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv"); 46 | 47 | csv_read.printSchema(); 48 | 49 | csv_read.show(); 50 | 51 | 52 | StructType customSchema = new StructType(new StructField[] { 53 | new StructField("movieId", DataTypes.LongType, true, Metadata.empty()), 54 | new StructField("title", DataTypes.StringType, true, Metadata.empty()), 55 | new StructField("genres", DataTypes.StringType, true, Metadata.empty()) 56 | }); 57 | 58 | Dataset csv_custom_read = sparkSession.read().format("com.databricks.spark.csv") 59 | .option("header", "true") 60 | .schema(customSchema) 61 | .load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv"); 62 | 63 | csv_custom_read.printSchema(); 64 | 65 | csv_custom_read.show(); 66 | 67 | 68 | csv_custom_read.write() 69 | .format("com.databricks.spark.csv") 70 | .option("header", "true") 71 | .option("codec", "org.apache.hadoop.io.compress.GzipCodec") 72 | .save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMovies.csv"); 73 | 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/CassandraExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | import org.apache.spark.SparkConf; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | import org.apache.spark.sql.Dataset; 10 | import org.apache.spark.sql.Row; 11 | import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.SparkSession; 12 | 13 | import com.datastax.spark.connector.japi.CassandraJavaUtil; 14 | import com.datastax.spark.connector.japi.rdd.CassandraTableScanJavaRDD; 15 | 16 | public class CassandraExample { 17 | @SuppressWarnings("deprecation") 18 | public static void main(String[] args) { 19 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 20 | SparkConf conf =new SparkConf().setMaster("local").setAppName("Cassandra Example"); 21 | conf.set("spark.cassandra.connection.host", "127.0.0.1"); 22 | //conf.set("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse"); 23 | 24 | JavaSparkContext jsc=new JavaSparkContext(conf); 25 | JavaRDD cassandraTable = CassandraJavaUtil.javaFunctions(jsc).cassandraTable("my_keyspace", "emp",CassandraJavaUtil.mapRowTo(Employee.class)); 26 | 27 | JavaRDD selectEmpDept = CassandraJavaUtil.javaFunctions(jsc).cassandraTable("my_keyspace", "emp",CassandraJavaUtil.mapColumnTo(String.class)).select("emp_dept","emp_name"); 28 | 29 | 30 | cassandraTable.collect().forEach(System.out::println); 31 | //selectEmpDept.collect().forEach(System.out::println); 32 | 33 | 34 | CassandraJavaUtil.javaFunctions(cassandraTable) 35 | .writerBuilder("my_keyspace", "emp1", CassandraJavaUtil.mapToRow(Employee.class)).saveToCassandra(); 36 | 37 | /*SQLContext sqlContext = new SQLContext(jsc); 38 | 39 | Map map =new HashMap<>(); 40 | map.put("table" , "emp"); 41 | map.put("keyspace", "my_keyspace"); 42 | 43 | Dataset df = sqlContext.read().format("org.apache.spark.sql.cassandra") 44 | .options(map) 45 | .load(); 46 | 47 | df.show();*/ 48 | 49 | 50 | 51 | } 52 | 53 | 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/DelimitedFileOperations.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | public class DelimitedFileOperations { 4 | 5 | public static void main(String[] args) { 6 | // TODO Auto-generated method stub 7 | 8 | } 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/Employee.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import java.io.Serializable; 4 | 5 | public class Employee implements Serializable{ 6 | private Integer empid; 7 | private String emp_name; 8 | private String emp_dept; 9 | /*public Employee(Integer empid, String emp_name, String emp_dept) { 10 | super(); 11 | this.empid = empid; 12 | this.emp_name = emp_name; 13 | this.emp_dept = emp_dept; 14 | }*/ 15 | 16 | public String toString() { 17 | return "Employee [empid=" + empid + ", emp_name=" + emp_name + ", emp_dept=" + emp_dept + "]"; 18 | } 19 | 20 | 21 | public Integer getEmpid() { 22 | return empid; 23 | } 24 | 25 | 26 | public void setEmpid(Integer empid) { 27 | this.empid = empid; 28 | } 29 | public String getEmp_name() { 30 | return emp_name; 31 | } 32 | public void setEmp_name(String emp_name) { 33 | this.emp_name = emp_name; 34 | } 35 | public String getEmp_dept() { 36 | return emp_dept; 37 | } 38 | public void setEmp_dept(String emp_dept) { 39 | this.emp_dept = emp_dept; 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/HdfsExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | 7 | public class HdfsExample { 8 | public static void main(String[] args) { 9 | 10 | SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example"); 11 | JavaSparkContext jsc=new JavaSparkContext(conf); 12 | jsc.hadoopConfiguration().setLong("dfs.blocksize",2); 13 | //jsc.hadoopConfiguration().setLong("fs.local.block.size",2); 14 | 15 | JavaRDD hadoopRdd = jsc.textFile("hdfs://ch3lxesgdi02.corp.equinix.com:8020/user/gse/packt/ch01/test1",2); 16 | 17 | System.out.println(hadoopRdd.getNumPartitions()); 18 | //hadoopRdd.saveAsTextFile("hdfs://ch3lxesgdi02.corp.equinix.com:8020/user/gse/packt/ch01/testout"); 19 | 20 | 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/JsonFileOperations.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import org.apache.log4j.Level; 4 | import org.apache.log4j.LogManager; 5 | import org.apache.log4j.Logger; 6 | import org.apache.spark.api.java.JavaRDD; 7 | import org.apache.spark.rdd.RDD; 8 | import org.apache.spark.sql.Dataset; 9 | import org.apache.spark.sql.Row; 10 | import org.apache.spark.sql.SparkSession; 11 | import org.apache.spark.sql.types.DataTypes; 12 | import org.apache.spark.sql.types.Metadata; 13 | import org.apache.spark.sql.types.StructField; 14 | import org.apache.spark.sql.types.StructType; 15 | 16 | import com.fasterxml.jackson.databind.ObjectMapper; 17 | 18 | public class JsonFileOperations { 19 | 20 | public static void main(String[] args) { 21 | System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); 22 | Logger rootLogger = LogManager.getRootLogger(); 23 | rootLogger.setLevel(Level.WARN); 24 | SparkSession sparkSession = SparkSession 25 | .builder() 26 | .master("local") 27 | .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse") 28 | .appName("JavaALSExample") 29 | .getOrCreate(); 30 | 31 | RDD textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2); 32 | 33 | JavaRDD mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class)); 34 | 35 | mapParser.foreach(t -> System.out.println(t)); 36 | 37 | Dataset anotherPeople = sparkSession.read().json(textFile); 38 | 39 | anotherPeople.printSchema(); 40 | anotherPeople.show(); 41 | 42 | 43 | Dataset json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json"); 44 | json_rec.printSchema(); 45 | 46 | json_rec.show(); 47 | 48 | StructType schema = new StructType( new StructField[] { 49 | DataTypes.createStructField("cid", DataTypes.IntegerType, true), 50 | DataTypes.createStructField("county", DataTypes.StringType, true), 51 | DataTypes.createStructField("firstName", DataTypes.StringType, true), 52 | DataTypes.createStructField("sex", DataTypes.StringType, true), 53 | DataTypes.createStructField("year", DataTypes.StringType, true), 54 | DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) }); 55 | 56 | /* StructType pep = new StructType(new StructField[] { 57 | new StructField("Count", DataTypes.StringType, true, Metadata.empty()), 58 | new StructField("County", DataTypes.StringType, true, Metadata.empty()), 59 | new StructField("First Name", DataTypes.StringType, true, Metadata.empty()), 60 | new StructField("Sex", DataTypes.StringType, true, Metadata.empty()), 61 | new StructField("Year", DataTypes.StringType, true, Metadata.empty()), 62 | new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/ 63 | 64 | Dataset person_mod = sparkSession.read().schema(schema).json(textFile); 65 | 66 | person_mod.printSchema(); 67 | person_mod.show(); 68 | 69 | person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json"); 70 | 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/LFSExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import java.util.Arrays; 4 | 5 | import org.apache.spark.SparkConf; 6 | import org.apache.spark.api.java.JavaPairRDD; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | 10 | import scala.Tuple2; 11 | 12 | public class LFSExample { 13 | public static void main(String[] args) { 14 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 15 | SparkConf conf =new SparkConf().setMaster("local").setAppName("Local File System Example"); 16 | 17 | 18 | JavaSparkContext jsc=new JavaSparkContext(conf); 19 | // jsc.hadoopConfiguration().setLong("dfs.block.size",20000); 20 | 21 | // jsc.hadoopConfiguration().setLong("fs.local.block.size",20000); 22 | 23 | // 24 | // JavaRDD localFile=jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt"); 25 | // localFile.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2((String) x, 1)) 26 | // .reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path"); 27 | 28 | 29 | // JavaRDD localFile1 = jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt"); 30 | // 31 | // System.out.println(localFile1.getNumPartitions()); 32 | // localFile1.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2((String) x, 1)) 33 | // .reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path1"); 34 | 35 | JavaRDD localFile2 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*"); 36 | System.out.println(localFile2.getNumPartitions()); 37 | // localFile2.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2((String) x, 1)) 38 | // .reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path2"); 39 | //// 40 | // JavaRDD localFile3 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*,C:\\Users\\sgulati\\Documents\\Result\\test5\\*"); 41 | // 42 | // localFile3.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2((String) x, 1)) 43 | // .reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path3"); 44 | // 45 | // JavaPairRDD localFileWhole = jsc.wholeTextFiles("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt"); 46 | // System.out.println(localFileWhole.collect()); 47 | 48 | jsc.close(); 49 | 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/Movie.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import java.io.Serializable; 4 | import java.util.stream.Stream; 5 | 6 | public class Movie implements Serializable { 7 | private Integer movieId; 8 | private String title; 9 | private String genre; 10 | 11 | public Movie() {}; 12 | public Movie(Integer movieId, String title, String genere) { 13 | super(); 14 | this.movieId = movieId; 15 | this.title = title; 16 | this.genre = genere; 17 | } 18 | public Integer getMovieId() { 19 | return movieId; 20 | } 21 | public void setMovieId(Integer movieId) { 22 | this.movieId = movieId; 23 | } 24 | public String getTitle() { 25 | return title; 26 | } 27 | public void setTitle(String title) { 28 | this.title = title; 29 | } 30 | public String getGenere() { 31 | return genre; 32 | } 33 | public void setGenere(String genere) { 34 | this.genre = genere; 35 | } 36 | public static Movie parseRating(String str) { 37 | String[] fields = str.split(","); 38 | if (fields.length != 3) { 39 | System.out.println("The elements are ::" ); 40 | Stream.of(fields).forEach(System.out::println); 41 | throw new IllegalArgumentException("Each line must contain 3 fields while the current line has ::"+fields.length); 42 | } 43 | Integer movieId = Integer.parseInt(fields[0]); 44 | String title = fields[1].trim(); 45 | String genere = fields[2].trim(); 46 | return new Movie(movieId,title,genere); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/Person.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import java.io.Serializable; 4 | 5 | public class Person implements Serializable{ 6 | private String Name; 7 | private Integer Age; 8 | private String occupation; 9 | public String getOccupation() { 10 | return occupation; 11 | } 12 | public void setOccupation(String occupation) { 13 | this.occupation = occupation; 14 | } 15 | public String getName() { 16 | return Name; 17 | } 18 | public void setName(String name) { 19 | Name = name; 20 | } 21 | public Integer getAge() { 22 | return Age; 23 | } 24 | public void setAge(Integer age) { 25 | Age = age; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/PersonDetails.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import java.io.Serializable; 4 | import java.sql.Timestamp; 5 | 6 | public class PersonDetails implements Serializable { 7 | private Integer cid; 8 | private String county; 9 | private String firstName; 10 | private String sex; 11 | private String year; 12 | private Timestamp dateOfBirth; 13 | public Integer getCid() { 14 | return cid; 15 | } 16 | public void setCid(Integer cid) { 17 | this.cid = cid; 18 | } 19 | public String getCounty() { 20 | return county; 21 | } 22 | public void setCounty(String county) { 23 | this.county = county; 24 | } 25 | public String getFirstName() { 26 | return firstName; 27 | } 28 | public void setFirstName(String firstName) { 29 | this.firstName = firstName; 30 | } 31 | public String getSex() { 32 | return sex; 33 | } 34 | public void setSex(String sex) { 35 | this.sex = sex; 36 | } 37 | public String getYear() { 38 | return year; 39 | } 40 | public void setYear(String year) { 41 | this.year = year; 42 | } 43 | public Timestamp getDateOfBirth() { 44 | return dateOfBirth; 45 | } 46 | public void setDateOfBirth(Timestamp dateOfBirth) { 47 | this.dateOfBirth = dateOfBirth; 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/S3Example.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import java.util.Arrays; 4 | 5 | import org.apache.spark.SparkConf; 6 | import org.apache.spark.api.java.JavaRDD; 7 | import org.apache.spark.api.java.JavaSparkContext; 8 | 9 | import scala.Tuple2; 10 | 11 | public class S3Example { 12 | 13 | 14 | public static void main(String[] args) { 15 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 16 | SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example"); 17 | JavaSparkContext jsc=new JavaSparkContext(conf); 18 | //jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "Your awsAccessKeyId"); 19 | //jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "your awsSecretAccessKey"); 20 | 21 | 22 | System.out.println(System.getenv("AWS_ACCESS_KEY_ID")); 23 | JavaRDD textFile = jsc.textFile("s3a://"+"trust"+"/"+"MOCK_DATA.csv"); 24 | 25 | // textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2((String) x, 1)) 26 | // .reduceByKey((x, y) -> x + y).saveAsTextFile("s3n://"+"trust"+"/"+"out.txt"); 27 | 28 | textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2((String) x, 1)) 29 | .reduceByKey((x, y) -> x + y).saveAsTextFile("s3a://"+"trust"+"/"+"out.txt"); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/TextFileOperations.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.spark.SparkConf; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | 10 | public class TextFileOperations { 11 | 12 | public static void main(String[] args) { 13 | System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); 14 | SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false"); 15 | JavaSparkContext sparkContext = new JavaSparkContext(conf); 16 | 17 | JavaRDD textFile = sparkContext.textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/people.tsv"); 18 | 19 | 20 | JavaRDD people =textFile.map( 21 | line -> { 22 | String[] parts = line.split("~"); 23 | Person person = new Person(); 24 | person.setName(parts[0]); 25 | person.setAge(Integer.parseInt(parts[1].trim())); 26 | person.setOccupation(parts[2]); 27 | return person; 28 | }); 29 | 30 | 31 | people.foreach(p -> System.out.println(p)); 32 | 33 | 34 | 35 | JavaRDD peoplePart = textFile.mapPartitions(p -> { 36 | ArrayList personList=new ArrayList(); 37 | while (p.hasNext()){ 38 | String[] parts = p.next().split("~"); 39 | Person person = new Person(); 40 | person.setName(parts[0]); 41 | person.setAge(Integer.parseInt(parts[1].trim())); 42 | person.setOccupation(parts[2]); 43 | personList.add(person); 44 | } 45 | return personList.iterator(); 46 | }); 47 | 48 | peoplePart.foreach(p -> System.out.println(p)); 49 | 50 | 51 | people.saveAsTextFile("C:/Users/sumit.kumar/git/learning/src/main/resources/peopleSimple"); 52 | people.repartition(1).saveAsTextFile("C:/Users/sumit.kumar/git/learning/src/main/resources/peopleRepart"); 53 | people.coalesce(1).saveAsTextFile("C:/Users/sumit.kumar/git/learning/src/main/resources/peopleCoalesce"); 54 | } 55 | 56 | 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch5/XMLFileOperations.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch5; 2 | 3 | import java.util.HashMap; 4 | 5 | import org.apache.log4j.Level; 6 | import org.apache.log4j.LogManager; 7 | import org.apache.log4j.Logger; 8 | import org.apache.spark.sql.Dataset; 9 | import org.apache.spark.sql.Row; 10 | import org.apache.spark.sql.SparkSession; 11 | 12 | public class XMLFileOperations { 13 | 14 | public static void main(String[] args) { 15 | System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); 16 | 17 | SparkSession sparkSession = SparkSession 18 | .builder() 19 | .master("local") 20 | .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse") 21 | .appName("JavaALSExample") 22 | .getOrCreate(); 23 | Logger rootLogger = LogManager.getRootLogger(); 24 | rootLogger.setLevel(Level.WARN); 25 | 26 | 27 | HashMap params = new HashMap(); 28 | params.put("rowTag", "food"); 29 | params.put("failFast", "true"); 30 | Dataset docDF = sparkSession.read() 31 | .format("com.databricks.spark.xml") 32 | .options(params) 33 | .load("C:/Users/sumit.kumar/git/learning/src/main/resources/breakfast_menu.xml"); 34 | 35 | docDF.printSchema(); 36 | docDF.show(); 37 | 38 | docDF.write().format("com.databricks.spark.xml") 39 | .option("rootTag", "food") 40 | .option("rowTag", "food") 41 | .save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMenu.xml"); 42 | 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch7/BroadcastVariable.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch7; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaSparkContext; 5 | import org.apache.spark.broadcast.Broadcast; 6 | import org.apache.spark.sql.SparkSession; 7 | 8 | public class BroadcastVariable { 9 | 10 | public static void main(String[] args) { 11 | 12 | 13 | // SparkConf conf = new SparkConf().setMaster("local").setAppName("BroadCasting"); 14 | // JavaSparkContext jsc = new JavaSparkContext(conf); 15 | // 16 | // Broadcast broadcastVar = jsc.broadcast("Hello Spark"); 17 | // 18 | SparkSession sparkSession = SparkSession.builder().master("local").appName("My App") 19 | .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); 20 | 21 | Broadcast broadcastVar= sparkSession.sparkContext().broadcast("Hello Spark", scala.reflect.ClassTag$.MODULE$.apply(String.class)); 22 | System.out.println(broadcastVar.getValue()); 23 | 24 | broadcastVar.unpersist(); 25 | // broadcastVar.unpersist(true); 26 | broadcastVar.destroy(); 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch7/CustomPartitioner.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch7; 2 | 3 | import org.apache.spark.Partitioner; 4 | 5 | public class CustomPartitioner extends Partitioner{ 6 | 7 | /** 8 | * 9 | */ 10 | private static final long serialVersionUID = -7397874438301367044L; 11 | 12 | final int maxPartitions=2; 13 | 14 | @Override 15 | public int getPartition(Object key) { 16 | 17 | return (((String) key).length()%maxPartitions); 18 | 19 | 20 | } 21 | 22 | @Override 23 | public int numPartitions() { 24 | 25 | return maxPartitions; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch7/CustomPartitionerExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch7; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.spark.SparkConf; 8 | import org.apache.spark.api.java.JavaPairRDD; 9 | import org.apache.spark.api.java.JavaRDD; 10 | import org.apache.spark.api.java.JavaSparkContext; 11 | 12 | import scala.Tuple2; 13 | 14 | public class CustomPartitionerExample { 15 | public static void main(String[] args) { 16 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 17 | SparkConf conf = new SparkConf().setMaster("local").setAppName("Partitioning"); 18 | JavaSparkContext jsc = new JavaSparkContext(conf); 19 | 20 | JavaPairRDD pairRdd = jsc.parallelizePairs( 21 | Arrays.asList(new Tuple2("India", "Asia"),new Tuple2("Germany", "Europe"), 22 | new Tuple2("Japan", "Asia"),new Tuple2("France", "Europe")) 23 | ,3); 24 | 25 | 26 | JavaPairRDD customPartitioned = pairRdd.partitionBy(new CustomPartitioner()); 27 | 28 | System.out.println(customPartitioned.getNumPartitions()); 29 | 30 | 31 | JavaRDD mapPartitionsWithIndex = customPartitioned.mapPartitionsWithIndex((index, tupleIterator) -> { 32 | 33 | List list=new ArrayList<>(); 34 | 35 | while(tupleIterator.hasNext()){ 36 | list.add("Partition number:"+index+",key:"+tupleIterator.next()._1()); 37 | } 38 | 39 | return list.iterator(); 40 | }, true); 41 | 42 | System.out.println(mapPartitionsWithIndex.collect()); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch7/ListAccumulator.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch7; 2 | 3 | import java.util.concurrent.CopyOnWriteArrayList; 4 | 5 | import org.apache.spark.util.AccumulatorV2; 6 | 7 | public class ListAccumulator extends AccumulatorV2> { 8 | 9 | private static final long serialVersionUID = 1L; 10 | private CopyOnWriteArrayList accList = null; 11 | 12 | public ListAccumulator() { 13 | accList = new CopyOnWriteArrayList(); 14 | 15 | } 16 | 17 | public ListAccumulator(CopyOnWriteArrayList value) { 18 | if (value.size() != 0) { 19 | accList = new CopyOnWriteArrayList(value); 20 | } 21 | } 22 | 23 | @Override 24 | public void add(String arg) { 25 | if (!arg.isEmpty()) 26 | accList.add(Integer.parseInt(arg)); 27 | 28 | } 29 | 30 | @Override 31 | public AccumulatorV2> copy() { 32 | return new ListAccumulator(value()); 33 | } 34 | 35 | @Override 36 | public boolean isZero() { 37 | return accList.size() == 0 ? true : false; 38 | } 39 | 40 | @Override 41 | public void merge(AccumulatorV2> other) { 42 | add(other.value()); 43 | 44 | } 45 | 46 | private void add(CopyOnWriteArrayList value) { 47 | value().addAll(value); 48 | } 49 | 50 | @Override 51 | public void reset() { 52 | accList = new CopyOnWriteArrayList(); 53 | } 54 | 55 | @Override 56 | public CopyOnWriteArrayList value() { 57 | return accList; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch7/MapSideJoinBroadcast.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch7; 2 | 3 | import java.util.Arrays; 4 | import java.util.Map; 5 | 6 | import org.apache.spark.api.java.JavaPairRDD; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | import org.apache.spark.broadcast.Broadcast; 10 | import org.apache.spark.sql.SparkSession; 11 | 12 | import scala.Tuple2; 13 | import scala.Tuple3; 14 | 15 | public class MapSideJoinBroadcast { 16 | 17 | public static void main(String[] args) { 18 | 19 | SparkSession sparkSession = SparkSession.builder().master("local").appName("My App") 20 | .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); 21 | 22 | JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext()); 23 | 24 | JavaPairRDD userIdToCityId = jsc.parallelizePairs( 25 | Arrays.asList(new Tuple2("1", "101"), new Tuple2("2", "102"), 26 | new Tuple2("3", "107"), new Tuple2("4", "103"), 27 | new Tuple2("11", "101"), new Tuple2("12", "102"), 28 | new Tuple2("13", "107"), new Tuple2("14", "103"))); 29 | 30 | JavaPairRDD cityIdToCityName = jsc.parallelizePairs( 31 | Arrays.asList(new Tuple2("101", "India"), new Tuple2("102", "UK"), 32 | new Tuple2("103", "Germany"), new Tuple2("107", "USA"))); 33 | 34 | Broadcast> citiesBroadcasted = jsc.broadcast(cityIdToCityName.collectAsMap()); 35 | 36 | JavaRDD> joined = userIdToCityId.map( 37 | v1 -> new Tuple3(v1._1(), v1._2(), citiesBroadcasted.value().get(v1._2()))); 38 | 39 | System.out.println(joined.collect()); 40 | 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch7/PartitionIndexInformation.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch7; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.spark.HashPartitioner; 8 | import org.apache.spark.api.java.JavaPairRDD; 9 | import org.apache.spark.api.java.JavaRDD; 10 | import org.apache.spark.api.java.JavaSparkContext; 11 | import org.apache.spark.sql.SparkSession; 12 | 13 | import scala.Tuple2; 14 | 15 | public class PartitionIndexInformation { 16 | public static void main(String[] args) { 17 | 18 | SparkSession sparkSession = SparkSession.builder().master("local").appName("My App") 19 | .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); 20 | 21 | JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext()); 22 | 23 | 24 | JavaPairRDD pairRdd = jsc.parallelizePairs( 25 | Arrays.asList(new Tuple2(1, "A"),new Tuple2(2, "B"), 26 | new Tuple2(3, "C"),new Tuple2(4, "D"), 27 | new Tuple2(5, "E"),new Tuple2(6, "F"), 28 | new Tuple2(7, "G"),new Tuple2(8, "H"))); 29 | 30 | JavaPairRDD partitionBy = pairRdd.partitionBy(new HashPartitioner(2)); 31 | 32 | JavaPairRDD mapValues = partitionBy.mapToPair(t -> new Tuple2(t._1()+1,t._2())); 33 | 34 | 35 | JavaRDD mapPartitionsWithIndex =partitionBy.mapPartitionsWithIndex((index, iterator) -> { 36 | List list =new ArrayList<>(); 37 | 38 | while(iterator.hasNext()){ 39 | Tuple2 next = iterator.next(); 40 | list.add(index+":"+next._1()+":"+next._2()); 41 | } 42 | 43 | return list.iterator(); 44 | },false); 45 | 46 | JavaRDD mapPartitionsWithIndex1 =mapValues.mapPartitionsWithIndex((index, iterator) -> { 47 | List list =new ArrayList<>(); 48 | 49 | while(iterator.hasNext()){ 50 | Tuple2 next = iterator.next(); 51 | list.add(index+":"+next._1()+":"+next._2()); 52 | } 53 | 54 | return list.iterator(); 55 | },false); 56 | 57 | System.out.println(mapPartitionsWithIndex.collect()); 58 | System.out.println(mapPartitionsWithIndex1.collect()); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch7/Partitioning.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch7; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.spark.HashPartitioner; 8 | import org.apache.spark.RangePartitioner; 9 | import org.apache.spark.SparkConf; 10 | import org.apache.spark.api.java.JavaPairRDD; 11 | import org.apache.spark.api.java.JavaRDD; 12 | import org.apache.spark.api.java.JavaSparkContext; 13 | import org.apache.spark.rdd.RDD; 14 | 15 | import scala.Tuple2; 16 | 17 | public class Partitioning { 18 | public static void main(String[] args) { 19 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 20 | SparkConf conf = new SparkConf().setMaster("local").setAppName("Partitioning"); 21 | JavaSparkContext jsc = new JavaSparkContext(conf); 22 | 23 | JavaPairRDD pairRdd = jsc.parallelizePairs( 24 | Arrays.asList(new Tuple2(1, "A"),new Tuple2(2, "B"), 25 | new Tuple2(3, "C"),new Tuple2(4, "D"), 26 | new Tuple2(5, "E"),new Tuple2(6, "F"), 27 | new Tuple2(7, "G"),new Tuple2(8, "H")),3); 28 | 29 | 30 | 31 | 32 | RDD> rdd = JavaPairRDD.toRDD(pairRdd); 33 | 34 | System.out.println(pairRdd.getNumPartitions()); 35 | // JavaPairRDD hashPartitioned = pairRdd.partitionBy(new HashPartitioner(2)); 36 | // 37 | // System.out.println(hashPartitioned.getNumPartitions()); 38 | 39 | 40 | 41 | RangePartitioner rangePartitioner = new RangePartitioner(4, rdd, true, scala.math.Ordering.Int$.MODULE$ , scala.reflect.ClassTag$.MODULE$.apply(Integer.class)); 42 | 43 | JavaPairRDD rangePartitioned = pairRdd.partitionBy(rangePartitioner); 44 | 45 | 46 | JavaRDD mapPartitionsWithIndex = rangePartitioned.mapPartitionsWithIndex((index, tupleIterator) -> { 47 | 48 | List list=new ArrayList<>(); 49 | 50 | while(tupleIterator.hasNext()){ 51 | list.add("Partition number:"+index+",key:"+tupleIterator.next()._1()); 52 | } 53 | 54 | return list.iterator(); 55 | }, true); 56 | 57 | System.out.println(mapPartitionsWithIndex.collect()); 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch7/TestAccumulator.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch7; 2 | 3 | import org.apache.log4j.Level; 4 | import org.apache.log4j.LogManager; 5 | import org.apache.log4j.Logger; 6 | import org.apache.spark.SparkConf; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | import org.apache.spark.api.java.function.VoidFunction; 10 | import org.apache.spark.sql.SparkSession; 11 | import org.apache.spark.util.CollectionAccumulator; 12 | import org.apache.spark.util.LongAccumulator; 13 | 14 | public class TestAccumulator { 15 | 16 | public static void main(String[] args) { 17 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 18 | SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false"); 19 | JavaSparkContext sparkContext = new JavaSparkContext(conf); 20 | // Logger rootLogger = LogManager.getRootLogger(); 21 | // rootLogger.setLevel(Level.WARN); 22 | 23 | 24 | 25 | LongAccumulator longAccumulator = sparkContext.sc().longAccumulator("ExceptionCounter"); 26 | 27 | JavaRDD textFile = sparkContext.textFile("src/main/resources/logFileWithException.log"); 28 | textFile.foreach(new VoidFunction() { 29 | @Override 30 | public void call(String line) throws Exception { 31 | if(line.contains("Exception")){ 32 | longAccumulator.add(1); 33 | System.out.println("The intermediate value in loop "+longAccumulator.value()); 34 | 35 | } 36 | } 37 | }); 38 | System.out.println("The final value of Accumulator : "+longAccumulator.value()); 39 | 40 | 41 | CollectionAccumulator collectionAccumulator = sparkContext.sc().collectionAccumulator(); 42 | textFile.foreach(new VoidFunction() { 43 | @Override 44 | public void call(String line) throws Exception { 45 | if(line.contains("Exception")){ 46 | collectionAccumulator.add(1L); 47 | System.out.println("The intermediate value in loop "+collectionAccumulator.value()); 48 | 49 | } 50 | } 51 | }); 52 | System.out.println("The final value of Accumulator : "+collectionAccumulator.value()); 53 | 54 | ListAccumulator listAccumulator=new ListAccumulator(); 55 | 56 | sparkContext.sc().register(listAccumulator, "ListAccumulator"); 57 | 58 | textFile.foreach(new VoidFunction() { 59 | @Override 60 | public void call(String line) throws Exception { 61 | if(line.contains("Exception")){ 62 | listAccumulator.add("1"); 63 | System.out.println("The intermediate value in loop "+listAccumulator.value()); 64 | 65 | } 66 | } 67 | }); 68 | System.out.println("The final value of Accumulator : "+listAccumulator.value()); 69 | 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch7/Transformations.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch7; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | import java.util.stream.Collectors; 7 | 8 | import org.apache.spark.HashPartitioner; 9 | import org.apache.spark.api.java.JavaPairRDD; 10 | import org.apache.spark.api.java.JavaRDD; 11 | import org.apache.spark.api.java.JavaSparkContext; 12 | import org.apache.spark.api.java.function.Function; 13 | import org.apache.spark.sql.SparkSession; 14 | 15 | import net.sf.saxon.expr.flwor.Tuple; 16 | import scala.Tuple2; 17 | 18 | public class Transformations { 19 | public static void main(String[] args) { 20 | 21 | SparkSession sparkSession = SparkSession.builder().master("local").appName("My App") 22 | .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); 23 | 24 | JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext()); 25 | 26 | JavaRDD intRDD = jsc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 2); 27 | 28 | 29 | JavaRDD mapPartitions = intRDD.mapPartitions(iterator -> { 30 | List intList = new ArrayList<>(); 31 | while (iterator.hasNext()) { 32 | intList.add(iterator.next() + 1); 33 | } 34 | 35 | return intList.iterator(); 36 | }); 37 | 38 | intRDD.mapPartitionsWithIndex((index, iterator) -> { 39 | List list = new ArrayList(); 40 | while (iterator.hasNext()) { 41 | list.add("Element " + iterator.next() + " belongs to partition " + index); 42 | } 43 | return list.iterator(); 44 | }, false); 45 | 46 | JavaPairRDD pairRDD = intRDD.mapPartitionsToPair(t -> { 47 | List> list = new ArrayList<>(); 48 | while (t.hasNext()) { 49 | int element = t.next(); 50 | list.add(element % 2 == 0 ? new Tuple2("even", element) 51 | : new Tuple2("odd", element)); 52 | } 53 | return list.iterator(); 54 | }); 55 | JavaPairRDD mapValues = pairRDD.mapValues(v1 -> v1 * 3); 56 | 57 | // System.out.println(mapValues.collect()); 58 | 59 | // intRDD.mapPartitionsToPair(f) 60 | /* 61 | * intRDD.mapPartitionsWithIndex(new Function2, Iterator>() { 63 | * 64 | * @Override public Iterator call(Integer v1, Iterator 65 | * v2) throws Exception { // TODO Auto-generated method stub return 66 | * null; } }, true); 67 | */ 68 | // System.out.println(mapPartitions.toDebugString()); 69 | 70 | // sort bykey 71 | JavaPairRDD monExpRDD = jsc 72 | .parallelizePairs(Arrays.asList(new Tuple2("Jan", "50,100,214,10"), 73 | new Tuple2("Feb", "60,314,223,77"))); 74 | 75 | JavaPairRDD monExpflattened = monExpRDD 76 | .flatMapValues(new Function>() { 77 | @Override 78 | public Iterable call(String v1) throws Exception { 79 | List list = new ArrayList<>(); 80 | 81 | String[] split = v1.split(","); 82 | 83 | for (String s : split) { 84 | list.add(Integer.parseInt(s)); 85 | } 86 | return list; 87 | } 88 | }); 89 | 90 | JavaPairRDD monExpflattened1 = monExpRDD.flatMapValues( 91 | v -> Arrays.asList(v.split(",")).stream().map(s -> Integer.parseInt(s)).collect(Collectors.toList())); 92 | 93 | JavaPairRDD repartitionAndSortWithinPartitions = monExpflattened 94 | .repartitionAndSortWithinPartitions(new HashPartitioner(2)); 95 | JavaPairRDD unPartitionedRDD = jsc.parallelizePairs(Arrays.asList(new Tuple2(8, "h"), 96 | new Tuple2(5, "e"), new Tuple2(4, "d"), 97 | new Tuple2(2, "a"), new Tuple2(7, "g"), 98 | new Tuple2(6, "f"),new Tuple2(1, "a"), 99 | new Tuple2(3, "c"),new Tuple2(3, "z"))); 100 | 101 | 102 | JavaPairRDD repartitionAndSortWithinPartitions2 = unPartitionedRDD.repartitionAndSortWithinPartitions(new HashPartitioner(3)); 103 | 104 | 105 | pairRDD.coalesce(2); 106 | 107 | 108 | 109 | 110 | JavaPairRDD pairRDD3 = jsc.parallelizePairs(Arrays.asList( 111 | new Tuple2("key1", "Austria"), new Tuple2("key2", "Australia"), 112 | new Tuple2("key3", "Antartica"), new Tuple2("key1", "Asia"), 113 | new Tuple2("key2", "France"),new Tuple2("key3", "Canada"), 114 | new Tuple2("key1", "Argentina"),new Tuple2("key2", "American Samoa"), 115 | new Tuple2("key3", "Germany")),1); 116 | // System.out.println(pairRDD3.getNumPartitions()); 117 | 118 | JavaPairRDD aggregateByKey = pairRDD3.aggregateByKey(0, (v1, v2) -> { 119 | System.out.println(v2); 120 | if(v2.startsWith("A")){ 121 | v1+=1; 122 | } 123 | 124 | return v1; 125 | }, (v1, v2) -> v1+v2); 126 | 127 | 128 | JavaPairRDD combineByKey = pairRDD3.combineByKey(v1 -> { 129 | if(v1.startsWith("A")){ 130 | return 1; 131 | } 132 | else{ 133 | return 0; 134 | } 135 | }, (v1, v2) -> { 136 | 137 | if(v2.startsWith("A")){ 138 | v1+=1; 139 | } 140 | 141 | return v1; 142 | }, (v1, v2) -> v1+v2); 143 | 144 | 145 | JavaRDD stringRDD = jsc.parallelize(Arrays.asList("Hello Spark", "Hello Java")); 146 | JavaPairRDD flatMapToPair = stringRDD.flatMapToPair(s -> Arrays.asList(s.split(" ")).stream() 147 | .map(token -> new Tuple2(token, 1)).collect(Collectors.toList()) 148 | .iterator()); 149 | flatMapToPair.foldByKey(0,(v1, v2) -> v1+v2).collect(); 150 | 151 | 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/Average.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import java.io.Serializable; 4 | 5 | public class Average implements Serializable { 6 | private static final long serialVersionUID = 1L; 7 | private double sumVal; 8 | private long countVal; 9 | 10 | public Average() { 11 | } 12 | 13 | public Average(long sumVal, long countVal) { 14 | super(); 15 | this.sumVal = sumVal; 16 | this.countVal = countVal; 17 | } 18 | 19 | public double getSumVal() { 20 | return sumVal; 21 | } 22 | 23 | public void setSumVal(double sumVal) { 24 | this.sumVal = sumVal; 25 | } 26 | 27 | public long getCountVal() { 28 | return countVal; 29 | } 30 | 31 | public void setCountVal(long countVal) { 32 | this.countVal = countVal; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/AverageUDAF.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import org.apache.spark.sql.Row; 4 | import org.apache.spark.sql.expressions.MutableAggregationBuffer; 5 | import org.apache.spark.sql.expressions.UserDefinedAggregateFunction; 6 | import org.apache.spark.sql.types.DataType; 7 | import org.apache.spark.sql.types.DataTypes; 8 | import org.apache.spark.sql.types.Metadata; 9 | import org.apache.spark.sql.types.StructField; 10 | import org.apache.spark.sql.types.StructType; 11 | 12 | public class AverageUDAF extends UserDefinedAggregateFunction { 13 | private static final long serialVersionUID = 1L; 14 | 15 | @Override 16 | public StructType inputSchema() { 17 | return new StructType(new StructField[] { new StructField("counter", DataTypes.DoubleType, true, Metadata.empty())}); 18 | } 19 | 20 | @Override 21 | public DataType dataType() { 22 | return DataTypes.DoubleType; 23 | } 24 | 25 | @Override 26 | public boolean deterministic() { 27 | return false; 28 | } 29 | 30 | 31 | 32 | @Override 33 | public StructType bufferSchema() { 34 | return new StructType() .add("sumVal", DataTypes.DoubleType) .add("countVal", DataTypes.DoubleType); 35 | } 36 | 37 | @Override 38 | public void initialize(MutableAggregationBuffer bufferAgg) { 39 | bufferAgg.update(0, 0.0); 40 | bufferAgg.update(1, 0.0); 41 | } 42 | 43 | @Override 44 | public void update(MutableAggregationBuffer bufferAgg, Row row) { 45 | bufferAgg.update(0, bufferAgg.getDouble(0)+row.getDouble(0)); 46 | bufferAgg.update(1, bufferAgg.getDouble(1)+2.0); 47 | } 48 | 49 | 50 | @Override 51 | public void merge(MutableAggregationBuffer bufferAgg, Row row) { 52 | bufferAgg.update(0, bufferAgg.getDouble(0)+row.getDouble(0)); 53 | bufferAgg.update(1, bufferAgg.getDouble(1)+row.getDouble(1)); 54 | } 55 | 56 | 57 | @Override 58 | public Object evaluate(Row row) { 59 | return row.getDouble(0)/row.getDouble(1); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/CalcDaysUDF.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import java.text.SimpleDateFormat; 4 | import java.util.Date; 5 | import java.util.concurrent.TimeUnit; 6 | 7 | import org.apache.spark.sql.api.java.UDF2; 8 | 9 | public class CalcDaysUDF implements UDF2 { 10 | private static final long serialVersionUID = 1L; 11 | @Override 12 | public Long call(String dateString,String format) throws Exception { 13 | SimpleDateFormat myFormat = new SimpleDateFormat(format); 14 | Date date1 = myFormat.parse(dateString); 15 | Date date2 = new Date(); 16 | long diff = date2.getTime() - date1.getTime(); 17 | return TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/ContextCreation.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaSparkContext; 5 | import org.apache.spark.sql.SQLContext; 6 | 7 | public class ContextCreation { 8 | @SuppressWarnings("deprecation") 9 | public static void main(String[] args) { 10 | 11 | SparkConf conf =new SparkConf().setMaster("local").setAppName("Sql"); 12 | 13 | JavaSparkContext javaSparkContext = new JavaSparkContext(conf); 14 | 15 | SQLContext sqlContext = new SQLContext(javaSparkContext); 16 | 17 | //HiveContext hiveContext = new HiveContext(javaSparkContext); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/DatasetOperations.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.log4j.Level; 7 | import org.apache.log4j.LogManager; 8 | import org.apache.log4j.Logger; 9 | import org.apache.spark.api.java.JavaRDD; 10 | import org.apache.spark.api.java.function.Function; 11 | import org.apache.spark.sql.AnalysisException; 12 | import org.apache.spark.sql.Dataset; 13 | import org.apache.spark.sql.Row; 14 | import org.apache.spark.sql.RowFactory; 15 | import org.apache.spark.sql.SaveMode; 16 | import org.apache.spark.sql.SparkSession; 17 | import org.apache.spark.sql.types.DataTypes; 18 | import org.apache.spark.sql.types.StructField; 19 | import org.apache.spark.sql.types.StructType; 20 | import static org.apache.spark.sql.functions.col; 21 | 22 | 23 | 24 | public class DatasetOperations { 25 | 26 | public static void main(String[] args) throws AnalysisException { 27 | //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set 28 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 29 | 30 | //Build a Spark Session 31 | SparkSession sparkSession = SparkSession 32 | .builder() 33 | .master("local") 34 | .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse") 35 | .appName("DatasetOperations") 36 | //.enableHiveSupport() 37 | .getOrCreate(); 38 | Logger rootLogger = LogManager.getRootLogger(); 39 | rootLogger.setLevel(Level.WARN); 40 | //Create a RDD 41 | JavaRDD deptRDD = sparkSession.sparkContext() 42 | .textFile("src/main/resources/dept.txt", 1) 43 | .toJavaRDD(); 44 | 45 | //Convert the RDD to RDD 46 | JavaRDD deptRows = deptRDD.filter(str-> !str.contains("deptno")).map(new Function() { 47 | private static final long serialVersionUID = 1L; 48 | @Override 49 | public Row call(String rowString) throws Exception { 50 | String[] cols = rowString.split(","); 51 | return RowFactory.create(cols[0].trim(), cols[1].trim(),cols[2].trim()); 52 | } 53 | }); 54 | 55 | //Create schema 56 | String[] schemaArr=deptRDD.first().split(","); 57 | List structFieldList = new ArrayList<>(); 58 | for (String fieldName : schemaArr) { 59 | StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true); 60 | structFieldList.add(structField); 61 | } 62 | StructType schema = DataTypes.createStructType(structFieldList); 63 | 64 | Dataset deptDf = sparkSession.createDataFrame(deptRows, schema); 65 | deptDf.printSchema(); 66 | deptDf.show(); 67 | 68 | deptDf.createOrReplaceTempView("dept"); 69 | 70 | Dataset result = sparkSession.sql("select loc,count(loc) from dept where deptno > 10 group by loc" ); 71 | result.show(); 72 | 73 | 74 | // sparkSession.newSession().sql("SELECT * FROM dept").show(); 75 | 76 | 77 | deptDf.createGlobalTempView("dept_global_view"); 78 | 79 | sparkSession.newSession().sql("SELECT deptno,dname,loc, rank() OVER (PARTITION BY loc ORDER BY deptno ) FROM global_temp.dept_global_view").show(); 80 | 81 | // sparkSession.newSession().sql("SELECT * FROM dept_global_view").show(); 82 | 83 | deptDf.write().mode(SaveMode.Overwrite).json("src/main/resources/output/dept"); 84 | deptDf.write().mode(SaveMode.Overwrite).format("csv").save("src/main/resources/output/deptText"); 85 | deptDf.write().mode("overwrite").format("csv").save("src/main/resources/output/deptText"); 86 | 87 | 88 | deptDf.write().mode(SaveMode.Overwrite).format("csv").saveAsTable("Department"); 89 | deptDf.write().mode(SaveMode.Overwrite).format("csv").option("path", "file:///E:/hadoop/bin").saveAsTable("Department"); 90 | 91 | // Read the CSV data 92 | Dataset emp_ds = sparkSession.read() 93 | .format("csv") 94 | .option("header", "true") 95 | .option("inferSchema", "true") 96 | .load("src/main/resources/employee.txt"); 97 | 98 | emp_ds.printSchema(); 99 | emp_ds.show(); 100 | 101 | emp_ds.select("empName" ,"empId").show(); 102 | 103 | emp_ds.select(col("empName").name("Employee Name") ,col("empId").cast(DataTypes.IntegerType).name("Employee Id")).show(); 104 | 105 | emp_ds.sort(col("empId").asc()).filter(col("salary").gt("2500")); 106 | 107 | emp_ds.select("job").groupBy(col("job")).count().show(); 108 | 109 | //emp_ds.as("A").join(deptDf.as("B"),col("deptno"),"left").printSchema(); 110 | 111 | emp_ds.as("A").join(deptDf.as("B"),emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"left").select("A.empId","A.empName","A.job","A.manager","A.hiredate","A.salary","A.comm","A.deptno","B.dname","B.loc").show(); 112 | 113 | emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").show(); 114 | emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").logicalPlan(); 115 | 116 | emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").explain(); 117 | 118 | sparkSession.sql("show functions").show(false); 119 | sparkSession.sql("DESCRIBE FUNCTION add_months").show(false); 120 | sparkSession.sql("DESCRIBE FUNCTION EXTENDED add_months").show(false); 121 | 122 | 123 | } 124 | 125 | } 126 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/DfExample.java: -------------------------------------------------------------------------------- 1 | /*package com.packt.sfjd.ch8; 2 | 3 | import java.util.Arrays; 4 | 5 | import org.apache.spark.SparkConf; 6 | import org.apache.spark.api.java.JavaRDD; 7 | import org.apache.spark.api.java.JavaSparkContext; 8 | import org.apache.spark.sql.DataFrame; 9 | import org.apache.spark.sql.Dataset; 10 | import org.apache.spark.sql.Row; 11 | import org.apache.spark.sql.SQLContext; 12 | 13 | public class DfExample { 14 | public static void main(String[] args) { 15 | 16 | 17 | SparkConf conf =new SparkConf().setMaster("local").setAppName("Sql"); 18 | 19 | JavaSparkContext jsc = new JavaSparkContext(conf); 20 | JavaRDD empRDD = jsc.parallelize(Arrays.asList(new Employee("Foo", 1),new Employee("Bar", 1))); 21 | SQLContext sqlContext = new SQLContext(jsc); 22 | 23 | DataFrame df = sqlContext.createDataFrame(empRDD, Employee.class); 24 | 25 | DataFrame filter = df.filter("id >1"); 26 | 27 | filter.show(); 28 | 29 | } 30 | } 31 | */ -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/DsExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import java.util.Arrays; 4 | 5 | import org.apache.spark.SparkConf; 6 | import org.apache.spark.api.java.JavaRDD; 7 | import org.apache.spark.api.java.JavaSparkContext; 8 | import org.apache.spark.api.java.function.FilterFunction; 9 | import org.apache.spark.network.protocol.Encoders; 10 | import org.apache.spark.sql.Dataset; 11 | import org.apache.spark.sql.Row; 12 | import org.apache.spark.sql.SparkSession; 13 | import static org.apache.spark.sql.functions.col; 14 | 15 | public class DsExample { 16 | public static void main(String[] args) { 17 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 18 | SparkSession sparkSession = SparkSession.builder() 19 | .master("local") 20 | .appName("Spark Session Example") 21 | .config("spark.driver.memory", "2G") 22 | .config("spark.sql.warehouse.dir", "E:\\hadoop\\warehouse") 23 | .getOrCreate(); 24 | 25 | JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext()); 26 | JavaRDD empRDD = jsc.parallelize(Arrays.asList(new Employee( 1,"Foo"),new Employee( 2,"Bar"))); 27 | 28 | //Dataset dataset = sparkSession.createDataFrame(empRDD, Employee.class); 29 | Dataset dsEmp = sparkSession.createDataset(empRDD.rdd(), org.apache.spark.sql.Encoders.bean(Employee.class)); 30 | Dataset filter = dsEmp.filter(emp->emp.getEmpId()>1); 31 | //filter.show(); 32 | 33 | 34 | Dataset dfEmp = sparkSession.createDataFrame(empRDD, Employee.class); 35 | dfEmp.show(); 36 | 37 | Dataset filter2 = dfEmp.filter(row->row.getInt(2)> 1); 38 | filter2.show(); 39 | 40 | //Three variants in which DataSet can be used 41 | 42 | dsEmp.printSchema(); 43 | 44 | //1. 45 | dsEmp.filter(new FilterFunction() { 46 | @Override 47 | public boolean call(Employee emp) throws Exception { 48 | return emp.getEmpId() > 1; 49 | } 50 | }).show(); 51 | 52 | dsEmp.filter(emp -> emp.getEmpId()>1).show(); 53 | 54 | //2. 55 | dsEmp.filter("empID > 1").show(); 56 | 57 | //3. DSL 58 | dsEmp.filter(col("empId").gt(1)).show(); 59 | 60 | 61 | 62 | 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/Employee.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import java.io.Serializable; 4 | 5 | public class Employee implements Serializable { 6 | 7 | /** 8 | * 9 | */ 10 | private static final long serialVersionUID = 1L; 11 | 12 | 13 | 14 | private int empId; 15 | private String empName; 16 | private String job; 17 | private String manager; 18 | private String hiredate; 19 | private double salary; 20 | private String comm; 21 | private double deptNo; 22 | 23 | 24 | public Employee() { 25 | } 26 | public Employee(int empId, String empName, String job, String manager, String hiredate, int salary, String comm, 27 | int deptNo) { 28 | super(); 29 | this.empId = empId; 30 | this.empName = empName; 31 | this.job = job; 32 | this.manager = manager; 33 | this.hiredate = hiredate; 34 | this.salary = salary; 35 | this.comm = comm; 36 | this.deptNo = deptNo; 37 | } 38 | 39 | 40 | 41 | public Employee(int empId, String empName) { 42 | super(); 43 | this.empId = empId; 44 | this.empName = empName; 45 | } 46 | 47 | 48 | 49 | public int getEmpId() { 50 | return empId; 51 | } 52 | 53 | public void setEmpId(int empId) { 54 | this.empId = empId; 55 | } 56 | 57 | public String getEmpName() { 58 | return empName; 59 | } 60 | 61 | public void setEmpName(String empName) { 62 | this.empName = empName; 63 | } 64 | 65 | public String getJob() { 66 | return job; 67 | } 68 | 69 | public void setJob(String job) { 70 | this.job = job; 71 | } 72 | 73 | public String getManager() { 74 | return manager; 75 | } 76 | 77 | public void setManager(String manager) { 78 | this.manager = manager; 79 | } 80 | 81 | public String getHiredate() { 82 | return hiredate; 83 | } 84 | 85 | public void setHiredate(String hiredate) { 86 | this.hiredate = hiredate; 87 | } 88 | 89 | public double getSalary() { 90 | return salary; 91 | } 92 | 93 | public void setSalary(double salary) { 94 | this.salary = salary; 95 | } 96 | 97 | public String getComm() { 98 | return comm; 99 | } 100 | 101 | public void setComm(String comm) { 102 | this.comm = comm; 103 | } 104 | 105 | public double getDeptNo() { 106 | return deptNo; 107 | } 108 | 109 | public void setDeptNo(double deptNo) { 110 | this.deptNo = deptNo; 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/SparkSessionExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.SparkContext; 5 | import org.apache.spark.sql.SparkSession; 6 | 7 | import scala.Function1; 8 | import scala.Tuple2; 9 | import scala.collection.JavaConverters; 10 | import scala.collection.convert.Decorators.AsJava; 11 | import scala.collection.immutable.Map; 12 | 13 | public class SparkSessionExample { 14 | public static void main(String[] args) { 15 | SparkSession sparkSession = SparkSession.builder() 16 | .master("local") 17 | .appName("Spark Session Example") 18 | .enableHiveSupport() 19 | .config("spark.driver.memory", "2G") 20 | .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse") 21 | .getOrCreate(); 22 | 23 | sparkSession.conf().set("spark.driver.memory", "3G"); 24 | 25 | SparkContext sparkContext = sparkSession.sparkContext(); 26 | SparkConf conf = sparkSession.sparkContext().getConf(); 27 | 28 | Map all = sparkSession.conf().getAll(); 29 | System.out.println(JavaConverters.mapAsJavaMapConverter(all).asJava().get("spark.driver.memory")); 30 | 31 | 32 | 33 | 34 | 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/SparkSessionHeloWorld.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Row; 5 | import org.apache.spark.sql.SparkSession; 6 | 7 | public class SparkSessionHeloWorld { 8 | public static void main(String[] args) { 9 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 10 | SparkSession sparkSession = SparkSession.builder() 11 | .master("local") 12 | .appName("CSV Read Example") 13 | .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse") 14 | .getOrCreate(); 15 | 16 | Dataset csv = sparkSession.read().format("com.databricks.spark.csv").option("header","true") 17 | .load("C:\\Users\\sgulati\\Documents\\my_docs\\book\\testdata\\emp.csv"); 18 | 19 | csv.createOrReplaceTempView("test"); 20 | Dataset sql = sparkSession.sql("select * from test"); 21 | sql.collectAsList(); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/TypeSafeUDAF.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import java.io.Serializable; 4 | 5 | import org.apache.spark.sql.Encoder; 6 | import org.apache.spark.sql.Encoders; 7 | import org.apache.spark.sql.expressions.Aggregator; 8 | 9 | public class TypeSafeUDAF extends Aggregator implements Serializable{ 10 | private static final long serialVersionUID = 1L; 11 | 12 | public Average zero() { 13 | return new Average(0L, 0L); 14 | } 15 | 16 | public Average reduce(Average buffer, Employee employee) { 17 | double newSum = buffer.getSumVal() + employee.getSalary(); 18 | long newCount = buffer.getCountVal() + 1; 19 | buffer.setSumVal(newSum); 20 | buffer.setCountVal(newCount); 21 | return buffer; 22 | } 23 | 24 | public Average merge(Average b1, Average b2) { 25 | double mergedSum = b1.getSumVal() + b2.getSumVal(); 26 | long mergedCount = b1.getCountVal() + b2.getCountVal(); 27 | b1.setSumVal(mergedSum); 28 | b1.setCountVal(mergedCount); 29 | return b1; 30 | } 31 | 32 | public Double finish(Average reduction) { 33 | return ((double) reduction.getSumVal()) / reduction.getCountVal(); 34 | } 35 | 36 | public Encoder bufferEncoder() { 37 | return Encoders.bean(Average.class); 38 | } 39 | 40 | public Encoder outputEncoder() { 41 | return Encoders.DOUBLE(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch8/UDFExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch8; 2 | 3 | import org.apache.log4j.Level; 4 | import org.apache.log4j.LogManager; 5 | import org.apache.log4j.Logger; 6 | import org.apache.spark.sql.Dataset; 7 | import org.apache.spark.sql.Encoders; 8 | import org.apache.spark.sql.Row; 9 | import org.apache.spark.sql.SparkSession; 10 | import org.apache.spark.sql.TypedColumn; 11 | import org.apache.spark.sql.api.java.UDF2; 12 | import org.apache.spark.sql.types.DataTypes; 13 | 14 | public class UDFExample { 15 | 16 | public static void main(String[] args) { 17 | //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set 18 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 19 | 20 | //Build a Spark Session 21 | SparkSession sparkSession = SparkSession 22 | .builder() 23 | .master("local") 24 | .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse") 25 | .appName("EdgeBuilder") 26 | .getOrCreate(); 27 | Logger rootLogger = LogManager.getRootLogger(); 28 | rootLogger.setLevel(Level.WARN); 29 | // Read the CSV data 30 | Dataset emp_ds = sparkSession.read() 31 | .format("com.databricks.spark.csv") 32 | .option("header", "true") 33 | .option("inferSchema", "true") 34 | .load("src/main/resources/employee.txt"); 35 | 36 | UDF2 calcDays=new CalcDaysUDF(); 37 | //Registering the UDFs in Spark Session created above 38 | sparkSession.udf().register("calcDays", calcDays, DataTypes.LongType); 39 | 40 | emp_ds.createOrReplaceTempView("emp_ds"); 41 | 42 | emp_ds.printSchema(); 43 | emp_ds.show(); 44 | 45 | sparkSession.sql("select calcDays(hiredate,'dd-MM-yyyy') from emp_ds").show(); 46 | //Instantiate UDAF 47 | AverageUDAF calcAvg= new AverageUDAF(); 48 | //Register UDAF to SparkSession 49 | sparkSession.udf().register("calAvg", calcAvg); 50 | //Use UDAF 51 | sparkSession.sql("select deptno,calAvg(salary) from emp_ds group by deptno ").show(); 52 | 53 | // 54 | TypeSafeUDAF typeSafeUDAF=new TypeSafeUDAF(); 55 | 56 | Dataset emf = emp_ds.as(Encoders.bean(Employee.class)); 57 | emf.printSchema(); 58 | emf.show(); 59 | 60 | TypedColumn averageSalary = typeSafeUDAF.toColumn().name("averageTypeSafe"); 61 | Dataset result = emf.select(averageSalary); 62 | result.show(); 63 | 64 | 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/Calculator.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.util.Scanner; 4 | 5 | public class Calculator { 6 | 7 | private static final String EXIT = "EXIT"; 8 | 9 | public static void main(String[] args) { 10 | 11 | Calculator calc = new Calculator(); 12 | Scanner s = new Scanner(System.in); 13 | while (true) { 14 | String res = calc.runCalc(s); 15 | if (res.equals(EXIT)) { 16 | break; 17 | } else { 18 | System.out.println(res); 19 | } 20 | } 21 | } 22 | 23 | private String runCalc(Scanner s) { 24 | System.out.println("Main Menu:"); 25 | System.out.println("1. Addition"); 26 | System.out.println("2. Substraction"); 27 | System.out.println("3. Multipication"); 28 | System.out.println("4. Division"); 29 | System.out.println("5. Exit"); 30 | System.out.println("Enter your choice: "); 31 | int i = s.nextInt(); 32 | 33 | if (i == 5) { 34 | return EXIT; 35 | } 36 | 37 | System.out.println("ENTER FIRST NUMBER "); 38 | int a = s.nextInt(); 39 | 40 | System.out.println("ENTER SECOND NUMBER "); 41 | int b = s.nextInt(); 42 | 43 | int result = 0;// 'result' will store the result of operation 44 | 45 | switch (i) { 46 | case 1: 47 | result = a + b; 48 | break; 49 | case 2: 50 | result = a - b; 51 | break; 52 | case 3: 53 | result = a * b; 54 | break; 55 | case 4: 56 | result = a / b; 57 | break; 58 | 59 | default: 60 | return "Wrong Choice."; 61 | 62 | } 63 | 64 | return "Answer is " + result; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/FileStreamingEx.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 6 | import org.apache.log4j.Level; 7 | import org.apache.log4j.LogManager; 8 | import org.apache.log4j.Logger; 9 | import org.apache.spark.SparkConf; 10 | import org.apache.spark.api.java.JavaSparkContext; 11 | import org.apache.spark.streaming.Durations; 12 | import org.apache.spark.streaming.api.java.JavaDStream; 13 | import org.apache.spark.streaming.api.java.JavaPairDStream; 14 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 15 | 16 | public class FileStreamingEx { 17 | 18 | public static void main(String[] args) { 19 | //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set 20 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 21 | //Logger rootLogger = LogManager.getRootLogger(); 22 | //rootLogger.setLevel(Level.WARN); 23 | SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]"); 24 | String inputDirectory="E:\\hadoop\\streamFolder\\"; 25 | 26 | JavaSparkContext sc = new JavaSparkContext(conf); 27 | JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(1)); 28 | // streamingContext.checkpoint("E:\\hadoop\\checkpoint"); 29 | Logger rootLogger = LogManager.getRootLogger(); 30 | rootLogger.setLevel(Level.WARN); 31 | 32 | JavaDStream streamfile = streamingContext.textFileStream(inputDirectory); 33 | streamfile.print(); 34 | streamfile.foreachRDD(rdd-> rdd.foreach(x -> System.out.println(x))); 35 | 36 | 37 | JavaPairDStream streamedFile = streamingContext.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class); 38 | streamedFile.print(); 39 | 40 | streamingContext.start(); 41 | 42 | 43 | try { 44 | streamingContext.awaitTermination(); 45 | } catch (InterruptedException e) { 46 | // TODO Auto-generated catch block 47 | e.printStackTrace(); 48 | } 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/FlightDetails.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.io.Serializable; 4 | 5 | public class FlightDetails implements Serializable { 6 | private String flightId; 7 | private double temperature; 8 | private boolean landed; 9 | private long timestamp; 10 | 11 | public String getFlightId() { 12 | return flightId; 13 | } 14 | 15 | public void setFlightId(String flightId) { 16 | this.flightId = flightId; 17 | } 18 | 19 | public double getTemperature() { 20 | return temperature; 21 | } 22 | 23 | public void setTemperature(double temperature) { 24 | this.temperature = temperature; 25 | } 26 | 27 | public boolean isLanded() { 28 | return landed; 29 | } 30 | 31 | public void setLanded(boolean landed) { 32 | this.landed = landed; 33 | } 34 | 35 | public long getTimestamp() { 36 | return timestamp; 37 | } 38 | 39 | public void setTimestamp(long timestamp) { 40 | this.timestamp = timestamp; 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return "FlightDetails [flightId=" + flightId + ", temperature=" + temperature + ", landed=" + landed 46 | + ", timestamp=" + timestamp + "]"; 47 | } 48 | 49 | public static void main(String[] args) { 50 | int x=1; 51 | int x1=1; 52 | int y =x++; 53 | int z = x1 + x1++; 54 | System.out.println(y+""+z); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/KafkaExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.util.Arrays; 4 | import java.util.Collection; 5 | import java.util.Date; 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | import java.util.stream.Collectors; 9 | 10 | import org.apache.kafka.clients.consumer.ConsumerRecord; 11 | import org.apache.kafka.common.serialization.StringDeserializer; 12 | import org.apache.log4j.Level; 13 | import org.apache.log4j.LogManager; 14 | import org.apache.log4j.Logger; 15 | import org.apache.spark.SparkConf; 16 | import org.apache.spark.api.java.JavaSparkContext; 17 | import org.apache.spark.streaming.Durations; 18 | import org.apache.spark.streaming.api.java.JavaDStream; 19 | import org.apache.spark.streaming.api.java.JavaInputDStream; 20 | import org.apache.spark.streaming.api.java.JavaPairDStream; 21 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 22 | import org.apache.spark.streaming.kafka010.ConsumerStrategies; 23 | import org.apache.spark.streaming.kafka010.KafkaUtils; 24 | import org.apache.spark.streaming.kafka010.LocationStrategies; 25 | 26 | import scala.Tuple2; 27 | 28 | public class KafkaExample { 29 | 30 | public static void main(String[] args) { 31 | //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set 32 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 33 | //Logger rootLogger = LogManager.getRootLogger(); 34 | //rootLogger.setLevel(Level.WARN); 35 | SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]"); 36 | JavaSparkContext sc = new JavaSparkContext(conf); 37 | JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2)); 38 | streamingContext.checkpoint("E:\\hadoop\\checkpoint"); 39 | Logger rootLogger = LogManager.getRootLogger(); 40 | rootLogger.setLevel(Level.WARN); 41 | Map kafkaParams = new HashMap<>(); 42 | kafkaParams.put("bootstrap.servers", "10.0.75.1:9092"); 43 | kafkaParams.put("key.deserializer", StringDeserializer.class); 44 | kafkaParams.put("value.deserializer", StringDeserializer.class); 45 | kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea"); 46 | kafkaParams.put("auto.offset.reset", "latest"); 47 | // kafkaParams.put("enable.auto.commit", false); 48 | 49 | Collection topics = Arrays.asList("mytopic", "anothertopic"); 50 | 51 | final JavaInputDStream> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(), 52 | ConsumerStrategies.Subscribe(topics, kafkaParams)); 53 | 54 | JavaPairDStream pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value())); 55 | 56 | pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));}); 57 | 58 | JavaDStream tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText()); 59 | 60 | tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x))); 61 | 62 | JavaDStream hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() ); 63 | 64 | hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x))); 65 | 66 | JavaPairDStream cntByVal = hashtagRDD.countByValue(); 67 | 68 | cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2()))); 69 | 70 | /* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30)) 71 | .countByValue() 72 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 73 | 74 | hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30)) 75 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2()))); 76 | */ 77 | hashtagRDD.window(Durations.minutes(8)).countByValue() 78 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 79 | hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue() 80 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 81 | hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue() 82 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 83 | hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue() 84 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 85 | hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue() 86 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 87 | 88 | /*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue() 89 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/ 90 | /* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue() 91 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/ 92 | 93 | streamingContext.start(); 94 | try { 95 | streamingContext.awaitTermination(); 96 | } catch (InterruptedException e) { 97 | // TODO Auto-generated catch block 98 | e.printStackTrace(); 99 | } 100 | } 101 | 102 | 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/StateFulProcessingExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import org.apache.spark.api.java.Optional; 8 | import org.apache.spark.api.java.function.Function3; 9 | import org.apache.spark.sql.SparkSession; 10 | import org.apache.spark.streaming.Durations; 11 | import org.apache.spark.streaming.State; 12 | import org.apache.spark.streaming.StateSpec; 13 | import org.apache.spark.streaming.api.java.JavaDStream; 14 | import org.apache.spark.streaming.api.java.JavaMapWithStateDStream; 15 | import org.apache.spark.streaming.api.java.JavaPairDStream; 16 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 17 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 18 | 19 | import com.fasterxml.jackson.databind.ObjectMapper; 20 | 21 | import scala.Tuple2; 22 | 23 | //{"flightId":"tz302","timestamp":1494423926816,"temperature":21.12,"landed":false} 24 | public class StateFulProcessingExample { 25 | public static void main(String[] args) throws InterruptedException { 26 | 27 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 28 | 29 | SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Stateful Streaming Example") 30 | .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); 31 | 32 | JavaStreamingContext jssc= new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()), 33 | Durations.milliseconds(1000)); 34 | JavaReceiverInputDStream inStream = jssc.socketTextStream("10.204.136.223", 9999); 35 | jssc.checkpoint("C:\\Users\\sgulati\\spark-checkpoint"); 36 | 37 | JavaDStream flightDetailsStream = inStream.map(x -> { 38 | ObjectMapper mapper = new ObjectMapper(); 39 | return mapper.readValue(x, FlightDetails.class); 40 | }); 41 | 42 | 43 | 44 | JavaPairDStream flightDetailsPairStream = flightDetailsStream 45 | .mapToPair(f -> new Tuple2(f.getFlightId(), f)); 46 | 47 | Function3, State>, Tuple2> mappingFunc = ( 48 | flightId, curFlightDetail, state) -> { 49 | List details = state.exists() ? state.get() : new ArrayList<>(); 50 | 51 | boolean isLanded = false; 52 | 53 | if (curFlightDetail.isPresent()) { 54 | details.add(curFlightDetail.get()); 55 | if (curFlightDetail.get().isLanded()) { 56 | isLanded = true; 57 | } 58 | } 59 | Double avgSpeed = details.stream().mapToDouble(f -> f.getTemperature()).average().orElse(0.0); 60 | 61 | if (isLanded) { 62 | state.remove(); 63 | } else { 64 | state.update(details); 65 | } 66 | return new Tuple2(flightId, avgSpeed); 67 | }; 68 | 69 | JavaMapWithStateDStream, Tuple2> streamWithState = flightDetailsPairStream 70 | .mapWithState(StateSpec.function(mappingFunc).timeout(Durations.minutes(5))); 71 | 72 | streamWithState.print(); 73 | jssc.start(); 74 | jssc.awaitTermination(); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/StateLessProcessingExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import org.apache.spark.api.java.JavaSparkContext; 4 | import org.apache.spark.sql.SparkSession; 5 | import org.apache.spark.streaming.Durations; 6 | import org.apache.spark.streaming.api.java.JavaDStream; 7 | import org.apache.spark.streaming.api.java.JavaPairDStream; 8 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 9 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 10 | 11 | import com.fasterxml.jackson.databind.ObjectMapper; 12 | 13 | import scala.Tuple2; 14 | 15 | public class StateLessProcessingExample { 16 | public static void main(String[] args) throws InterruptedException { 17 | 18 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 19 | 20 | SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("stateless Streaming Example") 21 | .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); 22 | 23 | JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()), 24 | Durations.milliseconds(1000)); 25 | JavaReceiverInputDStream inStream = jssc.socketTextStream("10.204.136.223", 9999); 26 | 27 | JavaDStream flightDetailsStream = inStream.map(x -> { 28 | ObjectMapper mapper = new ObjectMapper(); 29 | return mapper.readValue(x, FlightDetails.class); 30 | }); 31 | 32 | 33 | 34 | //flightDetailsStream.print(); 35 | 36 | //flightDetailsStream.foreachRDD((VoidFunction>) rdd -> rdd.saveAsTextFile("hdfs://namenode:port/path")); 37 | 38 | JavaDStream window = flightDetailsStream.window(Durations.minutes(5),Durations.minutes(1)); 39 | 40 | JavaPairDStream transfomedWindow = window.mapToPair(f->new Tuple2(f.getFlightId(),f.getTemperature())). 41 | mapValues(t->new Tuple2(t,1)) 42 | .reduceByKey((t1, t2) -> new Tuple2(t1._1()+t2._1(), t1._2()+t2._2())).mapValues(t -> t._1()/t._2()); 43 | transfomedWindow.cache(); 44 | transfomedWindow.print(); 45 | 46 | jssc.start(); 47 | jssc.awaitTermination(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/StructuredStreamingExample.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Encoders; 5 | import org.apache.spark.sql.Row; 6 | import org.apache.spark.sql.SparkSession; 7 | import org.apache.spark.sql.streaming.StreamingQuery; 8 | import org.apache.spark.sql.streaming.StreamingQueryException; 9 | 10 | import com.fasterxml.jackson.databind.ObjectMapper; 11 | 12 | public class StructuredStreamingExample { 13 | 14 | public static void main(String[] args) throws StreamingQueryException { 15 | System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); 16 | SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("structured Streaming Example") 17 | .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); 18 | 19 | Dataset inStream = sparkSession.readStream().format("socket").option("host", "10.204.136.223") 20 | .option("port", 9999).load(); 21 | 22 | Dataset dsFlightDetails = inStream.as(Encoders.STRING()).map(x -> { 23 | ObjectMapper mapper = new ObjectMapper(); 24 | return mapper.readValue(x, FlightDetails.class); 25 | 26 | }, Encoders.bean(FlightDetails.class)); 27 | 28 | 29 | dsFlightDetails.createOrReplaceTempView("flight_details"); 30 | 31 | Dataset avdFlightDetails = sparkSession.sql("select flightId, avg(temperature) from flight_details group by flightId"); 32 | 33 | StreamingQuery query = avdFlightDetails.writeStream() 34 | .outputMode("complete") 35 | .format("console") 36 | .start(); 37 | 38 | query.awaitTermination(); 39 | 40 | 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/TweetText.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.io.IOException; 4 | import java.io.Serializable; 5 | 6 | import org.apache.spark.api.java.function.Function; 7 | 8 | import com.fasterxml.jackson.databind.JsonNode; 9 | import com.fasterxml.jackson.databind.ObjectMapper; 10 | public class TweetText implements Function ,Serializable{ 11 | 12 | /** 13 | * 14 | */ 15 | private static final long serialVersionUID = 1L; 16 | 17 | @Override 18 | public String call(String tweet) throws Exception { 19 | 20 | ObjectMapper mapper = new ObjectMapper(); 21 | try 22 | { 23 | JsonNode root = mapper.readValue(tweet, JsonNode.class); 24 | if (root.get("lang") != null && 25 | "en".equals(root.get("lang").textValue())) 26 | { 27 | if (root.get("id") != null && root.get("text") != null) 28 | { System.out.println("the text is ::"+root.get("text").textValue()); 29 | return root.get("text").textValue(); 30 | } 31 | return null; 32 | } 33 | return null; 34 | } 35 | catch (IOException ex) 36 | { 37 | ex.printStackTrace(); 38 | } 39 | return null; 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/WindowBatchInterval.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.util.Arrays; 4 | import java.util.Date; 5 | import java.util.List; 6 | 7 | import org.apache.log4j.Level; 8 | import org.apache.log4j.LogManager; 9 | import org.apache.log4j.Logger; 10 | import org.apache.spark.SparkConf; 11 | import org.apache.spark.api.java.JavaPairRDD; 12 | import org.apache.spark.api.java.JavaSparkContext; 13 | import org.apache.spark.api.java.StorageLevels; 14 | import org.apache.spark.streaming.Durations; 15 | import org.apache.spark.streaming.api.java.JavaDStream; 16 | import org.apache.spark.streaming.api.java.JavaPairDStream; 17 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 18 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 19 | 20 | import scala.Tuple2; 21 | 22 | public class WindowBatchInterval { 23 | 24 | public static void main(String[] args) { 25 | //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set 26 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 27 | //Logger rootLogger = LogManager.getRootLogger(); 28 | //rootLogger.setLevel(Level.WARN); 29 | SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]"); 30 | 31 | 32 | JavaSparkContext sc = new JavaSparkContext(conf); 33 | JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2)); 34 | streamingContext.checkpoint("E:\\hadoop\\checkpoint"); 35 | Logger rootLogger = LogManager.getRootLogger(); 36 | rootLogger.setLevel(Level.WARN); 37 | 38 | List> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); 39 | JavaPairRDD initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); 40 | 41 | 42 | JavaReceiverInputDStream StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); 43 | 44 | JavaDStream words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); 45 | 46 | JavaPairDStream wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); 47 | 48 | wordCounts.print(); 49 | wordCounts.window(Durations.minutes(8)).countByValue() 50 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 51 | wordCounts.window(Durations.minutes(8),Durations.minutes(2)).countByValue() 52 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 53 | wordCounts.window(Durations.minutes(12),Durations.minutes(8)).countByValue() 54 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 55 | wordCounts.window(Durations.minutes(2),Durations.minutes(2)).countByValue() 56 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 57 | wordCounts.window(Durations.minutes(12),Durations.minutes(12)).countByValue() 58 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 59 | 60 | //comment these two operation to make it run 61 | wordCounts.window(Durations.minutes(5),Durations.minutes(2)).countByValue() 62 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 63 | wordCounts.window(Durations.minutes(10),Durations.minutes(1)).countByValue() 64 | .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); 65 | 66 | streamingContext.start(); 67 | try { 68 | streamingContext.awaitTermination(); 69 | } catch (InterruptedException e) { 70 | // TODO Auto-generated catch block 71 | e.printStackTrace(); 72 | } 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/WordCountRecoverableEx.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | import org.apache.spark.SparkConf; 7 | import org.apache.spark.api.java.JavaPairRDD; 8 | import org.apache.spark.api.java.Optional; 9 | import org.apache.spark.api.java.StorageLevels; 10 | import org.apache.spark.api.java.function.Function0; 11 | import org.apache.spark.api.java.function.Function3; 12 | import org.apache.spark.streaming.Durations; 13 | import org.apache.spark.streaming.State; 14 | import org.apache.spark.streaming.StateSpec; 15 | import org.apache.spark.streaming.api.java.JavaDStream; 16 | import org.apache.spark.streaming.api.java.JavaMapWithStateDStream; 17 | import org.apache.spark.streaming.api.java.JavaPairDStream; 18 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 19 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 20 | 21 | import scala.Tuple2; 22 | 23 | public class WordCountRecoverableEx { 24 | 25 | public static void main(String[] args) throws Exception { 26 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 27 | 28 | final String ip = "10.0.75.1"; 29 | final int port = Integer.parseInt("9000"); 30 | final String checkpointDirectory = "E:\\hadoop\\checkpoint"; 31 | // Function to create JavaStreamingContext without any output operations 32 | // (used to detect the new context) 33 | Function0 createContextFunc = new Function0() { 34 | @Override 35 | public JavaStreamingContext call() { 36 | return createContext(ip, port, checkpointDirectory); 37 | } 38 | }; 39 | 40 | JavaStreamingContext ssc = JavaStreamingContext.getOrCreate(checkpointDirectory, createContextFunc); 41 | ssc.start(); 42 | ssc.awaitTermination(); 43 | } 44 | 45 | protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) { 46 | SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]"); 47 | JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); 48 | streamingContext.checkpoint(checkpointDirectory); 49 | // Initial state RDD input to mapWithState 50 | @SuppressWarnings("unchecked") 51 | List> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1)); 52 | JavaPairRDD initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); 53 | 54 | JavaReceiverInputDStream StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER); 55 | 56 | JavaDStream words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator()); 57 | 58 | JavaPairDStream wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1)) 59 | .reduceByKey((count1, count2) -> count1 + count2); 60 | 61 | // Update the cumulative count function 62 | Function3, State, Tuple2> mappingFunc = new Function3, State, Tuple2>() { 63 | @Override 64 | public Tuple2 call(String word, Optional one, State state) { 65 | int sum = one.orElse(0) + (state.exists() ? state.get() : 0); 66 | Tuple2 output = new Tuple2<>(word, sum); 67 | state.update(sum); 68 | return output; 69 | } 70 | }; 71 | 72 | // DStream made of get cumulative counts that get updated in every batch 73 | JavaMapWithStateDStream> stateDstream = wordCounts 74 | .mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD)); 75 | 76 | stateDstream.print(); 77 | return streamingContext; 78 | } 79 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/WordCountSocketEx.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.util.Arrays; 4 | import java.util.Iterator; 5 | import java.util.regex.Pattern; 6 | 7 | import org.apache.spark.SparkConf; 8 | import org.apache.spark.api.java.StorageLevels; 9 | import org.apache.spark.api.java.function.FlatMapFunction; 10 | import org.apache.spark.api.java.function.Function2; 11 | import org.apache.spark.api.java.function.PairFunction; 12 | import org.apache.spark.streaming.Durations; 13 | import org.apache.spark.streaming.api.java.JavaDStream; 14 | import org.apache.spark.streaming.api.java.JavaPairDStream; 15 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 16 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 17 | 18 | import scala.Tuple2; 19 | 20 | public final class WordCountSocketEx { 21 | 22 | public static void main(String[] args) throws Exception { 23 | 24 | /* 25 | * if (args.length < 2) { 26 | * System.err.println("Usage: JavaNetworkWordCount "); 27 | * System.exit(1); } 28 | */ 29 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 30 | 31 | // Create the context with a 1 second batch size 32 | SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); 33 | JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); 34 | 35 | JavaReceiverInputDStream StreamingLines = streamingContext.socketTextStream("10.0.75.1", Integer.parseInt("9000"), 36 | StorageLevels.MEMORY_AND_DISK_SER); 37 | 38 | JavaDStream words = StreamingLines.flatMap(new FlatMapFunction() { 39 | @Override 40 | public Iterator call(String str) { 41 | return Arrays.asList(str.split(" ")).iterator(); 42 | } 43 | }); 44 | 45 | JavaPairDStream wordCounts = words.mapToPair(new PairFunction() { 46 | @Override 47 | public Tuple2 call(String str) { 48 | return new Tuple2<>(str, 1); 49 | } 50 | }).reduceByKey(new Function2() { 51 | @Override 52 | public Integer call(Integer count1, Integer count2) { 53 | return count1 + count2; 54 | } 55 | }); 56 | 57 | wordCounts.print(); 58 | streamingContext.start(); 59 | streamingContext.awaitTermination(); 60 | } 61 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/WordCountSocketJava8Ex.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.util.Arrays; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | import java.util.regex.Pattern; 7 | 8 | import org.apache.spark.SparkConf; 9 | import org.apache.spark.api.java.JavaPairRDD; 10 | import org.apache.spark.api.java.StorageLevels; 11 | import org.apache.spark.api.java.function.FlatMapFunction; 12 | import org.apache.spark.api.java.function.Function; 13 | import org.apache.spark.api.java.function.Function2; 14 | import org.apache.spark.api.java.function.PairFunction; 15 | import org.apache.spark.streaming.Durations; 16 | import org.apache.spark.streaming.api.java.JavaDStream; 17 | import org.apache.spark.streaming.api.java.JavaPairDStream; 18 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 19 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 20 | 21 | import scala.Tuple2; 22 | 23 | public final class WordCountSocketJava8Ex { 24 | 25 | public static void main(String[] args) throws Exception { 26 | 27 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 28 | 29 | SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); 30 | JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); 31 | 32 | List> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); 33 | JavaPairRDD initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); 34 | 35 | 36 | JavaReceiverInputDStream StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); 37 | 38 | JavaDStream words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); 39 | 40 | JavaPairDStream wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); 41 | 42 | wordCounts.print(); 43 | 44 | JavaPairDStream joinedDstream = wordCounts.transformToPair( 45 | new Function, JavaPairRDD>() { 46 | @Override public JavaPairRDD call(JavaPairRDD rdd) throws Exception { 47 | rdd.join(initialRDD).mapToPair(new PairFunction>, String, Integer>() { 48 | @Override 49 | public Tuple2 call(Tuple2> joinedTuple) 50 | throws Exception { 51 | // TODO Auto-generated method stub 52 | return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) ); 53 | } 54 | }); 55 | 56 | return rdd; 57 | } 58 | }); 59 | 60 | joinedDstream.print(); 61 | streamingContext.start(); 62 | streamingContext.awaitTermination(); 63 | } 64 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/WordCountSocketStateful.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | 7 | import org.apache.spark.SparkConf; 8 | import org.apache.spark.api.java.JavaPairRDD; 9 | import org.apache.spark.api.java.Optional; 10 | import org.apache.spark.api.java.StorageLevels; 11 | import org.apache.spark.api.java.function.Function3; 12 | import org.apache.spark.streaming.Durations; 13 | import org.apache.spark.streaming.State; 14 | import org.apache.spark.streaming.StateSpec; 15 | import org.apache.spark.streaming.api.java.JavaDStream; 16 | import org.apache.spark.streaming.api.java.JavaMapWithStateDStream; 17 | import org.apache.spark.streaming.api.java.JavaPairDStream; 18 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 19 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 20 | 21 | import scala.Tuple2; 22 | 23 | public class WordCountSocketStateful { 24 | 25 | 26 | public static void main(String[] args) throws Exception { 27 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 28 | 29 | SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); 30 | JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); 31 | streamingContext.checkpoint("E:\\hadoop\\checkpoint"); 32 | // Initial state RDD input to mapWithState 33 | @SuppressWarnings("unchecked") 34 | List> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1)); 35 | JavaPairRDD initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); 36 | 37 | JavaReceiverInputDStream StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); 38 | 39 | JavaDStream words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); 40 | 41 | JavaPairDStream wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); 42 | 43 | 44 | 45 | // Update the cumulative count function 46 | Function3, State, Tuple2> mappingFunc = 47 | new Function3, State, Tuple2>() { 48 | @Override 49 | public Tuple2 call(String word, Optional one, 50 | State state) { 51 | int sum = one.orElse(0) + (state.exists() ? state.get() : 0); 52 | Tuple2 output = new Tuple2<>(word, sum); 53 | state.update(sum); 54 | return output; 55 | } 56 | }; 57 | 58 | // DStream made of get cumulative counts that get updated in every batch 59 | JavaMapWithStateDStream> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD)); 60 | 61 | stateDstream.print(); 62 | streamingContext.start(); 63 | streamingContext.awaitTermination(); 64 | } 65 | } -------------------------------------------------------------------------------- /src/main/java/com/packt/sfjd/ch9/WordCountTransformOpEx.java: -------------------------------------------------------------------------------- 1 | package com.packt.sfjd.ch9; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | import org.apache.log4j.Level; 7 | import org.apache.log4j.LogManager; 8 | import org.apache.log4j.Logger; 9 | import org.apache.spark.SparkConf; 10 | import org.apache.spark.api.java.JavaPairRDD; 11 | import org.apache.spark.api.java.StorageLevels; 12 | import org.apache.spark.api.java.function.Function; 13 | import org.apache.spark.api.java.function.PairFunction; 14 | import org.apache.spark.streaming.Durations; 15 | import org.apache.spark.streaming.api.java.JavaDStream; 16 | import org.apache.spark.streaming.api.java.JavaPairDStream; 17 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 18 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 19 | 20 | import scala.Tuple2; 21 | 22 | public final class WordCountTransformOpEx { 23 | 24 | public static void main(String[] args) throws Exception { 25 | 26 | System.setProperty("hadoop.home.dir", "E:\\hadoop"); 27 | 28 | SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); 29 | JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); 30 | Logger rootLogger = LogManager.getRootLogger(); 31 | rootLogger.setLevel(Level.WARN); 32 | List> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); 33 | JavaPairRDD initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); 34 | 35 | 36 | JavaReceiverInputDStream StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); 37 | 38 | JavaDStream words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); 39 | 40 | JavaPairDStream wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); 41 | 42 | wordCounts.print(); 43 | 44 | JavaPairDStream joinedDstream = wordCounts 45 | .transformToPair(new Function, JavaPairRDD>() { 46 | @Override 47 | public JavaPairRDD call(JavaPairRDD rdd) throws Exception { 48 | JavaPairRDD modRDD = rdd.join(initialRDD).mapToPair( 49 | new PairFunction>, String, Integer>() { 50 | @Override 51 | public Tuple2 call( 52 | Tuple2> joinedTuple) throws Exception { 53 | return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2())); 54 | } 55 | }); 56 | return modRDD; 57 | } 58 | }); 59 | 60 | joinedDstream.print(); 61 | streamingContext.start(); 62 | streamingContext.awaitTermination(); 63 | } 64 | } -------------------------------------------------------------------------------- /src/main/resources/Employee.txt: -------------------------------------------------------------------------------- 1 | empId,empName,job,manager,hiredate,salary,comm,deptno 2 | 7839, KING, PRESIDENT, null,17-11-1981,5000, null, 10 3 | 7698, BLAKE, MANAGER, 7839,1-5-1981, 2850, null, 30 4 | 7782, CLARK, MANAGER, 7839,9-6-1981, 2450, null, 10 5 | 7566, JONES, MANAGER, 7839,2-4-1981, 2975, null, 20 6 | 7788, SCOTT, ANALYST, 7566,13-7-1987,3000, null, 20 7 | 7902, FORD, ANALYST, 7566,3-12-1981,3000, null, 20 8 | 7369, SMITH, CLERK, 7902,17-12-1980,800, null, 20 9 | 7499, ALLEN, SALESMAN, 7698,20-2-1981,1600, 300, 30 10 | 7521, WARD, SALESMAN, 7698,22-2-1981,1250, 500, 30 11 | 7654, MARTIN, SALESMAN, 7698,28-9-1981,1250, 1400, 30 12 | 7844, TURNER, SALESMAN, 7698,8-9-1981,1500, 0, 30 13 | 7876, ADAMS, CLERK, 7788,13-7-1987,1100, null, 20 14 | 7900, JAMES, CLERK, 7698,3-12-1981,950, null, 30 15 | 7934, MILLER, CLERK, 7782,23-1-1982,1300, null, 10 -------------------------------------------------------------------------------- /src/main/resources/breakfast_menu.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Belgian Waffles 5 | $5.95 6 | Two of our famous Belgian Waffles with plenty of real maple syrup 7 | 650 8 | 9 | 10 | Strawberry Belgian Waffles 11 | $7.95 12 | Light Belgian waffles covered with strawberries and whipped cream 13 | 900 14 | 15 | 16 | Berry-Berry Belgian Waffles 17 | $8.95 18 | Light Belgian waffles covered with an assortment of fresh berries and whipped cream 19 | 900 20 | 21 | 22 | French Toast 23 | $4.50 24 | Thick slices made from our homemade sourdough bread 25 | 600 26 | 27 | 28 | Homestyle Breakfast 29 | $6.95 30 | Two eggs, bacon or sausage, toast, and our ever-popular hash browns 31 | 950 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/main/resources/dept.txt: -------------------------------------------------------------------------------- 1 | deptno,dname,loc 2 | 10, ACCOUNTING, NEW YORK 3 | 20, RESEARCH, BOSTON 4 | 30, SALES, CHICAGO 5 | 40, OPERATIONS, BOSTON 6 | 50, ADMIN, CHICAGO -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.appender.myConsoleAppender=org.apache.log4j.ConsoleAppender 2 | log4j.appender.myConsoleAppender.layout=org.apache.log4j.PatternLayout 3 | log4j.appender.myConsoleAppender.layout.ConversionPattern=%d [%t] %-5p %c - %m%n 4 | 5 | log4j.appender.RollingAppender=org.apache.log4j.DailyRollingFileAppender 6 | log4j.appender.RollingAppender.File=/var/log/spark.log 7 | log4j.appender.RollingAppender.DatePattern='.'yyyy-MM-dd 8 | log4j.appender.RollingAppender.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.RollingAppender.layout.ConversionPattern=[%p] %d %c %M - %m%n 10 | 11 | log4j.appender.RollingAppenderU=org.apache.log4j.DailyRollingFileAppender 12 | log4j.appender.RollingAppenderU.File=/var/log/sparkU.log 13 | log4j.appender.RollingAppenderU.DatePattern='.'yyyy-MM-dd 14 | log4j.appender.RollingAppenderU.layout=org.apache.log4j.PatternLayout 15 | log4j.appender.RollingAppenderU.layout.ConversionPattern=[%p] %d %c %M - %m%n 16 | 17 | 18 | # By default, everything goes to console and file 19 | log4j.rootLogger=WARN, RollingAppender, myConsoleAppender 20 | 21 | # My custom logging goes to another file 22 | log4j.logger.myLogger=INFO, RollingAppenderU 23 | 24 | # The noisier spark logs go to file only 25 | log4j.logger.spark.storage=INFO, RollingAppender 26 | log4j.additivity.spark.storage=false 27 | log4j.logger.spark.scheduler=INFO, RollingAppender 28 | log4j.additivity.spark.scheduler=false 29 | log4j.logger.spark.CacheTracker=INFO, RollingAppender 30 | log4j.additivity.spark.CacheTracker=false 31 | log4j.logger.spark.CacheTrackerActor=INFO, RollingAppender 32 | log4j.additivity.spark.CacheTrackerActor=false 33 | log4j.logger.spark.MapOutputTrackerActor=INFO, RollingAppender 34 | log4j.additivity.spark.MapOutputTrackerActor=false 35 | log4j.logger.spark.MapOutputTracker=INFO, RollingAppender 36 | log4j.additivty.spark.MapOutputTracker=false -------------------------------------------------------------------------------- /src/main/resources/logFileWithException.log: -------------------------------------------------------------------------------- 1 | Exception in thread "main" java.lang.UnsupportedClassVersionError: client : Unsupported major.minor version 52.0 at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(Unknown Source) at java.security.SecureClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.access$100(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.launcher.LauncherHelper.checkAndLoadMain(Unknown Source) Press any key to continue . . . 2 | Exception in thread "main" java.lang.UnsupportedClassVersionError: client : Unsupported major.minor version 52.0 at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(Unknown Source) at java.security.SecureClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.access$100(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.launcher.LauncherHelper.checkAndLoadMain(Unknown Source) Press any key to continue . . . 3 | Exception in thread "main" java.lang.UnsupportedClassVersionError: client : Unsupported major.minor version 52.0 at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(Unknown Source) at java.security.SecureClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.access$100(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.launcher.LauncherHelper.checkAndLoadMain(Unknown Source) Press any key to continue . . . 4 | Exception in thread "main" java.lang.UnsupportedClassVersionError: client : Unsupported major.minor version 52.0 at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(Unknown Source) at java.security.SecureClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.access$100(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.launcher.LauncherHelper.checkAndLoadMain(Unknown Source) Press any key to continue . . . 5 | Exception in thread "main" java.lang.UnsupportedClassVersionError: client : Unsupported major.minor version 52.0 at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(Unknown Source) at java.security.SecureClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.defineClass(Unknown Source) at java.net.URLClassLoader.access$100(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.net.URLClassLoader$1.run(Unknown Source) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.launcher.LauncherHelper.checkAndLoadMain(Unknown Source) Press any key to continue . . . -------------------------------------------------------------------------------- /src/main/resources/movies.csv: -------------------------------------------------------------------------------- 1 | movieId,title,genres 2 | 1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy 3 | 2,Jumanji (1995),Adventure|Children|Fantasy 4 | 3,Grumpier Old Men (1995),Comedy|Romance 5 | 4,Waiting to Exhale (1995),Comedy|Drama|Romance 6 | 5,Father of the Bride Part II (1995),Comedy 7 | 6,Heat (1995),Action|Crime|Thriller 8 | 7,Sabrina (1995),Comedy|Romance 9 | 8,Tom and Huck (1995),Adventure|Children 10 | 9,Sudden Death (1995),Action 11 | 10,GoldenEye (1995),Action|Adventure|Thriller 12 | 11,"American President The (1995)",Comedy|Drama|Romance 13 | 12,Dracula: Dead and Loving It (1995),Comedy|Horror 14 | 13,Balto (1995),Adventure|Animation|Children 15 | 14,Nixon (1995),Drama 16 | 15,Cutthroat Island (1995),Action|Adventure|Romance 17 | 16,Casino (1995),Crime|Drama 18 | 17,Sense and Sensibility (1995),Drama|Romance 19 | 18,Four Rooms (1995),Comedy 20 | 19,Ace Ventura: When Nature Calls (1995),Comedy 21 | 20,Money Train (1995),Action|Comedy|Crime|Drama|Thriller 22 | 21,Get Shorty (1995),Comedy|Crime|Thriller 23 | 22,Copycat (1995),Crime|Drama|Horror|Mystery|Thriller 24 | 23,Assassins (1995),Action|Crime|Thriller 25 | 24,Powder (1995),Drama|Sci-Fi 26 | 25,Leaving Las Vegas (1995),Drama|Romance 27 | 26,Othello (1995),Drama 28 | 27,Now and Then (1995),Children|Drama 29 | 28,Persuasion (1995),Drama|Romance 30 | 31 | 30,Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),Crime|Drama 32 | 31,Dangerous Minds (1995),Drama 33 | 32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller 34 | 34,Babe (1995),Children|Drama 35 | 35,Carrington (1995),Drama|Romance 36 | 36,Dead Man Walking (1995),Crime|Drama 37 | 37,Across the Sea of Time (1995),Documentary|IMAX 38 | 38,It Takes Two (1995),Children|Comedy 39 | 39,Clueless (1995),Comedy|Romance 40 | 41 | 41,Richard III (1995),Drama|War 42 | 42,Dead Presidents (1995),Action|Crime|Drama 43 | 43,Restoration (1995),Drama 44 | 44,Mortal Kombat (1995),Action|Adventure|Fantasy 45 | 45,To Die For (1995),Comedy|Drama|Thriller 46 | 46,How to Make an American Quilt (1995),Drama|Romance 47 | 47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller 48 | 48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance 49 | 49,When Night Is Falling (1995),Drama|Romance 50 | -------------------------------------------------------------------------------- /src/main/resources/numSeries.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 2 5 | 4 6 | 5 7 | 6 8 | 5 9 | 7 10 | 8 11 | 9 12 | 10 -------------------------------------------------------------------------------- /src/main/resources/people.tsv: -------------------------------------------------------------------------------- 1 | virat~34~Batsman 2 | sachin~50~Batsman 3 | kumble~45~Bowler -------------------------------------------------------------------------------- /src/main/resources/pep_json.json: -------------------------------------------------------------------------------- 1 | { "year": "2013", "firstName": "DAVID", "county": "KINGS", "sex": "M", "cid": 272,"dateOfBirth":"2016-01-07T00:01:17Z" } 2 | { "year": "2013", "firstName": "JAYDEN", "county": "KINGS", "sex": "M", "cid": 268,"dateOfBirth":"2016-02-07T00:01:17Z" } 3 | { "year": "2013", "firstName": "JAYDEN", "county": "QUEENS", "sex": "M", "cid": 219,"dateOfBirth":"2016-03-07T00:01:17Z" } 4 | { "year": "2013", "firstName": "MOSHE", "county": "KINGS", "sex": "M", "cid": 219,"dateOfBirth":"2016-04-07T00:01:17Z" } 5 | { "year": "2013", "firstName": "ETHAN", "county": "QUEENS", "sex": "M", "cid": 216,"dateOfBirth":"2016-05-07T00:01:17Z" } --------------------------------------------------------------------------------