├── .gitignore
├── README.md
├── scala-examples
    ├── pom.xml
    ├── scripts
    │   ├── Rational.scala
    │   ├── Spark.scala
    │   ├── array.scala
    │   ├── class.scala
    │   ├── date.scala
    │   ├── fileread.scala
    │   ├── list.scala
    │   ├── loop.scala
    │   ├── printsamples.scala
    │   ├── set.scala
    │   ├── test.scala
    │   └── tupple.scala
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── sparkbyexamples
    │                   ├── json
    │                       └── TestJson.scala
    │                   ├── list
    │                       ├── ArrayExamples.scala
    │                       ├── LinkedListMutableExamples.scala
    │                       ├── ListBufferExamples.scala
    │                       └── ListExamples.scala
    │                   └── static
    │                       ├── MapExamples.scala
    │                       ├── StaticExample.scala
    │                       └── Test.scala
├── scala-kafka
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           └── person.avsc
    │       └── scala
    │           └── com
    │               └── sparkbyexamples
    │                   └── kafka
    │                       ├── KafkaConsumerAssignApp.scala
    │                       ├── KafkaConsumerSubscribeApp.scala
    │                       ├── KafkaProducerApp.scala
    │                       ├── KafkaProducerJson.scala
    │                       ├── avro
    │                           └── KafkaProducerAvro.scala
    │                       ├── beans
    │                           └── User.scala
    │                       ├── jackson
    │                           ├── KafkaConsumerWithUserObject.scala
    │                           ├── KafkaProducerWithUserObject.scala
    │                           ├── UserDeserializer.scala
    │                           └── UserSerializer.scala
    │                       ├── json
    │                           └── KafkaProducerJson.scala
    │                       ├── registry
    │                           ├── KafkaConsumerAvroRegistry.scala
    │                           ├── KafkaProducerAvroRegistry.scala
    │                           ├── PersonKafkaConsumerAvroRegistry.scala
    │                           └── PersonKafkaProducerAvroRegistry.scala
    │                       └── streams
    │                           └── KafkaStreams.scala
├── spark-avro-examples
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           └── person.avsc
    │       └── scala
    │           └── com
    │               └── sparkbyexamples
    │                   └── spark
    │                       └── dataframe
    │                           └── avro
    │                               ├── AvroExample.scala
    │                               └── AvroUsingNestedSchema.scala
├── spark-hive
    ├── pom.xml
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── sparkbyexamples
    │                   └── HBaseWrite.scala
├── spark-kafka
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           └── person.avsc
    │       └── scala
    │           └── com
    │               └── sparkbyexamples
    │                   └── spark
    │                       └── kafka
    │                           └── json
    │                               ├── KafkaConsumerJson.scala
    │                               └── KafkaProduceJson.scala
├── spark-sql-examples
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           ├── books.xml
    │           ├── books_withnested_array.xml
    │           ├── free-zipcode-database.csv
    │           ├── kv.csv
    │           ├── persons.xml
    │           ├── persons_complex.xml
    │           ├── records.xml
    │           ├── schema.json
    │           ├── stream.csv
    │           ├── test.txt
    │           ├── txt
    │           │   ├── alice.txt
    │           │   ├── datasets.csv
    │           │   └── holmes.txt
    │           ├── zipcodes-noheader.csv
    │           ├── zipcodes.csv
    │           ├── zipcodes.json
    │           └── zipcodes_streaming
    │           │   ├── zipcode1.json
    │           │   ├── zipcode10.json
    │           │   ├── zipcode11.json
    │           │   ├── zipcode12.json
    │           │   ├── zipcode2.json
    │           │   ├── zipcode3.json
    │           │   ├── zipcode4.json
    │           │   ├── zipcode5.json
    │           │   ├── zipcode6.json
    │           │   ├── zipcode7.json
    │           │   ├── zipcode8.json
    │           │   └── zipcode9.json
    │       └── scala
    │           └── com
    │               └── sparkbyexamples
    │                   └── spark
    │                       ├── SparkSessionTest.scala
    │                       ├── beans
    │                           ├── Books.scala
    │                           ├── BooksDiscounted.scala
    │                           ├── BooksStruct.scala
    │                           ├── BooksWithArray.scala
    │                           ├── User.scala
    │                           └── Zipcode.scala
    │                       ├── dataframe
    │                           ├── ArrayToColumn.scala
    │                           ├── AvroExample.scala
    │                           ├── CaseClassSparkSchema.scala
    │                           ├── CastColumnType.scala
    │                           ├── CreateDataFrame.scala
    │                           ├── CreateEmptyDataFrameExample.scala
    │                           ├── CreateEmptyDatasetExample.scala
    │                           ├── DataFrameWithComplexDSL.scala
    │                           ├── DataFrameWithSimpleDSL.scala
    │                           ├── FromCSVFile.scala
    │                           ├── FromCSVFile2.scala
    │                           ├── FromJsonFile.scala
    │                           ├── ParquetExample.scala
    │                           ├── RenameColDataFrame.scala
    │                           ├── SQLExample.scala
    │                           ├── SaveDataFrame.scala
    │                           ├── StructTypeUsage.scala
    │                           ├── UDFDataFrame.scala
    │                           ├── WithColumn.scala
    │                           ├── functions
    │                           │   ├── AnotherExample.scala
    │                           │   ├── MathFunctions.scala
    │                           │   ├── PivotExample.scala
    │                           │   ├── StringFunctions.scala
    │                           │   ├── WhenOtherwise.scala
    │                           │   ├── WindowGroupbyFirst.scala
    │                           │   ├── collection
    │                           │   │   ├── ArrayOfArrayType.scala
    │                           │   │   ├── ArrayOfMapType.scala
    │                           │   │   ├── ArrayOfStructType.scala
    │                           │   │   ├── ArrayTypeExample.scala
    │                           │   │   ├── ExplodeArrayAndMap.scala
    │                           │   │   ├── MapFunctions.scala
    │                           │   │   └── MapTypeExample.scala
    │                           │   ├── datetime
    │                           │   │   ├── AddTime.scala
    │                           │   │   ├── CurrentDateAndTime.scala
    │                           │   │   ├── DateAddMonths.scala
    │                           │   │   ├── DateDiff.scala
    │                           │   │   ├── DateExamples.scala
    │                           │   │   ├── DateFormat.scala
    │                           │   │   ├── DateLastDay.scala
    │                           │   │   ├── DateToString.scala
    │                           │   │   ├── DateTrunc.scala
    │                           │   │   ├── DayAndWeekOfYear.scala
    │                           │   │   ├── DayWeekAndWeekMonth.scala
    │                           │   │   ├── GetTimeFromTimestamp.scala
    │                           │   │   ├── StringToDate.scala
    │                           │   │   ├── StringToTimestamp.scala
    │                           │   │   ├── TimestampDiff.scala
    │                           │   │   ├── TimestampToDate.scala
    │                           │   │   ├── TimestampToString.scala
    │                           │   │   └── unixtimeExample.scala
    │                           │   ├── from_json.scala
    │                           │   └── litTypeLit.scala
    │                           └── xml
    │                           │   ├── PersonsComplexXML.scala
    │                           │   ├── PersonsXML.scala
    │                           │   ├── ReadBooksXMLWithNestedArray.scala
    │                           │   ├── ReadBooksXMLWithNestedArrayStruct.scala
    │                           │   └── xstream
    │                           │       └── WriteXML.scala
    │                       ├── dataset
    │                           ├── DataSetFromData.scala
    │                           ├── DataSetWithCustomClass.scala
    │                           └── xml
    │                           │   ├── ReadBooksXML.scala
    │                           │   ├── ReadBooksXMLWithNestedArray.scala
    │                           │   ├── ReadBooksXMLWithNestedArrayDSL.scala
    │                           │   ├── SparkXMLUsingXstream.scala
    │                           │   └── sparkXml.scala
    │                       ├── rdd
    │                           ├── CreateEmptyRDD.scala
    │                           ├── CreateRDD.scala
    │                           ├── OperationsOnPair.scala
    │                           ├── OperationsOnRDD.scala
    │                           ├── PartitionBy.scala
    │                           ├── RDDAccumulator.scala
    │                           ├── RDDBroadcast.scala
    │                           ├── RDDCache.scala
    │                           ├── RDDFromCSVFile.scala
    │                           ├── RDDFromDataUsingParallelize.scala
    │                           ├── RDDFromParallelizeRange.scala
    │                           ├── RDDFromWholeTextFile.scala
    │                           ├── RDDHadoopInputFormat.scala
    │                           ├── RDDPersist.scala
    │                           ├── RDDReadFilesFromDirectory.scala
    │                           ├── RDDSaveAsObjectFile.scala
    │                           ├── RDDSequenceFiles.scala
    │                           ├── ReadMultipleCSVFiles.scala
    │                           ├── ReadMultipleFiles.scala
    │                           ├── SortBy.scala
    │                           ├── WordCount.scala
    │                           ├── ZipCode.scala
    │                           └── xml
    │                           │   └── XmlRecordReader.scala
    │                       └── stackoverflow
    │                           ├── AddingLiterral.scala
    │                           ├── Test.scala
    │                           └── Test2.scala
├── spark-streaming
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           ├── folder_streaming
    │           │   ├── zipcode1.json
    │           │   ├── zipcode10.json
    │           │   ├── zipcode11.json
    │           │   ├── zipcode12.json
    │           │   ├── zipcode2.json
    │           │   ├── zipcode3.json
    │           │   ├── zipcode4.json
    │           │   ├── zipcode5.json
    │           │   ├── zipcode6.json
    │           │   ├── zipcode7.json
    │           │   ├── zipcode8.json
    │           │   └── zipcode9.json
    │           ├── person.avsc
    │           └── person.json
    │       └── scala
    │           └── com
    │               └── sparkbyexamples
    │                   └── spark
    │                       └── streaming
    │                           ├── SparkStreamingFromDirectory.scala
    │                           ├── SparkStreamingFromSocket.scala
    │                           ├── batch
    │                               ├── SparkBatchConsumeFromKafka.scala
    │                               ├── SparkBatchConsumeFromKafkaAvro.scala
    │                               ├── SparkBatchProduceToKafka.scala
    │                               └── SparkBatchProduceToKafkaAvro.scala
    │                           ├── inprogress
    │                               ├── SparkStreamingForeachRDD.scala
    │                               ├── SparkStreamingForeachWriter.scala
    │                               ├── SparkStreamingFromDirectoryTmp.scala
    │                               ├── SparkStreamingKafkaProducerZipcodeObject.scala
    │                               ├── SparkStreamingToHDFS.scala
    │                               ├── SparkStreamingToJDBC.scala
    │                               ├── SparkStreamingToParquetFile.scala
    │                               ├── SparkStreamingToS3.scala
    │                               └── SparkStreamingTwitter.scala
    │                           └── kafka
    │                               ├── KafkaProduceAvro.scala
    │                               ├── SparkStreamingConsumeKafka.scala
    │                               ├── avro
    │                                   ├── KafkaConsumerAvro.scala
    │                                   └── KafkaProduceAvro.scala
    │                               └── json
    │                                   └── SparkStreamingConsumerKafkaJson.scala
└── spark2.3-avro-examples
    ├── pom.xml
    └── src
        └── main
            ├── resources
                └── person.avsc
            └── scala
                └── com
                    └── sparkbyexamples
                        └── spark
                            └── dataframe
                                └── avro
                                    └── AvroUsingDataBricks.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | .metadata
 3 | .cache-main
 4 | .classpath
 5 | .project
 6 | .settings
 7 | *.class
 8 | *.orig
 9 | *.log
10 | target/
11 | .DS_Store
12 | *.iml
13 | scalastyle-output.xml
14 | 
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Explanation of all examples present on this project are available at https://sparkbyexamples.com/


--------------------------------------------------------------------------------
/scala-examples/scripts/Rational.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | class Rational(n:Int,d:Int){
 3 |   require(d!=0)
 4 |   val number:Int = n
 5 |   val denom:Int = d
 6 |   override def toString() = n+"/"+d
 7 |   def this(n:Int)=this(n,1)
 8 | 
 9 |   def add(that:Rational): Rational ={
10 |       new Rational(number*that.denom + that.number*denom,denom*that.denom)
11 |   }
12 | 
13 |   def +(that:Rational): Rational ={
14 |     new Rational(number*that.denom + that.number*denom,denom*that.denom)
15 |   }
16 | }
17 | 
18 | val a = new Rational(1,2)
19 | val b = new Rational(2,3)
20 | val c = a+b
21 | println(c)


--------------------------------------------------------------------------------
/scala-examples/scripts/Spark.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.sql.SparkSession
 2 | 
 3 | object SparkTest{
 4 | 
 5 |   def main(args:Array[String]): Unit ={
 6 | 
 7 |     val sparkSession = SparkSession.builder().appName("Naveen").master("local[1]").getOrCreate();
 8 | 
 9 |     println("APP Name :"+sparkSession.sparkContext.appName);
10 |     println("Deploy Mode :"+sparkSession.sparkContext.deployMode);
11 |     println("Master :"+sparkSession.sparkContext.master);
12 | 
13 |   }
14 | }


--------------------------------------------------------------------------------
/scala-examples/scripts/array.scala:
--------------------------------------------------------------------------------
 1 | // Arrays are mutable
 2 | println("Start")
 3 | 
 4 | var j=0
 5 | while(j<args.length){
 6 |   println(args(j) +","+args.apply(j))
 7 |   j+=1
 8 | }
 9 | 
10 | args.foreach(s=>println(s))
11 | args.foreach(println)
12 | for(i<-args){
13 |   println(i);booleanArrayOps()
14 | }
15 | 
16 | for(i<-1 to 2){
17 |   println(args(i))
18 | }
19 | val arr1=Array("one","two","three")
20 | println("Count: "+arr1.count(a=>a.length==5))
21 | 
22 | val arr:Array[String]=new Array[String](3);
23 | arr(0)="1one"
24 | arr(1)="2two"
25 | arr.update(2,"3three");
26 | println(arr.dropRight(2).length)
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/scala-examples/scripts/class.scala:
--------------------------------------------------------------------------------
 1 | import scala.collection.mutable
 2 | 
 3 | class CheckSumAccumulator{
 4 | 
 5 |   private var sum = 0
 6 | 
 7 |   def add(b:Byte): Unit ={
 8 |     sum+=b
 9 |   }
10 | 
11 |   def calc(): Int ={
12 |     ~(sum & 0XFF)+1
13 |   }
14 | }
15 | 
16 | object CheckSumAccumulator123{
17 |   private val cache = mutable.Map.empty[String,Int]
18 | 
19 |   def calculate(str:String): Int ={
20 | 
21 |     if(cache.contains(str))
22 |         cache(str)
23 |     else{
24 |       val csum = new CheckSumAccumulator();
25 |       for(c<-str)
26 |         csum.add(c.toByte)
27 |       cache += (str -> csum.calc())
28 |       csum.calc()
29 |     }
30 |   }
31 | }
32 | 
33 | println(CheckSumAccumulator123.calculate("Naveen"))
34 | println(CheckSumAccumulator123.calculate("Praveen"))
35 | println(CheckSumAccumulator123.calculate("Naveen"))
36 | 


--------------------------------------------------------------------------------
/scala-examples/scripts/date.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | import java.time.{LocalDate, Period}
 3 | import java.time.format.DateTimeFormatter
 4 | import java.time.temporal.ChronoUnit
 5 | import java.util.Calendar
 6 | import java.time.ZoneId
 7 | val dateFormat = DateTimeFormatter.ofPattern("ddMMyyyy")
 8 | dateFormat.parse("").getTime
 9 | val da:LocalDate = LocalDate.parse("13041981",dateFormat)
10 | 
11 | val forDate = dateFormat.parse("13041981")
12 | 
13 | println("Date: "+forDate)
14 | println("Date 2: "+da.toString)
15 | 
16 | val today = LocalDate.now
17 | 
18 | println("Now : "+today.format(dateFormat))
19 | 
20 | println("Years:"+ChronoUnit.YEARS.between(da,today))
21 | 
22 | println("Years:"+Period.between(da,today).getYears)
23 | 
24 | println("EpochDay:"+da.atStartOfDay(ZoneId.systemDefault).toInstant.getEpochSecond)
25 | println("EpochDay:"+Calendar.getInstance().getTimeInMillis)
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/scala-examples/scripts/fileread.scala:
--------------------------------------------------------------------------------
1 | 
2 | import scala.io.Source
3 | 
4 | for(s<-Source.fromFile("pom.xml").getLines())
5 |   println(s)


--------------------------------------------------------------------------------
/scala-examples/scripts/list.scala:
--------------------------------------------------------------------------------
 1 | //Lists are immutable like java String
 2 | var l1:List[String] = List[String]("1","2")
 3 | //l1(0)="one" - this statement fails
 4 |  l1=List("one","two","three")
 5 | val l2=l1
 6 | l2.foreach(l=>println(l))
 7 | l2.foreach(println)
 8 | for(i<-l2)
 9 |     println(i)
10 | for(i<-0 to 2)
11 |     println(l2(i))
12 | val l3 = "zero" :: l2
13 | 
14 | l3.foreach(l=>println(l))
15 | 
16 | println("Concatenated List")
17 | val l4 = "4" :: "5" :: "6" :: Nil
18 | 
19 | val l5 = l3 ::: l4
20 | 
21 | for(l<-l5)
22 |     println(l)
23 | val l6 = List("will","wall","until")
24 | println("All containsl letter l :" + l6.forall(l=>l.endsWith("l")))
25 | 
26 | val l7 = l6.sortWith((a,b)=>a.charAt(0) > b.charAt(0))
27 | println("After Sorting")
28 | l7.foreach(println)
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/scala-examples/scripts/loop.scala:
--------------------------------------------------------------------------------
 1 | import java.io.File
 2 | 
 3 | import scala.io.Source
 4 | 
 5 | var files = (new File(".")).listFiles()
 6 | for(file <- files if file.getName.endsWith(".scala") if file.getName.contains("loop"))
 7 |   println(file)
 8 | 
 9 | for{
10 |   file <- files
11 |   if file.getName.endsWith(".scala")
12 |   if file.getName.contains("test")
13 | }println(file)
14 | 
15 | def lines(fileName:String):Array[String]={
16 |   Source.fromFile(fileName).getLines().toArray
17 | }
18 | 
19 | def readFiles(): Unit ={
20 | 
21 |   val files = new File(".").listFiles()
22 |   for{
23 |     file<-files
24 |     if file.getName.endsWith(".scala")
25 |     line<-lines(file.getName)
26 |     trimLine = line.trim
27 |     if trimLine.contains("println")
28 |   }println(s"$file lines "+ trimLine)
29 | 
30 | }
31 | readFiles()
32 | 
33 | 


--------------------------------------------------------------------------------
/scala-examples/scripts/printsamples.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | var v = 10
 3 | val b = 20
 4 | 
 5 | println(s"My first value $v  and second value $b")
 6 | 
 7 | println(s"Add ${v+b}")
 8 | println("a\\b")
 9 | println(raw"a\\b")
10 | printf("My first %d and second %d",v,b)


--------------------------------------------------------------------------------
/scala-examples/scripts/set.scala:
--------------------------------------------------------------------------------
 1 | import com.experian.edf.oxygen.utils.JsonUtils
 2 | import com.google.gson.JsonObject
 3 | 
 4 | //Set by default immutable
 5 | var s1 = Set("one","two","three","four")
 6 | //s1(0)="1" - this statement fails
 7 | s1.foreach(println)
 8 | 
 9 | for(s<-s1)
10 |     println(s)
11 | 
12 |  s1 += "zero"
13 | 
14 | for(s<-s1)
15 |   println(s)
16 | 
17 | //Map
18 | 
19 | var m1 = Map("a"->"A","b"->"B")
20 | 
21 | val str = "{"+Map("ss" -> "yy", "aa" -> "bb").map{case (k, v) => "\""+k + "\":" + v}.mkString(",") + "}"
22 | 
23 | 
24 | println("----->"+str)
25 | val str1 = m1.foreach(m=> ("--->"+m._1 + ","+m._2))
26 | println("----->"+str1)
27 | val s5 = m1.keySet
28 | 
29 | for(s<-s5)
30 |   println(m1(s))
31 | 
32 | println(m1.contains("a"))
33 | 
34 | m1.foreach(a=>println(a._1 +","+a._2))
35 | 
36 | for(m<-m1)
37 |   printf(m._1,m._2)
38 | 
39 | for((a,b)<-m1){
40 |   printf("Key %s , value %s -", a,b)
41 | }


--------------------------------------------------------------------------------
/scala-examples/scripts/test.scala:
--------------------------------------------------------------------------------
 1 | import java.text.SimpleDateFormat
 2 | import java.util.Date
 3 | 
 4 | val asOfDateFormat = new SimpleDateFormat("yyyyMMdd")
 5 | val str = "file:/C:/Users/a03078a/Documents/DataFabric/Workspace/bureau-australia-data/DefaultListingExtract_Experian_20181008041614.txt"
 6 | println(str.lastIndexOf("Experian_"))
 7 | val dateStr = str.substring(str.lastIndexOf("Experian_")+9,str.lastIndexOf("Experian_")+9+8)
 8 | println("Extracted date:"+dateStr)
 9 | val parseDate = asOfDateFormat.parse(dateStr)
10 | println("Parsed Date:"+parseDate)
11 | val longDate = asOfDateFormat.parse(dateStr).getTime
12 | 
13 | 
14 | println("Date in long:"+longDate)
15 | val reformatDate:Date = new Date()
16 | reformatDate.setTime(longDate);
17 | println("Reformat Date:"+reformatDate)
18 | val daStr = asOfDateFormat.format(reformatDate)
19 | 
20 | 
21 | println("final Date :"+daStr)
22 | 
23 | 


--------------------------------------------------------------------------------
/scala-examples/scripts/tupple.scala:
--------------------------------------------------------------------------------
1 | val t = ("A",1,'c')
2 | 
3 | println(t._1)


--------------------------------------------------------------------------------
/scala-examples/src/main/scala/com/sparkbyexamples/json/TestJson.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.json
2 | 
3 | object TestJson {
4 | 
5 |   def main(args: Array[String]): Unit = {
6 | 
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/scala-examples/src/main/scala/com/sparkbyexamples/list/ArrayExamples.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.list
 2 | 
 3 | object ArrayExamples extends App {
 4 | 
 5 |   var list1 = Array("A","B")
 6 |   list1(1)="AA" //Errro
 7 |   val  list2 = list1.map(_.toLowerCase())
 8 | 
 9 |   println(list1.mkString(","))
10 |   list1.foreach(f=>println(f))
11 |   list1.foreach(println(_))
12 | 
13 |   println("Reading a value form Index :"+list1(1))
14 |   println("Adding element 'C' to Arrays")
15 | 
16 |   //list1 += "C"    // re-assigning
17 |   println(list1.mkString(","))
18 | 
19 |   println("Adding two Arrays")
20 |   var list3 = list1 ++ list2
21 | 
22 |   println(list3.mkString(","))
23 | 
24 |   println("Adding literal to each element in Lists")
25 |   val list4 = list1.map(f=>f+"->")
26 |   println(list4.mkString(","))
27 | 
28 |   println("Convert all list elements to Int")
29 |   val list5 = List("1","2","3","4","5")
30 |   println(list5.map(f=>f.toInt).mkString(","))
31 | }
32 | 


--------------------------------------------------------------------------------
/scala-examples/src/main/scala/com/sparkbyexamples/list/LinkedListMutableExamples.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.list
 2 | 
 3 | import scala.collection.mutable
 4 | 
 5 | object LinkedListMutableExamples {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     var list1 = mutable.LinkedList("A","B")
10 |     list1(0)="C"
11 |     list1(1)="D"
12 |     // list1 = "C" :: list1  //  Error
13 |     //list1 = list1 + "X" //Error
14 |     //list1 ++= "" //Error
15 |    // list1 += "" //Error
16 |     list1.append(list1)
17 | 
18 |     println("Modify an element on list")
19 |     list1.foreach(println(_))
20 | 
21 |     println("Create list2 from list1")
22 |     var list2 = list1.map(_.toLowerCase())
23 |     list2.foreach(println(_))
24 | 
25 |     println("Add list to existing list")
26 |     list2 ++= list1
27 |     //val list7 = list1 ::: list2 //Error
28 |     list2.foreach(println(_))
29 | 
30 |     println("Merge list1 & list2 and create list3")
31 |     //val list3 = list1 ::: list2 // Error
32 |     var list3 = list1 ++ list2
33 | 
34 |     list3.foreach(println(_))
35 | 
36 |     //Converts list to map
37 |     println("Convert list to map")
38 |     val list4 = list1.map(f=>(f,f.toLowerCase()))
39 |     list4.foreach(f=>println(f._1+f._2))
40 | 
41 |     //A ListBuffer is like an array buffer except that it uses a linked list internally instead of an array
42 |    // val list3 = mutable.ListBuffer("A","B")
43 | 
44 | 
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/scala-examples/src/main/scala/com/sparkbyexamples/list/ListBufferExamples.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.list
 2 | 
 3 | import scala.collection.mutable
 4 | 
 5 | object ListBufferExamples {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     var list1 = mutable.ListBuffer("A","B")
10 |     list1(0)="C"
11 |     list1(1)="D"
12 |     // list1 = "C" :: list1  //  Error
13 |     //list1 = list1 + "" //Error
14 |     //list1 ++= "E" //Error
15 |     // list1 = list1 + "C"// Error
16 |     list1 += "B"
17 | 
18 |     list1.append("A")
19 | 
20 |     println("Modify an element on list")
21 |     list1.foreach(println(_))
22 | 
23 |     println("Create list2 from list1")
24 |     var list2 = list1.map(_.toLowerCase())
25 |     list2.appendAll(list1)
26 |     list2.foreach(println(_))
27 | 
28 |     println("Add list to existing list")
29 |     list2 ++= list1
30 |     //val list7 = list1 ::: list2 //Error
31 |     list2.foreach(println(_))
32 | 
33 |     println("Merge list1 & list2 and create list3")
34 |     //val list3 = list1 ::: list2 // Error
35 |     val list3 = list1 ++ list2
36 |     list3.foreach(println(_))
37 | 
38 |     //Converts list to map
39 |     println("Convert list to map")
40 |     val list4 = list1.map(f=>(f,f.toLowerCase()))
41 |     list4.foreach(f=>println(f._1+f._2))
42 | 
43 |     //A ListBuffer is like an array buffer except that it uses a linked list internally instead of an array
44 |     // val list3 = mutable.ListBuffer("A","B")
45 | 
46 | 
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/scala-examples/src/main/scala/com/sparkbyexamples/list/ListExamples.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.list
 2 | 
 3 | object ListExamples {
 4 | 
 5 |   def main(args:Array[String]) {
 6 | 
 7 |     var list1 = List("A","B")
 8 |     //list1(1)="AA" //Errro
 9 |     val  list2 = list1.map(_.toLowerCase())
10 | 
11 |     println(list1.mkString(","))
12 |     list1.foreach(f=>println(f))
13 |     list1.foreach(println(_))
14 | 
15 |     println("Reading a value form Index :"+list1(1))
16 |     println("Adding element 'C' to List")
17 |     //list1 += "D"
18 |     list1 = "C" :: list1    // re-assigning
19 |     println(list1.mkString(","))
20 | 
21 |     println("Adding two Lists")
22 |     var list3 = list1 ::: list2
23 | 
24 |     list3 :::= list2
25 |     println(list3.mkString(","))
26 | 
27 |     println("Adding literal to each element in Lists")
28 |     val list4 = list1.map(f=>f+"->")
29 |     println(list4.mkString(","))
30 | 
31 |     println("Convert all list elements to Int")
32 |     val list5 = List("1","2","3","4","5")
33 |     println(list5.map(f=>f.toInt).mkString(","))
34 | 
35 | 
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/scala-examples/src/main/scala/com/sparkbyexamples/static/MapExamples.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.static
2 | 
3 | object MapExamples {
4 | 
5 | 
6 | }
7 | 


--------------------------------------------------------------------------------
/scala-examples/src/main/scala/com/sparkbyexamples/static/StaticExample.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples
 2 | 
 3 | class StaticExample(className1:String) {
 4 | 
 5 |   private val className = className1
 6 | 
 7 |   def printObjectName(): Unit ={
 8 |     println(StaticExample.objectName);
 9 |   }
10 | 
11 |   def getValue():String = {
12 |     return StaticExample.objectName
13 |   }
14 |   def getClassName():String = {
15 |     return className
16 |   }
17 | 
18 | 
19 | }
20 | 
21 | object StaticExample {
22 | 
23 |   private val objectName = " class name Static Example"
24 |   val objectNamePublic = "public variable"
25 |   var singletone:Option[String] = None
26 | 
27 |   def create(): Unit ={
28 |     if(singletone == None){
29 |       singletone = Some("value")
30 |     }
31 |   }
32 | 
33 |   def getClassName(staticExample:StaticExample) : String = {
34 |     return staticExample.className
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/scala-examples/src/main/scala/com/sparkbyexamples/static/Test.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples
 2 | 
 3 | object Test {
 4 | 
 5 |   def main(args:Array[String]): Unit ={
 6 | 
 7 |     val O:Option[Any] = None
 8 | 
 9 |     StaticExample.singletone match {
10 |       case None => println ("nome")
11 |       case Some(_) => println(StaticExample.singletone.get)
12 |     }
13 | 
14 |     StaticExample.create()
15 |     println(StaticExample.objectNamePublic)
16 | 
17 |     StaticExample.singletone match {
18 |       case None => println ("nome")
19 |       case _ => println(StaticExample.singletone.get)
20 |     }
21 | 
22 |     val staticExample: StaticExample = new StaticExample("My Name is Naveen")
23 |     val staticExample2: StaticExample = new StaticExample("My Name is Prabha")
24 |     println("staticExample.getClassName() : "+staticExample.getClassName())
25 |     println("staticExample2.getClassName() : "+staticExample2.getClassName())
26 | 
27 |     println("staticExample.getValue() : "+staticExample.getValue())
28 | 
29 |     println( StaticExample.getClassName(staticExample))
30 | 
31 | 
32 |     staticExample.printObjectName()
33 | 
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/scala-kafka/README.md:
--------------------------------------------------------------------------------
1 | Apache Kafka producer and consumer example in scala
2 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/resources/person.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "Person",
 4 |   "namespace": "com.sparkbyexamples",
 5 |   "fields": [
 6 |     {"name": "id","type": "int"},
 7 |     {"name": "firstname","type": "string"},
 8 |     {"name": "middlename","type": "string"},
 9 |     {"name": "lastname","type": "string"},
10 |     {"name": "dob_year","type": "int"},
11 |     {"name": "dob_month","type": "int"},
12 |     {"name": "gender","type": "string"},
13 |     {"name": "salary","type": "int"}
14 |   ]
15 | }


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/KafkaConsumerAssignApp.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.kafka
 2 | 
 3 | import java.util
 4 | import java.util.Properties
 5 | import java.util.regex.Pattern
 6 | 
 7 | import org.apache.kafka.clients.consumer.KafkaConsumer
 8 | import org.apache.kafka.common.TopicPartition
 9 | 
10 | import scala.collection.JavaConverters._
11 | 
12 | object KafkaConsumerAssignApp {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 | 
16 |     val prop:Properties = new Properties()
17 |     prop.put("bootstrap.servers","192.168.1.100:9092")
18 |     prop.put("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer")
19 |     prop.put("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer")
20 | 
21 |     val consumer = new KafkaConsumer(prop)
22 | 
23 |     val tp1 = new TopicPartition("topic_text",1)
24 |     val tp2 = new TopicPartition("my_topic_partition",1)
25 | 
26 |     val topics = List[TopicPartition](tp1,tp2)
27 |     consumer.assign(topics.asJava)
28 |     while(true){
29 | 
30 |       val records = consumer.poll(10)
31 |       for(record<-records.asScala){
32 | 
33 |         println("Key: "+record.key() +", Value: "+record.value() +", Offset: "+record.offset() )
34 | 
35 |       }
36 |     }
37 | 
38 |     consumer.close()// close in finally block
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/KafkaConsumerSubscribeApp.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.kafka
 2 | import java.util.{Collections, Properties}
 3 | import java.util.regex.Pattern
 4 | 
 5 | import org.apache.kafka.clients.consumer.KafkaConsumer
 6 | 
 7 | import scala.collection.JavaConverters._
 8 | object KafkaConsumerSubscribeApp extends App {
 9 | 
10 |   val props:Properties = new Properties()
11 |   props.put("group.id", "test")
12 |   props.put("bootstrap.servers","192.168.1.128:9092")
13 |   props.put("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer")
14 |   props.put("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer")
15 |   props.put("enable.auto.commit", "true")
16 |   props.put("auto.commit.interval.ms", "1000")
17 |   val consumer = new KafkaConsumer(props)
18 |   val topics = List("topic_text")
19 |   try {
20 |     consumer.subscribe(topics.asJava)
21 |     //consumer.subscribe(Collections.singletonList("topic_partition"))
22 |     //consumer.subscribe(Pattern.compile("topic_partition"))
23 |     while (true) {
24 |       val records = consumer.poll(10)
25 |       for (record <- records.asScala) {
26 |         println("Topic: " + record.topic() + ", Key: " + record.key() + ", Value: " + record.value() +
27 |           ", Offset: " + record.offset() + ", Partition: " + record.partition())
28 |       }
29 |     }
30 |   }catch{
31 |     case e:Exception => e.printStackTrace()
32 |   }finally {
33 |     consumer.close()
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/KafkaProducerApp.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.kafka
 2 | import java.util.Properties
 3 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 4 | object KafkaProducerApp extends App {
 5 | 
 6 |   val props:Properties = new Properties()
 7 |   props.put("bootstrap.servers","192.168.1.128:9092")
 8 |   props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
 9 |   props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer")
10 |   props.put("acks","all")
11 | 
12 |   val producer = new KafkaProducer[String, String](props)
13 |   val topic = "text_topic"
14 | 
15 |   try {
16 |     for (i <- 0 to 15) {
17 |       val record = new ProducerRecord[String, String](topic, i.toString, "My Site is sparkbyexamples.com " + i)
18 |       val metadata = producer.send(record)
19 |       printf(s"sent record(key=%s value=%s) " +
20 |         "meta(partition=%d, offset=%d)\n",
21 |         record.key(), record.value(), metadata.get().partition(),
22 |         metadata.get().offset())
23 |     }
24 |   }catch{
25 |     case e:Exception => e.printStackTrace()
26 |   }finally {
27 |     producer.close()
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/KafkaProducerJson.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.kafka
 2 | 
 3 | object KafkaProducerJson_ {
 4 | 
 5 |   def main(args: Array[String]): Unit = {
 6 | 
 7 | 
 8 | 
 9 | 
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/avro/KafkaProducerAvro.scala:
--------------------------------------------------------------------------------
 1 | //import java.util.{Properties, UUID}
 2 | //
 3 | //import org.apache.avro.Schema
 4 | //import org.apache.avro.Schema.Parser
 5 | //import domain.User
 6 | //import org.apache.avro.generic.GenericData
 7 | //import org.apache.avro.generic.GenericRecord
 8 | //import org.apache.avro.specific.SpecificDatumWriter
 9 | //import java.io.ByteArrayOutputStream
10 | //
11 | //import org.apache.avro.io._
12 | //import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
13 | //
14 | //import scala.io.Source
15 | //
16 | //class KafkaProducer() {
17 | //
18 | //  private val props = new Properties()
19 | //
20 | //  props.put("metadata.broker.list", "localhost:9092")
21 | //  props.put("message.send.max.retries", "5")
22 | //  props.put("request.required.acks", "-1")
23 | //  props.put("serializer.class", "kafka.serializer.DefaultEncoder")
24 | //  props.put("client.id", UUID.randomUUID().toString())
25 | //
26 | //  private val producer = new Producer[String, Array[Byte]](new ProducerConfig(props))
27 | //
28 | //  //Read avro schema file
29 | //  val schema: Schema = new Parser().parse(Source.fromURL(getClass.getResource("/schema.avsc")).mkString)
30 | //
31 | //  // Create avro generic record object
32 | //  val genericUser: GenericRecord = new GenericData.Record(schema)
33 | //
34 | //  //Put data in that generic record
35 | //  genericUser.put("id", "1")
36 | //  genericUser.put("name", "sushil")
37 | //  genericUser.put("email", null)
38 | //
39 | //  // Serialize generic record into byte array
40 | //  val writer = new SpecificDatumWriter[GenericRecord](schema)
41 | //  val out = new ByteArrayOutputStream()
42 | //  val encoder: BinaryEncoder = EncoderFactory.get().binaryEncoder(out, null)
43 | //  writer.write(genericUser, encoder)
44 | //  encoder.flush()
45 | //  out.close()
46 | //
47 | //  val serializedBytes: Array[Byte] = out.toByteArray()
48 | //
49 | //  val queueMessage = new KeyedMessage[String, Array[Byte]](topic, serializedBytes)
50 | //  producer.send(queueMessage)


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/beans/User.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.kafka.beans
 2 | 
 3 | class User() {
 4 |   private var name:String = ""
 5 |   private var age:Int = 0
 6 | 
 7 |   def this(name: String, age: Int) {
 8 |     this()
 9 |     this.name =name
10 |     this.age = age
11 |   }
12 | 
13 |   def getName: String = this.name
14 | 
15 |   def getAge: Int = this.age
16 | 
17 |   override def toString: String = "User(" + name + ", " + age + ")"
18 | }
19 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/jackson/KafkaConsumerWithUserObject.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.kafka.jackson
 2 | import java.util.Properties
 3 | import com.sparkbyexamples.kafka.beans.User
 4 | import org.apache.kafka.clients.consumer.KafkaConsumer
 5 | import scala.collection.JavaConverters._
 6 | object KafkaConsumerWithUserObject extends App {
 7 |   val prop:Properties = new Properties()
 8 |   prop.put("group.id", "test")
 9 |   prop.put("bootstrap.servers","192.168.1.100:9092")
10 |   prop.put("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer")
11 |   prop.put("value.deserializer","com.sparkbyexamples.kafka.jackson.UserDeserializer")
12 |   prop.put("enable.auto.commit", "true")
13 |   prop.put("auto.commit.interval.ms", "1000")
14 |   val consumer = new KafkaConsumer[String,User](prop)
15 |   val topics = List("user_user")
16 |   try{
17 |     consumer.subscribe(topics.asJava)
18 |     while(true){
19 |       val records = consumer.poll(10)
20 |       for(record<-records.asScala){
21 |         println("Topic: "+record.topic()+", Key: "+record.key() +", Value: "+record.value().getName +
22 |           ", Offset: "+record.offset() +", Partition: "+record.partition())
23 |       }
24 |     }
25 |   }catch{
26 |     case e:Exception => e.printStackTrace()
27 |   }finally {
28 |     consumer.close()
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/jackson/KafkaProducerWithUserObject.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.kafka.jackson
 2 | import java.util.Properties
 3 | 
 4 | import com.sparkbyexamples.kafka.beans.User
 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 6 | import org.apache.kafka.common.serialization.StringSerializer
 7 | object KafkaProducerWithUserObject {
 8 |   val props:Properties = new Properties()
 9 |   props.put("bootstrap.servers","192.168.1.100:9092")
10 |   props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
11 |   props.put("value.serializer","com.sparkbyexamples.kafka.jackson.UserSerializer")
12 |   props.put("acks","all")
13 |   val producer = new KafkaProducer[String, User](props)
14 |   try{
15 |     for(i <- 0 to 100) {
16 |       val user = new User("My Name - "+i,i)
17 |       val record = new ProducerRecord[String, User]("user_topic",i.toString,user)
18 |       val metadata = producer.send(record)
19 |       printf(s"sent record(key=%s value=%s) " +
20 |         "meta(partition=%d, offset=%d)\n",
21 |         record.key(), record.value(), metadata.get().partition(),
22 |         metadata.get().offset());
23 |     }
24 | 
25 |   }catch{
26 |     case e:Exception => e.printStackTrace()
27 |   }finally {
28 |     producer.close()
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/jackson/UserDeserializer.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.kafka.jackson
 2 | 
 3 | import java.util
 4 | 
 5 | import com.sparkbyexamples.kafka.beans.User
 6 | import org.apache.kafka.common.serialization.Deserializer
 7 | import org.codehaus.jackson.map.ObjectMapper
 8 | 
 9 | class UserDeserializer extends Deserializer[User] {
10 |   override def configure(map: util.Map[String, _], b: Boolean): Unit = {
11 |   }
12 | 
13 |   override def deserialize(s: String, bytes: Array[Byte]): User = {
14 |     val mapper = new ObjectMapper()
15 |     val user = mapper.readValue(bytes, classOf[User])
16 |     user
17 |   }
18 | 
19 |   override def close(): Unit = {
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/jackson/UserSerializer.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.kafka.jackson
 2 | 
 3 | import java.util
 4 | 
 5 | import com.sparkbyexamples.kafka.beans.User
 6 | import org.apache.kafka.common.serialization.Serializer
 7 | import org.codehaus.jackson.map.ObjectMapper
 8 | 
 9 | class UserSerializer extends Serializer[User]{
10 | 
11 |   override def configure(map: util.Map[String, _], b: Boolean): Unit = {
12 |   }
13 | 
14 |   override def serialize(s: String, t: User): Array[Byte] = {
15 |     if(t==null)
16 |       null
17 |     else
18 |      {
19 |        val objectMapper = new ObjectMapper()
20 |        objectMapper.writeValueAsString(t).getBytes
21 |      }
22 |   }
23 | 
24 |   override def close(): Unit = {
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/json/KafkaProducerJson.scala:
--------------------------------------------------------------------------------
 1 | //package com.sparkbyexamples.kafka.json
 2 | //
 3 | //import java.util.Properties
 4 | //
 5 | //import org.apache.kafka.clients.producer.KafkaProducer
 6 | //
 7 | //object KafkaProducerJson {
 8 | //
 9 | //  def main(args: Array[String]): Unit = {
10 | //
11 | //    val props:Properties = new Properties()
12 | //    props.put("bootstrap.servers","192.168.1.128:9092")
13 | //    props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
14 | //    props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer")
15 | //    props.put("acks","all")
16 | //
17 | //    val producer = new KafkaProducer[String, String](props)
18 | //    val topic = "text_topic"
19 | //
20 | //
21 | //  }
22 | //}
23 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/registry/KafkaConsumerAvroRegistry.scala:
--------------------------------------------------------------------------------
 1 | //package com.sparkbyexamples.kafka.avro
 2 | //
 3 | //import java.util.Properties
 4 | //
 5 | //import org.apache.kafka.clients.consumer.ConsumerConfig
 6 | //import org.apache.kafka.clients.consumer.KafkaConsumer
 7 | //import java.util
 8 | //import scala.collection.JavaConversions._
 9 | //
10 | //object KafkaConsumerAvroRegistry_ {
11 | //
12 | //  def main(args: Array[String]): Unit = {
13 | //
14 | //    val props = new Properties()
15 | //    props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092")
16 | //    props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.StringDeserializer.class")
17 | //    props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.KafkaAvroDeserializer.class")
18 | //
19 | //    props.put(ConsumerConfig.GROUP_ID_CONFIG, "group1")
20 | //    props.put("schema.registry.url", "http://localhost:8081");
21 | //
22 | //    val topic = "avro_topic"
23 | //    val consumer = new KafkaConsumer[String, String](props)
24 | //    consumer.subscribe(util.Arrays.asList(topic))
25 | //    while ({true}) {
26 | //      val records = consumer.poll(100)
27 | //
28 | //      for (record <- records) {
29 | //        //System.out.printf("offset = %d, key = %s, value = %s \n", record.offset, record.key, record.value)
30 | //      }
31 | //    }
32 | //  }
33 | //}


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/registry/KafkaProducerAvroRegistry.scala:
--------------------------------------------------------------------------------
 1 | //package com.sparkbyexamples.kafka.avro
 2 | //
 3 | //import java.util.Properties
 4 | //
 5 | //import org.apache.avro.Schema
 6 | //import org.apache.avro.generic.{GenericData, GenericRecord}
 7 | //import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
 8 | //
 9 | //object KafkaProducerAvroRegistry_ {
10 | //
11 | //  def main(args: Array[String]): Unit = {
12 | //
13 | //    val props = new Properties()
14 | //    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092")
15 | //    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.StringSerializer.class")
16 | //    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.KafkaAvroSerializer.class");
17 | //    props.put("schema.registry.url", "http://localhost:8081");
18 | //    val producer = new KafkaProducer[Object, Object](props);
19 | //
20 | //    val key = "key1";
21 | //    val userSchema = "{\"type\":\"record\"," +
22 | //      "\"name\":\"myrecord\"," +
23 | //      "\"fields\":[{\"name\":\"f1\",\"type\":\"string\"}]}";
24 | //    val parser = new Schema.Parser();
25 | //    val schema = parser.parse(userSchema);
26 | //    val avroRecord:GenericRecord = new GenericData.Record(schema);
27 | //    avroRecord.put("f1", "value1");
28 | //
29 | //    val record:ProducerRecord[Object, Object] = new ProducerRecord[Object, Object]("avro_topic", key, avroRecord);
30 | //
31 | //    producer.send(record);
32 | //
33 | //  }
34 | //}
35 | 


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/registry/PersonKafkaConsumerAvroRegistry.scala:
--------------------------------------------------------------------------------
 1 | //package com.sparkbyexamples.kafka.avro
 2 | //
 3 | //import java.util
 4 | //import java.util.Properties
 5 | //
 6 | //import org.apache.kafka.clients.consumer.{ConsumerConfig, KafkaConsumer}
 7 | //
 8 | //import scala.collection.JavaConversions._
 9 | //
10 | //object PersonKafkaConsumerAvroRegistry_ {
11 | //
12 | //  def main(args: Array[String]): Unit = {
13 | //
14 | //    val props = new Properties()
15 | //    props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092")
16 | //    props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.StringDeserializer.class")
17 | //    props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.KafkaAvroDeserializer.class")
18 | //
19 | //    props.put(ConsumerConfig.GROUP_ID_CONFIG, "group1")
20 | //    props.put("schema.registry.url", "http://localhost:8081");
21 | //
22 | //    val topic = "avro_topic"
23 | //    val consumer = new KafkaConsumer[String, String](props)
24 | //    consumer.subscribe(util.Arrays.asList(topic))
25 | //    while ({true}) {
26 | //      val records = consumer.poll(100)
27 | //
28 | //      for (record <- records) {
29 | //        //System.out.printf("offset = %d, key = %s, value = %s \n", record.offset, record.key, record.value)
30 | //      }
31 | //    }
32 | //  }
33 | //}


--------------------------------------------------------------------------------
/scala-kafka/src/main/scala/com/sparkbyexamples/kafka/registry/PersonKafkaProducerAvroRegistry.scala:
--------------------------------------------------------------------------------
 1 | //package com.sparkbyexamples.kafka.avro
 2 | //
 3 | //import java.util.Properties
 4 | //
 5 | //import org.apache.avro.Schema
 6 | //import org.apache.avro.generic.GenericRecord
 7 | //import org.apache.avro.generic.GenericRecordBuilder
 8 | //import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
 9 | //import java.io.{ByteArrayOutputStream, File}
10 | //
11 | //import org.apache.avro.io.{BinaryEncoder, EncoderFactory}
12 | //import org.apache.avro.specific.SpecificDatumWriter
13 | //
14 | //
15 | //
16 | //object PersonKafkaProducerAvroRegistry {
17 | //
18 | //  def main(args: Array[String]): Unit = {
19 | //
20 | //    val props = new Properties()
21 | //    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.1.100:9092")
22 | //    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.StringSerializer.class")
23 | //    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.KafkaAvroSerializer.class");
24 | //   // props.put("serializer.class", "kafka.serializer.DefaultEncoder")
25 | //
26 | //    val producer = new KafkaProducer[Object, Object](props);
27 | //
28 | //    val key = "key1";
29 | //
30 | //
31 | //    val parser = new Schema.Parser();
32 | //    val schema = parser.parse(new File("src/main/resources/person.avsc"));
33 | //    val genericRecordBuilder = new GenericRecordBuilder(schema)
34 | //
35 | //
36 | //    val avroPerson = genericRecordBuilder
37 | //      .set("firstName", "My First Name")
38 | //      .set("lastName", "My last Name")
39 | //      .set("birthDate", "My Date of Birth")
40 | //      .build()
41 | //
42 | ////    val writer = new SpecificDatumWriter[GenericRecord](schema)
43 | ////    val out = new ByteArrayOutputStream()
44 | ////    val encoder: BinaryEncoder = EncoderFactory.get().binaryEncoder(out, null)
45 | ////    writer.write(avroPerson, encoder)
46 | ////    encoder.flush()
47 | ////    out.close()
48 | ////    val serializedBytes: Array[Byte] = out.toByteArray()
49 | //
50 | //    val record:ProducerRecord[Object, Object] = new ProducerRecord[Object, Object]("avro_topic", key, avroPerson);
51 | //
52 | //    producer.send(record);
53 | //
54 | //  }
55 | //}
56 | 


--------------------------------------------------------------------------------
/spark-avro-examples/src/main/resources/person.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "Person",
 4 |   "namespace": "com.sparkbyexamples",
 5 |   "fields": [
 6 |     {"name": "id","type": "int"},
 7 |     {"name": "firstname","type": "string"},
 8 |     {"name": "middlename","type": "string"},
 9 |     {"name": "lastname","type": "string"},
10 |     {"name": "dob_year","type": "int"},
11 |     {"name": "dob_month","type": "int"},
12 |     {"name": "gender","type": "string"},
13 |     {"name": "salary","type": "int"}
14 |   ]
15 | }


--------------------------------------------------------------------------------
/spark-avro-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/avro/AvroExample.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.avro
 2 | 
 3 | import java.io.File
 4 | import org.apache.avro.Schema
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.{SaveMode, SparkSession}
 7 | /**
 8 |   * Spark Avro library example
 9 |   * Avro schema example
10 |   * Avro file format
11 |   *
12 |   */
13 | object AvroExample {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 | 
17 |     val spark: SparkSession = SparkSession.builder().master("local[1]")
18 |       .appName("SparkByExamples.com")
19 |       .getOrCreate()
20 | 
21 |     val data = Seq((1,"James ", "", "Smith", 2018, 1, "M", 3000),
22 |       (2,"Michael ", "Rose", "", 2010, 3, "M", 4000),
23 |       (3,"Robert ", "", "Williams", 2010, 3, "M", 4000),
24 |       (4,"Maria ", "Anne", "Jones", 2005, 5, "F", 4000),
25 |       (5,"Jen", "Mary", "Brown", 2010, 7, "", -1)
26 |     )
27 | 
28 |     val columns = Seq("firstname", "middlename", "lastname", "dob_year",
29 |       "dob_month", "gender", "salary")
30 |     import spark.sqlContext.implicits._
31 |     val df = data.toDF(columns: _*)
32 | 
33 |     /**
34 |       * Write Avro File
35 |       */
36 |     df.write.format("avro")
37 |       .mode(SaveMode.Overwrite)
38 |       .save("C:/tmp/spark_out/avro/person.avro")
39 | 
40 |     /**
41 |       * Read Avro File
42 |       */
43 |     spark.read.format("avro").load("C:/tmp/spark_out/avro/person.avro").show()
44 | 
45 |     /**
46 |       * Write Avro Partition
47 |       */
48 |     df.write.partitionBy("dob_year","dob_month")
49 |       .format("avro")
50 |       .mode(SaveMode.Overwrite)
51 |       .save("C:/tmp/spark_out/avro/person_partition.avro")
52 | 
53 |     /**
54 |       * Reading Avro Partition
55 |       */
56 |     spark.read
57 |       .format("avro")
58 |       .load("C:/tmp/spark_out/avro/person_partition.avro")
59 |       .where(col("dob_year") === 2010)
60 |       .show()
61 | 
62 |     /**
63 |       * Explicit Avro schema
64 |       */
65 |     val schemaAvro = new Schema.Parser()
66 |       .parse(new File("src/main/resources/person.avsc"))
67 | 
68 |     spark.read
69 |       .format("avro")
70 |       .option("avroSchema", schemaAvro.toString)
71 |       .load("C:/tmp/spark_out/avro/person.avro")
72 |       .show()
73 | 
74 |     /**
75 |       * Avro Spark SQL
76 |       */
77 |     spark.sqlContext.sql("CREATE TEMPORARY VIEW PERSON USING avro OPTIONS (path \"C:/tmp/spark_out/avro/person.avro\")")
78 |     spark.sqlContext.sql("SELECT * FROM PERSON").show()
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/spark-avro-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/avro/AvroUsingNestedSchema.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.avro
2 | 
3 | object AvroUsingNestedSchema_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-hive/src/main/scala/com/sparkbyexamples/HBaseWrite.scala:
--------------------------------------------------------------------------------
 1 | //package com.sparkbyexamples
 2 | //
 3 | //import org.apache.spark.sql.SparkSession
 4 | //
 5 | //object HBaseWrite {
 6 | //
 7 | //  def main(args: Array[String]): Unit = {
 8 | //
 9 | //    val spark:SparkSession = SparkSession.builder()
10 | //      .master("local[3]")
11 | //      .appName("SparkByExample")
12 | //      .getOrCreate()
13 | //
14 | //    //Chaining multiple options
15 | //    val df = spark.read.
16 | //      options(Map("inferSchema"->"true","sep"->",","header"->"true"))
17 | //      .csv("src/main/resources/zipcodes.csv")
18 | //    df.show(false)
19 | //    df.printSchema()
20 | //
21 | //    def catalog = s"""{
22 | //                     |"table":{"namespace":"default", "name":"Zipcode"},
23 | //                     |"rowkey":"key",
24 | //                     |"columns":{
25 | //                     |"RecordNumber":{"cf":"rowkey", "col":"RecordNumber", "type":"string"},
26 | //                     |"Zipcode":{"cf":"ZipcodeCF", "col":"Zipcode", "type":"string"},
27 | //                     |"ZipCodeType":{"cf":"ZipcodeCF", "col":"ZipCodeType", "type":"string"},
28 | //                     |"City":{"cf":"ZipcodeCF", "col":"City", "type":"string"},
29 | //                     |"State":{"cf":"ZipcodeCF", "col":"State", "type":"string"},
30 | //                     |"LocationType":{"cf":"ZipcodeCF", "col":"LocationType", "type":"string"},
31 | //                     |"Lat":{"cf":"ZipcodeCF", "col":"Lat", "type":"string"},
32 | //                     |"Long":{"cf":"ZipcodeCF", "col":"Long", "type":"string"},
33 | //                     |"Xaxis":{"cf":"ZipcodeCF", "col":"Xaxis", "type":"string"},
34 | //                     |"Yaxis":{"cf":"ZipcodeCF", "col":"Yaxis", "type":"string"},
35 | //                     |"Zaxis":{"cf":"ZipcodeCF", "col":"Zaxis", "type":"string"},
36 | //                     |"WorldRegion":{"cf":"ZipcodeCF", "col":"WorldRegion", "type":"string"},
37 | //                     |"Country":{"cf":"ZipcodeCF", "col":"Country", "type":"string"},
38 | //                     |"LocationText":{"cf":"ZipcodeCF", "col":"LocationText", "type":"string"}
39 | //                     |}
40 | //                     |}""".stripMargin
41 | //
42 | //    df.write
43 | //      .option(HBaseTableCatalog.tableCatalog, catalog)
44 | //      .option(HBaseTableCatalog.newTable, "5")
45 | //      .format("org.apache.spark.sql.execution.datasources.hbase")
46 | //      .save()
47 | //
48 | //  }
49 | //}
50 | 


--------------------------------------------------------------------------------
/spark-kafka/src/main/resources/person.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "Person",
 4 |   "namespace": "com.sparkbyexamples",
 5 |   "fields": [
 6 |     {"name": "firstname","type": "string"},
 7 |     {"name": "middlename","type": "string"},
 8 |     {"name": "lastname","type": "string"},
 9 |     {"name": "dob_year","type": "int"},
10 |     {"name": "dob_month","type": "int"},
11 |     {"name": "gender","type": "string"},
12 |     {"name": "salary","type": "int"}
13 |   ]
14 | }


--------------------------------------------------------------------------------
/spark-kafka/src/main/scala/com/sparkbyexamples/spark/kafka/json/KafkaConsumerJson.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.kafka.json
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object KafkaConsumerJson {
 6 |   def main(args:Array[String]): Unit = {
 7 | 
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder().master("local[1]")
10 |       .appName("SparkByExamples.com")
11 |       .getOrCreate()
12 | 
13 |     val df = spark
14 |       .readStream
15 |       .format("kafka")
16 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
17 |       .option("subscribe", "topic1")
18 |       .load()
19 | 
20 |     df.printSchema()
21 | 
22 |     df.show()
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/spark-kafka/src/main/scala/com/sparkbyexamples/spark/kafka/json/KafkaProduceJson.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.kafka.json
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object KafkaProduceJson {
 6 | 
 7 |   def main(args:Array[String]): Unit ={
 8 | 
 9 | 
10 |     val spark: SparkSession = SparkSession.builder().master("local[1]")
11 |       .appName("SparkByExamples.com")
12 |       .getOrCreate()
13 | 
14 |     val data = Seq((1,"James ","","Smith",2018,1,"M",3000),
15 |       (2,"Michael ","Rose","",2010,3,"M",4000),
16 |       (3,"Robert ","","Williams",2010,3,"M",4000),
17 |       (4,"Maria ","Anne","Jones",2005,5,"F",4000),
18 |       (5,"Jen","Mary","Brown",2010,7,"",-1)
19 |     )
20 | 
21 |     val columns = Seq("id","firstname","middlename","lastname","dob_year",
22 |       "dob_month","gender","salary")
23 |     import spark.sqlContext.implicits._
24 |     val df = data.toDF(columns:_*)
25 | 
26 |     val ds = df.toJSON
27 |     ds.printSchema()
28 | 
29 |     val query = ds
30 |       .writeStream
31 |       .format("kafka")
32 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
33 |       .option("topic", "text_topic")
34 |       .start()
35 | 
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/kv.csv:
--------------------------------------------------------------------------------
1 | key,value
2 | record1,My Name is Naveen
3 | record2,My Name is Praveen
4 | record3,My Name is Prabha


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/persons.xml:
--------------------------------------------------------------------------------
 1 | <persons>
 2 |     <person id="1">
 3 |         <firstname>James</firstname>
 4 |         <lastname>Smith</lastname>
 5 |         <middlename></middlename>
 6 |         <dob_year>1980</dob_year>
 7 |         <dob_month>1</dob_month>
 8 |         <gender>M</gender>
 9 |         <salary currency="Euro">10000</salary>
10 |         <addresses>
11 |             <address>
12 |                 <street>123 ABC street</street>
13 |                 <city>NewJersy</city>
14 |                 <state>NJ</state>    
15 |             </address>
16 |             <address>
17 |                 <street>456 apple street</street>
18 |                 <city>newark</city>
19 |                 <state>DE</state>    
20 |             </address>    
21 |         </addresses>    
22 |     </person>
23 |     <person id="2">
24 |         <firstname>Michael</firstname>
25 |         <lastname></lastname>
26 |         <middlename>Rose</middlename>
27 |         <dob_year>1990</dob_year>
28 |         <dob_month>6</dob_month>
29 |         <gender>M</gender>
30 |         <salary currency="Dollor">10000</salary>
31 |         <addresses>
32 |             <address>
33 |                 <street>4512 main st</street>
34 |                 <city>new york</city>
35 |                 <state>NY</state>    
36 |             </address>
37 |             <address>
38 |                 <street>4367 orange st</street>
39 |                 <city>sandiago</city>
40 |                 <state>CA</state>    
41 |             </address>    
42 |         </addresses>            
43 |     </person>
44 | </persons>
45 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/persons_complex.xml:
--------------------------------------------------------------------------------
 1 | <persons>
 2 |     <person id="1">
 3 |         <firstname>James</firstname>
 4 |         <lastname>Smith</lastname>
 5 |         <middlename></middlename>
 6 |         <dob_year>1980</dob_year>
 7 |         <dob_month>1</dob_month>
 8 |         <gender>M</gender>
 9 |         <salary currency="Euro">10000</salary>
10 |         <addresses>
11 |             <address type="home">
12 |                 <addressLine>1 capler dr</addressLine>
13 |                 <city>new york</city>
14 |                 <state>NY</state>
15 |             </address>
16 |             <address type="work">
17 |                 <addressLine>455 catalina dr</addressLine>
18 |                 <city>chicago</city>
19 |                 <state>IL</state>
20 |             </address>
21 |         </addresses>
22 |     </person>
23 |     <person id="2">
24 |         <firstname>Michael</firstname>
25 |         <lastname></lastname>
26 |         <middlename>Rose</middlename>
27 |         <dob_year>1990</dob_year>
28 |         <dob_month>6</dob_month>
29 |         <gender>M</gender>
30 |         <salary currency="Dollor">10000</salary>
31 |         <addresses>
32 |             <address type="home">
33 |                 <addressLine>2345 pasadena village</addressLine>
34 |                 <city>orlando</city>
35 |                 <state>FL</state>
36 |             </address>
37 |             <address type="work">
38 |                 <addressLine>3 walnut dr</addressLine>
39 |                 <city>wilmington</city>
40 |                 <state>DE</state>
41 |             </address>
42 |         </addresses>
43 |     </person>
44 | </persons>


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/records.xml:
--------------------------------------------------------------------------------
 1 | <Records>
 2 |     <Rec>
 3 |         <Name>John</Name>
 4 |         <Age>10</Age>
 5 |         <Gender>M</Gender>
 6 |     </Rec>
 7 |     <Rec>
 8 |         <Name>Jenny</Name>
 9 |         <Age>12</Age>
10 |         <Gender>F</Gender>
11 |     </Rec>
12 |     <Rec>
13 |         <Name>Janardhan</Name>
14 |         <Age>14</Age>
15 |         <Gender>M</Gender>
16 |     </Rec>
17 | </Records>
18 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type" : "struct",
 3 |   "fields" : [ {
 4 |     "name" : "name",
 5 |     "type" : {
 6 |       "type" : "struct",
 7 |       "fields" : [ {
 8 |         "name" : "firstname",
 9 |         "type" : "string",
10 |         "nullable" : true,
11 |         "metadata" : { }
12 |       }, {
13 |         "name" : "middlename",
14 |         "type" : "string",
15 |         "nullable" : true,
16 |         "metadata" : { }
17 |       }, {
18 |         "name" : "lastname",
19 |         "type" : "string",
20 |         "nullable" : true,
21 |         "metadata" : { }
22 |       } ]
23 |     },
24 |     "nullable" : true,
25 |     "metadata" : { }
26 |   }, {
27 |     "name" : "dob",
28 |     "type" : "string",
29 |     "nullable" : true,
30 |     "metadata" : { }
31 |   }, {
32 |     "name" : "gender",
33 |     "type" : "string",
34 |     "nullable" : true,
35 |     "metadata" : { }
36 |   }, {
37 |     "name" : "salary",
38 |     "type" : "integer",
39 |     "nullable" : true,
40 |     "metadata" : { }
41 |   } ]
42 | }


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/stream.csv:
--------------------------------------------------------------------------------
1 | TotalCost|BirthDate|Gender|TotalChildren|ProductCategoryName
2 | 1000||Male|2|Technology
3 | 2000|1957-03-06||3|Beauty
4 | 3000|1959-03-06|Male||Car
5 | 4000|1953-03-06|Male|2|
6 | 5000|1957-03-06|Female|3|Beauty
7 | 6000|1959-03-06|Male|4|Car


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/test.txt:
--------------------------------------------------------------------------------
 1 | ﻿Project Gutenberg’s
 2 | Alice’s Adventures in Wonderland
 3 | by Lewis Carroll
 4 | This eBook is for the use
 5 | of anyone anywhere
 6 | at no cost and with
 7 | Alice’s Adventures in Wonderland
 8 | by Lewis Carroll
 9 | This eBook is for the use
10 | of anyone anywhere
11 | at no cost and with
12 | This eBook is for the use
13 | of anyone anywhere
14 | at no cost and with
15 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode1.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
2 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode10.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
2 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false}
3 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false}
4 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode11.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
2 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode12.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode2.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false}
2 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false}
3 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode3.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":61391,"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX","LocationType":"NOT ACCEPTABLE","Lat":32.72,"Long":-97.31,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Cingular Wireless, TX","Location":"NA-US-TX-CINGULAR WIRELESS","Decommisioned":false}
2 | {"RecordNumber":61392,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX","LocationType":"PRIMARY","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Fort Worth, TX","Location":"NA-US-TX-FORT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986}
3 | {"RecordNumber":61393,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX","LocationType":"ACCEPTABLE","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Ft Worth, TX","Location":"NA-US-TX-FT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986}
4 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode4.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":4,"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Urb Eugene Rice, PR","Location":"NA-US-PR-URB EUGENE RICE","Decommisioned":false}
2 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode5.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":39827,"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.37,"Long":-111.64,"Xaxis":-0.3,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14962,"EstimatedPopulation":26883,"TotalWages":563792730,"Notes":"no NWS data, "}
2 | {"RecordNumber":39828,"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.38,"Long":-111.84,"Xaxis":-0.31,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14374,"EstimatedPopulation":25446,"TotalWages":471000465}
3 | {"RecordNumber":49345,"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL","LocationType":"PRIMARY","Lat":30.69,"Long":-81.92,"Xaxis":0.12,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Hilliard, FL","Location":"NA-US-FL-HILLIARD","Decommisioned":false,"TaxReturnsFiled":3922,"EstimatedPopulation":7443,"TotalWages":133112149}
4 | {"RecordNumber":49346,"Zipcode":34445,"ZipCodeType":"PO BOX","City":"HOLDER","State":"FL","LocationType":"PRIMARY","Lat":28.96,"Long":-82.41,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Holder, FL","Location":"NA-US-FL-HOLDER","Decommisioned":false}
5 | {"RecordNumber":49347,"Zipcode":32564,"ZipCodeType":"STANDARD","City":"HOLT","State":"FL","LocationType":"PRIMARY","Lat":30.72,"Long":-86.67,"Xaxis":0.04,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Holt, FL","Location":"NA-US-FL-HOLT","Decommisioned":false,"TaxReturnsFiled":1207,"EstimatedPopulation":2190,"TotalWages":36395913}
6 | {"RecordNumber":49348,"Zipcode":34487,"ZipCodeType":"PO BOX","City":"HOMOSASSA","State":"FL","LocationType":"PRIMARY","Lat":28.78,"Long":-82.61,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Homosassa, FL","Location":"NA-US-FL-HOMOSASSA","Decommisioned":false}
7 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode6.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":10,"Zipcode":708,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false}
2 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode7.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false}
2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599}
3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517}
4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318}
6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473}
7 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode8.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false}
2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599}
3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517}
4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318}
6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473}
7 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode9.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
2 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318}
3 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473}
4 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/SparkSessionTest.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object SparkSessionTest {
 6 | 
 7 |   def main(args:Array[String]): Unit ={
 8 | 
 9 |     val spark = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate();
13 | 
14 |     println("First SparkContext:")
15 |     println("APP Name :"+spark.sparkContext.appName);
16 |     println("Deploy Mode :"+spark.sparkContext.deployMode);
17 |     println("Master :"+spark.sparkContext.master);
18 | 
19 |     val sparkSession2 = SparkSession.builder()
20 |       .master("local[1]")
21 |       .appName("SparkByExample-test")
22 |       .getOrCreate();
23 | 
24 |     println("Second SparkContext:")
25 |     println("APP Name :"+sparkSession2.sparkContext.appName);
26 |     println("Deploy Mode :"+sparkSession2.sparkContext.deployMode);
27 |     println("Master :"+sparkSession2.sparkContext.master);
28 | 
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/Books.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.beans
2 | 
3 | case class Books(_id:String, author:String, description:String, price:Double, publish_date:String, title:String)


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/BooksDiscounted.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.beans
2 | 
3 | case class BooksDiscounted(_id:String, author:String, description:String, price:Double, publish_date:String, title:String, discountPrice:Double)
4 | 
5 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/BooksStruct.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.beans
2 | 
3 | class BooksStruct {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/BooksWithArray.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.beans
 2 | 
 3 | case class BooksWithArray(_id:String, author:String, description:String, price:Double, publish_date:String, title:String,otherInfo:OtherInfo,stores:Stores)
 4 | case class OtherInfo(pagesCount:String,language:String,country:String,address:Address)
 5 | case class Address(addressline1:String,city:String,state:String)
 6 | case class Stores(store:Array[Store])
 7 | case class Store(name:String)
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/User.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.beans
 2 | 
 3 | class User() {
 4 |   private var name:String = ""
 5 |   private var age:Int = 0
 6 | 
 7 |   def this(name: String, age: Int) {
 8 |     this()
 9 |     this.name =name
10 |     this.age = age
11 |   }
12 | 
13 |   def getName: String = this.name
14 | 
15 |   def getAge: Int = this.age
16 | 
17 |   override def toString: String = "User(" + name + ", " + age + ")"
18 | }
19 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/Zipcode.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.beans
 2 | 
 3 | import scala.beans.BeanProperty
 4 | 
 5 | class Zipcode {
 6 | 
 7 |   @BeanProperty
 8 |   var RecordNumber = -1
 9 |   @BeanProperty
10 |   var Zipcode=""
11 |   @BeanProperty
12 |   var ZipCodeType=""
13 |   @BeanProperty
14 |   var City=""
15 |   @BeanProperty
16 |   var State=""
17 |   @BeanProperty
18 |   var LocationType=""
19 |   @BeanProperty
20 |   var Lat=""
21 |   @BeanProperty
22 |   var Long=""
23 |   @BeanProperty
24 |   var Xaxis=""
25 |   @BeanProperty
26 |   var Yaxis=""
27 |   @BeanProperty
28 |   var Zaxis=""
29 |   @BeanProperty
30 |   var WorldRegion=""
31 |   @BeanProperty
32 |   var Country=""
33 |   @BeanProperty
34 |   var LocationText=""
35 |   @BeanProperty
36 |   var Location=""
37 |   @BeanProperty
38 |   var Decommisioned=""
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/ArrayToColumn.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
 4 | import org.apache.spark.sql.{Row, SparkSession}
 5 | 
 6 | object ArrayToColumn extends App {
 7 | 
 8 |   val spark = SparkSession.builder().appName("SparkByExamples.com")
 9 |     .master("local[1]")
10 |     .getOrCreate()
11 | 
12 |   val arrayData = Seq(
13 |     Row("James",List("Java","Scala","C++")),
14 |     Row("Michael",List("Spark","Java","C++")),
15 |     Row("Robert",List("CSharp","VB",""))
16 |   )
17 | 
18 |   val arraySchema = new StructType().add("name",StringType)
19 |     .add("subjects",ArrayType(StringType))
20 | 
21 |   val arrayDF = spark.createDataFrame(spark.sparkContext.parallelize(arrayData),arraySchema)
22 |   arrayDF.printSchema()
23 |   arrayDF.show()
24 | 
25 |   val arrayDFColumn = df.select(
26 |     df("name") +: (0 until 3).map(i => df("subjects")(i).alias(s"LanguagesKnown$i")): _*
27 |   )
28 | 
29 |   arrayDFColumn.show(false)
30 | 
31 |   //How to convert Array of Array to column
32 |   val arrayArrayData = Seq(
33 |     Row("James",List(List("Java","Scala","C++"),List("Spark","Java"))),
34 |     Row("Michael",List(List("Spark","Java","C++"),List("Spark","Java"))),
35 |     Row("Robert",List(List("CSharp","VB"),List("Spark","Python")))
36 |   )
37 | 
38 |   val arrayArraySchema = new StructType().add("name",StringType)
39 |     .add("subjects",ArrayType(ArrayType(StringType)))
40 | 
41 |   val df = spark.createDataFrame(spark.sparkContext.parallelize(arrayArrayData),arrayArraySchema)
42 |   df.printSchema()
43 |   df.show()
44 | 
45 |   val df2 = df.select(
46 |     df("name") +: (0 until 2).map(i => df("subjects")(i).alias(s"LanguagesKnown$i")): _*
47 |   )
48 | 
49 |   df2.show(false)
50 | }
51 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/AvroExample.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import java.io.File
 4 | 
 5 | import org.apache.avro.Schema
 6 | import org.apache.spark.sql.{SaveMode, SparkSession}
 7 | import org.apache.spark.sql.functions._
 8 | 
 9 | /**
10 |   * Spark Avro library example
11 |   * Avro schema example
12 |   * Avro file format
13 |   *
14 |   */
15 | object AvroExample {
16 | 
17 |   def main(args: Array[String]): Unit = {
18 | 
19 | 
20 |     val spark: SparkSession = SparkSession.builder().master("local[1]")
21 |       .appName("SparkByExamples.com")
22 |       .getOrCreate()
23 | 
24 |     val data = Seq(("James ", "", "Smith", 2018, 1, "M", 3000),
25 |       ("Michael ", "Rose", "", 2010, 3, "M", 4000),
26 |       ("Robert ", "", "Williams", 2010, 3, "M", 4000),
27 |       ("Maria ", "Anne", "Jones", 2005, 5, "F", 4000),
28 |       ("Jen", "Mary", "Brown", 2010, 7, "", -1)
29 |     )
30 | 
31 |     val columns = Seq("firstname", "middlename", "lastname", "dob_year",
32 |       "dob_month", "gender", "salary")
33 |     import spark.sqlContext.implicits._
34 |     val df = data.toDF(columns: _*)
35 | 
36 |     /**
37 |       * Write Avro File
38 |       */
39 |     df.write.format("avro")
40 |       .mode(SaveMode.Overwrite)
41 |       .save("C:\\tmp\\spark_out\\avro\\person.avro")
42 | 
43 |     /**
44 |       * Read Avro File
45 |       */
46 |     spark.read.format("avro").load("C:\\tmp\\spark_out\\avro\\person.avro").show()
47 | 
48 |     /**
49 |       * Write Avro Partition
50 |       */
51 |     df.write.partitionBy("dob_year","dob_month")
52 |       .format("avro")
53 |       .mode(SaveMode.Overwrite)
54 |       .save("C:\\tmp\\spark_out\\avro\\person_partition.avro")
55 | 
56 |     /**
57 |       * Reading Avro Partition
58 |       */
59 |     spark.read
60 |       .format("avro")
61 |       .load("C:\\tmp\\spark_out\\avro\\person_partition.avro")
62 |       .where(col("dob_year") === 2010)
63 |       .show()
64 | 
65 |     /**
66 |       * Explicit Avro schema
67 |       */
68 |     val schemaAvro = new Schema.Parser()
69 |       .parse(new File("src/main/resources/person.avsc"))
70 | 
71 |     spark.read
72 |       .format("avro")
73 |       .option("avroSchema", schemaAvro.toString)
74 |       .load("C:\\tmp\\spark_out\\avro\\person.avro")
75 |       .show()
76 | 
77 |     /**
78 |       * Avro Spark SQL
79 |       */
80 |     spark.sqlContext.sql("CREATE TEMPORARY VIEW PERSON USING avro OPTIONS (path \"C:/tmp/spark_out/avro/person.avro\")")
81 |     spark.sqlContext.sql("SELECT * FROM PERSON").show()
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CaseClassSparkSchema.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.Encoders
 4 | import org.apache.spark.sql.types.StructType
 5 | 
 6 | object CaseClassSparkSchema extends App{
 7 | 
 8 |   case class Name(first:String,last:String,middle:String)
 9 |   case class Employee(fullName:Name,age:Integer,gender:String)
10 | 
11 |   val encoderSchema = Encoders.product[Employee].schema
12 |   encoderSchema.printTreeString()
13 | 
14 |   import org.apache.spark.sql.catalyst.ScalaReflection
15 |   val schema = ScalaReflection.schemaFor[Employee].dataType.asInstanceOf[StructType]
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CastColumnType.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.{Row, SparkSession}
 4 | import org.apache.spark.sql.types._
 5 | import org.apache.spark.sql.functions._
 6 | 
 7 | object CastColumnType extends App{
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .master("local[1]")
11 |     .appName("SparkByExamples.com")
12 |     .getOrCreate()
13 | 
14 |   val simpleData = Seq(Row("James",34,"2006-01-01","true","M",3000.60),
15 |     Row("Michael",33,"1980-01-10","true","F",3300.80),
16 |     Row("Robert",37,"06-01-1992","false","M",5000.50)
17 |   )
18 | 
19 |   val simpleSchema = StructType(Array(
20 |     StructField("firstName",StringType,true),
21 |     StructField("age",IntegerType,true),
22 |     StructField("jobStartDate",StringType,true),
23 |     StructField("isGraduated", StringType, true),
24 |     StructField("gender", StringType, true),
25 |     StructField("salary", DoubleType, true)
26 |   ))
27 | 
28 |   val df = spark.createDataFrame(spark.sparkContext.parallelize(simpleData),simpleSchema)
29 |   df.printSchema()
30 |   df.show(false)
31 | 
32 |   //withColumn with the original column
33 |   val df2 = df.withColumn("age",col("age").cast(StringType))
34 |     .withColumn("isGraduated",col("isGraduated").cast(BooleanType))
35 |     .withColumn("jobStartDate",col("jobStartDate").cast(DateType))
36 |   df2.printSchema()
37 | 
38 | 
39 |   val df3 = df2.selectExpr("cast(age as int) age",
40 |     "cast(isGraduated as string) isGraduated",
41 |     "cast(jobStartDate as string) jobStartDate")
42 |   df3.printSchema()
43 |   df3.show(false)
44 | 
45 |   df3.createOrReplaceTempView("CastExample")
46 |   val df4 = spark.sql("SELECT STRING(age),BOOLEAN(isGraduated),DATE(jobStartDate) from CastExample")
47 |   df4.printSchema()
48 |   df4.show(false)
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CreateDataFrame.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 4 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 5 | 
 6 | object CreateDataFrame {
 7 | 
 8 |   def main(args:Array[String]):Unit={
 9 | 
10 |     val spark:SparkSession = SparkSession.builder()
11 |       .master("local[1]")
12 |       .appName("SparkByExample")
13 |       .getOrCreate()
14 | 
15 |     import spark.implicits._
16 |     val columns = Seq("language","users_count")
17 |     val data = Seq(("Java", "20000"), ("Python", "100000"), ("Scala", "3000"))
18 |     val rdd = spark.sparkContext.parallelize(data)
19 | 
20 | 
21 |     //From RDD (USING toDF())
22 |     val dfFromRDD1 = rdd.toDF("language","users")
23 | 
24 |     //From RDD (USING createDataFrame)
25 |     val dfFromRDD2 = spark.createDataFrame(rdd).toDF(columns:_*)
26 | 
27 |     //From RDD (USING createDataFrame and Adding schema using StructType)
28 |     //convert RDD[T] to RDD[Row]
29 |     val schema = StructType(columns
30 |       .map(fieldName => StructField(fieldName, StringType, nullable = true)))
31 |     val rowRDD = rdd.map(attributes => Row(attributes._1, attributes._2))
32 |     val dfFromRDD3 = spark.createDataFrame(rowRDD,schema)
33 | 
34 | 
35 |     //From Data (USING toDF())
36 |     val dfFromData1 = data.toDF()
37 | 
38 |     //From Data (USING createDataFrame)
39 |     var dfFromData2 = spark.createDataFrame(data).toDF(columns:_*)
40 | 
41 |     //From Data (USING createDataFrame and Adding schema using StructType)
42 |     import scala.collection.JavaConversions._
43 |     val rowData = data
44 |       .map(attributes => Row(attributes._1, attributes._2))
45 |     var dfFromData3 = spark.createDataFrame(rowData,schema)
46 | 
47 |     //From Data (USING createDataFrame and Adding bean class)
48 |     //To-DO
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CreateEmptyDataFrameExample.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.{Row, SparkSession}
 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 5 | 
 6 |   object CreateEmptyDataFrameExample extends App {
 7 | 
 8 |     val spark: SparkSession = SparkSession.builder()
 9 |       .master("local[1]")
10 |       .appName("SparkByExamples.com")
11 |       .getOrCreate()
12 |     import spark.implicits._
13 | 
14 | 
15 |     val schema = StructType(
16 |       StructField("firstName", StringType, true) ::
17 |         StructField("lastName", IntegerType, false) ::
18 |         StructField("middleName", IntegerType, false) :: Nil)
19 | 
20 |     val colSeq = Seq("firstName","lastName","middleName")
21 | 
22 |     case class Name(firstName: String, lastName: String, middleName:String)
23 | 
24 |     // Create empty dataframe using StructType schema
25 |     val df = spark.createDataFrame(spark.sparkContext
26 |       .emptyRDD[Row], schema)
27 | 
28 |     // Using implicit encoder
29 | 
30 |     Seq.empty[(String,String,String)].toDF(colSeq:_*)
31 | 
32 |     //Using case class
33 | 
34 |     Seq.empty[Name].toDF().printSchema()
35 | 
36 |     //Using emptyDataFrame
37 |     spark.emptyDataFrame
38 | 
39 | 
40 |     //Using emptyDataset
41 | 
42 | 
43 |   }
44 | 
45 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CreateEmptyDatasetExample.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | package com.sparkbyexamples.spark.dataframe
 3 | 
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 6 | 
 7 | object CreateEmptyDatasetExample extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .master("local[1]")
11 |     .appName("SparkByExamples.com")
12 |     .getOrCreate()
13 | 
14 |   import spark.implicits._
15 | 
16 |   val schema = StructType(
17 |     StructField("firstName", StringType, true) ::
18 |       StructField("lastName", IntegerType, false) ::
19 |       StructField("middleName", IntegerType, false) :: Nil)
20 | 
21 |   val colSeq = Seq("firstName","lastName","middleName")
22 | 
23 |   case class Name(firstName: String, lastName: String, middleName:String)
24 | 
25 |   spark.createDataset(Seq.empty[Name])
26 |   spark.createDataset(Seq.empty[(String,String,String)])
27 |   spark.createDataset(spark.sparkContext.emptyRDD[Name])
28 |   Seq.empty[(String,String,String)].toDS()
29 |   Seq.empty[Name].toDS()
30 |   spark.emptyDataset[Name]
31 | }


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/DataFrameWithComplexDSL.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.{Row, SparkSession}
 4 | import org.apache.spark.sql.functions._
 5 | case class Employee(firstName:String,lastName:String, email:String,salary:Int)
 6 | case class Department(id:Int,name:String)
 7 | case class DepartmentWithEmployees(department: Department, employees: Seq[Employee])
 8 | object DataFrameWithDSL2 {
 9 | 
10 |   def main(args: Array[String]): Unit = {
11 | 
12 |     val department1 = Department(123456, "Computer Science")
13 |     val department2 = Department(789012, "Mechanical Engineering")
14 |     val department3 = Department(345678, "Theater and Drama")
15 |     val department4 = Department(901234, "Indoor Recreation")
16 | 
17 |     //Create the Employees
18 | 
19 |     val employee1 = Employee("michael", "armbrust", "no-reply@berkeley.edu", 100000)
20 |     val employee2 = Employee("xiangrui", "meng", "no-reply@stanford.edu", 120000)
21 |     val employee3 = Employee("matei", "", "no-reply@waterloo.edu", 140000)
22 |     val employee4 = Employee("", "wendell", "no-reply@berkeley.edu", 160000)
23 | 
24 |     //Create the DepartmentWithEmployees instances from Departments and Employees
25 |     val departmentWithEmployees1 = DepartmentWithEmployees(department1, List(employee1, employee2))
26 |     val departmentWithEmployees2 = DepartmentWithEmployees(department2, List(employee3, employee4))
27 |     val departmentWithEmployees3 = DepartmentWithEmployees(department3, List(employee1, employee4))
28 |     val departmentWithEmployees4 = DepartmentWithEmployees(department4, List(employee2, employee3))
29 | 
30 |     val data1 = Seq(departmentWithEmployees1,departmentWithEmployees2)
31 | 
32 |     val data2 = Seq(departmentWithEmployees3,departmentWithEmployees4)
33 | 
34 |     val spark: SparkSession = SparkSession.builder()
35 |       .master("local[1]")
36 |       .appName("SparkByExample")
37 |       .getOrCreate()
38 | 
39 |     import spark.implicits._
40 | 
41 |     val df = spark.createDataFrame(data1)
42 |     val df2 = spark.createDataFrame(data2)
43 | 
44 |     //union
45 |     val finalDF = df.union(df2)
46 |     finalDF.printSchema()
47 |     finalDF.show(false)
48 | 
49 |     finalDF.select("department.*").printSchema()
50 |     finalDF.select(explode(col("employees"))).select("col.*").show(false)
51 | 
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/DataFrameWithSimpleDSL.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, SparkSession}
 4 | 
 5 | object DataFrameWithSimpleDSL {
 6 | 
 7 |   def main(args:Array[String]):Unit= {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     val filePath = "C://000_Projects/opt/BigData/zipcodes.csv"
15 | 
16 |     var df:DataFrame = spark.read.option("header","true").csv(filePath)
17 |     df.printSchema()
18 | 
19 |     // Where
20 |     df.select("*").where(df("RecordNumber") < 10).show()
21 |     //Filter
22 |     df.filter(df("State")==="PR").select("State").show()
23 |     //Distinct
24 |     df.select(df("State")).distinct().show()
25 |     //Count
26 |     println("Number of records"+df.count())
27 | 
28 |     //When Otherwise
29 |     //df.select(df("State"), case df("State") when "PR" then "PR123"
30 | 
31 |     // where with and and or conditions
32 |     df.where(df("State") === "PR" && df("City").contains("DEL")).show()
33 | 
34 |     //Order or Sort by
35 |     df.orderBy(df("RecordNumber").desc, df("State").asc).show()
36 | 
37 | 
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/FromCSVFile.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object FromCSVFile {
 6 | 
 7 |   def main(args:Array[String]):Unit= {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     spark.sparkContext
15 | 
16 |     val filePath="src/main/resources/zipcodes.csv"
17 | 
18 |     //Chaining multiple options
19 |     val df2 = spark.read.options(Map("inferSchema"->"true","sep"->",","header"->"true")).csv(filePath)
20 |     df2.show(false)
21 |     df2.printSchema()
22 | 
23 |     df2.write.json("c:/tmp/spark_output/zipcodes")
24 | 
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/FromCSVFile2.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object FromCSVFile2 {
 6 | 
 7 |   def main(args:Array[String]):Unit= {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     val filePath="src/main/resources/stream.csv"
15 | 
16 |     val df = spark.read.options(Map("inferSchema"->"true","delimiter"->"|","header"->"true")).csv(filePath)
17 | 
18 |     val df2 = df.select("Gender", "BirthDate", "TotalCost", "TotalChildren", "ProductCategoryName")
19 |       .filter("Gender is not null")
20 |       .filter("BirthDate is not null")
21 |       .filter("TotalChildren is not null")
22 |       .filter("ProductCategoryName is not null")
23 |     df2.show()
24 | 
25 |     df.select("Gender", "BirthDate", "TotalCost", "TotalChildren", "ProductCategoryName")
26 |       .where(df("Gender").isNotNull && df("BirthDate").isNotNull && df("TotalChildren").isNotNull && df("ProductCategoryName").isNotNull ).show()
27 | 
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/FromJsonFile.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object FromJsonFile {
 6 | 
 7 |   def main(args:Array[String]): Unit = {
 8 | 
 9 |     val spark:SparkSession = SparkSession.builder()
10 |       .master("local[3]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 |     val sc = spark.sparkContext
14 | 
15 |     val rdd = sc.textFile("src/main/resources/zipcodes.json")
16 |     //Todo : convert RDD to DataFrame
17 |     rdd.collect().foreach(println)
18 | 
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetExample.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object ParquetExample {
 6 | 
 7 |   def main(args:Array[String]):Unit= {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("SparkByExamples.com")
12 |       .getOrCreate()
13 | 
14 |     val data = Seq(("James ","","Smith","36636","M",3000),
15 |       ("Michael ","Rose","","40288","M",4000),
16 |       ("Robert ","","Williams","42114","M",4000),
17 |       ("Maria ","Anne","Jones","39192","F",4000),
18 |       ("Jen","Mary","Brown","","F",-1)
19 |     )
20 | 
21 |     val columns = Seq("firstname","middlename","lastname","dob","gender","salary")
22 |     import spark.sqlContext.implicits._
23 |     val df = data.toDF(columns:_*)
24 | 
25 |     df.show()
26 |     df.printSchema()
27 | 
28 |     df.write
29 |       .parquet("C:\\tmp\\output\\people.parquet")
30 | 
31 |     val parqDF = spark.read.parquet("C:\\tmp\\output\\people.parquet")
32 |     parqDF.createOrReplaceTempView("ParquetTable")
33 | 
34 |     spark.sql("select * from ParquetTable where salary >= 4000").explain()
35 |     val parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ")
36 | 
37 |     parkSQL.show()
38 |     parkSQL.printSchema()
39 | 
40 |     df.write
41 |       .partitionBy("gender","salary")
42 |       .parquet("C:\\tmp\\output\\people2.parquet")
43 | 
44 |     val parqDF2 = spark.read.parquet("C:\\tmp\\output\\people2.parquet")
45 |     parqDF2.createOrReplaceTempView("ParquetTable2")
46 | 
47 |     val df3 = spark.sql("select * from ParquetTable2  where gender='M' and salary >= 4000")
48 |     df3.explain()
49 |     df3.printSchema()
50 |     df3.show()
51 | 
52 |     val parqDF3 = spark.read
53 |       .parquet("C:\\tmp\\output\\people2.parquet\\gender=M")
54 |     parqDF3.show()
55 | 
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/RenameColDataFrame.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.{Row, SparkSession}
 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 5 | import org.apache.spark.sql.functions.{col, _}
 6 | 
 7 | object RenameColDataFrame {
 8 | 
 9 |   def main(args:Array[String]):Unit= {
10 | 
11 |     val spark: SparkSession = SparkSession.builder()
12 |       .master("local[1]")
13 |       .appName("SparkByExamples.com")
14 |       .getOrCreate()
15 | 
16 |     val data = Seq(Row(Row("James ","","Smith"),"36636","M",3000),
17 |       Row(Row("Michael ","Rose",""),"40288","M",4000),
18 |       Row(Row("Robert ","","Williams"),"42114","M",4000),
19 |       Row(Row("Maria ","Anne","Jones"),"39192","F",4000),
20 |       Row(Row("Jen","Mary","Brown"),"","F",-1)
21 |     )
22 | 
23 |     val schema = new StructType()
24 |       .add("name",new StructType()
25 |         .add("firstname",StringType)
26 |         .add("middlename",StringType)
27 |         .add("lastname",StringType))
28 |       .add("dob",StringType)
29 |       .add("gender",StringType)
30 |       .add("salary",IntegerType)
31 | 
32 |     val df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema)
33 | 
34 |     df.printSchema()
35 | 
36 |     df.withColumnRenamed("dob","DateOfBirth")
37 |         .printSchema()
38 | 
39 |     val schema2 = new StructType()
40 |         .add("fname",StringType)
41 |         .add("middlename",StringType)
42 |         .add("lname",StringType)
43 | 
44 |     df.select(col("name").cast(schema2),
45 |       col("dob"),
46 |       col("gender"),
47 |       col("salary"))
48 |         .printSchema()
49 | 
50 |     df.select(col("name.firstname").as("fname"),
51 |       col("name.middlename").as("mname"),
52 |       col("name.lastname").as("lname"),
53 |       col("dob"),col("gender"),col("salary"))
54 |       .printSchema()
55 | 
56 |     df.withColumnRenamed("name.firstname","fname")
57 |       .withColumnRenamed("name.middlename","mname")
58 |       .withColumnRenamed("name.lastname","lname")
59 |       .drop("name")
60 |       .printSchema()
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/SQLExample.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions._
 5 | object DataFrameWithSQL_ {
 6 | 
 7 |   def main(args:Array[String]):Unit= {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("SparkByExamples.com")
12 |       .getOrCreate()
13 | 
14 |     val data = Seq(1,2,3)
15 | 
16 |     import spark.sqlContext.implicits._
17 | 
18 |     val df = data.toDF("field1")
19 | 
20 |    df.createOrReplaceTempView("table1")
21 | 
22 |     val df2 = spark.sql("select tb1.field1 as field1,tb2.field1 as field2 from table1 tb1, table1 tb2 where tb1.field1 <> tb2.field1")
23 |     df2.printSchema()
24 |     df2.show(false)
25 | 
26 |     df2.createOrReplaceTempView("table2")
27 | 
28 |     val df3 = spark.sql("select distinct tb1.field1,tb1.field2 from table2 tb1, table2 tb2 where tb1.field1 == tb2.field2 and tb1.field2 == tb2.field1")
29 | 
30 |     df3.show(false)
31 | 
32 | 
33 | 
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/SaveDataFrame.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, SparkSession}
 4 | 
 5 | object SaveDataFrame {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 |     val spark: SparkSession = SparkSession.builder()
 9 |       .master("local[1]")
10 |       .appName("SparkByExample")
11 |       .getOrCreate()
12 | 
13 |     val filePath = "C://000_Projects/opt/BigData/zipcodes.csv"
14 | 
15 |     var df:DataFrame = spark.read.option("header","true").csv(filePath)
16 | 
17 |     df.repartition(5).write.option("header","true").csv("c:/tmp/output/df1")
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/UDFDataFrame.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions._
 5 | object UDFDataFrame {
 6 |   def main(args:Array[String]): Unit = {
 7 | 
 8 |     val spark:SparkSession = SparkSession.builder()
 9 |       .master("local[3]")
10 |       .appName("SparkByExample")
11 |       .getOrCreate()
12 | 
13 |     val data = Seq(("2018/01/23",23),("2018/01/24",24),("2018/02/20",25))
14 | 
15 |     import spark.sqlContext.implicits._
16 |     val df = data.toDF("date1","day")
17 | 
18 |     val replace: String => String = _.replace("/","-")
19 |     import org.apache.spark.sql.functions.udf
20 |     val replaceUDF = udf(replace)
21 |     val minDate = df.agg(min($"date1")).collect()(0).get(0)
22 | 
23 |     val df2 = df.select("*").filter( to_date(replaceUDF($"date1")) > date_add(to_date(replaceUDF(lit(minDate))),7 ))
24 |     df2.show()
25 |   }
26 | 
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/WithColumn.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe
 2 | 
 3 | import org.apache.spark.sql.{Row, SparkSession}
 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, MapType, StringType, StructType}
 5 | import org.apache.spark.sql.functions._
 6 | object WithColumn {
 7 | 
 8 |   def main(args:Array[String]):Unit= {
 9 | 
10 |     val spark: SparkSession = SparkSession.builder()
11 |       .master("local[1]")
12 |       .appName("SparkByExamples.com")
13 |       .getOrCreate()
14 | 
15 |     val arrayStructureData = Seq(
16 |       Row(Row("James ","","Smith"),"1","M",3100,List("Cricket","Movies"),Map("hair"->"black","eye"->"brown")),
17 |       Row(Row("Michael ","Rose",""),"2","M",3100,List("Tennis"),Map("hair"->"brown","eye"->"black")),
18 |       Row(Row("Robert ","","Williams"),"3","M",3100,List("Cooking","Football"),Map("hair"->"red","eye"->"gray")),
19 |       Row(Row("Maria ","Anne","Jones"),"4","M",3100,null,Map("hair"->"blond","eye"->"red")),
20 |       Row(Row("Jen","Mary","Brown"),"5","M",3100,List("Blogging"),Map("white"->"black","eye"->"black"))
21 |     )
22 | 
23 |     val arrayStructureSchema = new StructType()
24 |       .add("name",new StructType()
25 |         .add("firstname",StringType)
26 |         .add("middlename",StringType)
27 |         .add("lastname",StringType))
28 |       .add("id",StringType)
29 |       .add("gender",StringType)
30 |       .add("salary",IntegerType)
31 |       .add("Hobbies", ArrayType(StringType))
32 |       .add("properties", MapType(StringType,StringType))
33 | 
34 |     val df2 = spark.createDataFrame(
35 |       spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
36 | 
37 |     //Change the column data type
38 |     df2.withColumn("salary",df2("salary").cast("Integer"))
39 | 
40 |     //Derive a new column from existing
41 |     val df4=df2.withColumn("CopiedColumn",df2("salary")* -1)
42 | 
43 |     //Transforming existing column
44 |     val df5 = df2.withColumn("salary",df2("salary")*100)
45 | 
46 |     //You can also chain withColumn to change multiple columns
47 | 
48 |     //Renaming a column.
49 |     val df3=df2.withColumnRenamed("gender","sex")
50 |     df3.printSchema()
51 | 
52 |     //Droping a column
53 |     val df6=df4.drop("CopiedColumn")
54 |     println(df6.columns.contains("CopiedColumn"))
55 | 
56 |     //Adding a literal value
57 |     df2.withColumn("Country", lit("USA")).printSchema()
58 | 
59 |     //Retrieving
60 |     df2.show(false)
61 |     df2.select("name").show(false)
62 |     df2.select("name.firstname").show(false)
63 |     df2.select("name.*").show(false)
64 | 
65 | 
66 |     val df8 = df2.select(col("*"),explode(col("hobbies")))
67 |     df8.show(false)
68 | 
69 | 
70 |     //df8.select(from_collection())
71 | 
72 | 
73 | 
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/AnotherExample.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions
 2 | 
 3 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 4 | import org.apache.spark.sql.{Row, SparkSession}
 5 | 
 6 | class AnotherExample {
 7 | 
 8 |   def main(args:Array[String]):Unit= {
 9 | 
10 |     val spark: SparkSession = SparkSession.builder()
11 |       .master("local[1]")
12 |       .appName("SparkByExamples.com")
13 |       .getOrCreate()
14 | 
15 |     /**
16 |       * Simple using columns list
17 |       */
18 |     val data = Seq(("James ","","Smith","2018","01","M",3000),
19 |       ("Michael ","Rose","","2010","03","M",4000),
20 |       ("Robert ","","Williams","2010","03","M",4000),
21 |       ("Maria ","Anne","Jones","2005","05","F",4000),
22 |       ("Jen","Mary","Brown","2010","07","",-1)
23 |     )
24 | 
25 |     val columns = Seq("firstname","middlename","lastname","dob_year","dob_month","gender","salary")
26 |     import spark.sqlContext.implicits._
27 |     val df = data.toDF(columns:_*)
28 | 
29 |     /**
30 |       * schema using Row data
31 |       */
32 |     val data3 = Seq(Row("James ","","Smith","36636","M",3000),
33 |       Row("Michael ","Rose","","40288","M",4000),
34 |       Row("Robert ","","Williams","42114","M",4000),
35 |       Row("Maria ","Anne","Jones","39192","F",4000),
36 |       Row("Jen","Mary","Brown","","F",-1)
37 |     )
38 | 
39 |     val schema3 = new StructType()
40 |       .add("firstname",StringType)
41 |       .add("middlename",StringType)
42 |       .add("lastname",StringType)
43 |       .add("dob",StringType)
44 |       .add("gender",StringType)
45 |       .add("salary",IntegerType)
46 | 
47 |     val df3 = spark.createDataFrame(spark.sparkContext.parallelize(data3),schema3)
48 | 
49 |     /**
50 |      * nested structure schema
51 |      */
52 |     val data4 = Seq(Row(Row("James ","","Smith"),"36636","M",3000),
53 |       Row(Row("Michael ","Rose",""),"40288","M",4000),
54 |       Row(Row("Robert ","","Williams"),"42114","M",4000),
55 |       Row(Row("Maria ","Anne","Jones"),"39192","F",4000),
56 |       Row(Row("Jen","Mary","Brown"),"","F",-1)
57 |     )
58 | 
59 |     val schema4 = new StructType()
60 |       .add("name",new StructType()
61 |         .add("firstname",StringType)
62 |         .add("middlename",StringType)
63 |         .add("lastname",StringType))
64 |       .add("dob",StringType)
65 |       .add("gender",StringType)
66 |       .add("salary",IntegerType)
67 | 
68 |     val df4 = spark.createDataFrame(spark.sparkContext.parallelize(data4),schema4)
69 | 
70 | 
71 | 
72 | 
73 | 
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/MathFunctions.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions._
 5 | 
 6 | object MathFunctions {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 |     val spark = SparkSession.builder()
10 |       .appName("sparkbyexamples.com")
11 |       .master("local")
12 |       .getOrCreate()
13 |     spark.sparkContext.setLogLevel("ERROR")
14 |     import spark.implicits._
15 |     val data = Seq((2,2.67),(3,3.12),(4,4.34),(5,1.10))
16 |    // data.sc
17 | //    data.printSchema()
18 | //    data.withColumn("factorial",factorial(col("number")))
19 | //     // .withColumn("ceil")
20 | //      .show()
21 | 
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/PivotExample.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions._
 5 | object PivotExample {
 6 |   def main(args:Array[String]):Unit= {
 7 | 
 8 |     val spark: SparkSession = SparkSession.builder()
 9 |       .master("local[1]")
10 |       .appName("SparkByExamples.com")
11 |       .getOrCreate()
12 | 
13 |     val data = Seq(("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"),
14 |       ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"),
15 |       ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"),
16 |       ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico"))
17 | 
18 | 
19 | 
20 |     import spark.sqlContext.implicits._
21 |     val df = data.toDF("Product","Amount","Country")
22 |     df.show()
23 | 
24 |     //pivot
25 |     val pivotDF = df.groupBy("Product","Country")
26 |       .sum("Amount")
27 |       .groupBy("Product")
28 |       .pivot("Country")
29 |       .sum("sum(Amount)")
30 |     pivotDF.show()
31 | 
32 | //    val countries = Seq("USA","China","Canada","Mexico")
33 | //    val pivotDF2 = df.groupBy("Product").pivot("Country", countries).sum("Amount")
34 | //    pivotDF2.show()
35 | 
36 |     //unpivot
37 |     //val unPivotDF = pivotDF.select($"Product",expr("stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) " +
38 |       //"as (Country,Total)")).where("Total is not null")
39 |     //unPivotDF.show()
40 | 
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/StringFunctions.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions
2 | 
3 | class StringFunctions {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/WhenOtherwise.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{when, _}
 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 6 | 
 7 | object WhenOtherwise {
 8 | 
 9 |   def main(args:Array[String]):Unit= {
10 | 
11 |     val spark: SparkSession = SparkSession.builder()
12 |       .master("local[1]")
13 |       .appName("SparkByExamples.com")
14 |       .getOrCreate()
15 | 
16 |     import spark.sqlContext.implicits._
17 | 
18 |     val data = List(("James ","","Smith","36636","M",60000),
19 |         ("Michael ","Rose","","40288","M",70000),
20 |         ("Robert ","","Williams","42114","",400000),
21 |         ("Maria ","Anne","Jones","39192","F",500000),
22 |         ("Jen","Mary","Brown","","F",0))
23 | 
24 |     val cols = Seq("first_name","middle_name","last_name","dob","gender","salary")
25 | 
26 |     val df = spark.createDataFrame(data).toDF(cols:_*)
27 | 
28 |     val df2 = df.withColumn("gender", when(col("gender") === "M","Male")
29 |       .when(col("gender") === "F","Female")
30 |       .otherwise("Unknown"))
31 | 
32 | 
33 |     val df3 = df.withColumn("gender",
34 |       expr("case when gender = 'M' then 'Male' " +
35 |                        "when gender = 'F' then 'Female' " +
36 |                        "else 'Unknown' end"))
37 | 
38 |     val df4 = df.select(col("*"), when(col("gender") === "M","Male")
39 |       .when(col("gender") === "F","Female")
40 |       .otherwise("Unknown").alias("new_gender"))
41 | 
42 |     val df5 = df.select(col("*"),
43 |       expr("case when gender = 'M' then 'Male' " +
44 |                        "when gender = 'F' then 'Female' " +
45 |                        "else 'Unknown' end").alias("new_gender"))
46 | 
47 |     val dataDF = Seq(
48 |       (66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4"
49 |       )).toDF("id", "code", "amt")
50 | 
51 |     df2.show()
52 |     df3.show()
53 |     df4.show()
54 |     df5.show()
55 |     dataDF.show()
56 | 
57 |     dataDF.withColumn("new_column",
58 |        when(col("code") === "a" || col("code") === "d", "A")
59 |       .when(col("code") === "b" and col("amt") === "4", "B")
60 |       .otherwise("A1"))
61 |       .show()
62 | 
63 |     //alternatively, we can also use "and" "or" operators
64 |     dataDF.withColumn("new_column",
65 |       when(col("code") === "a" or col("code") === "d", "A")
66 |         .when(col("code") === "b" and col("amt") === "4", "B")
67 |         .otherwise("A1"))
68 |       .show()
69 | 
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfArrayType.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.collection
 2 | 
 3 | import org.apache.spark.sql.{Row, SparkSession}
 4 | import org.apache.spark.sql.functions.{explode, flatten}
 5 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
 6 | 
 7 | object ArrayOfArrayType extends App {
 8 | 
 9 |   val spark = SparkSession.builder().appName("SparkByExamples.com")
10 |     .master("local[1]")
11 |     .getOrCreate()
12 | 
13 |   val arrayArrayData = Seq(
14 |     Row("James",List(List("Java","Scala","C++"),List("Spark","Java"))),
15 |     Row("Michael",List(List("Spark","Java","C++"),List("Spark","Java"))),
16 |     Row("Robert",List(List("CSharp","VB"),List("Spark","Python")))
17 |   )
18 | 
19 |   val arrayArraySchema = new StructType().add("name",StringType)
20 |     .add("subjects",ArrayType(ArrayType(StringType)))
21 | 
22 |   val df = spark.createDataFrame(
23 |     spark.sparkContext.parallelize(arrayArrayData),arrayArraySchema)
24 |   df.printSchema()
25 |   df.show(false)
26 | 
27 |   import spark.implicits._
28 |   val df2 = df.select($"name",explode($"subjects"))
29 | 
30 | 
31 |   df2.printSchema()
32 |   df2.show(false)
33 | 
34 |   //Convert Array of Array into Single array
35 |   df.select($"name",flatten($"subjects")).show(false)
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfMapType.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.collection
 2 | 
 3 | 
 4 | import org.apache.spark.sql.{Row, SparkSession}
 5 | import org.apache.spark.sql.functions.{explode}
 6 | import org.apache.spark.sql.types._
 7 | 
 8 | object ArrayOfMapType extends App {
 9 |   val spark = SparkSession.builder().appName("SparkByExamples.com")
10 |     .master("local[1]")
11 |     .getOrCreate()
12 | 
13 |   val arrayMapSchema = new StructType().add("name",StringType)
14 |     .add("properties",
15 |       ArrayType(new MapType(StringType,StringType,true)))
16 | 
17 |   val arrayMapData = Seq(
18 |     Row("James",List(Map("hair"->"black","eye"->"brown"), Map("height"->"5.9"))),
19 |     Row("Michael",List(Map("hair"->"brown","eye"->"black"),Map("height"->"6"))),
20 |     Row("Robert",List(Map("hair"->"red","eye"->"gray"),Map("height"->"6.3")))
21 |   )
22 | 
23 |   val df = spark.createDataFrame(
24 |     spark.sparkContext.parallelize(arrayMapData),arrayMapSchema)
25 |   df.printSchema()
26 |   df.show(false)
27 | 
28 |   import spark.implicits._
29 | 
30 |   val df2 = df.select($"name",explode($"properties"))
31 |   df2.printSchema()
32 |   df2.show(false)
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfStructType.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.collection
 2 | 
 3 | import org.apache.spark.sql.functions._
 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
 5 | import org.apache.spark.sql.{Row, SparkSession}
 6 | 
 7 | object ArrayOfStructType extends App{
 8 | 
 9 |   val spark = SparkSession.builder().appName("SparkByExamples.com")
10 |     .master("local[1]")
11 |     .getOrCreate()
12 | 
13 |   val arrayStructData = Seq(
14 |     Row("James",List(Row("Java","XX",120),Row("Scala","XA",300))),
15 |     Row("Michael",List(Row("Java","XY",200),Row("Scala","XB",500))),
16 |     Row("Robert",List(Row("Java","XZ",400),Row("Scala","XC",250))),
17 |     Row("Washington",null)
18 |   )
19 | 
20 |   val arrayStructSchema = new StructType().add("name",StringType)
21 |     .add("booksIntersted",ArrayType(new StructType()
22 |       .add("name",StringType)
23 |       .add("author",StringType)
24 |       .add("pages",IntegerType)))
25 | 
26 |   val df = spark.createDataFrame(
27 |     spark.sparkContext.parallelize(arrayStructData),arrayStructSchema)
28 |   df.printSchema()
29 |   df.show(false)
30 | 
31 |   import spark.implicits._
32 |   val df2 = df.select($"name",explode($"booksIntersted"))
33 |   df2.printSchema()
34 |   df2.show(false)
35 | 
36 |   df2.groupBy($"name").agg(collect_list($"col").as("booksIntersted"))
37 |     .show(false)
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/MapTypeExample.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.collection
 2 | import org.apache.spark.sql.functions.{col, explode, lit, map, map_concat, map_from_entries, map_keys, map_values}
 3 | import org.apache.spark.sql.{Row, SparkSession}
 4 | import org.apache.spark.sql.types._
 5 | 
 6 | object MapTypeExample extends App {
 7 | 
 8 |   val spark: SparkSession = SparkSession.builder()
 9 |     .master("local[1]")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 |   import spark.implicits._
13 | 
14 |   //Creating DF with MapType
15 |   val arrayStructureData = Seq(
16 |     Row("James",List(Row("Newark","NY"),Row("Brooklyn","NY")),
17 |       Map("hair"->"black","eye"->"brown"), Map("height"->"5.9")),
18 |     Row("Michael",List(Row("SanJose","CA"),Row("Sandiago","CA")),
19 |       Map("hair"->"brown","eye"->"black"),Map("height"->"6")),
20 |     Row("Robert",List(Row("LasVegas","NV")),
21 |       Map("hair"->"red","eye"->"gray"),Map("height"->"6.3")),
22 |     Row("Maria",null,Map("hair"->"blond","eye"->"red"),
23 |       Map("height"->"5.6")),
24 |     Row("Jen",List(Row("LAX","CA"),Row("Orange","CA")),
25 |       Map("white"->"black","eye"->"black"),Map("height"->"5.2"))
26 |   )
27 | 
28 | 
29 |   val mapType  = DataTypes.createMapType(StringType,StringType)
30 | 
31 |   val arrayStructureSchema = new StructType()
32 |     .add("name",StringType)
33 |     .add("addresses", ArrayType(new StructType()
34 |       .add("city",StringType)
35 |       .add("state",StringType)))
36 |     .add("properties", mapType)
37 |     .add("secondProp", MapType(StringType,StringType))
38 | 
39 |   val mapTypeDF = spark.createDataFrame(
40 |     spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
41 |   mapTypeDF.printSchema()
42 |   mapTypeDF.show()
43 | 
44 |   mapTypeDF.select(col("name"),map_keys(col("properties"))).show(false)
45 |   mapTypeDF.select(col("name"),map_values(col("properties"))).show(false)
46 |   mapTypeDF.select(col("name"),map_concat(col("properties"),col("secondProp"))).show(false)
47 | 
48 | 
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/AddTime.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{expr,col}
 5 | object AddTime extends App {
 6 | 
 7 |   val spark:SparkSession = SparkSession.builder()
 8 |     .master("local")
 9 |     .appName("SparkByExamples.com")
10 |     .getOrCreate()
11 |   spark.sparkContext.setLogLevel("ERROR")
12 | 
13 |   import spark.sqlContext.implicits._
14 | 
15 |   spark.sql( "select current_timestamp," +
16 |     "cast(current_timestamp as TIMESTAMP) + INTERVAL 2 hours as added_hours," +
17 |     "cast(current_timestamp as TIMESTAMP) + INTERVAL 5 minutes as added_minutes," +
18 |     "cast(current_timestamp as TIMESTAMP) + INTERVAL 55 seconds as added_seconds"
19 |   ).show(false)
20 | 
21 | 
22 |   val df = Seq(("2019-07-01 12:01:19.101"),
23 |     ("2019-06-24 12:01:19.222"),
24 |     ("2019-11-16 16:44:55.406"),
25 |     ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
26 | 
27 | 
28 |   df.createOrReplaceTempView("AddTimeExample")
29 | 
30 |   val df2 = spark.sql("select input_timestamp, " +
31 |     "cast(input_timestamp as TIMESTAMP) + INTERVAL 2 hours as added_hours," +
32 |     "cast(input_timestamp as TIMESTAMP) + INTERVAL 5 minutes as added_minutes," +
33 |     "cast(input_timestamp as TIMESTAMP) + INTERVAL 55 seconds as added_seconds from AddTimeExample"
34 |     )
35 |   df2.show(false)
36 | 
37 |   df.withColumn("added_hours",col("input_timestamp") + expr("INTERVAL 2 HOURS"))
38 |     .withColumn("added_minutes",col("input_timestamp") + expr("INTERVAL 2 minutes"))
39 |     .withColumn("added_seconds",col("input_timestamp") + expr("INTERVAL 2 seconds"))
40 |     .show(false)
41 | }
42 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/CurrentDateAndTime.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions._
 5 | 
 6 | object CurrentDateAndTime extends App {
 7 | 
 8 |   val spark:SparkSession = SparkSession.builder()
 9 |     .master("local")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 |   spark.sparkContext.setLogLevel("ERROR")
13 | 
14 |   import spark.sqlContext.implicits._
15 | 
16 |   //Get current Date & Time
17 |   val df = Seq((1)).toDF("seq")
18 | 
19 |   val curDate = df.withColumn("current_date",current_date().as("current_date"))
20 |     .withColumn("current_timestamp",current_timestamp().as("current_timestamp"))
21 |   curDate.show(false)
22 | 
23 | 
24 |   curDate.select(date_format(col("current_timestamp"),"MM-dd-yyyy").as("date"),
25 |     date_format(col("current_timestamp"),"HH:mm:ss.SSS").as("time"),
26 |     date_format(col("current_date"), "MM-dd-yyyy").as("current_date_formateed"))
27 |     .show(false)
28 | 
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateAddMonths.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions._
 5 | 
 6 | object DateAddMonths extends App {
 7 | 
 8 |   val spark:SparkSession = SparkSession.builder()
 9 |     .master("local")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 |   spark.sparkContext.setLogLevel("ERROR")
13 | 
14 |   import spark.sqlContext.implicits._
15 | 
16 |   Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select(
17 |     col("date"),
18 |     add_months(col("date"),3).as("add_months"),
19 |     add_months(col("date"),-3).as("sub_months"),
20 |     date_add(col("date"),4).as("date_add"),
21 |     date_sub(col("date"),4).as("date_sub")
22 |   ).show()
23 | 
24 |   Seq(("06-03-2009"),("07-24-2009")).toDF("date").select(
25 |     col("Date"),
26 |     add_months(to_date(col("Date"),"MM-dd-yyyy"),3).as("add_months"),
27 |     add_months(to_date(col("Date"),"MM-dd-yyyy"),-3).as("add_months2"),
28 |     date_add(to_date(col("Date"),"MM-dd-yyyy"),3).as("date_add"),
29 |       date_add(to_date(col("Date"),"MM-dd-yyyy"),-3).as("date_add2"),
30 |     date_sub(to_date(col("Date"),"MM-dd-yyyy"),3).as("date_sub")
31 |   ).show()
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateDiff.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.functions._
 4 | import org.apache.spark.sql.{DataFrame, SparkSession}
 5 | 
 6 | object DateDiff extends App {
 7 | 
 8 |   val spark:SparkSession = SparkSession.builder()
 9 |     .master("local")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 |   spark.sparkContext.setLogLevel("ERROR")
13 | 
14 |   import spark.sqlContext.implicits._
15 | 
16 |   //Difference between two dates in days
17 |   Seq(("2019-07-01"),("2019-06-24"),("2019-08-24"),("2018-07-23")).toDF("date")
18 |     .select(
19 |       col("date"),
20 |       current_date().as("current_date"),
21 |       datediff(current_date(),col("date")).as("datediff")
22 |     ).show()
23 | 
24 |   // Difference between two dates in Months and Years
25 |   val df = Seq(("2019-07-01"),("2019-06-24"),("2019-08-24"),("2018-12-23"),("2018-07-20"))
26 |     .toDF("startDate").select(
27 |     col("startDate"),current_date().as("endDate")
28 |   )
29 | 
30 |   calculateDiff(df)
31 | 
32 |   //Difference between two dates when dates are not in Spark DateType format 'yyyy-MM-dd'.
33 |   //Note that when dates are not in Spark DateType format, all Spark functions returns null
34 |   //Hence, first convert the input dates to Spark DateType using to_date function
35 |   val dfDate = Seq(("07-01-2019"),("06-24-2019"),("08-24-2019"),("12-23-2018"),("07-20-2018"))
36 |     .toDF("startDate").select(
37 |     to_date(col("startDate"),"MM-dd-yyyy").as("startDate"),
38 |     current_date().as("endDate")
39 |   )
40 | 
41 |   calculateDiff(dfDate)
42 | 
43 |   def calculateDiff(df:DataFrame): Unit ={
44 |     df.withColumn("datesDiff", datediff(col("endDate"),col("startDate")))
45 |       .withColumn("montsDiff", months_between(
46 |         col("endDate"),col("startDate")))
47 |       .withColumn("montsDiff_round",round(months_between(
48 |         col("endDate"),col("startDate")),2))
49 |       .withColumn("yearsDiff",months_between(
50 |         col("endDate"),col("startDate"),true).divide(12))
51 |       .withColumn("yearsDiff_round",round(months_between(
52 |         col("endDate"),col("startDate"),true).divide(12),2))
53 |       .show()
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateExamples.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions._
 5 | 
 6 | object DateExamples {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val spark:SparkSession = SparkSession.builder()
11 |       .master("local[3]")
12 |       .appName("SparkByExample")
13 |       .getOrCreate()
14 |     spark.sparkContext.setLogLevel("ERROR")
15 |     val data = Seq(("2019-01-23"),("2019-06-24"),("2019-09-20"))
16 | 
17 |     import spark.sqlContext.implicits._
18 |     val df = data.toDF("date")
19 | 
20 |     //date_format
21 |     Seq(("2019-01-23")).toDF("InputDate").select(
22 |       current_date()as("current_date"),
23 |       col("InputDate"),
24 |       date_format(col("InputDate"), "MM-dd-yyyy").as("date_format")
25 |     ).show()
26 | 
27 |     //to_date
28 |     Seq(("04/13/2019")).toDF("InputDate").select(
29 |       col("InputDate"),
30 |       to_date(col("InputDate"), "MM/dd/yyyy").as("to_date")
31 |     ).show()
32 | 
33 | 
34 |     //datediff,
35 |     Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select(
36 |       col("date"),
37 |       current_date(),
38 |       datediff(current_date(),col("date")).as("datediff")
39 |     ).show()
40 | 
41 |     //months_between
42 |     Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select(
43 |       col("date"),
44 |       current_date(),
45 |       datediff(current_date(),col("date")).as("datediff"),
46 |       months_between(current_date(),col("date")).as("months_between")
47 |     ).show()
48 | 
49 |     //Trunc
50 |     Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select(
51 |       col("date"),
52 |       trunc(col("date"),"Month").as("Month_Trunc"),
53 |       trunc(col("date"),"Year").as("Month_Year"),
54 |       trunc(col("date"),"Month").as("Month_Trunc")
55 |     ).show()
56 | 
57 |     Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select(
58 |       col("date"),
59 |       add_months(col("date"),3).as("add_months"),
60 |       add_months(col("date"),-3).as("sub_months"),
61 |       date_add(col("date"),4).as("date_add"),
62 |       date_sub(col("date"),4).as("date_sub")
63 |     ).show()
64 | 
65 |     Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select(
66 |       col("date"),
67 |       year(col("date")).as("year"),
68 |       month(col("date")).as("month"),
69 |       dayofweek(col("date")).as("dayofweek"),
70 |       dayofmonth(col("date")).as("dayofmonth"),
71 |       dayofyear(col("date")).as("dayofyear"),
72 |       next_day(col("date"),"Sunday").as("next_day"),
73 |       weekofyear(col("date")).as("weekofyear")
74 |     ).show()
75 | 
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateLastDay.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{col, last_day, to_date}
 5 | 
 6 | object DateLastDay extends App {
 7 | 
 8 |   val spark:SparkSession = SparkSession.builder()
 9 |     .master("local")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 |   spark.sparkContext.setLogLevel("ERROR")
13 | 
14 |   import spark.sqlContext.implicits._
15 | 
16 |   Seq(("2019-01-01"),("2020-02-24"),("2019-02-24"),
17 |       ("2019-05-01"),("2018-03-24"),("2007-12-19"))
18 |     .toDF("Date").select(
19 |     col("Date"),
20 |     last_day(col("Date")).as("last_day")
21 |   ).show()
22 | 
23 | 
24 |   Seq(("06-03-2009"),("07-24-2009")).toDF("Date").select(
25 |     col("Date"),
26 |     last_day(to_date(col("Date"),"MM-dd-yyyy")).as("last_day")
27 |   ).show()
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateToString.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import com.sparkbyexamples.spark.dataframe.functions.datetime.DateFormat.spark
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions.{current_date, current_timestamp, date_format}
 6 | 
 7 | object DateToString extends App {
 8 | 
 9 |   val spark:SparkSession = SparkSession.builder()
10 |     .master("local")
11 |     .appName("SparkByExamples.com")
12 |     .getOrCreate()
13 |   spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |   import spark.sqlContext.implicits._
16 | 
17 |   Seq(1).toDF("seq").select(
18 |     current_date().as("current_date"),
19 |     date_format(current_timestamp(),"yyyy MM dd").as("yyyy MM dd"),
20 |     date_format(current_timestamp(),"MM/dd/yyyy hh:mm").as("MM/dd/yyyy"),
21 |     date_format(current_timestamp(),"yyyy MMM dd").as("yyyy MMMM dd"),
22 |     date_format(current_timestamp(),"yyyy MMMM dd E").as("yyyy MMMM dd E")
23 |   ).show(false)
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DayAndWeekOfYear.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{col, date_format, to_timestamp}
 5 | 
 6 | 
 7 | object DayAndWeekOfYear extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .master("local")
11 |     .appName("SparkByExamples.com")
12 |     .getOrCreate()
13 |   spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |   import spark.sqlContext.implicits._
16 | 
17 |   val df = Seq(("2019-01-03 12:01:19.000"),
18 |     ("2019-02-01 12:01:19.000"),
19 |     ("2019-7-16 16:44:55.406"),
20 |     ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
21 | 
22 |   df.withColumn("input_timestamp",
23 |     to_timestamp(col("input_timestamp")))
24 |     .withColumn("day_of_year", date_format(col("input_timestamp"), "D"))
25 |     .withColumn("week_of_year", date_format(col("input_timestamp"), "w"))
26 | 
27 |     .show(false)
28 | }


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DayWeekAndWeekMonth.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{col, to_timestamp,date_format}
 5 | 
 6 | 
 7 | object DayWeekAndWeekMonth extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .master("local")
11 |     .appName("SparkByExamples.com")
12 |     .getOrCreate()
13 |   spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |   import spark.sqlContext.implicits._
16 | 
17 |   val df = Seq(("2019-07-01 12:01:19.000"),
18 |     ("2019-06-24 12:01:19.000"),
19 |     ("2019-11-16 16:44:55.406"),
20 |     ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
21 | 
22 |   df.withColumn("input_timestamp",
23 |     to_timestamp(col("input_timestamp")))
24 |     .withColumn("week_day_number", date_format(col("input_timestamp"), "u"))
25 |     .withColumn("week_day_abb", date_format(col("input_timestamp"), "E"))
26 |     .withColumn("week_day_full", date_format(col("input_timestamp"), "EEEE"))
27 |     .withColumn("week_of_month", date_format(col("input_timestamp"), "W"))
28 |     .show(false)
29 | }


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/GetTimeFromTimestamp.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{col,hour,minute,second}
 5 | 
 6 | object GetTimeFromTimestamp extends App {
 7 | 
 8 |   val spark:SparkSession = SparkSession.builder()
 9 |     .master("local")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 |   spark.sparkContext.setLogLevel("ERROR")
13 | 
14 |   import spark.sqlContext.implicits._
15 | 
16 |   val df = Seq(("2019-07-01 12:01:19.000"),
17 |     ("2019-06-24 12:01:19.000"),
18 |     ("2019-11-16 16:44:55.406"),
19 |     ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
20 | 
21 | 
22 |   df.withColumn("hour", hour(col("input_timestamp")))
23 |     .withColumn("minute", minute(col("input_timestamp")))
24 |     .withColumn("second", second(col("input_timestamp")))
25 |     .show(false)
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/StringToDate.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{col, to_date}
 5 | 
 6 | object StringToDate extends App {
 7 | 
 8 |   val spark:SparkSession = SparkSession.builder()
 9 |     .master("local")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 |   spark.sparkContext.setLogLevel("ERROR")
13 | 
14 |   import spark.sqlContext.implicits._
15 | 
16 |   Seq(("06-03-2009"),("07-24-2009")).toDF("Date").select(
17 |     col("Date"),
18 |     to_date(col("Date"),"MM-dd-yyyy").as("to_date")
19 |   ).show()
20 | }
21 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/StringToTimestamp.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions._
 5 | import org.apache.spark.sql.types.LongType
 6 | 
 7 | object StringToTimestamp extends App {
 8 | 
 9 |   val spark:SparkSession = SparkSession.builder()
10 |     .master("local")
11 |     .appName("SparkByExamples.com")
12 |     .getOrCreate()
13 |   spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |   import spark.sqlContext.implicits._
16 | 
17 |   //String to timestamps
18 |   val df = Seq(("2019-07-01 12:01:19.000"),
19 |     ("2019-06-24 12:01:19.000"),
20 |     ("2019-11-16 16:44:55.406"),
21 |     ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
22 | 
23 |   df.withColumn("datetype_timestamp",
24 |         to_timestamp(col("input_timestamp")))
25 |     .printSchema()
26 | 
27 | 
28 |   //Convert string to timestamp when input string has just time
29 |   val df1 = Seq(("12:01:19.345"),
30 |     ("12:01:20.567"),
31 |     ("16:02:44.406"),
32 |     ("16:50:59.406"))
33 |     .toDF("input_timestamp")
34 | 
35 |   df1.withColumn("datetype_timestamp",
36 |     to_timestamp(col("input_timestamp"),"HH:mm:ss.SSS"))
37 |     .show(false)
38 | 
39 |   //when dates are not in Spark DateType format 'yyyy-MM-dd  HH:mm:ss.SSS'.
40 |   //Note that when dates are not in Spark DateType format, all Spark functions returns null
41 |   //Hence, first convert the input dates to Spark DateType using to_timestamp function
42 |   val dfDate = Seq(("07-01-2019 12 01 19 406"),
43 |     ("06-24-2019 12 01 19 406"),
44 |     ("11-16-2019 16 44 55 406"),
45 |     ("11-16-2019 16 50 59 406")).toDF("input_timestamp")
46 | 
47 |   dfDate.withColumn("datetype_timestamp",
48 |           to_timestamp(col("input_timestamp"),"MM-dd-yyyy HH mm ss SSS"))
49 |     .show(false)
50 | 
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimestampToDate.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{col, to_date, to_timestamp}
 5 | import org.apache.spark.sql.types.DateType
 6 | 
 7 | object TimestampToDate extends App {
 8 | 
 9 |   val spark:SparkSession = SparkSession.builder()
10 |     .master("local")
11 |     .appName("SparkByExamples.com")
12 |     .getOrCreate()
13 |   spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |   import spark.sqlContext.implicits._
16 | 
17 |   val df = Seq(("2019-07-01 12:01:19.000"),
18 |     ("2019-06-24 12:01:19.000"),
19 |     ("2019-11-16 16:44:55.406"),
20 |     ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
21 | 
22 |   //Timestamp String to DateType
23 |   df.withColumn("datetype",
24 |     to_date(col("input_timestamp"),"yyyy-MM-dd"))
25 |     .show(false)
26 | 
27 |   //Timestamp type to DateType
28 |   df.withColumn("ts",to_timestamp(col("input_timestamp")))
29 |     .withColumn("datetype",to_date(col("ts")))
30 |     .show(false)
31 | 
32 |   //Using Cast
33 |   df.withColumn("ts",to_timestamp(col("input_timestamp")))
34 |     .withColumn("datetype",col("ts").cast(DateType))
35 |     .show(false)
36 | }
37 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimestampToString.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{current_date, current_timestamp, date_format}
 5 | 
 6 | object TimestampToString extends App {
 7 | 
 8 |   val spark:SparkSession = SparkSession.builder()
 9 |     .master("local")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 |   spark.sparkContext.setLogLevel("ERROR")
13 | 
14 |   import spark.sqlContext.implicits._
15 | 
16 |   import spark.sqlContext.implicits._
17 |   Seq(1).toDF("seq").select(
18 |     current_timestamp().as("current_date"),
19 |     date_format(current_timestamp(),"yyyy MM dd").as("yyyy MM dd"),
20 |     date_format(current_timestamp(),"MM/dd/yyyy hh:mm").as("MM/dd/yyyy"),
21 |     date_format(current_timestamp(),"yyyy MMM dd").as("yyyy MMMM dd"),
22 |     date_format(current_timestamp(),"yyyy MMMM dd E").as("yyyy MMMM dd E")
23 |   ).show(false)
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/unixtimeExample.scala:
--------------------------------------------------------------------------------
 1 | //  ?..//
 2 | //  /
 3 | //  // package com.sparkbyexamples.spark.dataframe.functions.datetime
 4 | //
 5 | //import org.apache.spark.sql.SparkSession
 6 | //import o rg.apache.spark.sql.functions.{col, from_unixtime, unix_timestampfc
 7 | //kn b bnm}
 8 | //
 9 | //object unixtimeExample extends App {
10 | //
11 | //  val spark:SparkSession = SparkSession.builder()
12 | //    .master("local")
13 | //    .appName("SparkByExamples.com")
14 | //    .getOrCreate()
15 | //  spark.sparkContext.setLogLevel("ERROR")
16 | //
17 | //  import spark.sqlContext.implicits._
18 | //
19 | //  val df = Seq(("2019-07-01 12:01:19"),
20 | //    ("2019-06-24 12:01:19"),
21 | //    ("2019-11-16 16:44:55"),
22 | //    ("2019-11-16 16:50:59")).toDF("input_timestamp")
23 | //
24 | //
25 | //  val df2 = df.withColumn("unix_timestamp", unix_timestamp(col("input_timestamp")))
26 | //    .withColumn("current_unix_timestamp", unix_timestamp())
27 | //   df2.show(false)
28 | //
29 | //  df2.withColumn("from_unixtime",from_unixtime(col("unix_timestamp")))
30 | //    .show(false)
31 | //
32 | //}
33 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/from_json.scala:
--------------------------------------------------------------------------------
 1 | //package com.sparkbyexamples.spark.dataframe.functions
 2 | //
 3 | //import org.apache.spark.sql.SparkSession
 4 | //import org.apache.spark.sql.functions.col
 5 | //import org.apache.spark.sql.types.{StringType, StructType}
 6 | //
 7 | //object from_json {
 8 | //  def main(args:Array[String]):Unit= {
 9 | //
10 | //    val spark: SparkSession = SparkSession.builder()
11 | //      .master("local[1]")
12 | //      .appName("SparkByExample")
13 | //      .getOrCreate()
14 | //
15 | //
16 | //    val data = Seq(("1","{\"name\":\"Anne\",\"Age\":\"12\",\"country\":\"Denmark\"}"),
17 | //      ("2","{\"name\":\"Zen\",\"Age\":\"24\"}"),
18 | //      ("3","{\"name\":\"Fred\",\"Age\":\"20\",\"country\":\"France\"}"),
19 | //      ("4","{\"name\":\"Mona\",\"Age\":\"18\",\"country\":\"Denmark\"}")
20 | //    )
21 | //
22 | //    import spark.sqlContext.implicits._
23 | //    val df = data.toDF("ID","details_Json")
24 | //
25 | //    val schema = (new StructType()).add("name",StringType,true)
26 | //      .add("Age",StringType,true)
27 | //      .add("country",StringType,true)
28 | //
29 | //    val df2 = df.withColumn("details_Struct", from_json($"details_Json", schema))
30 | //        .withColumn("country",col("details_Struct").getField("country"))
31 | //        .filter(col("country").equalTo("Denmark"))
32 | //
33 | //
34 | //    df2.printSchema()
35 | //    df2.show(false)
36 | //  }
37 | //}
38 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/litTypeLit.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.functions
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.types.IntegerType
 5 | 
 6 | object litTypeLit extends App {
 7 | 
 8 | 
 9 | 
10 |   val spark = SparkSession.builder()
11 |     .appName("sparkbyexamples.com")
12 |     .master("local")
13 |     .getOrCreate()
14 | 
15 |   import spark.sqlContext.implicits._
16 |   import org.apache.spark.sql.functions._
17 | 
18 |   val data = Seq(("111",50000),("222",60000),("333",40000))
19 |   val df = data.toDF("EmpId","Salary")
20 |   val df2 = df.select(col("EmpId"),col("Salary"),lit("1").as("lit_value1"))
21 |   df2.show()
22 | 
23 |   val df3 = df2.withColumn("lit_value2",
24 |     when(col("Salary") >=40000 && col("Salary") <= 50000, lit("100").cast(IntegerType))
25 |       .otherwise(lit("200").cast(IntegerType))
26 |   )
27 | 
28 |   df3.show()
29 | 
30 |   val df4 = df3.withColumn("typedLit_seq",typedLit(Seq(1, 2, 3)))
31 |     .withColumn("typedLit_map",typedLit(Map("a" -> 1, "b" -> 2)))
32 |     .withColumn("typedLit_struct",typedLit(("a", 2, 1.0)))
33 | 
34 |   df4.printSchema()
35 |   df4.show()
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/PersonsComplexXML.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.xml
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
 5 | 
 6 | object PersonsComplexXML {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 |     val spark = SparkSession.builder().master("local[1]")
10 |       .appName("SparkByExample")
11 |       .getOrCreate()
12 | 
13 |     /*
14 |     Read XML File
15 |      */
16 |     val df = spark.read
17 |       .format("xml")
18 |       .option("rowTag", "person")
19 |       .load("src/main/resources/persons_complex.xml")
20 | 
21 |     df.printSchema()
22 | 
23 |     df.show()
24 |     val schema = new StructType()
25 |       .add("_id",StringType)
26 |       .add("firstname",StringType)
27 |       .add("middlename",StringType)
28 |       .add("lastname",StringType)
29 |       .add("dob_year",StringType)
30 |       .add("dob_month",StringType)
31 |       .add("gender",StringType)
32 |       .add("salary",StringType)
33 |       .add("addresses",  new StructType()
34 |         .add("address",ArrayType(
35 |           new StructType()
36 |             .add("_type",StringType)
37 |             .add("addressLine",StringType)
38 |             .add("city",StringType)
39 |             .add("state",StringType)
40 |           )
41 |         )
42 |       )
43 | 
44 |     val df2 = spark.read
45 |       .format("xml")
46 |       .option("rowTag", "person")
47 |       .schema(schema)
48 |       .load("src/main/resources/persons.xml")
49 | 
50 | //    df.foreach(row=>{
51 | //      println("ID:"+row.getAs("_id") )
52 | //      println("ID:"+row(0))
53 | //      println("ID:"+row.get(0))
54 | //      println(row.getAs("addresses"))
55 | //     // println("ID:"+row.getString(0))
56 | //    })
57 | //
58 |     df2.write
59 |       .format("com.databricks.spark.xml")
60 |       .option("rootTag", "persons")
61 |       .option("rowTag", "person")
62 |       .save("src/main/resources/persons_new.xml")
63 | 
64 |   }
65 | }
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/PersonsXML.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.xml
 2 | 
 3 | import org.apache.spark.sql.{SparkSession, types}
 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
 5 | 
 6 | object PersonsXML {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 |     val spark = SparkSession.builder().master("local[1]")
10 |       .appName("SparkByExample")
11 |       .getOrCreate()
12 | 
13 |     /*
14 |     Read XML File
15 |      */
16 |     val df = spark.read
17 |       .format("xml")
18 |       .option("rowTag", "person")
19 |       .load("src/main/resources/persons.xml")
20 | 
21 |     df.printSchema()
22 |     df.show()
23 |     
24 |     val schema = new StructType()
25 |       .add("_id",StringType)
26 |       .add("firstname",StringType)
27 |       .add("middlename",StringType)
28 |       .add("lastname",StringType)
29 |       .add("dob_year",StringType)
30 |       .add("dob_month",StringType)
31 |       .add("gender",StringType)
32 |       .add("salary",StringType)
33 | 
34 |     val df2 = spark.read
35 |       .format("xml")
36 |       .option("rowTag", "person")
37 |       .schema(schema)
38 |       .load("src/main/resources/persons.xml")
39 | 
40 |     df2.write
41 |       .format("com.databricks.spark.xml")
42 |       .option("rootTag", "persons")
43 |       .option("rowTag", "person")
44 |       .save("src/main/resources/persons_new.xml")
45 | 
46 |   }
47 | }
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/ReadBooksXMLWithNestedArray.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.xml
 2 | 
 3 | import com.sparkbyexamples.spark.beans.BooksWithArray
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 6 | import org.apache.spark.sql.types.StructType
 7 | 
 8 | object ReadBooksXMLWithNestedArray {
 9 | 
10 |   def main(args: Array[String]): Unit = {
11 |     val spark = SparkSession.builder().master("local[1]")
12 |       .appName("SparkByExample")
13 |       .getOrCreate()
14 | 
15 |     val df = spark.sqlContext.read
16 |       .format("com.databricks.spark.xml")
17 |       .option("rowTag", "book")
18 |       .load("src/main/resources/books_withnested_array.xml")
19 | 
20 |     df.printSchema()
21 |     df.show()
22 | 
23 |     df.foreach(row=>{
24 |       println(""+row.getAs("author")+","+row.getAs("_id"))
25 |       println(row.getStruct(4).getAs("country"))
26 |       println(row.getStruct(4).getClass)
27 |       val arr = row.getStruct(7).getList(0)
28 |       for (i<-0 to arr.size-1){
29 |         val b = arr.get(i).asInstanceOf[GenericRowWithSchema]
30 |         println(""+b.getAs("name") +","+b.getAs("location"))
31 |       }
32 |     })
33 | 
34 |   }
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/ReadBooksXMLWithNestedArrayStruct.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.xml
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 5 | import org.apache.spark.sql.types._
 6 | 
 7 | object ReadBooksXMLWithNestedArrayStruct {
 8 | 
 9 |   def main(args: Array[String]): Unit = {
10 |     val spark = SparkSession.builder().master("local[1]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     val customSchema = StructType(Array(
15 |       StructField("_id", StringType, nullable = true),
16 |       StructField("author", StringType, nullable = true),
17 |       StructField("description", StringType, nullable = true),
18 |       StructField("genre", StringType ,nullable = true),
19 |       StructField("price", DoubleType, nullable = true),
20 |       StructField("publish_date", StringType, nullable = true),
21 |       StructField("title", StringType, nullable = true),
22 |       StructField("otherInfo",StructType(Array(
23 |         StructField("pagesCount", StringType, nullable = true),
24 |         StructField("language", StringType, nullable = true),
25 |         StructField("country", StringType, nullable = true),
26 |         StructField("address", StructType(Array(
27 |           StructField("addressline1", StringType, nullable = true),
28 |           StructField("city", StringType, nullable = true),
29 |           StructField("state", StringType, nullable = true)
30 |           ))
31 |         ))
32 |       )),
33 |       StructField("stores",StructType(Array(
34 |         StructField("store",ArrayType(
35 |           StructType(Array(
36 |             StructField("location",StringType,true),
37 |             StructField("name",StringType,true)
38 |           ))
39 |         ))
40 |       )))
41 |     ))
42 | 
43 |     val df = spark.sqlContext.read
44 |       .format("com.databricks.spark.xml")
45 |       .option("rowTag", "book")
46 |       .schema(customSchema)
47 |       .load("src/main/resources/books_withnested_array.xml")
48 | 
49 |     df.printSchema()
50 |     df.show()
51 | 
52 |     df.foreach(row=>{
53 |       println(""+row.getAs("author")+","+row.getAs("_id"))
54 |       println(row.getStruct(4).getAs("country"))
55 |       println(row.getStruct(4).getClass)
56 |       val arr = row.getStruct(7).getList(0)
57 |       for (i<-0 to arr.size-1){
58 |         val b = arr.get(i).asInstanceOf[GenericRowWithSchema]
59 |         println(""+b.getAs("name") +","+b.getAs("location"))
60 |       }
61 |     })
62 | 
63 |   }
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/xstream/WriteXML.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.xml.xstream
 2 | 
 3 | import com.thoughtworks.xstream.XStream
 4 | import com.thoughtworks.xstream.io.xml.DomDriver
 5 | import org.apache.spark.sql.types.{StringType, StructType}
 6 | import org.apache.spark.sql.{Row, SparkSession}
 7 | 
 8 | object WriteXML {
 9 |   def main(args: Array[String]): Unit = {
10 | 
11 |     val spark:SparkSession = SparkSession.builder()
12 |       .master("local")
13 |       .appName("SparkByExample")
14 |       .getOrCreate()
15 | 
16 |     val sc = spark.sparkContext
17 | 
18 |     val data = Seq(Row("1",Row("James ","","Smith"),"36636","M","3000"),
19 |       Row("2",Row("Michael ","Rose",""),"40288","M","4000"),
20 |       Row("3",Row("Robert ","","Williams"),"42114","M","4000"),
21 |       Row("4",Row("Maria ","Anne","Jones"),"39192","F","4000"),
22 |       Row("5",Row("Jen","Mary","Brown"),"","F","-1")
23 |     )
24 | 
25 |     val schema = new StructType()
26 |       .add("id",StringType)
27 |       .add("name",new StructType()
28 |         .add("firstName",StringType)
29 |         .add("middleName",StringType)
30 |         .add("lastName",StringType))
31 |       .add("ssn",StringType)
32 |       .add("gender",StringType)
33 |       .add("salary",StringType)
34 | 
35 |     case class Name(firstName:String,middleName:String,lastName:String)
36 |     case class Person(id:String,name:Name,ssn:String,gender:String,salary:String)
37 |     import spark.sqlContext.implicits._
38 | 
39 |     val df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema)//.as[Person]
40 | 
41 |     val ds = df.mapPartitions(part=>{
42 |       val xstream = new XStream(new DomDriver)
43 |       val data = part.map(ite=>{
44 |         val nameRow:Row = ite.getAs[Row]("name")
45 |         val name= Name(nameRow.getAs("firstName"),nameRow.getAs("firstName"),nameRow.getAs("firstName"))
46 |         val person = Person(ite.getAs("id"),name,ite.getAs("ssn"),ite.getAs("gender"),ite.getAs("salary"))
47 |         //xstream.aliasType("Person",Class[String])
48 |         val xmlString = xstream.toXML(person)
49 |         xmlString
50 |       })
51 |       data
52 |     })
53 | 
54 |     ds.write.text("c:/tmp/xstream.xml")
55 | 
56 | //    val df2 = spark.createDataFrame(spark.sparkContext.parallelize(data),schema).as[Person]
57 | //
58 | //    val ds2 = df2.mapPartitions(part=>{
59 | //      val xstream = new XStream(new DomDriver)
60 | //      val person = part.map(ite=>{
61 | //        val xmlString = xstream.toXML(person)
62 | //        xmlString
63 | //      })
64 | //      person
65 | //    })
66 | //    ds2.write.text("c:/tmp/xstream_2.xml")
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/DataSetFromData.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataset
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object DataSetFromData {
 6 | 
 7 |   def main(args:Array[String]):Unit= {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     val data = Seq((1,2),(3,4),(5,6))
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/DataSetWithCustomClass.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataset
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | class Test(field1:String,field2:String,field3:String) extends Serializable{
 6 | 
 7 | 
 8 | }
 9 | 
10 | object TestEncoders {
11 |   implicit def testEncoder: org.apache.spark.sql.Encoder[Test] =
12 |     org.apache.spark.sql.Encoders.kryo[Test]
13 | }
14 | object DataSetWithCustomClass {
15 | 
16 |   def main(args:Array[String]):Unit= {
17 | 
18 |     val spark: SparkSession = SparkSession.builder()
19 |       .master("local[1]")
20 |       .appName("SparkByExample")
21 |       .getOrCreate()
22 | 
23 |     val test:Test = new Test("Field1","Field2","Field3")
24 | 
25 |     import spark.sqlContext.implicits._
26 |     import org.apache.spark.sql.Encoders
27 |     import TestEncoders._
28 |    // implicit val encoder = Encoders.bean[Test](classOf[Test])
29 | 
30 |     val data = Seq(test)
31 |     val rdd = spark.sparkContext.parallelize(data)
32 |     val ds = spark.createDataset(rdd)
33 | 
34 |     val ds2 = ds.selectExpr("CAST(value AS String)")
35 |       .as[(String)]
36 | 
37 | 
38 |     ds.printSchema()
39 |     ds2.show(false)
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXML.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataset.xml
 2 | 
 3 | import com.sparkbyexamples.spark.beans.{Books, BooksDiscounted}
 4 | import org.apache.spark.sql.{Encoders, SparkSession}
 5 | 
 6 | object ReadBooksXML {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 |     val spark = SparkSession.builder().master("local[1]")
10 |       .appName("SparkByExample")
11 |       .getOrCreate()
12 | 
13 |     import spark.implicits._
14 | 
15 |     val ds = spark.sqlContext.read
16 |       .format("com.databricks.spark.xml")
17 |       .option("rowTag", "book")
18 |       .load("src/main/resources/books.xml").as[Books]
19 | 
20 | 
21 |     val newds = ds.map(f=>{
22 |       BooksDiscounted(f._id,f.author,f.description,f.price,f.publish_date,f.title, f.price - f.price*20/100)
23 |     })
24 | 
25 |     newds.printSchema()
26 |     newds.show()
27 | 
28 |     newds.foreach(f=>{
29 |       println("Price :"+f.price + ", Discounted Price :"+f.discountPrice)
30 |     })
31 | 
32 |     //First element
33 |     println("First Element" +newds.first()._id)
34 | 
35 |   }
36 | }
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXMLWithNestedArray.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataset.xml
 2 | 
 3 | import com.sparkbyexamples.spark.beans.{Books, BooksWithArray}
 4 | import org.apache.spark.sql.{SparkSession, functions}
 5 | 
 6 | object ReadBooksXMLWithNestedArray {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 |     val spark = SparkSession.builder().master("local[1]")
10 |       .appName("SparkByExample")
11 |       .getOrCreate()
12 | 
13 |     import spark.implicits._
14 |     val ds = spark.sqlContext.read
15 |       .format("com.databricks.spark.xml")
16 |       .option("rowTag", "book")
17 |       .load("src/main/resources/books_withnested_array.xml").as[BooksWithArray]
18 | 
19 |     ds.printSchema()
20 |     ds.show()
21 | 
22 |     ds.foreach(f=>{
23 |       println(f.author+","+f.otherInfo.country+","+f.otherInfo.address.addressline1)
24 |       for(s<-f.stores.store){
25 |         println(s.name)
26 |       }
27 | 
28 |     })
29 | 
30 |   }
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXMLWithNestedArrayDSL.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataset.xml
 2 | 
 3 | 
 4 | 
 5 | import com.sparkbyexamples.spark.beans.Books
 6 | import org.apache.spark.sql.{Encoders, SparkSession, functions}
 7 | 
 8 | object ReadBooksXMLWithNestedArrayDSL {
 9 | 
10 |   def main(args: Array[String]): Unit = {
11 |     val spark = SparkSession.builder().master("local[1]")
12 |       .appName("SparkByExample")
13 |       .getOrCreate()
14 | 
15 |     import spark.implicits._
16 |     val xmlDF = spark.sqlContext.read
17 |       .format("com.databricks.spark.xml")
18 |       .option("rowTag", "book")
19 |       .load("src/main/resources/books_withnested_array.xml")
20 | 
21 |     xmlDF.printSchema()
22 |     println(xmlDF.count())
23 | 
24 |     xmlDF.show()
25 | 
26 |     xmlDF.select(xmlDF("title"),xmlDF("price")*100).show()
27 | 
28 |     xmlDF.select("author").show()
29 | 
30 | 
31 |     xmlDF.select("stores").show()
32 | 
33 |     xmlDF.withColumn("store", functions.explode(xmlDF("stores.store"))).show()
34 | 
35 |     val df = xmlDF.withColumn("store", functions.explode(xmlDF("stores.store")))
36 |       .select("_id","author","stores.country","store.name")
37 | 
38 |     val storeDF = xmlDF.select("stores.store")
39 |     storeDF.printSchema()
40 | 
41 |     df.foreach(f=>{
42 |       println(f.getAs("_id"))
43 |     })
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 |   }
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/SparkXMLUsingXstream.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataframe.xml
 2 | 
 3 | import com.thoughtworks.xstream.XStream
 4 | import com.thoughtworks.xstream.io.xml.DomDriver
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | case class Animal(cri:String,taille:Int)
 8 | 
 9 | object SparkXMLUsingXStream{
10 |   def main(args: Array[String]): Unit = {
11 |     val spark = SparkSession.
12 |     builder.master ("local[*]")
13 |     .appName ("sparkbyexamples.com")
14 |     .getOrCreate ()
15 | 
16 |     var animal:Animal = Animal("Rugissement",150)
17 |     val xstream1 = new XStream(new DomDriver())
18 |     xstream1.alias("testAni",classOf[Animal])
19 |     xstream1.aliasField("cricri",classOf[Animal],"cri")
20 |     val xmlString = Seq(xstream1.toXML(animal))
21 | 
22 |     import spark.implicits._
23 |     val newDf = xmlString.toDF()
24 |     newDf.show(false)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/sparkXml.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.dataset.xml
 2 | 
 3 | import org.apache.spark.sql.functions.{col, explode}
 4 | import org.apache.spark.sql.{SQLContext, SparkSession}
 5 | 
 6 | object sparkXml {
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark = SparkSession.
10 |       builder.master("local[*]")
11 |       //.config("spark.debug.maxToStringFields", "100")
12 |       .appName("Insight Application Big Data")
13 |       .getOrCreate()
14 | 
15 |     val df = spark.read
16 |       .format("com.databricks.spark.xml")
17 |       .option("rowTag", "row")
18 |       .load("src/main/resources/input.xml")
19 |     df.createOrReplaceTempView("categ_entry")
20 | 
21 |     df.printSchema()
22 |     spark.sql("Select c26['_VALUE'] as value, c26['_m'] as option from categ_entry").show(false)
23 | 
24 |      val df2 = df.withColumn("c26Struct",explode(df("c26")))
25 |      df2.select(col("c26Struct._VALUE").alias("value"),col("c26Struct._m").alias("option") ).show(false)
26 | 
27 | 
28 | 
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/CreateEmptyRDD.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object CreateEmptyRDD extends App{
 6 | 
 7 |   val spark:SparkSession = SparkSession.builder()
 8 |     .master("local[3]")
 9 |     .appName("SparkByExamples.com")
10 |     .getOrCreate()
11 | 
12 |   val rdd = spark.sparkContext.emptyRDD
13 |   val rddString = spark.sparkContext.emptyRDD[String]
14 | 
15 |   println(rdd)
16 |   println(rddString)
17 |   println("Num of Partitions: "+rdd.getNumPartitions)
18 | 
19 |   //rddString.saveAsTextFile("test.txt") // returns error
20 | 
21 |   val rdd2 = spark.sparkContext.parallelize(Seq.empty[String])
22 |   println(rdd2)
23 |   println("Num of Partitions: "+rdd2.getNumPartitions)
24 | 
25 |   //rdd2.saveAsTextFile("test3.txt")
26 | 
27 |   // Pair RDD
28 | 
29 |   type dataType = (String,Int)
30 |   var pairRDD = spark.sparkContext.emptyRDD[dataType]
31 |   println(pairRDD)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/CreateRDD.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object CreateRDD {
 6 | 
 7 |   def main(args:Array[String]): Unit ={
 8 | 
 9 |     val spark:SparkSession = SparkSession.builder()
10 |       .master("local[3]")
11 |       .appName("SparkByExamples.com")
12 |       .getOrCreate()
13 | 
14 |     //Create RDD from collection
15 |     val rdd=spark.sparkContext.parallelize(
16 |       Seq(("Java", 20000), ("Python", 100000), ("Scala", 3000))
17 |     )
18 | 
19 |     //Create RDD from another RDD
20 |     val rdd2 = rdd.map(row=>{
21 |       (row._1,row._2+100)
22 |     })
23 | 
24 |     rdd2.foreach(println)
25 | 
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/OperationsOnRDD.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object OperationsOnRDD {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark = SparkSession.builder()
10 |       .appName("SparkByExample")
11 |       .master("local")
12 |       .getOrCreate()
13 | 
14 |     spark.sparkContext.setLogLevel("ERROR")
15 | 
16 |     val rdd = spark.sparkContext.parallelize(
17 |       List("Germany India USA","USA London Russia","Mexico Brazil Canada China")
18 |     )
19 | 
20 |     val listRdd = spark.sparkContext.parallelize(List(9,2,3,4,5,6,7,8))
21 | 
22 |     //reduce
23 |     println("Minimum :"+listRdd.reduce((a,b)=> a min b))
24 |     println("Maximum :"+listRdd.reduce((a,b)=> a max b))
25 |     println("Sum :"+listRdd.reduce((a,b)=> a + b))
26 | 
27 |     //flatMap
28 |     val wordsRdd = rdd.flatMap(_.split(" "))
29 |     wordsRdd.foreach(println)
30 | 
31 |     //sortBy
32 |     println("Sort by word name")
33 |     val sortRdd = wordsRdd.sortBy(f=>f) // also can write f=>f
34 | 
35 |     //GroupBy
36 |     val groupRdd = wordsRdd.groupBy(word=>word.length)
37 |     groupRdd.foreach(println)
38 | 
39 |     //map
40 |     val tupp2Rdd = wordsRdd.map(f=>(f,1))
41 |     tupp2Rdd.foreach(println)
42 | 
43 | 
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/PartitionBy.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.HashPartitioner
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | object PartitionBy {
 8 | 
 9 | 
10 |   def main(args:Array[String]): Unit = {
11 | 
12 |     val spark:SparkSession = SparkSession.builder()
13 |       .master("local[3]")
14 |       .appName("SparkByExample")
15 |       .getOrCreate()
16 | 
17 |     val sc = spark.sparkContext
18 | 
19 |     val rdd = sc.textFile("C://000_Projects/opt/BigData/zipcodes.csv")
20 | 
21 |     val rdd2:RDD[Array[String]] = rdd.map(m=>m.split(","))
22 | 
23 | 
24 |     val rdd3 = rdd2.map(a=>(a(1),a.mkString(",")))
25 | 
26 |     val rdd4 = rdd3.partitionBy(new HashPartitioner(3))
27 | 
28 |     rdd4.saveAsTextFile("c:/tmp/output/partition")
29 | 
30 | 
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDAccumulator.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 | 
3 | object RDDAccumulator_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDBroadcast.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 | 
3 | object RDDBroadcast_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDCache.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 | 
3 | object RDDCache_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromCSVFile.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object RDDFromCSVFile {
 7 | 
 8 |   def main(args:Array[String]): Unit ={
 9 | 
10 |     def splitString(row:String):Array[String]={
11 |       row.split(",")
12 |     }
13 | 
14 |     val spark:SparkSession = SparkSession.builder()
15 |       .master("local[3]")
16 |       .appName("SparkByExample")
17 |       .getOrCreate()
18 |     val sc = spark.sparkContext
19 | 
20 |     val rdd = sc.textFile("src/main/resources/zipcodes-noheader.csv")
21 | 
22 |     val rdd2:RDD[ZipCode] = rdd.map(row=>{
23 |      val strArray = splitString(row)
24 |       ZipCode(strArray(0).toInt,strArray(1),strArray(3),strArray(4))
25 |     })
26 | 
27 |     rdd2.foreach(a=>println(a.city))
28 |   }
29 | 
30 | }
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromDataUsingParallelize.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.SQLContext
 6 | 
 7 | object RDDFromDataUsingParallelize {
 8 | 
 9 |   def main(args: Array[String]): Unit = {
10 |       val spark:SparkSession = SparkSession.builder()
11 |         .master("local[3]")
12 |         .appName("SparkByExample")
13 |         .getOrCreate()
14 |       val rdd:RDD[Int] = spark.sparkContext.parallelize(List(1,2,3,4,5))
15 |       val rddCollect:Array[Int] = rdd.collect()
16 |       println("Number of Partitions: "+rdd.getNumPartitions)
17 |       println("Action: First element: "+rdd.first())
18 |       println("Action: RDD converted to Array[Int] : ")
19 |       rddCollect.foreach(println)
20 | 
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromParallelizeRange.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object RDDFromParallelizeRange {
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark:SparkSession = SparkSession.builder()
10 |       .master("local[3]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     val sc = spark.sparkContext
15 | 
16 |     val rdd4:RDD[Range] = sc.parallelize(List(1 to 1000))
17 |     println("Number of Partitions : "+rdd4.getNumPartitions)
18 | 
19 |     val rdd5 = rdd4.repartition(5)
20 |     println("Number of Partitions : "+rdd5.getNumPartitions)
21 | 
22 |     val rdd6:Array[Range] = rdd5.collect()
23 |     println(rdd6.mkString(","))
24 | 
25 |     val rdd7:Array[Array[Range]] = rdd5.glom().collect()
26 |     println("After glom");
27 |     rdd7.foreach(f=>{
28 |       println("For each partition")
29 |       f.foreach(f1=>println(f1))
30 |     })
31 | 
32 | 
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromWholeTextFile.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object RDDFromWholeTextFile {
 7 | 
 8 |   def main(args:Array[String]): Unit = {
 9 | 
10 |     val spark:SparkSession = SparkSession.builder()
11 |       .master("local[3]")
12 |       .appName("SparkByExamples.com")
13 |       .getOrCreate()
14 |     val sc = spark.sparkContext
15 | 
16 |     val rdd = sc.wholeTextFiles("C://000_Projects/opt/BigData/alice.txt")
17 |     rdd.foreach(a=>println(a._1+"---->"+a._2))
18 | 
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDHadoopInputFormat.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 | 
3 | object RDDHadoopInputFormat_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDPersist.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 | 
3 | object RDDPersist_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDReadFilesFromDirectory.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 | 
3 | object RDDReadFilesFromDirectory_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDSaveAsObjectFile.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 | 
3 | object RDDSaveAsObjectFile_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDSequenceFiles.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 | 
3 | object RDDSequenceFiles_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/ReadMultipleCSVFiles.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object ReadMultipleCSVFiles extends App {
 7 | 
 8 |   val spark:SparkSession = SparkSession.builder()
 9 |     .master("local[1]")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 | 
13 |   spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |   println("spark read csv files from a directory into RDD")
16 |   val rddFromFile = spark.sparkContext.textFile("C:/tmp/files/text01.csv")
17 |   println(rddFromFile.getClass)
18 | 
19 |   val rdd = rddFromFile.map(f=>{
20 |     f.split(",")
21 |   })
22 | 
23 |   println("Iterate RDD")
24 |   rdd.foreach(f=>{
25 |     println("Col1:"+f(0)+",Col2:"+f(1))
26 |   })
27 |   println(rdd)
28 | 
29 |   println("Get data Using collect")
30 |   rdd.collect().foreach(f=>{
31 |     println("Col1:"+f(0)+",Col2:"+f(1))
32 |   })
33 | 
34 |   println("read all csv files from a directory to single RDD")
35 |   val rdd2 = spark.sparkContext.textFile("C:/tmp/files/*")
36 |   rdd2.foreach(f=>{
37 |     println(f)
38 |   })
39 | 
40 |   println("read csv files base on wildcard character")
41 |   val rdd3 = spark.sparkContext.textFile("C:/tmp/files/text*.csv")
42 |   rdd3.foreach(f=>{
43 |     println(f)
44 |   })
45 | 
46 |   println("read multiple csv files into a RDD")
47 |   val rdd4 = spark.sparkContext.textFile("C:/tmp/files/text01.csv,C:/tmp/files/text02.csv")
48 |   rdd4.foreach(f=>{
49 |     println(f)
50 |   })
51 | 
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/ReadMultipleFiles.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object ReadMultipleFiles extends App {
 6 | 
 7 | 
 8 |   val spark:SparkSession = SparkSession.builder()
 9 |     .master("local[1]")
10 |     .appName("SparkByExamples.com")
11 |     .getOrCreate()
12 | 
13 |   spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |   println("read all text files from a directory to single RDD")
16 |   val rdd = spark.sparkContext.textFile("C:/tmp/files/*")
17 |   rdd.foreach(f=>{
18 |     println(f)
19 |   })
20 | 
21 |   println("read text files base on wildcard character")
22 |   val rdd2 = spark.sparkContext.textFile("C:/tmp/files/text*.txt")
23 |   rdd2.foreach(f=>{
24 |     println(f)
25 |   })
26 | 
27 |   println("read multiple text files into a RDD")
28 |   val rdd3 = spark.sparkContext.textFile("C:/tmp/files/text01.txt,C:/tmp/files/text02.txt")
29 |   rdd3.foreach(f=>{
30 |     println(f)
31 |   })
32 | 
33 |   println("Read files and directory together")
34 |   val rdd4 = spark.sparkContext.textFile("C:/tmp/files/text01.txt,C:/tmp/files/text02.txt,C:/tmp/files/*")
35 |   rdd4.foreach(f=>{
36 |     println(f)
37 |   })
38 | 
39 | 
40 |   val rddWhole = spark.sparkContext.wholeTextFiles("C:/tmp/files/*")
41 |   rddWhole.foreach(f=>{
42 |     println(f._1+"=>"+f._2)
43 |   })
44 | 
45 |   val rdd5 = spark.sparkContext.textFile("C:/tmp/files/*")
46 |   val rdd6 = rdd5.map(f=>{
47 |     f.split(",")
48 |   })
49 | 
50 |   rdd6.foreach(f => {
51 |     println("Col1:"+f(0)+",Col2:"+f(1))
52 |   })
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/SortBy.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 6 | 
 7 | object SortBy {
 8 | 
 9 |   def main(args: Array[String]): Unit = {
10 | 
11 |     val spark:SparkSession = SparkSession.builder()
12 |       .master("local[3]")
13 |       .appName("SparkByExample")
14 |       .getOrCreate()
15 | 
16 |     val sc = spark.sparkContext
17 | 
18 |     val rdd:RDD[String] = sc.textFile("C://000_Projects/opt/BigData/zipcodes-noheader.csv")
19 | 
20 |     val rddZip:RDD[ZipCode] = rdd.map(f=>{
21 |       val arr = split(f)
22 |       ZipCode(arr(0).toInt,arr(1),arr(3),arr(4))
23 |     })
24 | 
25 |     //SortBy
26 |     val rddSort = rddZip.sortBy(f=>f.recordNumber)
27 |     rddSort.collect().foreach(f=>println(f.toString))
28 | 
29 |     //SorybyKey
30 |     //First create pairRDD
31 |     val rddTuple=rddZip.map(f=>{
32 |       Tuple2(f.recordNumber,f.toString)
33 |     })
34 |     rddTuple.sortByKey().collect().foreach(f=>println(f._2))
35 |   }
36 | 
37 |   def split(str:String): Array[String] ={
38 |     str.split(",")
39 |   }
40 | 
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/WordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object WordCount {
 7 | 
 8 | 
 9 |   def main(args:Array[String]): Unit = {
10 | 
11 |     val spark:SparkSession = SparkSession.builder()
12 |       .master("local[3]")
13 |       .appName("SparkByExample")
14 |       .getOrCreate()
15 | 
16 |     val sc = spark.sparkContext
17 | 
18 |     val rdd:RDD[String] = sc.textFile("src/main/scala/test.txt")
19 | 
20 |     // rdd.collect
21 |     rdd.collect().foreach(println)
22 | 
23 |     // rdd flatMap
24 |     val rdd2 = rdd.flatMap(f=>f.split(" "))
25 |     rdd2.foreach(f=>println(f))
26 | 
27 |     //Create a Tuple by adding 1 to each word
28 |     val rdd3 = rdd2.map(m=>(m,1))
29 |     rdd3.foreach(println)
30 | 
31 |     //Filter
32 |     val rdd4 = rdd3.filter(a=> a._1.startsWith("a"))
33 |     rdd4.foreach(println)
34 | 
35 |     //ReduceBy
36 |     val rdd5 = rdd3.reduceByKey(_ + _)
37 |     rdd5.foreach(println)
38 | 
39 |     //Swap word,count and sort by key
40 |     rdd5.map(a=>(a._2,a._1)).sortByKey().foreach(println)
41 | 
42 | 
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/ZipCode.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 | 
3 | case class ZipCode(recordNumber:Int,zipCode:String,city:String,state:String)
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/xml/XmlRecordReader.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.rdd.xml
 2 | 
 3 | import com.databricks.spark.xml.XmlInputFormat
 4 | import org.apache.hadoop.conf.Configuration
 5 | import org.apache.hadoop.io.{LongWritable, Text}
 6 | import org.apache.spark.api.java.JavaSparkContext
 7 | import org.apache.spark.api.java.function.VoidFunction
 8 | import org.apache.spark.sql.SparkSession
 9 | 
10 | import scala.xml.XML
11 | 
12 | 
13 | object XmlRecordReader {
14 |   def main(args: Array[String]): Unit = {
15 |     val sparkSession = SparkSession.builder.appName("XmlRecordReader").master("local").getOrCreate
16 |     val javaSparkContext = new JavaSparkContext(sparkSession.sparkContext)
17 |     val configuration = new Configuration
18 |     configuration.set("xmlinput.start", "<Rec>")
19 |     configuration.set("xmlinput.end", "</Rec>")
20 |     configuration.set("mapreduce.input.fileinputformat.inputdir", "src/main/resources/records.xml")
21 |     val javaPairRDD = javaSparkContext.newAPIHadoopRDD(configuration, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text])
22 |     javaPairRDD.foreach(new VoidFunction[Tuple2[LongWritable, Text]]() {
23 |       @throws[Exception]
24 |       override def call(tuple: Tuple2[LongWritable, Text]): Unit = { // TODO Auto-generated method stub
25 | 
26 |         val xml = XML.loadString(tuple._2.toString)
27 |         val forecast = (xml \ "Name") text
28 | 
29 |         println("forecast" + forecast)
30 | 
31 |       }
32 |     })
33 |   }
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/stackoverflow/AddingLiterral.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.stackoverflow
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 5 | case class Employee(EmpId: String, Experience: Double, Salary: Double)
 6 | 
 7 | case class Employee2(EmpId: EmpData, Experience: EmpData, Salary: EmpData)
 8 | case class EmpData(key: String,value:String)
 9 | object AddingLiterral {
10 |   def main(args: Array[String]): Unit = {
11 | 
12 |     val spark = SparkSession.builder()
13 |       .master("local[1]")
14 |       .appName("SparkByExample")
15 |       .getOrCreate();
16 |     import spark.sqlContext.implicits._
17 |     import org.apache.spark.sql.functions._
18 |     val data = Seq(("111",5,50000),("222",6,60000),("333",7,60000))
19 |     val df = data.toDF("EmpId","Experience","Salary")
20 | 
21 |     val newdf = df.withColumn("EmpId", struct(lit("1").as("key"),col("EmpId").as("value")))
22 |       .withColumn("Experience", struct(lit("2").as("key"),col("Experience").as("value")))
23 |       .withColumn("Salary", struct(lit("3").as("key"),col("Salary").as("value")))
24 |       .show(false)
25 | 
26 |     val ds = df.as[Employee]
27 |     val newDS = ds.map(rec=>{
28 |       (EmpData("1",rec.EmpId), EmpData("2",rec.Experience.toString),EmpData("3",rec.Salary.toString))
29 |     })
30 |     val finalDS = newDS.toDF("EmpId","Experience","Salary").as[Employee2]
31 |     finalDS.show(false)
32 | //    newDS.withColumnRenamed("_1","EmpId")
33 | //      .withColumnRenamed("_2","Experience")
34 | //      .withColumnRenamed("_3","Salary")
35 | //      .show(false)
36 | 
37 | 
38 | 
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/stackoverflow/Test.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.stackoverflow
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, SparkSession}
 4 | import org.apache.spark.sql.functions._
 5 | object Test {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate();
13 |     import spark.sqlContext.implicits._
14 | 
15 |     val df1:DataFrame  = Seq(
16 |       ("Mark", "2018-02-20 00:00:00"),
17 |       ("Alex", "2018-03-01 00:00:00"),
18 |       ("Bob", "2018-03-01 00:00:00"),
19 |       ("Mark", "2018-07-01 00:00:00"),
20 |       ("Kate", "2018-07-01 00:00:00")
21 |     ).toDF("USER_NAME", "REQUEST_DATE")
22 | 
23 |     df1.show()
24 | 
25 |     val df2: DataFrame  = Seq(
26 |       ("Alex", "2018-01-01 00:00:00", "2018-02-01 00:00:00", "OUT"),
27 |       ("Bob", "2018-02-01 00:00:00", "2018-02-05 00:00:00", "IN"),
28 |       ("Mark", "2018-02-01 00:00:00", "2018-03-01 00:00:00", "IN"),
29 |       ("Mark", "2018-05-01 00:00:00", "2018-08-01 00:00:00", "OUT"),
30 |       ("Meggy", "2018-02-01 00:00:00", "2018-02-01 00:00:00", "OUT")
31 |     ).toDF("NAME", "START_DATE", "END_DATE", "STATUS")
32 | 
33 |     df2.show()
34 | 
35 |     val df3 = df1.join(df2, col("USER_NAME") === col("NAME"), "left_outer")
36 | 
37 | 
38 |     df3.groupBy("USER_NAME","REQUEST_DATE")
39 | 
40 |     val df4 = df3.withColumn("USER_STATUS", when($"REQUEST_DATE" > $"START_DATE" and $"REQUEST_DATE" < $"END_DATE", "Our user") otherwise ("Not our user"))
41 | 
42 |     df4.select("USER_NAME","REQUEST_DATE","USER_STATUS").distinct()show(false)
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/stackoverflow/Test2.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.stackoverflow
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object Test2 {
 6 | 
 7 | //  def main(args: Array[String]): Unit = {
 8 | //
 9 | //    val spark = SparkSession.builder()
10 | //      .master("local[1]")
11 | //      .appName("SparkByExample")
12 | //      .getOrCreate();
13 | //
14 | //    val peopleDFCsv = spark.read.format("csv")
15 | //      .load("src/main/resources/stack.csv")
16 | //
17 | //    val d = peopleDFCsv.map(row=>{
18 | //      val col1=row.get(1)
19 | //      val col2=row.get(1)
20 | //      (col1,col2)
21 | //    }).toDF()
22 | //
23 | //  }
24 | }
25 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode1.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
2 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode10.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
2 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false}
3 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false}
4 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode11.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
2 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode2.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false}
2 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false}
3 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode3.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":61391,"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX","LocationType":"NOT ACCEPTABLE","Lat":32.72,"Long":-97.31,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Cingular Wireless, TX","Location":"NA-US-TX-CINGULAR WIRELESS","Decommisioned":false}
2 | {"RecordNumber":61392,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX","LocationType":"PRIMARY","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Fort Worth, TX","Location":"NA-US-TX-FORT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986}
3 | {"RecordNumber":61393,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX","LocationType":"ACCEPTABLE","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Ft Worth, TX","Location":"NA-US-TX-FT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986}
4 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode4.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":4,"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Urb Eugene Rice, PR","Location":"NA-US-PR-URB EUGENE RICE","Decommisioned":false}
2 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode5.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":39827,"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.37,"Long":-111.64,"Xaxis":-0.3,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14962,"EstimatedPopulation":26883,"TotalWages":563792730,"Notes":"no NWS data, "}
2 | {"RecordNumber":39828,"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.38,"Long":-111.84,"Xaxis":-0.31,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14374,"EstimatedPopulation":25446,"TotalWages":471000465}
3 | {"RecordNumber":49345,"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL","LocationType":"PRIMARY","Lat":30.69,"Long":-81.92,"Xaxis":0.12,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Hilliard, FL","Location":"NA-US-FL-HILLIARD","Decommisioned":false,"TaxReturnsFiled":3922,"EstimatedPopulation":7443,"TotalWages":133112149}
4 | {"RecordNumber":49346,"Zipcode":34445,"ZipCodeType":"PO BOX","City":"HOLDER","State":"FL","LocationType":"PRIMARY","Lat":28.96,"Long":-82.41,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Holder, FL","Location":"NA-US-FL-HOLDER","Decommisioned":false}
5 | {"RecordNumber":49347,"Zipcode":32564,"ZipCodeType":"STANDARD","City":"HOLT","State":"FL","LocationType":"PRIMARY","Lat":30.72,"Long":-86.67,"Xaxis":0.04,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Holt, FL","Location":"NA-US-FL-HOLT","Decommisioned":false,"TaxReturnsFiled":1207,"EstimatedPopulation":2190,"TotalWages":36395913}
6 | {"RecordNumber":49348,"Zipcode":34487,"ZipCodeType":"PO BOX","City":"HOMOSASSA","State":"FL","LocationType":"PRIMARY","Lat":28.78,"Long":-82.61,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Homosassa, FL","Location":"NA-US-FL-HOMOSASSA","Decommisioned":false}
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode6.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":10,"Zipcode":708,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false}
2 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode7.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false}
2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599}
3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517}
4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318}
6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473}
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode8.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false}
2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599}
3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517}
4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318}
6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473}
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/folder_streaming/zipcode9.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
2 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318}
3 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473}
4 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/person.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "Person",
 4 |   "namespace": "com.sparkbyexamples",
 5 |   "fields": [
 6 |     {"name": "id","type": ["int", "null"]},
 7 |     {"name": "firstname","type": ["string", "null"]},
 8 |     {"name": "middlename","type": ["string", "null"]},
 9 |     {"name": "lastname","type": ["string", "null"]},
10 |     {"name": "dob_year","type": ["int", "null"]},
11 |     {"name": "dob_month","type": ["int", "null"]},
12 |     {"name": "gender","type": ["string", "null"]},
13 |     {"name": "salary","type": ["int", "null"]}
14 |   ]
15 | }


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/person.json:
--------------------------------------------------------------------------------
1 | {"id":1,"firstname":"James ","middlename":"","lastname":"Smith","dob_year":2018,"dob_month":1,"gender":"M","salary":3000}
2 | {"id":2,"firstname":"Michael ","middlename":"Rose","lastname":"","dob_year":2010,"dob_month":3,"gender":"M","salary":4000}
3 | {"id":3,"firstname":"Robert ","middlename":"","lastname":"Williams","dob_year":2010,"dob_month":3,"gender":"M","salary":4000}
4 | {"id":4,"firstname":"Maria ","middlename":"Anne","lastname":"Jones","dob_year":2005,"dob_month":5,"gender":"F","salary":4000}
5 | {"id":5,"firstname":"Jen","middlename":"Mary","lastname":"Brown","dob_year":2010,"dob_month":7,"gender":"","salary":-1}
6 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/SparkStreamingFromDirectory.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 5 | 
 6 | object SparkStreamingFromDirectory {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val spark:SparkSession = SparkSession.builder()
11 |       .master("local[3]")
12 |       .appName("SparkByExample")
13 |       .getOrCreate()
14 | 
15 |     spark.sparkContext.setLogLevel("ERROR")
16 | 
17 |     val schema = StructType(
18 |       List(
19 |         StructField("RecordNumber", IntegerType, true),
20 |         StructField("Zipcode", StringType, true),
21 |         StructField("ZipCodeType", StringType, true),
22 |         StructField("City", StringType, true),
23 |         StructField("State", StringType, true),
24 |         StructField("LocationType", StringType, true),
25 |         StructField("Lat", StringType, true),
26 |         StructField("Long", StringType, true),
27 |         StructField("Xaxis", StringType, true),
28 |         StructField("Yaxis", StringType, true),
29 |         StructField("Zaxis", StringType, true),
30 |         StructField("WorldRegion", StringType, true),
31 |         StructField("Country", StringType, true),
32 |         StructField("LocationText", StringType, true),
33 |         StructField("Location", StringType, true),
34 |         StructField("Decommisioned", StringType, true)
35 |       )
36 |     )
37 | 
38 |     val df = spark.readStream
39 |       .schema(schema)
40 |       .json("c:/tmp/stream_folder")
41 | 
42 |     df.printSchema()
43 | 
44 |     val groupDF = df.select("Zipcode")
45 |         .groupBy("Zipcode").count()
46 |     groupDF.printSchema()
47 | 
48 |     groupDF.writeStream
49 |       .format("console")
50 |       .outputMode("complete")
51 |       .option("truncate",false)
52 |       .option("newRows",30)
53 |       .start()
54 |       .awaitTermination()
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/SparkStreamingFromSocket.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{explode, split}
 5 | 
 6 | object SparkStreamingFromSocket {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val spark:SparkSession = SparkSession.builder()
11 |       .master("local[3]")
12 |       .appName("SparkByExample")
13 |       .getOrCreate()
14 | 
15 |     spark.sparkContext.setLogLevel("ERROR")
16 | 
17 |     val df = spark.readStream
18 |       .format("socket")
19 |       .option("host","192.168.1.100")
20 |       .option("port","7890")
21 |       .load()
22 | 
23 |     df.printSchema()
24 | 
25 |     val wordsDF = df.select(explode(split(df("value")," ")).alias("word"))
26 | 
27 |     val count = wordsDF.groupBy("word").count()
28 | 
29 |     val query = count.writeStream
30 |       .format("console")
31 |       .outputMode("complete")
32 |       .start()
33 |       .awaitTermination()
34 | 
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/batch/SparkBatchConsumeFromKafka.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.batch
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | //https://spark.apache.org/docs/2.3.0/structured-streaming-kafka-integration.html
 5 | object SparkBatchConsumeFromKafka {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("https://SparkByExamples.com")
12 |       .getOrCreate()
13 | 
14 |     spark.sparkContext.setLogLevel("ERROR")
15 | 
16 |     val df = spark
17 |       .read
18 |       .format("kafka")
19 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
20 |       .option("subscribe", "text_topic6")
21 | //      .option("startingOffsets", """{"topic1":{"0":23,"1":-2},"topic2":{"0":-2}}""")
22 | //      .option("endingOffsets", """{"topic1":{"0":50,"1":-1},"topic2":{"0":-1}}""")
23 | 
24 | //      .option("subscribePattern", "topic.*")
25 | //      .option("startingOffsets", "earliest")
26 | //      .option("endingOffsets", "latest")
27 |       .load()
28 | 
29 |     df.printSchema()
30 | 
31 |     // Displays Data in Binary
32 |     df.show()
33 | 
34 |     //// Displays Data in String
35 |     val df2 = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)","topic")
36 |     df2.show(false)
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/batch/SparkBatchConsumeFromKafkaAvro.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.batch
 2 | 
 3 | import java.io.File
 4 | 
 5 | import org.apache.avro.Schema
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.apache.spark.sql.functions.col
 8 | import org.apache.spark.sql.avro.from_avro
 9 | 
10 | //https://spark.apache.org/docs/2.3.0/structured-streaming-kafka-integration.html
11 | object SparkBatchConsumeFromKafkaAvro {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     val spark: SparkSession = SparkSession.builder()
16 |       .master("local[1]")
17 |       .appName("https://SparkByExamples.com")
18 |       .getOrCreate()
19 | 
20 |     spark.sparkContext.setLogLevel("ERROR")
21 | 
22 |     val df = spark
23 |       .read
24 |       .format("kafka")
25 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
26 |       .option("subscribe", "avro_topic1")
27 |       .load()
28 | 
29 |     df.printSchema()
30 | 
31 |     /*
32 |      Displays Data in Binary
33 |      */
34 | 
35 |     val schemaAvro = new Schema.Parser()
36 |       .parse(new File("src/main/resources/person.avsc"))
37 | 
38 |     /*
39 |      Displays Data in String
40 |      */
41 |     val df2 = df.select(from_avro(col("value"),schemaAvro.toString ).as("value"))
42 |         .selectExpr("CAST(value AS STRING)")
43 | 
44 |     df2.select("value.gender").show()
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/batch/SparkBatchProduceToKafka.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.batch
 2 | import org.apache.spark.sql.SparkSession
 3 | //https://spark.apache.org/docs/2.3.0/structured-streaming-kafka-integration.html
 4 | object SparkBatchProduceToKafka {
 5 | 
 6 |   def main(args: Array[String]): Unit = {
 7 | 
 8 |     val spark: SparkSession = SparkSession.builder()
 9 |       .master("local[1]")
10 |       .appName("SparkByExample")
11 |       .getOrCreate()
12 | 
13 |     spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |     val data = Seq (("iphone", "2007"),("iphone 3G","2008"),
16 |       ("iphone 3GS","2009"),
17 |       ("iphone 4","2010"),
18 |       ("iphone 4S","2011"),
19 |       ("iphone 5","2012"),
20 |       ("iphone 8","2014"),
21 |       ("iphone 10","2017"))
22 | 
23 |     val df = spark.createDataFrame(data).toDF("key","value")
24 | 
25 |     /*
26 |       since we are using dataframe which is already in text,
27 |       selectExpr is optional.
28 |       If the bytes of the Kafka records represent UTF8 strings,
29 |       we can simply use a cast to convert the binary data
30 |       into the correct type.
31 | 
32 |       df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
33 |     */
34 |     df.write
35 |       .format("kafka")
36 |       .option("kafka.bootstrap.servers","192.168.1.100:9092")
37 |       .option("topic","text_topic6")
38 |       .save()
39 | 
40 | 
41 |     val data2 = Seq((1,"James ","","Smith",2018,1,"M",3000),
42 |       (2,"Michael ","Rose","",2010,3,"M",4000),
43 |       (3,"Robert ","","Williams",2010,3,"M",4000),
44 |       (4,"Maria ","Anne","Jones",2005,5,"F",4000),
45 |       (5,"Jen","Mary","Brown",2010,7,"",-1)
46 |     )
47 | 
48 |     val columns = Seq("id","firstname","middlename","lastname","dob_year",
49 |       "dob_month","gender","salary")
50 |     import spark.sqlContext.implicits._
51 |     val df2 = data2.toDF(columns:_*)
52 | 
53 |     /*
54 |       Writing Json as a Value to Kafka topic
55 |      */
56 |     df2.toJSON.write
57 |       .format("kafka")
58 |       .option("kafka.bootstrap.servers","192.168.1.100:9092")
59 |       .option("topic","text_topic6")
60 |       .save()
61 | 
62 |     /*
63 |       Another way of Writing Json
64 |       By sending key and value to Kafka
65 |       using to_json()
66 |       */
67 |     df2.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value")
68 |       .write
69 |       .format("kafka")
70 |       .option("kafka.bootstrap.servers","192.168.1.100:9092")
71 |       .option("topic","text_topic6")
72 |       .save()
73 | 
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/batch/SparkBatchProduceToKafkaAvro.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.batch
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.col
 5 | import org.apache.spark.sql.avro.to_avro
 6 | //https://spark.apache.org/docs/2.3.0/structured-streaming-kafka-integration.html
 7 | object SparkBatchProduceToKafkaAvro {
 8 | 
 9 |   def main(args: Array[String]): Unit = {
10 | 
11 |     val spark: SparkSession = SparkSession.builder()
12 |       .master("local[1]")
13 |       .appName("SparkByExample")
14 |       .getOrCreate()
15 | 
16 |     spark.sparkContext.setLogLevel("ERROR")
17 | 
18 |     /*
19 |       Write Avro to Kafka
20 |      */
21 |     val data2 = Seq((1,"James ","","Smith",2018,1,"M",3000),
22 |       (2,"Michael ","Rose","",2010,3,"M",4000),
23 |       (3,"Robert ","","Williams",2010,3,"M",4000),
24 |       (4,"Maria ","Anne","Jones",2005,5,"F",4000),
25 |       (5,"Jen","Mary","Brown",2010,7,"",-1)
26 |     )
27 | 
28 |     val columns = Seq("id","firstname","middlename","lastname","dob_year",
29 |       "dob_month","gender","salary")
30 |     import spark.sqlContext.implicits._
31 |     val df2 = data2.toDF(columns:_*)
32 | 
33 |     df2.toJSON.select(to_avro(col("value")).as("value"))
34 |       .write
35 |       .format("kafka")
36 |       .option("kafka.bootstrap.servers","192.168.1.100:9092")
37 |       .option("topic","avro_topic1")
38 |       .save()
39 | 
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingForeachRDD.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.streaming.inprogress
2 | 
3 | object SparkStreamingForeachRDD_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingForeachWriter.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.streaming.inprogress
2 | 
3 | object SparkStreamingForeachWriter_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingFromDirectoryTmp.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.inprogress
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 5 | 
 6 | object SparkStreamingFromDirectoryTmp {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val spark:SparkSession = SparkSession.builder()
11 |       .master("local[3]")
12 |       .appName("SparkByExample")
13 |       .getOrCreate()
14 | 
15 |     spark.sparkContext.setLogLevel("ERROR")
16 | 
17 |     val schema = StructType(
18 |       List(
19 |         StructField("RecordNumber", IntegerType, true),
20 |         StructField("Zipcode", StringType, true),
21 |         StructField("ZipCodeType", StringType, true),
22 |         StructField("City", StringType, true),
23 |         StructField("State", StringType, true),
24 |         StructField("LocationType", StringType, true),
25 |         StructField("Lat", StringType, true),
26 |         StructField("Long", StringType, true),
27 |         StructField("Xaxis", StringType, true),
28 |         StructField("Yaxis", StringType, true),
29 |         StructField("Zaxis", StringType, true),
30 |         StructField("WorldRegion", StringType, true),
31 |         StructField("Country", StringType, true),
32 |         StructField("LocationText", StringType, true),
33 |         StructField("Location", StringType, true),
34 |         StructField("Decommisioned", StringType, true)
35 |       )
36 |     )
37 | 
38 |     val df = spark.readStream
39 |       //.option("header","true")
40 |       //.option("maxFilesPerTrigger",3)
41 |       .schema(schema)
42 |       .json("c:/tmp/stream_folder")
43 |       //.text("c:/tmp/stream_folder")
44 | 
45 |     df.printSchema()
46 | 
47 | //    val groupDF = df.select(
48 | //      get_json_object(col("value").cast("string"),"$.Zipcode")
49 | //        .alias("Zipcode")).groupBy("Zipcode").count()
50 | 
51 |     val groupDF = df.select("Zipcode")
52 |         .groupBy("Zipcode").count()
53 |     groupDF.printSchema()
54 | 
55 |     groupDF.writeStream
56 |       .format("console")
57 |       .outputMode("complete")
58 |       .option("truncate",false)
59 |       .option("newRows",30)
60 |       .start()
61 |       .awaitTermination()
62 | 
63 | 
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingKafkaProducerZipcodeObject.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.inprogress
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | object SparkStreamingKafkaUserObject {
 5 | 
 6 |   def main(args: Array[String]): Unit = {
 7 | 
 8 |     val spark: SparkSession = SparkSession.builder()
 9 |       .master("local[3]")
10 |       .appName("SparkByExample")
11 |       .getOrCreate()
12 | 
13 |     spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |     val df = spark.readStream
16 |       .option("header", "true")
17 |       .option("maxFilesPerTrigger", 3)
18 |       .text("c:/tmp/stream_folder")
19 | 
20 |     df.printSchema()
21 | 
22 |     df.writeStream
23 |       .format("kafka")
24 |       .outputMode("append")
25 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
26 |       .option("topic", "topic_text")
27 |       .option("checkpointLocation", "c:/tmp/checkpoint")
28 |       .start()
29 |       .awaitTermination()
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingToHDFS.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.inprogress
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object SparkStreamingToHDFS_ {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[3]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     spark.sparkContext.setLogLevel("ERROR")
15 | 
16 |     val df = spark.readStream
17 |       .format("socket")
18 |       .option("host","localhost")
19 |       .option("port","9090")
20 |       .load()
21 | 
22 |     df.printSchema()
23 | 
24 | 
25 | 
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingToJDBC.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.inprogress
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object SparkStreamingToJDBC_ {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[3]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     spark.sparkContext.setLogLevel("ERROR")
15 | 
16 |     val df = spark.readStream
17 |       .format("socket")
18 |       .option("host","localhost")
19 |       .option("port","9090")
20 |       .load()
21 | 
22 |     df.printSchema()
23 | 
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingToParquetFile.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.inprogress
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object SparkStreamingToParquetFile_ {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[3]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     spark.sparkContext.setLogLevel("ERROR")
15 | 
16 |     val df = spark.readStream
17 |       .format("socket")
18 |       .option("host","localhost")
19 |       .option("port","9090")
20 |       .load()
21 | 
22 |     //Parquet doesn't support complete mode hence,
23 |     //we can't write aggregated output
24 |     df.writeStream
25 |       .format("parquet")
26 |       .outputMode("append")
27 |       .option("path","c:/tmp/spark_out/parquet")
28 |       .option("checkpointLocation", "c:/tmp/checkpoint")
29 |       .start()
30 |       .awaitTermination()
31 | 
32 | 
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingToS3.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.inprogress
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object SparkStreamingToS3_ {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[3]")
11 |       .appName("SparkByExample")
12 |       .getOrCreate()
13 | 
14 |     spark.sparkContext.setLogLevel("ERROR")
15 | 
16 |     val df = spark.readStream
17 |       .format("socket")
18 |       .option("host","localhost")
19 |       .option("port","9090")
20 |       .load()
21 | 
22 |     df.printSchema()
23 | 
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingTwitter.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.streaming.inprogress
2 | 
3 | object SparkStreamingTwitter_ {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/KafkaProduceAvro.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.kafka
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object KafkaProduceAvro {
 6 | 
 7 |   def main(args:Array[String]): Unit ={
 8 | 
 9 | 
10 |     val spark: SparkSession = SparkSession.builder().master("local[1]")
11 |       .appName("SparkByExamples.com")
12 |       .getOrCreate()
13 | 
14 |     spark.sparkContext.setLogLevel("ERROR")
15 | 
16 |     val data = Seq((1,"James ","","Smith",2018,1,"M",3000),
17 |       (2,"Michael ","Rose","",2010,3,"M",4000),
18 |       (3,"Robert ","","Williams",2010,3,"M",4000),
19 |       (4,"Maria ","Anne","Jones",2005,5,"F",4000),
20 |       (5,"Jen","Mary","Brown",2010,7,"",-1)
21 |     )
22 | 
23 |     val columns = Seq("id","firstname","middlename","lastname","dob_year",
24 |       "dob_month","gender","salary")
25 |     import spark.sqlContext.implicits._
26 |     val df = data.toDF(columns:_*)
27 | 
28 |     df.write.json("c:/tmp/person.json")
29 | 
30 | 
31 | 
32 | //    // `from_avro` requires Avro schema in JSON string format.
33 | //    val jsonFormatSchema = new String(Files.readAllBytes(Paths.get("src/main/resources/person.avsc")))
34 | //
35 | //    val df = spark
36 | //      .readStream
37 | //      .format("kafka")
38 | //      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
39 | //      .option("subscribe", "topic1")
40 | //      .load()
41 | //
42 | //    // 1. Decode the Avro data into a struct;
43 | //    // 2. Filter by column `favorite_color`;
44 | //    // 3. Encode the column `name` in Avro format.
45 | //    val output = df
46 | //      .select(from_avro(col("value"), jsonFormatSchema) as "user")
47 | //      .where(col("user.favorite_color") === "red")
48 | //      .select(to_avro(col("user.name")) as "value")
49 | 
50 | //    val data = Seq (("iphone", "2007"),("iphone 3G","2008"),
51 | //      ("iphone 3GS","2009"),
52 | //      ("iphone 4","2010"),
53 | //      ("iphone 4S","2011"),
54 | //      ("iphone 5","2012"),
55 | //      ("iphone 8","2014"),
56 | //      ("iphone 10","2017"))
57 | //
58 | //    val df = spark.createDataFrame(data).toDF("key","value")
59 | 
60 |     val ds = df.toJSON
61 |     ds.printSchema()
62 | 
63 |     val query = ds
64 |       .writeStream
65 |       .format("kafka")
66 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
67 |       .option("topic", "text_topic")
68 |       .start()
69 | 
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/SparkStreamingConsumeKafka.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.kafka
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{explode, split}
 5 | 
 6 | object SparkStreamingConsumeKafka {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val spark:SparkSession = SparkSession.builder()
11 |       .master("local[3]")
12 |       .appName("SparkByExample")
13 |       .getOrCreate()
14 | 
15 |     spark.sparkContext.setLogLevel("ERROR")
16 | 
17 |     val df = spark.readStream
18 |       .format("kafka")
19 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
20 |       .option("subscribe", "topic_text")
21 |       //.option("subscribePattern", "topic.*")
22 |       .option("startingOffsets", "earliest") // Other possible values assign and latest
23 |       .load()
24 | 
25 |     df.printSchema()
26 | 
27 |     val groupCount = df.select(explode(split(df("value")," ")).alias("word"))
28 |       .groupBy("word").count()
29 | 
30 |     groupCount.writeStream
31 |       .format("console")
32 |       .outputMode("complete")
33 |       .start()
34 |       .awaitTermination()
35 | 
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/avro/KafkaConsumerAvro.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.kafka.avro
 2 | import java.nio.file.{Files, Paths}
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.avro._
 5 | import org.apache.spark.sql.functions.col
 6 | 
 7 | object KafkaConsumerAvro {
 8 |     def main(args: Array[String]): Unit = {
 9 | 
10 |       val spark: SparkSession = SparkSession.builder()
11 |         .master("local")
12 |         .appName("SparkByExample.com")
13 |         .getOrCreate()
14 | 
15 |       spark.sparkContext.setLogLevel("ERROR")
16 | 
17 |       val df = spark.readStream
18 |         .format("kafka")
19 |         .option("kafka.bootstrap.servers", "192.168.1.100:9092")
20 |         .option("subscribe", "avro_topic")
21 |         .option("startingOffsets", "earliest") // From starting
22 |         .load()
23 | 
24 |       /*
25 |        Prints Kafka schema with columns (topic, offset, partition e.t.c)
26 |         */
27 |       df.printSchema()
28 | 
29 |       /*
30 |       Read schema to convert Avro data to DataFrame
31 |        */
32 |       val jsonFormatSchema = new String(
33 |         Files.readAllBytes(Paths.get("./src/main/resources/person.avsc")))
34 | 
35 |       val personDF = df.select(from_avro(col("value"), jsonFormatSchema).as("person"))
36 |         .select("person.*")
37 | 
38 |       personDF.printSchema()
39 | 
40 |       /*
41 |       Stream data to Console for testing
42 |        */
43 |       personDF.writeStream
44 |         .format("console")
45 |         .outputMode("append")
46 |         .start()
47 |         .awaitTermination()
48 | 
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/avro/KafkaProduceAvro.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.kafka.avro
 2 | import org.apache.spark.sql.SparkSession
 3 | import org.apache.spark.sql.functions.{col, from_json,to_json,struct}
 4 | import org.apache.spark.sql.avro.to_avro
 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 6 | object KafkaProduceAvro {
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val spark: SparkSession = SparkSession.builder()
10 |       .master("local[1]")
11 |       .appName("SparkByExample.com")
12 |       .getOrCreate()
13 | 
14 |     /*
15 |     Disable logging as it writes too much log
16 |      */
17 |     spark.sparkContext.setLogLevel("ERROR")
18 | 
19 |     /*
20 |     This consumes JSON data from Kafka
21 |      */
22 |     val df = spark.readStream
23 |       .format("kafka")
24 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
25 |       .option("subscribe", "json_topic")
26 |       .option("startingOffsets", "earliest") // From starting
27 |       .load()
28 | 
29 |     /*
30 |      Prints Kafka schema with columns (topic, offset, partition e.t.c)
31 |       */
32 |     df.printSchema()
33 | 
34 |     val schema = new StructType()
35 |       .add("id",IntegerType)
36 |       .add("firstname",StringType)
37 |       .add("middlename",StringType)
38 |       .add("lastname",StringType)
39 |       .add("dob_year",IntegerType)
40 |       .add("dob_month",IntegerType)
41 |       .add("gender",StringType)
42 |       .add("salary",IntegerType)
43 | 
44 |     /*
45 |     Converts JSON string to DataFrame
46 |      */
47 |     val personDF = df.selectExpr("CAST(value AS STRING)") // First convert binary to string
48 |       .select(from_json(col("value"), schema).as("data"))
49 | 
50 | 
51 |     personDF.printSchema()
52 |     /*
53 |       *uncomment below code if you want to write it to console for testing.
54 |       */
55 | //    person.select(to_json(struct("data.*")).as("value"))
56 | //      .writeStream
57 | //        .format("console")
58 | //        .outputMode("append")
59 | //        .start()
60 | //        .awaitTermination()
61 | 
62 |     /*
63 |       * Convert DataFrame columns to Avro format and name it as "value"
64 |       * And send this Avro data to Kafka topic
65 |       */
66 | 
67 |     personDF.select(to_avro(struct("data.*")) as "value")
68 |       .writeStream
69 |       .format("kafka")
70 |       .outputMode("append")
71 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
72 |       .option("topic", "avro_topic")
73 |       .option("checkpointLocation","c:/tmp")
74 |       .start()
75 |       .awaitTermination()
76 |   }
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/json/SparkStreamingConsumerKafkaJson.scala:
--------------------------------------------------------------------------------
 1 | package com.sparkbyexamples.spark.streaming.kafka.json
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.{col, from_json}
 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 6 | 
 7 | object SparkStreamingConsumerKafkaJson {
 8 | 
 9 |   def main(args: Array[String]): Unit = {
10 | 
11 |     val spark: SparkSession = SparkSession.builder()
12 |       .master("local[3]")
13 |       .appName("SparkByExample")
14 |       .getOrCreate()
15 | 
16 |     spark.sparkContext.setLogLevel("ERROR")
17 | 
18 |     val df = spark.readStream
19 |         .format("kafka")
20 |         .option("kafka.bootstrap.servers", "192.168.1.100:9092")
21 |         .option("subscribe", "json_topic")
22 |         .option("startingOffsets", "earliest") // From starting
23 |         .load()
24 | 
25 |     df.printSchema()
26 | 
27 |     //df.show(false)
28 |     //org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
29 | 
30 |     val schema = new StructType()
31 |       .add("id",IntegerType)
32 |       .add("firstname",StringType)
33 |       .add("middlename",StringType)
34 |       .add("lastname",StringType)
35 |       .add("dob_year",IntegerType)
36 |       .add("dob_month",IntegerType)
37 |       .add("gender",StringType)
38 |       .add("salary",IntegerType)
39 | 
40 |     val person = df.selectExpr("CAST(value AS STRING)")
41 |     .select(from_json(col("value"), schema).as("data"))
42 |       .select("data.*")
43 | 
44 |     /**
45 |      *uncomment below code if you want to write it to console for testing.
46 |      */
47 | //    val query = person.writeStream
48 | //      .format("console")
49 | //      .outputMode("append")
50 | //      .start()
51 | //      .awaitTermination()
52 | 
53 |     /**
54 |       *uncomment below code if you want to write it to kafka topic.
55 |       */
56 |     df.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value")
57 |       .writeStream
58 |       .format("kafka")
59 |       .outputMode("append")
60 |       .option("kafka.bootstrap.servers", "192.168.1.100:9092")
61 |       .option("topic", "josn_data_topic")
62 |       .start()
63 |       .awaitTermination()
64 | 
65 | 
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/spark2.3-avro-examples/src/main/resources/person.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "Person",
 4 |   "namespace": "com.sparkbyexamples",
 5 |   "fields": [
 6 |     {"name": "id","type": "int"},
 7 |     {"name": "firstname","type": "string"},
 8 |     {"name": "middlename","type": "string"},
 9 |     {"name": "lastname","type": "string"},
10 |     {"name": "dob_year","type": "int"},
11 |     {"name": "dob_month","type": "int"},
12 |     {"name": "gender","type": "string"},
13 |     {"name": "salary","type": "int"}
14 |   ]
15 | }


--------------------------------------------------------------------------------