├── .gitignore ├── README.md ├── scala-examples ├── pom.xml ├── scripts │ ├── Rational.scala │ ├── Spark.scala │ ├── array.scala │ ├── class.scala │ ├── date.scala │ ├── fileread.scala │ ├── list.scala │ ├── loop.scala │ ├── printsamples.scala │ ├── set.scala │ ├── test.scala │ └── tupple.scala └── src │ └── main │ └── scala │ └── com │ └── sparkbyexamples │ ├── json │ └── TestJson.scala │ ├── list │ ├── ArrayExamples.scala │ ├── LinkedListMutableExamples.scala │ ├── ListBufferExamples.scala │ └── ListExamples.scala │ └── static │ ├── MapExamples.scala │ ├── StaticExample.scala │ └── Test.scala ├── scala-kafka ├── README.md ├── pom.xml └── src │ └── main │ ├── resources │ └── person.avsc │ └── scala │ └── com │ └── sparkbyexamples │ └── kafka │ ├── KafkaConsumerAssignApp.scala │ ├── KafkaConsumerSubscribeApp.scala │ ├── KafkaProducerApp.scala │ ├── KafkaProducerJson.scala │ ├── avro │ └── KafkaProducerAvro.scala │ ├── beans │ └── User.scala │ ├── jackson │ ├── KafkaConsumerWithUserObject.scala │ ├── KafkaProducerWithUserObject.scala │ ├── UserDeserializer.scala │ └── UserSerializer.scala │ ├── json │ └── KafkaProducerJson.scala │ ├── registry │ ├── KafkaConsumerAvroRegistry.scala │ ├── KafkaProducerAvroRegistry.scala │ ├── PersonKafkaConsumerAvroRegistry.scala │ └── PersonKafkaProducerAvroRegistry.scala │ └── streams │ └── KafkaStreams.scala ├── spark-avro-examples ├── pom.xml └── src │ └── main │ ├── resources │ └── person.avsc │ └── scala │ └── com │ └── sparkbyexamples │ └── spark │ └── dataframe │ └── avro │ ├── AvroExample.scala │ └── AvroUsingNestedSchema.scala ├── spark-hive ├── pom.xml └── src │ └── main │ └── scala │ └── com │ └── sparkbyexamples │ └── HBaseWrite.scala ├── spark-kafka ├── pom.xml └── src │ └── main │ ├── resources │ └── person.avsc │ └── scala │ └── com │ └── sparkbyexamples │ └── spark │ └── kafka │ └── json │ ├── KafkaConsumerJson.scala │ └── KafkaProduceJson.scala ├── spark-sql-examples ├── pom.xml └── src │ └── main │ ├── resources │ ├── books.xml │ ├── books_withnested_array.xml │ ├── free-zipcode-database.csv │ ├── kv.csv │ ├── persons.xml │ ├── persons_complex.xml │ ├── records.xml │ ├── schema.json │ ├── stream.csv │ ├── test.txt │ ├── txt │ │ ├── alice.txt │ │ ├── datasets.csv │ │ └── holmes.txt │ ├── zipcodes-noheader.csv │ ├── zipcodes.csv │ ├── zipcodes.json │ └── zipcodes_streaming │ │ ├── zipcode1.json │ │ ├── zipcode10.json │ │ ├── zipcode11.json │ │ ├── zipcode12.json │ │ ├── zipcode2.json │ │ ├── zipcode3.json │ │ ├── zipcode4.json │ │ ├── zipcode5.json │ │ ├── zipcode6.json │ │ ├── zipcode7.json │ │ ├── zipcode8.json │ │ └── zipcode9.json │ └── scala │ └── com │ └── sparkbyexamples │ └── spark │ ├── SparkSessionTest.scala │ ├── beans │ ├── Books.scala │ ├── BooksDiscounted.scala │ ├── BooksStruct.scala │ ├── BooksWithArray.scala │ ├── User.scala │ └── Zipcode.scala │ ├── dataframe │ ├── ArrayToColumn.scala │ ├── AvroExample.scala │ ├── CaseClassSparkSchema.scala │ ├── CastColumnType.scala │ ├── CreateDataFrame.scala │ ├── CreateEmptyDataFrameExample.scala │ ├── CreateEmptyDatasetExample.scala │ ├── DataFrameWithComplexDSL.scala │ ├── DataFrameWithSimpleDSL.scala │ ├── FromCSVFile.scala │ ├── FromCSVFile2.scala │ ├── FromJsonFile.scala │ ├── ParquetExample.scala │ ├── RenameColDataFrame.scala │ ├── SQLExample.scala │ ├── SaveDataFrame.scala │ ├── StructTypeUsage.scala │ ├── UDFDataFrame.scala │ ├── WithColumn.scala │ ├── functions │ │ ├── AnotherExample.scala │ │ ├── MathFunctions.scala │ │ ├── PivotExample.scala │ │ ├── StringFunctions.scala │ │ ├── WhenOtherwise.scala │ │ ├── WindowGroupbyFirst.scala │ │ ├── collection │ │ │ ├── ArrayOfArrayType.scala │ │ │ ├── ArrayOfMapType.scala │ │ │ ├── ArrayOfStructType.scala │ │ │ ├── ArrayTypeExample.scala │ │ │ ├── ExplodeArrayAndMap.scala │ │ │ ├── MapFunctions.scala │ │ │ └── MapTypeExample.scala │ │ ├── datetime │ │ │ ├── AddTime.scala │ │ │ ├── CurrentDateAndTime.scala │ │ │ ├── DateAddMonths.scala │ │ │ ├── DateDiff.scala │ │ │ ├── DateExamples.scala │ │ │ ├── DateFormat.scala │ │ │ ├── DateLastDay.scala │ │ │ ├── DateToString.scala │ │ │ ├── DateTrunc.scala │ │ │ ├── DayAndWeekOfYear.scala │ │ │ ├── DayWeekAndWeekMonth.scala │ │ │ ├── GetTimeFromTimestamp.scala │ │ │ ├── StringToDate.scala │ │ │ ├── StringToTimestamp.scala │ │ │ ├── TimestampDiff.scala │ │ │ ├── TimestampToDate.scala │ │ │ ├── TimestampToString.scala │ │ │ └── unixtimeExample.scala │ │ ├── from_json.scala │ │ └── litTypeLit.scala │ └── xml │ │ ├── PersonsComplexXML.scala │ │ ├── PersonsXML.scala │ │ ├── ReadBooksXMLWithNestedArray.scala │ │ ├── ReadBooksXMLWithNestedArrayStruct.scala │ │ └── xstream │ │ └── WriteXML.scala │ ├── dataset │ ├── DataSetFromData.scala │ ├── DataSetWithCustomClass.scala │ └── xml │ │ ├── ReadBooksXML.scala │ │ ├── ReadBooksXMLWithNestedArray.scala │ │ ├── ReadBooksXMLWithNestedArrayDSL.scala │ │ ├── SparkXMLUsingXstream.scala │ │ └── sparkXml.scala │ ├── rdd │ ├── CreateEmptyRDD.scala │ ├── CreateRDD.scala │ ├── OperationsOnPair.scala │ ├── OperationsOnRDD.scala │ ├── PartitionBy.scala │ ├── RDDAccumulator.scala │ ├── RDDBroadcast.scala │ ├── RDDCache.scala │ ├── RDDFromCSVFile.scala │ ├── RDDFromDataUsingParallelize.scala │ ├── RDDFromParallelizeRange.scala │ ├── RDDFromWholeTextFile.scala │ ├── RDDHadoopInputFormat.scala │ ├── RDDPersist.scala │ ├── RDDReadFilesFromDirectory.scala │ ├── RDDSaveAsObjectFile.scala │ ├── RDDSequenceFiles.scala │ ├── ReadMultipleCSVFiles.scala │ ├── ReadMultipleFiles.scala │ ├── SortBy.scala │ ├── WordCount.scala │ ├── ZipCode.scala │ └── xml │ │ └── XmlRecordReader.scala │ └── stackoverflow │ ├── AddingLiterral.scala │ ├── Test.scala │ └── Test2.scala ├── spark-streaming ├── pom.xml └── src │ └── main │ ├── resources │ ├── folder_streaming │ │ ├── zipcode1.json │ │ ├── zipcode10.json │ │ ├── zipcode11.json │ │ ├── zipcode12.json │ │ ├── zipcode2.json │ │ ├── zipcode3.json │ │ ├── zipcode4.json │ │ ├── zipcode5.json │ │ ├── zipcode6.json │ │ ├── zipcode7.json │ │ ├── zipcode8.json │ │ └── zipcode9.json │ ├── person.avsc │ └── person.json │ └── scala │ └── com │ └── sparkbyexamples │ └── spark │ └── streaming │ ├── SparkStreamingFromDirectory.scala │ ├── SparkStreamingFromSocket.scala │ ├── batch │ ├── SparkBatchConsumeFromKafka.scala │ ├── SparkBatchConsumeFromKafkaAvro.scala │ ├── SparkBatchProduceToKafka.scala │ └── SparkBatchProduceToKafkaAvro.scala │ ├── inprogress │ ├── SparkStreamingForeachRDD.scala │ ├── SparkStreamingForeachWriter.scala │ ├── SparkStreamingFromDirectoryTmp.scala │ ├── SparkStreamingKafkaProducerZipcodeObject.scala │ ├── SparkStreamingToHDFS.scala │ ├── SparkStreamingToJDBC.scala │ ├── SparkStreamingToParquetFile.scala │ ├── SparkStreamingToS3.scala │ └── SparkStreamingTwitter.scala │ └── kafka │ ├── KafkaProduceAvro.scala │ ├── SparkStreamingConsumeKafka.scala │ ├── avro │ ├── KafkaConsumerAvro.scala │ └── KafkaProduceAvro.scala │ └── json │ └── SparkStreamingConsumerKafkaJson.scala └── spark2.3-avro-examples ├── pom.xml └── src └── main ├── resources └── person.avsc └── scala └── com └── sparkbyexamples └── spark └── dataframe └── avro └── AvroUsingDataBricks.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .metadata 3 | .cache-main 4 | .classpath 5 | .project 6 | .settings 7 | *.class 8 | *.orig 9 | *.log 10 | target/ 11 | .DS_Store 12 | *.iml 13 | scalastyle-output.xml 14 | 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Explanation of all examples present on this project are available at https://sparkbyexamples.com/ -------------------------------------------------------------------------------- /scala-examples/scripts/Rational.scala: -------------------------------------------------------------------------------- 1 | 2 | class Rational(n:Int,d:Int){ 3 | require(d!=0) 4 | val number:Int = n 5 | val denom:Int = d 6 | override def toString() = n+"/"+d 7 | def this(n:Int)=this(n,1) 8 | 9 | def add(that:Rational): Rational ={ 10 | new Rational(number*that.denom + that.number*denom,denom*that.denom) 11 | } 12 | 13 | def +(that:Rational): Rational ={ 14 | new Rational(number*that.denom + that.number*denom,denom*that.denom) 15 | } 16 | } 17 | 18 | val a = new Rational(1,2) 19 | val b = new Rational(2,3) 20 | val c = a+b 21 | println(c) -------------------------------------------------------------------------------- /scala-examples/scripts/Spark.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.SparkSession 2 | 3 | object SparkTest{ 4 | 5 | def main(args:Array[String]): Unit ={ 6 | 7 | val sparkSession = SparkSession.builder().appName("Naveen").master("local[1]").getOrCreate(); 8 | 9 | println("APP Name :"+sparkSession.sparkContext.appName); 10 | println("Deploy Mode :"+sparkSession.sparkContext.deployMode); 11 | println("Master :"+sparkSession.sparkContext.master); 12 | 13 | } 14 | } -------------------------------------------------------------------------------- /scala-examples/scripts/array.scala: -------------------------------------------------------------------------------- 1 | // Arrays are mutable 2 | println("Start") 3 | 4 | var j=0 5 | while(jprintln(s)) 11 | args.foreach(println) 12 | for(i<-args){ 13 | println(i);booleanArrayOps() 14 | } 15 | 16 | for(i<-1 to 2){ 17 | println(args(i)) 18 | } 19 | val arr1=Array("one","two","three") 20 | println("Count: "+arr1.count(a=>a.length==5)) 21 | 22 | val arr:Array[String]=new Array[String](3); 23 | arr(0)="1one" 24 | arr(1)="2two" 25 | arr.update(2,"3three"); 26 | println(arr.dropRight(2).length) 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /scala-examples/scripts/class.scala: -------------------------------------------------------------------------------- 1 | import scala.collection.mutable 2 | 3 | class CheckSumAccumulator{ 4 | 5 | private var sum = 0 6 | 7 | def add(b:Byte): Unit ={ 8 | sum+=b 9 | } 10 | 11 | def calc(): Int ={ 12 | ~(sum & 0XFF)+1 13 | } 14 | } 15 | 16 | object CheckSumAccumulator123{ 17 | private val cache = mutable.Map.empty[String,Int] 18 | 19 | def calculate(str:String): Int ={ 20 | 21 | if(cache.contains(str)) 22 | cache(str) 23 | else{ 24 | val csum = new CheckSumAccumulator(); 25 | for(c<-str) 26 | csum.add(c.toByte) 27 | cache += (str -> csum.calc()) 28 | csum.calc() 29 | } 30 | } 31 | } 32 | 33 | println(CheckSumAccumulator123.calculate("Naveen")) 34 | println(CheckSumAccumulator123.calculate("Praveen")) 35 | println(CheckSumAccumulator123.calculate("Naveen")) 36 | -------------------------------------------------------------------------------- /scala-examples/scripts/date.scala: -------------------------------------------------------------------------------- 1 | 2 | import java.time.{LocalDate, Period} 3 | import java.time.format.DateTimeFormatter 4 | import java.time.temporal.ChronoUnit 5 | import java.util.Calendar 6 | import java.time.ZoneId 7 | val dateFormat = DateTimeFormatter.ofPattern("ddMMyyyy") 8 | dateFormat.parse("").getTime 9 | val da:LocalDate = LocalDate.parse("13041981",dateFormat) 10 | 11 | val forDate = dateFormat.parse("13041981") 12 | 13 | println("Date: "+forDate) 14 | println("Date 2: "+da.toString) 15 | 16 | val today = LocalDate.now 17 | 18 | println("Now : "+today.format(dateFormat)) 19 | 20 | println("Years:"+ChronoUnit.YEARS.between(da,today)) 21 | 22 | println("Years:"+Period.between(da,today).getYears) 23 | 24 | println("EpochDay:"+da.atStartOfDay(ZoneId.systemDefault).toInstant.getEpochSecond) 25 | println("EpochDay:"+Calendar.getInstance().getTimeInMillis) 26 | 27 | 28 | -------------------------------------------------------------------------------- /scala-examples/scripts/fileread.scala: -------------------------------------------------------------------------------- 1 | 2 | import scala.io.Source 3 | 4 | for(s<-Source.fromFile("pom.xml").getLines()) 5 | println(s) -------------------------------------------------------------------------------- /scala-examples/scripts/list.scala: -------------------------------------------------------------------------------- 1 | //Lists are immutable like java String 2 | var l1:List[String] = List[String]("1","2") 3 | //l1(0)="one" - this statement fails 4 | l1=List("one","two","three") 5 | val l2=l1 6 | l2.foreach(l=>println(l)) 7 | l2.foreach(println) 8 | for(i<-l2) 9 | println(i) 10 | for(i<-0 to 2) 11 | println(l2(i)) 12 | val l3 = "zero" :: l2 13 | 14 | l3.foreach(l=>println(l)) 15 | 16 | println("Concatenated List") 17 | val l4 = "4" :: "5" :: "6" :: Nil 18 | 19 | val l5 = l3 ::: l4 20 | 21 | for(l<-l5) 22 | println(l) 23 | val l6 = List("will","wall","until") 24 | println("All containsl letter l :" + l6.forall(l=>l.endsWith("l"))) 25 | 26 | val l7 = l6.sortWith((a,b)=>a.charAt(0) > b.charAt(0)) 27 | println("After Sorting") 28 | l7.foreach(println) 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /scala-examples/scripts/loop.scala: -------------------------------------------------------------------------------- 1 | import java.io.File 2 | 3 | import scala.io.Source 4 | 5 | var files = (new File(".")).listFiles() 6 | for(file <- files if file.getName.endsWith(".scala") if file.getName.contains("loop")) 7 | println(file) 8 | 9 | for{ 10 | file <- files 11 | if file.getName.endsWith(".scala") 12 | if file.getName.contains("test") 13 | }println(file) 14 | 15 | def lines(fileName:String):Array[String]={ 16 | Source.fromFile(fileName).getLines().toArray 17 | } 18 | 19 | def readFiles(): Unit ={ 20 | 21 | val files = new File(".").listFiles() 22 | for{ 23 | file<-files 24 | if file.getName.endsWith(".scala") 25 | line<-lines(file.getName) 26 | trimLine = line.trim 27 | if trimLine.contains("println") 28 | }println(s"$file lines "+ trimLine) 29 | 30 | } 31 | readFiles() 32 | 33 | -------------------------------------------------------------------------------- /scala-examples/scripts/printsamples.scala: -------------------------------------------------------------------------------- 1 | 2 | var v = 10 3 | val b = 20 4 | 5 | println(s"My first value $v and second value $b") 6 | 7 | println(s"Add ${v+b}") 8 | println("a\\b") 9 | println(raw"a\\b") 10 | printf("My first %d and second %d",v,b) -------------------------------------------------------------------------------- /scala-examples/scripts/set.scala: -------------------------------------------------------------------------------- 1 | import com.experian.edf.oxygen.utils.JsonUtils 2 | import com.google.gson.JsonObject 3 | 4 | //Set by default immutable 5 | var s1 = Set("one","two","three","four") 6 | //s1(0)="1" - this statement fails 7 | s1.foreach(println) 8 | 9 | for(s<-s1) 10 | println(s) 11 | 12 | s1 += "zero" 13 | 14 | for(s<-s1) 15 | println(s) 16 | 17 | //Map 18 | 19 | var m1 = Map("a"->"A","b"->"B") 20 | 21 | val str = "{"+Map("ss" -> "yy", "aa" -> "bb").map{case (k, v) => "\""+k + "\":" + v}.mkString(",") + "}" 22 | 23 | 24 | println("----->"+str) 25 | val str1 = m1.foreach(m=> ("--->"+m._1 + ","+m._2)) 26 | println("----->"+str1) 27 | val s5 = m1.keySet 28 | 29 | for(s<-s5) 30 | println(m1(s)) 31 | 32 | println(m1.contains("a")) 33 | 34 | m1.foreach(a=>println(a._1 +","+a._2)) 35 | 36 | for(m<-m1) 37 | printf(m._1,m._2) 38 | 39 | for((a,b)<-m1){ 40 | printf("Key %s , value %s -", a,b) 41 | } -------------------------------------------------------------------------------- /scala-examples/scripts/test.scala: -------------------------------------------------------------------------------- 1 | import java.text.SimpleDateFormat 2 | import java.util.Date 3 | 4 | val asOfDateFormat = new SimpleDateFormat("yyyyMMdd") 5 | val str = "file:/C:/Users/a03078a/Documents/DataFabric/Workspace/bureau-australia-data/DefaultListingExtract_Experian_20181008041614.txt" 6 | println(str.lastIndexOf("Experian_")) 7 | val dateStr = str.substring(str.lastIndexOf("Experian_")+9,str.lastIndexOf("Experian_")+9+8) 8 | println("Extracted date:"+dateStr) 9 | val parseDate = asOfDateFormat.parse(dateStr) 10 | println("Parsed Date:"+parseDate) 11 | val longDate = asOfDateFormat.parse(dateStr).getTime 12 | 13 | 14 | println("Date in long:"+longDate) 15 | val reformatDate:Date = new Date() 16 | reformatDate.setTime(longDate); 17 | println("Reformat Date:"+reformatDate) 18 | val daStr = asOfDateFormat.format(reformatDate) 19 | 20 | 21 | println("final Date :"+daStr) 22 | 23 | -------------------------------------------------------------------------------- /scala-examples/scripts/tupple.scala: -------------------------------------------------------------------------------- 1 | val t = ("A",1,'c') 2 | 3 | println(t._1) -------------------------------------------------------------------------------- /scala-examples/src/main/scala/com/sparkbyexamples/json/TestJson.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.json 2 | 3 | object TestJson { 4 | 5 | def main(args: Array[String]): Unit = { 6 | 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /scala-examples/src/main/scala/com/sparkbyexamples/list/ArrayExamples.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.list 2 | 3 | object ArrayExamples extends App { 4 | 5 | var list1 = Array("A","B") 6 | list1(1)="AA" //Errro 7 | val list2 = list1.map(_.toLowerCase()) 8 | 9 | println(list1.mkString(",")) 10 | list1.foreach(f=>println(f)) 11 | list1.foreach(println(_)) 12 | 13 | println("Reading a value form Index :"+list1(1)) 14 | println("Adding element 'C' to Arrays") 15 | 16 | //list1 += "C" // re-assigning 17 | println(list1.mkString(",")) 18 | 19 | println("Adding two Arrays") 20 | var list3 = list1 ++ list2 21 | 22 | println(list3.mkString(",")) 23 | 24 | println("Adding literal to each element in Lists") 25 | val list4 = list1.map(f=>f+"->") 26 | println(list4.mkString(",")) 27 | 28 | println("Convert all list elements to Int") 29 | val list5 = List("1","2","3","4","5") 30 | println(list5.map(f=>f.toInt).mkString(",")) 31 | } 32 | -------------------------------------------------------------------------------- /scala-examples/src/main/scala/com/sparkbyexamples/list/LinkedListMutableExamples.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.list 2 | 3 | import scala.collection.mutable 4 | 5 | object LinkedListMutableExamples { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | var list1 = mutable.LinkedList("A","B") 10 | list1(0)="C" 11 | list1(1)="D" 12 | // list1 = "C" :: list1 // Error 13 | //list1 = list1 + "X" //Error 14 | //list1 ++= "" //Error 15 | // list1 += "" //Error 16 | list1.append(list1) 17 | 18 | println("Modify an element on list") 19 | list1.foreach(println(_)) 20 | 21 | println("Create list2 from list1") 22 | var list2 = list1.map(_.toLowerCase()) 23 | list2.foreach(println(_)) 24 | 25 | println("Add list to existing list") 26 | list2 ++= list1 27 | //val list7 = list1 ::: list2 //Error 28 | list2.foreach(println(_)) 29 | 30 | println("Merge list1 & list2 and create list3") 31 | //val list3 = list1 ::: list2 // Error 32 | var list3 = list1 ++ list2 33 | 34 | list3.foreach(println(_)) 35 | 36 | //Converts list to map 37 | println("Convert list to map") 38 | val list4 = list1.map(f=>(f,f.toLowerCase())) 39 | list4.foreach(f=>println(f._1+f._2)) 40 | 41 | //A ListBuffer is like an array buffer except that it uses a linked list internally instead of an array 42 | // val list3 = mutable.ListBuffer("A","B") 43 | 44 | 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /scala-examples/src/main/scala/com/sparkbyexamples/list/ListBufferExamples.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.list 2 | 3 | import scala.collection.mutable 4 | 5 | object ListBufferExamples { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | var list1 = mutable.ListBuffer("A","B") 10 | list1(0)="C" 11 | list1(1)="D" 12 | // list1 = "C" :: list1 // Error 13 | //list1 = list1 + "" //Error 14 | //list1 ++= "E" //Error 15 | // list1 = list1 + "C"// Error 16 | list1 += "B" 17 | 18 | list1.append("A") 19 | 20 | println("Modify an element on list") 21 | list1.foreach(println(_)) 22 | 23 | println("Create list2 from list1") 24 | var list2 = list1.map(_.toLowerCase()) 25 | list2.appendAll(list1) 26 | list2.foreach(println(_)) 27 | 28 | println("Add list to existing list") 29 | list2 ++= list1 30 | //val list7 = list1 ::: list2 //Error 31 | list2.foreach(println(_)) 32 | 33 | println("Merge list1 & list2 and create list3") 34 | //val list3 = list1 ::: list2 // Error 35 | val list3 = list1 ++ list2 36 | list3.foreach(println(_)) 37 | 38 | //Converts list to map 39 | println("Convert list to map") 40 | val list4 = list1.map(f=>(f,f.toLowerCase())) 41 | list4.foreach(f=>println(f._1+f._2)) 42 | 43 | //A ListBuffer is like an array buffer except that it uses a linked list internally instead of an array 44 | // val list3 = mutable.ListBuffer("A","B") 45 | 46 | 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /scala-examples/src/main/scala/com/sparkbyexamples/list/ListExamples.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.list 2 | 3 | object ListExamples { 4 | 5 | def main(args:Array[String]) { 6 | 7 | var list1 = List("A","B") 8 | //list1(1)="AA" //Errro 9 | val list2 = list1.map(_.toLowerCase()) 10 | 11 | println(list1.mkString(",")) 12 | list1.foreach(f=>println(f)) 13 | list1.foreach(println(_)) 14 | 15 | println("Reading a value form Index :"+list1(1)) 16 | println("Adding element 'C' to List") 17 | //list1 += "D" 18 | list1 = "C" :: list1 // re-assigning 19 | println(list1.mkString(",")) 20 | 21 | println("Adding two Lists") 22 | var list3 = list1 ::: list2 23 | 24 | list3 :::= list2 25 | println(list3.mkString(",")) 26 | 27 | println("Adding literal to each element in Lists") 28 | val list4 = list1.map(f=>f+"->") 29 | println(list4.mkString(",")) 30 | 31 | println("Convert all list elements to Int") 32 | val list5 = List("1","2","3","4","5") 33 | println(list5.map(f=>f.toInt).mkString(",")) 34 | 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /scala-examples/src/main/scala/com/sparkbyexamples/static/MapExamples.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.static 2 | 3 | object MapExamples { 4 | 5 | 6 | } 7 | -------------------------------------------------------------------------------- /scala-examples/src/main/scala/com/sparkbyexamples/static/StaticExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples 2 | 3 | class StaticExample(className1:String) { 4 | 5 | private val className = className1 6 | 7 | def printObjectName(): Unit ={ 8 | println(StaticExample.objectName); 9 | } 10 | 11 | def getValue():String = { 12 | return StaticExample.objectName 13 | } 14 | def getClassName():String = { 15 | return className 16 | } 17 | 18 | 19 | } 20 | 21 | object StaticExample { 22 | 23 | private val objectName = " class name Static Example" 24 | val objectNamePublic = "public variable" 25 | var singletone:Option[String] = None 26 | 27 | def create(): Unit ={ 28 | if(singletone == None){ 29 | singletone = Some("value") 30 | } 31 | } 32 | 33 | def getClassName(staticExample:StaticExample) : String = { 34 | return staticExample.className 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /scala-examples/src/main/scala/com/sparkbyexamples/static/Test.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples 2 | 3 | object Test { 4 | 5 | def main(args:Array[String]): Unit ={ 6 | 7 | val O:Option[Any] = None 8 | 9 | StaticExample.singletone match { 10 | case None => println ("nome") 11 | case Some(_) => println(StaticExample.singletone.get) 12 | } 13 | 14 | StaticExample.create() 15 | println(StaticExample.objectNamePublic) 16 | 17 | StaticExample.singletone match { 18 | case None => println ("nome") 19 | case _ => println(StaticExample.singletone.get) 20 | } 21 | 22 | val staticExample: StaticExample = new StaticExample("My Name is Naveen") 23 | val staticExample2: StaticExample = new StaticExample("My Name is Prabha") 24 | println("staticExample.getClassName() : "+staticExample.getClassName()) 25 | println("staticExample2.getClassName() : "+staticExample2.getClassName()) 26 | 27 | println("staticExample.getValue() : "+staticExample.getValue()) 28 | 29 | println( StaticExample.getClassName(staticExample)) 30 | 31 | 32 | staticExample.printObjectName() 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /scala-kafka/README.md: -------------------------------------------------------------------------------- 1 | Apache Kafka producer and consumer example in scala 2 | -------------------------------------------------------------------------------- /scala-kafka/src/main/resources/person.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "Person", 4 | "namespace": "com.sparkbyexamples", 5 | "fields": [ 6 | {"name": "id","type": "int"}, 7 | {"name": "firstname","type": "string"}, 8 | {"name": "middlename","type": "string"}, 9 | {"name": "lastname","type": "string"}, 10 | {"name": "dob_year","type": "int"}, 11 | {"name": "dob_month","type": "int"}, 12 | {"name": "gender","type": "string"}, 13 | {"name": "salary","type": "int"} 14 | ] 15 | } -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/KafkaConsumerAssignApp.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.kafka 2 | 3 | import java.util 4 | import java.util.Properties 5 | import java.util.regex.Pattern 6 | 7 | import org.apache.kafka.clients.consumer.KafkaConsumer 8 | import org.apache.kafka.common.TopicPartition 9 | 10 | import scala.collection.JavaConverters._ 11 | 12 | object KafkaConsumerAssignApp { 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | val prop:Properties = new Properties() 17 | prop.put("bootstrap.servers","192.168.1.100:9092") 18 | prop.put("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer") 19 | prop.put("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer") 20 | 21 | val consumer = new KafkaConsumer(prop) 22 | 23 | val tp1 = new TopicPartition("topic_text",1) 24 | val tp2 = new TopicPartition("my_topic_partition",1) 25 | 26 | val topics = List[TopicPartition](tp1,tp2) 27 | consumer.assign(topics.asJava) 28 | while(true){ 29 | 30 | val records = consumer.poll(10) 31 | for(record<-records.asScala){ 32 | 33 | println("Key: "+record.key() +", Value: "+record.value() +", Offset: "+record.offset() ) 34 | 35 | } 36 | } 37 | 38 | consumer.close()// close in finally block 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/KafkaConsumerSubscribeApp.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.kafka 2 | import java.util.{Collections, Properties} 3 | import java.util.regex.Pattern 4 | 5 | import org.apache.kafka.clients.consumer.KafkaConsumer 6 | 7 | import scala.collection.JavaConverters._ 8 | object KafkaConsumerSubscribeApp extends App { 9 | 10 | val props:Properties = new Properties() 11 | props.put("group.id", "test") 12 | props.put("bootstrap.servers","192.168.1.128:9092") 13 | props.put("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer") 14 | props.put("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer") 15 | props.put("enable.auto.commit", "true") 16 | props.put("auto.commit.interval.ms", "1000") 17 | val consumer = new KafkaConsumer(props) 18 | val topics = List("topic_text") 19 | try { 20 | consumer.subscribe(topics.asJava) 21 | //consumer.subscribe(Collections.singletonList("topic_partition")) 22 | //consumer.subscribe(Pattern.compile("topic_partition")) 23 | while (true) { 24 | val records = consumer.poll(10) 25 | for (record <- records.asScala) { 26 | println("Topic: " + record.topic() + ", Key: " + record.key() + ", Value: " + record.value() + 27 | ", Offset: " + record.offset() + ", Partition: " + record.partition()) 28 | } 29 | } 30 | }catch{ 31 | case e:Exception => e.printStackTrace() 32 | }finally { 33 | consumer.close() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/KafkaProducerApp.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.kafka 2 | import java.util.Properties 3 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 4 | object KafkaProducerApp extends App { 5 | 6 | val props:Properties = new Properties() 7 | props.put("bootstrap.servers","192.168.1.128:9092") 8 | props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer") 9 | props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer") 10 | props.put("acks","all") 11 | 12 | val producer = new KafkaProducer[String, String](props) 13 | val topic = "text_topic" 14 | 15 | try { 16 | for (i <- 0 to 15) { 17 | val record = new ProducerRecord[String, String](topic, i.toString, "My Site is sparkbyexamples.com " + i) 18 | val metadata = producer.send(record) 19 | printf(s"sent record(key=%s value=%s) " + 20 | "meta(partition=%d, offset=%d)\n", 21 | record.key(), record.value(), metadata.get().partition(), 22 | metadata.get().offset()) 23 | } 24 | }catch{ 25 | case e:Exception => e.printStackTrace() 26 | }finally { 27 | producer.close() 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/KafkaProducerJson.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.kafka 2 | 3 | object KafkaProducerJson_ { 4 | 5 | def main(args: Array[String]): Unit = { 6 | 7 | 8 | 9 | 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/avro/KafkaProducerAvro.scala: -------------------------------------------------------------------------------- 1 | //import java.util.{Properties, UUID} 2 | // 3 | //import org.apache.avro.Schema 4 | //import org.apache.avro.Schema.Parser 5 | //import domain.User 6 | //import org.apache.avro.generic.GenericData 7 | //import org.apache.avro.generic.GenericRecord 8 | //import org.apache.avro.specific.SpecificDatumWriter 9 | //import java.io.ByteArrayOutputStream 10 | // 11 | //import org.apache.avro.io._ 12 | //import kafka.producer.{KeyedMessage, Producer, ProducerConfig} 13 | // 14 | //import scala.io.Source 15 | // 16 | //class KafkaProducer() { 17 | // 18 | // private val props = new Properties() 19 | // 20 | // props.put("metadata.broker.list", "localhost:9092") 21 | // props.put("message.send.max.retries", "5") 22 | // props.put("request.required.acks", "-1") 23 | // props.put("serializer.class", "kafka.serializer.DefaultEncoder") 24 | // props.put("client.id", UUID.randomUUID().toString()) 25 | // 26 | // private val producer = new Producer[String, Array[Byte]](new ProducerConfig(props)) 27 | // 28 | // //Read avro schema file 29 | // val schema: Schema = new Parser().parse(Source.fromURL(getClass.getResource("/schema.avsc")).mkString) 30 | // 31 | // // Create avro generic record object 32 | // val genericUser: GenericRecord = new GenericData.Record(schema) 33 | // 34 | // //Put data in that generic record 35 | // genericUser.put("id", "1") 36 | // genericUser.put("name", "sushil") 37 | // genericUser.put("email", null) 38 | // 39 | // // Serialize generic record into byte array 40 | // val writer = new SpecificDatumWriter[GenericRecord](schema) 41 | // val out = new ByteArrayOutputStream() 42 | // val encoder: BinaryEncoder = EncoderFactory.get().binaryEncoder(out, null) 43 | // writer.write(genericUser, encoder) 44 | // encoder.flush() 45 | // out.close() 46 | // 47 | // val serializedBytes: Array[Byte] = out.toByteArray() 48 | // 49 | // val queueMessage = new KeyedMessage[String, Array[Byte]](topic, serializedBytes) 50 | // producer.send(queueMessage) -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/beans/User.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.kafka.beans 2 | 3 | class User() { 4 | private var name:String = "" 5 | private var age:Int = 0 6 | 7 | def this(name: String, age: Int) { 8 | this() 9 | this.name =name 10 | this.age = age 11 | } 12 | 13 | def getName: String = this.name 14 | 15 | def getAge: Int = this.age 16 | 17 | override def toString: String = "User(" + name + ", " + age + ")" 18 | } 19 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/jackson/KafkaConsumerWithUserObject.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.kafka.jackson 2 | import java.util.Properties 3 | import com.sparkbyexamples.kafka.beans.User 4 | import org.apache.kafka.clients.consumer.KafkaConsumer 5 | import scala.collection.JavaConverters._ 6 | object KafkaConsumerWithUserObject extends App { 7 | val prop:Properties = new Properties() 8 | prop.put("group.id", "test") 9 | prop.put("bootstrap.servers","192.168.1.100:9092") 10 | prop.put("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer") 11 | prop.put("value.deserializer","com.sparkbyexamples.kafka.jackson.UserDeserializer") 12 | prop.put("enable.auto.commit", "true") 13 | prop.put("auto.commit.interval.ms", "1000") 14 | val consumer = new KafkaConsumer[String,User](prop) 15 | val topics = List("user_user") 16 | try{ 17 | consumer.subscribe(topics.asJava) 18 | while(true){ 19 | val records = consumer.poll(10) 20 | for(record<-records.asScala){ 21 | println("Topic: "+record.topic()+", Key: "+record.key() +", Value: "+record.value().getName + 22 | ", Offset: "+record.offset() +", Partition: "+record.partition()) 23 | } 24 | } 25 | }catch{ 26 | case e:Exception => e.printStackTrace() 27 | }finally { 28 | consumer.close() 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/jackson/KafkaProducerWithUserObject.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.kafka.jackson 2 | import java.util.Properties 3 | 4 | import com.sparkbyexamples.kafka.beans.User 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 6 | import org.apache.kafka.common.serialization.StringSerializer 7 | object KafkaProducerWithUserObject { 8 | val props:Properties = new Properties() 9 | props.put("bootstrap.servers","192.168.1.100:9092") 10 | props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer") 11 | props.put("value.serializer","com.sparkbyexamples.kafka.jackson.UserSerializer") 12 | props.put("acks","all") 13 | val producer = new KafkaProducer[String, User](props) 14 | try{ 15 | for(i <- 0 to 100) { 16 | val user = new User("My Name - "+i,i) 17 | val record = new ProducerRecord[String, User]("user_topic",i.toString,user) 18 | val metadata = producer.send(record) 19 | printf(s"sent record(key=%s value=%s) " + 20 | "meta(partition=%d, offset=%d)\n", 21 | record.key(), record.value(), metadata.get().partition(), 22 | metadata.get().offset()); 23 | } 24 | 25 | }catch{ 26 | case e:Exception => e.printStackTrace() 27 | }finally { 28 | producer.close() 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/jackson/UserDeserializer.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.kafka.jackson 2 | 3 | import java.util 4 | 5 | import com.sparkbyexamples.kafka.beans.User 6 | import org.apache.kafka.common.serialization.Deserializer 7 | import org.codehaus.jackson.map.ObjectMapper 8 | 9 | class UserDeserializer extends Deserializer[User] { 10 | override def configure(map: util.Map[String, _], b: Boolean): Unit = { 11 | } 12 | 13 | override def deserialize(s: String, bytes: Array[Byte]): User = { 14 | val mapper = new ObjectMapper() 15 | val user = mapper.readValue(bytes, classOf[User]) 16 | user 17 | } 18 | 19 | override def close(): Unit = { 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/jackson/UserSerializer.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.kafka.jackson 2 | 3 | import java.util 4 | 5 | import com.sparkbyexamples.kafka.beans.User 6 | import org.apache.kafka.common.serialization.Serializer 7 | import org.codehaus.jackson.map.ObjectMapper 8 | 9 | class UserSerializer extends Serializer[User]{ 10 | 11 | override def configure(map: util.Map[String, _], b: Boolean): Unit = { 12 | } 13 | 14 | override def serialize(s: String, t: User): Array[Byte] = { 15 | if(t==null) 16 | null 17 | else 18 | { 19 | val objectMapper = new ObjectMapper() 20 | objectMapper.writeValueAsString(t).getBytes 21 | } 22 | } 23 | 24 | override def close(): Unit = { 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/json/KafkaProducerJson.scala: -------------------------------------------------------------------------------- 1 | //package com.sparkbyexamples.kafka.json 2 | // 3 | //import java.util.Properties 4 | // 5 | //import org.apache.kafka.clients.producer.KafkaProducer 6 | // 7 | //object KafkaProducerJson { 8 | // 9 | // def main(args: Array[String]): Unit = { 10 | // 11 | // val props:Properties = new Properties() 12 | // props.put("bootstrap.servers","192.168.1.128:9092") 13 | // props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer") 14 | // props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer") 15 | // props.put("acks","all") 16 | // 17 | // val producer = new KafkaProducer[String, String](props) 18 | // val topic = "text_topic" 19 | // 20 | // 21 | // } 22 | //} 23 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/registry/KafkaConsumerAvroRegistry.scala: -------------------------------------------------------------------------------- 1 | //package com.sparkbyexamples.kafka.avro 2 | // 3 | //import java.util.Properties 4 | // 5 | //import org.apache.kafka.clients.consumer.ConsumerConfig 6 | //import org.apache.kafka.clients.consumer.KafkaConsumer 7 | //import java.util 8 | //import scala.collection.JavaConversions._ 9 | // 10 | //object KafkaConsumerAvroRegistry_ { 11 | // 12 | // def main(args: Array[String]): Unit = { 13 | // 14 | // val props = new Properties() 15 | // props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092") 16 | // props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.StringDeserializer.class") 17 | // props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.KafkaAvroDeserializer.class") 18 | // 19 | // props.put(ConsumerConfig.GROUP_ID_CONFIG, "group1") 20 | // props.put("schema.registry.url", "http://localhost:8081"); 21 | // 22 | // val topic = "avro_topic" 23 | // val consumer = new KafkaConsumer[String, String](props) 24 | // consumer.subscribe(util.Arrays.asList(topic)) 25 | // while ({true}) { 26 | // val records = consumer.poll(100) 27 | // 28 | // for (record <- records) { 29 | // //System.out.printf("offset = %d, key = %s, value = %s \n", record.offset, record.key, record.value) 30 | // } 31 | // } 32 | // } 33 | //} -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/registry/KafkaProducerAvroRegistry.scala: -------------------------------------------------------------------------------- 1 | //package com.sparkbyexamples.kafka.avro 2 | // 3 | //import java.util.Properties 4 | // 5 | //import org.apache.avro.Schema 6 | //import org.apache.avro.generic.{GenericData, GenericRecord} 7 | //import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 8 | // 9 | //object KafkaProducerAvroRegistry_ { 10 | // 11 | // def main(args: Array[String]): Unit = { 12 | // 13 | // val props = new Properties() 14 | // props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092") 15 | // props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.StringSerializer.class") 16 | // props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.KafkaAvroSerializer.class"); 17 | // props.put("schema.registry.url", "http://localhost:8081"); 18 | // val producer = new KafkaProducer[Object, Object](props); 19 | // 20 | // val key = "key1"; 21 | // val userSchema = "{\"type\":\"record\"," + 22 | // "\"name\":\"myrecord\"," + 23 | // "\"fields\":[{\"name\":\"f1\",\"type\":\"string\"}]}"; 24 | // val parser = new Schema.Parser(); 25 | // val schema = parser.parse(userSchema); 26 | // val avroRecord:GenericRecord = new GenericData.Record(schema); 27 | // avroRecord.put("f1", "value1"); 28 | // 29 | // val record:ProducerRecord[Object, Object] = new ProducerRecord[Object, Object]("avro_topic", key, avroRecord); 30 | // 31 | // producer.send(record); 32 | // 33 | // } 34 | //} 35 | -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/registry/PersonKafkaConsumerAvroRegistry.scala: -------------------------------------------------------------------------------- 1 | //package com.sparkbyexamples.kafka.avro 2 | // 3 | //import java.util 4 | //import java.util.Properties 5 | // 6 | //import org.apache.kafka.clients.consumer.{ConsumerConfig, KafkaConsumer} 7 | // 8 | //import scala.collection.JavaConversions._ 9 | // 10 | //object PersonKafkaConsumerAvroRegistry_ { 11 | // 12 | // def main(args: Array[String]): Unit = { 13 | // 14 | // val props = new Properties() 15 | // props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092") 16 | // props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.StringDeserializer.class") 17 | // props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.KafkaAvroDeserializer.class") 18 | // 19 | // props.put(ConsumerConfig.GROUP_ID_CONFIG, "group1") 20 | // props.put("schema.registry.url", "http://localhost:8081"); 21 | // 22 | // val topic = "avro_topic" 23 | // val consumer = new KafkaConsumer[String, String](props) 24 | // consumer.subscribe(util.Arrays.asList(topic)) 25 | // while ({true}) { 26 | // val records = consumer.poll(100) 27 | // 28 | // for (record <- records) { 29 | // //System.out.printf("offset = %d, key = %s, value = %s \n", record.offset, record.key, record.value) 30 | // } 31 | // } 32 | // } 33 | //} -------------------------------------------------------------------------------- /scala-kafka/src/main/scala/com/sparkbyexamples/kafka/registry/PersonKafkaProducerAvroRegistry.scala: -------------------------------------------------------------------------------- 1 | //package com.sparkbyexamples.kafka.avro 2 | // 3 | //import java.util.Properties 4 | // 5 | //import org.apache.avro.Schema 6 | //import org.apache.avro.generic.GenericRecord 7 | //import org.apache.avro.generic.GenericRecordBuilder 8 | //import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 9 | //import java.io.{ByteArrayOutputStream, File} 10 | // 11 | //import org.apache.avro.io.{BinaryEncoder, EncoderFactory} 12 | //import org.apache.avro.specific.SpecificDatumWriter 13 | // 14 | // 15 | // 16 | //object PersonKafkaProducerAvroRegistry { 17 | // 18 | // def main(args: Array[String]): Unit = { 19 | // 20 | // val props = new Properties() 21 | // props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.1.100:9092") 22 | // props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.StringSerializer.class") 23 | // props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,"io.confluent.kafka.serializers.KafkaAvroSerializer.class"); 24 | // // props.put("serializer.class", "kafka.serializer.DefaultEncoder") 25 | // 26 | // val producer = new KafkaProducer[Object, Object](props); 27 | // 28 | // val key = "key1"; 29 | // 30 | // 31 | // val parser = new Schema.Parser(); 32 | // val schema = parser.parse(new File("src/main/resources/person.avsc")); 33 | // val genericRecordBuilder = new GenericRecordBuilder(schema) 34 | // 35 | // 36 | // val avroPerson = genericRecordBuilder 37 | // .set("firstName", "My First Name") 38 | // .set("lastName", "My last Name") 39 | // .set("birthDate", "My Date of Birth") 40 | // .build() 41 | // 42 | //// val writer = new SpecificDatumWriter[GenericRecord](schema) 43 | //// val out = new ByteArrayOutputStream() 44 | //// val encoder: BinaryEncoder = EncoderFactory.get().binaryEncoder(out, null) 45 | //// writer.write(avroPerson, encoder) 46 | //// encoder.flush() 47 | //// out.close() 48 | //// val serializedBytes: Array[Byte] = out.toByteArray() 49 | // 50 | // val record:ProducerRecord[Object, Object] = new ProducerRecord[Object, Object]("avro_topic", key, avroPerson); 51 | // 52 | // producer.send(record); 53 | // 54 | // } 55 | //} 56 | -------------------------------------------------------------------------------- /spark-avro-examples/src/main/resources/person.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "Person", 4 | "namespace": "com.sparkbyexamples", 5 | "fields": [ 6 | {"name": "id","type": "int"}, 7 | {"name": "firstname","type": "string"}, 8 | {"name": "middlename","type": "string"}, 9 | {"name": "lastname","type": "string"}, 10 | {"name": "dob_year","type": "int"}, 11 | {"name": "dob_month","type": "int"}, 12 | {"name": "gender","type": "string"}, 13 | {"name": "salary","type": "int"} 14 | ] 15 | } -------------------------------------------------------------------------------- /spark-avro-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/avro/AvroExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.avro 2 | 3 | import java.io.File 4 | import org.apache.avro.Schema 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.{SaveMode, SparkSession} 7 | /** 8 | * Spark Avro library example 9 | * Avro schema example 10 | * Avro file format 11 | * 12 | */ 13 | object AvroExample { 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val spark: SparkSession = SparkSession.builder().master("local[1]") 18 | .appName("SparkByExamples.com") 19 | .getOrCreate() 20 | 21 | val data = Seq((1,"James ", "", "Smith", 2018, 1, "M", 3000), 22 | (2,"Michael ", "Rose", "", 2010, 3, "M", 4000), 23 | (3,"Robert ", "", "Williams", 2010, 3, "M", 4000), 24 | (4,"Maria ", "Anne", "Jones", 2005, 5, "F", 4000), 25 | (5,"Jen", "Mary", "Brown", 2010, 7, "", -1) 26 | ) 27 | 28 | val columns = Seq("firstname", "middlename", "lastname", "dob_year", 29 | "dob_month", "gender", "salary") 30 | import spark.sqlContext.implicits._ 31 | val df = data.toDF(columns: _*) 32 | 33 | /** 34 | * Write Avro File 35 | */ 36 | df.write.format("avro") 37 | .mode(SaveMode.Overwrite) 38 | .save("C:/tmp/spark_out/avro/person.avro") 39 | 40 | /** 41 | * Read Avro File 42 | */ 43 | spark.read.format("avro").load("C:/tmp/spark_out/avro/person.avro").show() 44 | 45 | /** 46 | * Write Avro Partition 47 | */ 48 | df.write.partitionBy("dob_year","dob_month") 49 | .format("avro") 50 | .mode(SaveMode.Overwrite) 51 | .save("C:/tmp/spark_out/avro/person_partition.avro") 52 | 53 | /** 54 | * Reading Avro Partition 55 | */ 56 | spark.read 57 | .format("avro") 58 | .load("C:/tmp/spark_out/avro/person_partition.avro") 59 | .where(col("dob_year") === 2010) 60 | .show() 61 | 62 | /** 63 | * Explicit Avro schema 64 | */ 65 | val schemaAvro = new Schema.Parser() 66 | .parse(new File("src/main/resources/person.avsc")) 67 | 68 | spark.read 69 | .format("avro") 70 | .option("avroSchema", schemaAvro.toString) 71 | .load("C:/tmp/spark_out/avro/person.avro") 72 | .show() 73 | 74 | /** 75 | * Avro Spark SQL 76 | */ 77 | spark.sqlContext.sql("CREATE TEMPORARY VIEW PERSON USING avro OPTIONS (path \"C:/tmp/spark_out/avro/person.avro\")") 78 | spark.sqlContext.sql("SELECT * FROM PERSON").show() 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /spark-avro-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/avro/AvroUsingNestedSchema.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.avro 2 | 3 | object AvroUsingNestedSchema_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-hive/src/main/scala/com/sparkbyexamples/HBaseWrite.scala: -------------------------------------------------------------------------------- 1 | //package com.sparkbyexamples 2 | // 3 | //import org.apache.spark.sql.SparkSession 4 | // 5 | //object HBaseWrite { 6 | // 7 | // def main(args: Array[String]): Unit = { 8 | // 9 | // val spark:SparkSession = SparkSession.builder() 10 | // .master("local[3]") 11 | // .appName("SparkByExample") 12 | // .getOrCreate() 13 | // 14 | // //Chaining multiple options 15 | // val df = spark.read. 16 | // options(Map("inferSchema"->"true","sep"->",","header"->"true")) 17 | // .csv("src/main/resources/zipcodes.csv") 18 | // df.show(false) 19 | // df.printSchema() 20 | // 21 | // def catalog = s"""{ 22 | // |"table":{"namespace":"default", "name":"Zipcode"}, 23 | // |"rowkey":"key", 24 | // |"columns":{ 25 | // |"RecordNumber":{"cf":"rowkey", "col":"RecordNumber", "type":"string"}, 26 | // |"Zipcode":{"cf":"ZipcodeCF", "col":"Zipcode", "type":"string"}, 27 | // |"ZipCodeType":{"cf":"ZipcodeCF", "col":"ZipCodeType", "type":"string"}, 28 | // |"City":{"cf":"ZipcodeCF", "col":"City", "type":"string"}, 29 | // |"State":{"cf":"ZipcodeCF", "col":"State", "type":"string"}, 30 | // |"LocationType":{"cf":"ZipcodeCF", "col":"LocationType", "type":"string"}, 31 | // |"Lat":{"cf":"ZipcodeCF", "col":"Lat", "type":"string"}, 32 | // |"Long":{"cf":"ZipcodeCF", "col":"Long", "type":"string"}, 33 | // |"Xaxis":{"cf":"ZipcodeCF", "col":"Xaxis", "type":"string"}, 34 | // |"Yaxis":{"cf":"ZipcodeCF", "col":"Yaxis", "type":"string"}, 35 | // |"Zaxis":{"cf":"ZipcodeCF", "col":"Zaxis", "type":"string"}, 36 | // |"WorldRegion":{"cf":"ZipcodeCF", "col":"WorldRegion", "type":"string"}, 37 | // |"Country":{"cf":"ZipcodeCF", "col":"Country", "type":"string"}, 38 | // |"LocationText":{"cf":"ZipcodeCF", "col":"LocationText", "type":"string"} 39 | // |} 40 | // |}""".stripMargin 41 | // 42 | // df.write 43 | // .option(HBaseTableCatalog.tableCatalog, catalog) 44 | // .option(HBaseTableCatalog.newTable, "5") 45 | // .format("org.apache.spark.sql.execution.datasources.hbase") 46 | // .save() 47 | // 48 | // } 49 | //} 50 | -------------------------------------------------------------------------------- /spark-kafka/src/main/resources/person.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "Person", 4 | "namespace": "com.sparkbyexamples", 5 | "fields": [ 6 | {"name": "firstname","type": "string"}, 7 | {"name": "middlename","type": "string"}, 8 | {"name": "lastname","type": "string"}, 9 | {"name": "dob_year","type": "int"}, 10 | {"name": "dob_month","type": "int"}, 11 | {"name": "gender","type": "string"}, 12 | {"name": "salary","type": "int"} 13 | ] 14 | } -------------------------------------------------------------------------------- /spark-kafka/src/main/scala/com/sparkbyexamples/spark/kafka/json/KafkaConsumerJson.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.kafka.json 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object KafkaConsumerJson { 6 | def main(args:Array[String]): Unit = { 7 | 8 | 9 | val spark: SparkSession = SparkSession.builder().master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | val df = spark 14 | .readStream 15 | .format("kafka") 16 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 17 | .option("subscribe", "topic1") 18 | .load() 19 | 20 | df.printSchema() 21 | 22 | df.show() 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /spark-kafka/src/main/scala/com/sparkbyexamples/spark/kafka/json/KafkaProduceJson.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.kafka.json 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object KafkaProduceJson { 6 | 7 | def main(args:Array[String]): Unit ={ 8 | 9 | 10 | val spark: SparkSession = SparkSession.builder().master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val data = Seq((1,"James ","","Smith",2018,1,"M",3000), 15 | (2,"Michael ","Rose","",2010,3,"M",4000), 16 | (3,"Robert ","","Williams",2010,3,"M",4000), 17 | (4,"Maria ","Anne","Jones",2005,5,"F",4000), 18 | (5,"Jen","Mary","Brown",2010,7,"",-1) 19 | ) 20 | 21 | val columns = Seq("id","firstname","middlename","lastname","dob_year", 22 | "dob_month","gender","salary") 23 | import spark.sqlContext.implicits._ 24 | val df = data.toDF(columns:_*) 25 | 26 | val ds = df.toJSON 27 | ds.printSchema() 28 | 29 | val query = ds 30 | .writeStream 31 | .format("kafka") 32 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 33 | .option("topic", "text_topic") 34 | .start() 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/kv.csv: -------------------------------------------------------------------------------- 1 | key,value 2 | record1,My Name is Naveen 3 | record2,My Name is Praveen 4 | record3,My Name is Prabha -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/persons.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | James 4 | Smith 5 | 6 | 1980 7 | 1 8 | M 9 | 10000 10 | 11 |
12 | 123 ABC street 13 | NewJersy 14 | NJ 15 |
16 |
17 | 456 apple street 18 | newark 19 | DE 20 |
21 |
22 |
23 | 24 | Michael 25 | 26 | Rose 27 | 1990 28 | 6 29 | M 30 | 10000 31 | 32 |
33 | 4512 main st 34 | new york 35 | NY 36 |
37 |
38 | 4367 orange st 39 | sandiago 40 | CA 41 |
42 |
43 |
44 |
45 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/persons_complex.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | James 4 | Smith 5 | 6 | 1980 7 | 1 8 | M 9 | 10000 10 | 11 |
12 | 1 capler dr 13 | new york 14 | NY 15 |
16 |
17 | 455 catalina dr 18 | chicago 19 | IL 20 |
21 |
22 |
23 | 24 | Michael 25 | 26 | Rose 27 | 1990 28 | 6 29 | M 30 | 10000 31 | 32 |
33 | 2345 pasadena village 34 | orlando 35 | FL 36 |
37 |
38 | 3 walnut dr 39 | wilmington 40 | DE 41 |
42 |
43 |
44 |
-------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/records.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | John 4 | 10 5 | M 6 | 7 | 8 | Jenny 9 | 12 10 | F 11 | 12 | 13 | Janardhan 14 | 14 15 | M 16 | 17 | 18 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "type" : "struct", 3 | "fields" : [ { 4 | "name" : "name", 5 | "type" : { 6 | "type" : "struct", 7 | "fields" : [ { 8 | "name" : "firstname", 9 | "type" : "string", 10 | "nullable" : true, 11 | "metadata" : { } 12 | }, { 13 | "name" : "middlename", 14 | "type" : "string", 15 | "nullable" : true, 16 | "metadata" : { } 17 | }, { 18 | "name" : "lastname", 19 | "type" : "string", 20 | "nullable" : true, 21 | "metadata" : { } 22 | } ] 23 | }, 24 | "nullable" : true, 25 | "metadata" : { } 26 | }, { 27 | "name" : "dob", 28 | "type" : "string", 29 | "nullable" : true, 30 | "metadata" : { } 31 | }, { 32 | "name" : "gender", 33 | "type" : "string", 34 | "nullable" : true, 35 | "metadata" : { } 36 | }, { 37 | "name" : "salary", 38 | "type" : "integer", 39 | "nullable" : true, 40 | "metadata" : { } 41 | } ] 42 | } -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/stream.csv: -------------------------------------------------------------------------------- 1 | TotalCost|BirthDate|Gender|TotalChildren|ProductCategoryName 2 | 1000||Male|2|Technology 3 | 2000|1957-03-06||3|Beauty 4 | 3000|1959-03-06|Male||Car 5 | 4000|1953-03-06|Male|2| 6 | 5000|1957-03-06|Female|3|Beauty 7 | 6000|1959-03-06|Male|4|Car -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/test.txt: -------------------------------------------------------------------------------- 1 | Project Gutenberg’s 2 | Alice’s Adventures in Wonderland 3 | by Lewis Carroll 4 | This eBook is for the use 5 | of anyone anywhere 6 | at no cost and with 7 | Alice’s Adventures in Wonderland 8 | by Lewis Carroll 9 | This eBook is for the use 10 | of anyone anywhere 11 | at no cost and with 12 | This eBook is for the use 13 | of anyone anywhere 14 | at no cost and with 15 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode1.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode10.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} 2 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false} 3 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false} 4 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode11.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode12.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode2.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false} 2 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false} 3 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode3.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":61391,"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX","LocationType":"NOT ACCEPTABLE","Lat":32.72,"Long":-97.31,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Cingular Wireless, TX","Location":"NA-US-TX-CINGULAR WIRELESS","Decommisioned":false} 2 | {"RecordNumber":61392,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX","LocationType":"PRIMARY","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Fort Worth, TX","Location":"NA-US-TX-FORT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986} 3 | {"RecordNumber":61393,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX","LocationType":"ACCEPTABLE","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Ft Worth, TX","Location":"NA-US-TX-FT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986} 4 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode4.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":4,"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Urb Eugene Rice, PR","Location":"NA-US-PR-URB EUGENE RICE","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode5.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":39827,"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.37,"Long":-111.64,"Xaxis":-0.3,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14962,"EstimatedPopulation":26883,"TotalWages":563792730,"Notes":"no NWS data, "} 2 | {"RecordNumber":39828,"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.38,"Long":-111.84,"Xaxis":-0.31,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14374,"EstimatedPopulation":25446,"TotalWages":471000465} 3 | {"RecordNumber":49345,"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL","LocationType":"PRIMARY","Lat":30.69,"Long":-81.92,"Xaxis":0.12,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Hilliard, FL","Location":"NA-US-FL-HILLIARD","Decommisioned":false,"TaxReturnsFiled":3922,"EstimatedPopulation":7443,"TotalWages":133112149} 4 | {"RecordNumber":49346,"Zipcode":34445,"ZipCodeType":"PO BOX","City":"HOLDER","State":"FL","LocationType":"PRIMARY","Lat":28.96,"Long":-82.41,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Holder, FL","Location":"NA-US-FL-HOLDER","Decommisioned":false} 5 | {"RecordNumber":49347,"Zipcode":32564,"ZipCodeType":"STANDARD","City":"HOLT","State":"FL","LocationType":"PRIMARY","Lat":30.72,"Long":-86.67,"Xaxis":0.04,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Holt, FL","Location":"NA-US-FL-HOLT","Decommisioned":false,"TaxReturnsFiled":1207,"EstimatedPopulation":2190,"TotalWages":36395913} 6 | {"RecordNumber":49348,"Zipcode":34487,"ZipCodeType":"PO BOX","City":"HOMOSASSA","State":"FL","LocationType":"PRIMARY","Lat":28.78,"Long":-82.61,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Homosassa, FL","Location":"NA-US-FL-HOMOSASSA","Decommisioned":false} 7 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode6.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":10,"Zipcode":708,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode7.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false} 2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599} 3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517} 4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493} 5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318} 6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473} 7 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode8.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false} 2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599} 3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517} 4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493} 5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318} 6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473} 7 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/resources/zipcodes_streaming/zipcode9.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493} 2 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318} 3 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473} 4 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/SparkSessionTest.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object SparkSessionTest { 6 | 7 | def main(args:Array[String]): Unit ={ 8 | 9 | val spark = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate(); 13 | 14 | println("First SparkContext:") 15 | println("APP Name :"+spark.sparkContext.appName); 16 | println("Deploy Mode :"+spark.sparkContext.deployMode); 17 | println("Master :"+spark.sparkContext.master); 18 | 19 | val sparkSession2 = SparkSession.builder() 20 | .master("local[1]") 21 | .appName("SparkByExample-test") 22 | .getOrCreate(); 23 | 24 | println("Second SparkContext:") 25 | println("APP Name :"+sparkSession2.sparkContext.appName); 26 | println("Deploy Mode :"+sparkSession2.sparkContext.deployMode); 27 | println("Master :"+sparkSession2.sparkContext.master); 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/Books.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | case class Books(_id:String, author:String, description:String, price:Double, publish_date:String, title:String) -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/BooksDiscounted.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | case class BooksDiscounted(_id:String, author:String, description:String, price:Double, publish_date:String, title:String, discountPrice:Double) 4 | 5 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/BooksStruct.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | class BooksStruct { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/BooksWithArray.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | case class BooksWithArray(_id:String, author:String, description:String, price:Double, publish_date:String, title:String,otherInfo:OtherInfo,stores:Stores) 4 | case class OtherInfo(pagesCount:String,language:String,country:String,address:Address) 5 | case class Address(addressline1:String,city:String,state:String) 6 | case class Stores(store:Array[Store]) 7 | case class Store(name:String) 8 | 9 | 10 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/User.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | class User() { 4 | private var name:String = "" 5 | private var age:Int = 0 6 | 7 | def this(name: String, age: Int) { 8 | this() 9 | this.name =name 10 | this.age = age 11 | } 12 | 13 | def getName: String = this.name 14 | 15 | def getAge: Int = this.age 16 | 17 | override def toString: String = "User(" + name + ", " + age + ")" 18 | } 19 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/beans/Zipcode.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | import scala.beans.BeanProperty 4 | 5 | class Zipcode { 6 | 7 | @BeanProperty 8 | var RecordNumber = -1 9 | @BeanProperty 10 | var Zipcode="" 11 | @BeanProperty 12 | var ZipCodeType="" 13 | @BeanProperty 14 | var City="" 15 | @BeanProperty 16 | var State="" 17 | @BeanProperty 18 | var LocationType="" 19 | @BeanProperty 20 | var Lat="" 21 | @BeanProperty 22 | var Long="" 23 | @BeanProperty 24 | var Xaxis="" 25 | @BeanProperty 26 | var Yaxis="" 27 | @BeanProperty 28 | var Zaxis="" 29 | @BeanProperty 30 | var WorldRegion="" 31 | @BeanProperty 32 | var Country="" 33 | @BeanProperty 34 | var LocationText="" 35 | @BeanProperty 36 | var Location="" 37 | @BeanProperty 38 | var Decommisioned="" 39 | } 40 | 41 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/ArrayToColumn.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | 6 | object ArrayToColumn extends App { 7 | 8 | val spark = SparkSession.builder().appName("SparkByExamples.com") 9 | .master("local[1]") 10 | .getOrCreate() 11 | 12 | val arrayData = Seq( 13 | Row("James",List("Java","Scala","C++")), 14 | Row("Michael",List("Spark","Java","C++")), 15 | Row("Robert",List("CSharp","VB","")) 16 | ) 17 | 18 | val arraySchema = new StructType().add("name",StringType) 19 | .add("subjects",ArrayType(StringType)) 20 | 21 | val arrayDF = spark.createDataFrame(spark.sparkContext.parallelize(arrayData),arraySchema) 22 | arrayDF.printSchema() 23 | arrayDF.show() 24 | 25 | val arrayDFColumn = df.select( 26 | df("name") +: (0 until 3).map(i => df("subjects")(i).alias(s"LanguagesKnown$i")): _* 27 | ) 28 | 29 | arrayDFColumn.show(false) 30 | 31 | //How to convert Array of Array to column 32 | val arrayArrayData = Seq( 33 | Row("James",List(List("Java","Scala","C++"),List("Spark","Java"))), 34 | Row("Michael",List(List("Spark","Java","C++"),List("Spark","Java"))), 35 | Row("Robert",List(List("CSharp","VB"),List("Spark","Python"))) 36 | ) 37 | 38 | val arrayArraySchema = new StructType().add("name",StringType) 39 | .add("subjects",ArrayType(ArrayType(StringType))) 40 | 41 | val df = spark.createDataFrame(spark.sparkContext.parallelize(arrayArrayData),arrayArraySchema) 42 | df.printSchema() 43 | df.show() 44 | 45 | val df2 = df.select( 46 | df("name") +: (0 until 2).map(i => df("subjects")(i).alias(s"LanguagesKnown$i")): _* 47 | ) 48 | 49 | df2.show(false) 50 | } 51 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/AvroExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import java.io.File 4 | 5 | import org.apache.avro.Schema 6 | import org.apache.spark.sql.{SaveMode, SparkSession} 7 | import org.apache.spark.sql.functions._ 8 | 9 | /** 10 | * Spark Avro library example 11 | * Avro schema example 12 | * Avro file format 13 | * 14 | */ 15 | object AvroExample { 16 | 17 | def main(args: Array[String]): Unit = { 18 | 19 | 20 | val spark: SparkSession = SparkSession.builder().master("local[1]") 21 | .appName("SparkByExamples.com") 22 | .getOrCreate() 23 | 24 | val data = Seq(("James ", "", "Smith", 2018, 1, "M", 3000), 25 | ("Michael ", "Rose", "", 2010, 3, "M", 4000), 26 | ("Robert ", "", "Williams", 2010, 3, "M", 4000), 27 | ("Maria ", "Anne", "Jones", 2005, 5, "F", 4000), 28 | ("Jen", "Mary", "Brown", 2010, 7, "", -1) 29 | ) 30 | 31 | val columns = Seq("firstname", "middlename", "lastname", "dob_year", 32 | "dob_month", "gender", "salary") 33 | import spark.sqlContext.implicits._ 34 | val df = data.toDF(columns: _*) 35 | 36 | /** 37 | * Write Avro File 38 | */ 39 | df.write.format("avro") 40 | .mode(SaveMode.Overwrite) 41 | .save("C:\\tmp\\spark_out\\avro\\person.avro") 42 | 43 | /** 44 | * Read Avro File 45 | */ 46 | spark.read.format("avro").load("C:\\tmp\\spark_out\\avro\\person.avro").show() 47 | 48 | /** 49 | * Write Avro Partition 50 | */ 51 | df.write.partitionBy("dob_year","dob_month") 52 | .format("avro") 53 | .mode(SaveMode.Overwrite) 54 | .save("C:\\tmp\\spark_out\\avro\\person_partition.avro") 55 | 56 | /** 57 | * Reading Avro Partition 58 | */ 59 | spark.read 60 | .format("avro") 61 | .load("C:\\tmp\\spark_out\\avro\\person_partition.avro") 62 | .where(col("dob_year") === 2010) 63 | .show() 64 | 65 | /** 66 | * Explicit Avro schema 67 | */ 68 | val schemaAvro = new Schema.Parser() 69 | .parse(new File("src/main/resources/person.avsc")) 70 | 71 | spark.read 72 | .format("avro") 73 | .option("avroSchema", schemaAvro.toString) 74 | .load("C:\\tmp\\spark_out\\avro\\person.avro") 75 | .show() 76 | 77 | /** 78 | * Avro Spark SQL 79 | */ 80 | spark.sqlContext.sql("CREATE TEMPORARY VIEW PERSON USING avro OPTIONS (path \"C:/tmp/spark_out/avro/person.avro\")") 81 | spark.sqlContext.sql("SELECT * FROM PERSON").show() 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CaseClassSparkSchema.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.Encoders 4 | import org.apache.spark.sql.types.StructType 5 | 6 | object CaseClassSparkSchema extends App{ 7 | 8 | case class Name(first:String,last:String,middle:String) 9 | case class Employee(fullName:Name,age:Integer,gender:String) 10 | 11 | val encoderSchema = Encoders.product[Employee].schema 12 | encoderSchema.printTreeString() 13 | 14 | import org.apache.spark.sql.catalyst.ScalaReflection 15 | val schema = ScalaReflection.schemaFor[Employee].dataType.asInstanceOf[StructType] 16 | 17 | } 18 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CastColumnType.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types._ 5 | import org.apache.spark.sql.functions._ 6 | 7 | object CastColumnType extends App{ 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val simpleData = Seq(Row("James",34,"2006-01-01","true","M",3000.60), 15 | Row("Michael",33,"1980-01-10","true","F",3300.80), 16 | Row("Robert",37,"06-01-1992","false","M",5000.50) 17 | ) 18 | 19 | val simpleSchema = StructType(Array( 20 | StructField("firstName",StringType,true), 21 | StructField("age",IntegerType,true), 22 | StructField("jobStartDate",StringType,true), 23 | StructField("isGraduated", StringType, true), 24 | StructField("gender", StringType, true), 25 | StructField("salary", DoubleType, true) 26 | )) 27 | 28 | val df = spark.createDataFrame(spark.sparkContext.parallelize(simpleData),simpleSchema) 29 | df.printSchema() 30 | df.show(false) 31 | 32 | //withColumn with the original column 33 | val df2 = df.withColumn("age",col("age").cast(StringType)) 34 | .withColumn("isGraduated",col("isGraduated").cast(BooleanType)) 35 | .withColumn("jobStartDate",col("jobStartDate").cast(DateType)) 36 | df2.printSchema() 37 | 38 | 39 | val df3 = df2.selectExpr("cast(age as int) age", 40 | "cast(isGraduated as string) isGraduated", 41 | "cast(jobStartDate as string) jobStartDate") 42 | df3.printSchema() 43 | df3.show(false) 44 | 45 | df3.createOrReplaceTempView("CastExample") 46 | val df4 = spark.sql("SELECT STRING(age),BOOLEAN(isGraduated),DATE(jobStartDate) from CastExample") 47 | df4.printSchema() 48 | df4.show(false) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CreateDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 4 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 5 | 6 | object CreateDataFrame { 7 | 8 | def main(args:Array[String]):Unit={ 9 | 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[1]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | 15 | import spark.implicits._ 16 | val columns = Seq("language","users_count") 17 | val data = Seq(("Java", "20000"), ("Python", "100000"), ("Scala", "3000")) 18 | val rdd = spark.sparkContext.parallelize(data) 19 | 20 | 21 | //From RDD (USING toDF()) 22 | val dfFromRDD1 = rdd.toDF("language","users") 23 | 24 | //From RDD (USING createDataFrame) 25 | val dfFromRDD2 = spark.createDataFrame(rdd).toDF(columns:_*) 26 | 27 | //From RDD (USING createDataFrame and Adding schema using StructType) 28 | //convert RDD[T] to RDD[Row] 29 | val schema = StructType(columns 30 | .map(fieldName => StructField(fieldName, StringType, nullable = true))) 31 | val rowRDD = rdd.map(attributes => Row(attributes._1, attributes._2)) 32 | val dfFromRDD3 = spark.createDataFrame(rowRDD,schema) 33 | 34 | 35 | //From Data (USING toDF()) 36 | val dfFromData1 = data.toDF() 37 | 38 | //From Data (USING createDataFrame) 39 | var dfFromData2 = spark.createDataFrame(data).toDF(columns:_*) 40 | 41 | //From Data (USING createDataFrame and Adding schema using StructType) 42 | import scala.collection.JavaConversions._ 43 | val rowData = data 44 | .map(attributes => Row(attributes._1, attributes._2)) 45 | var dfFromData3 = spark.createDataFrame(rowData,schema) 46 | 47 | //From Data (USING createDataFrame and Adding bean class) 48 | //To-DO 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CreateEmptyDataFrameExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 5 | 6 | object CreateEmptyDataFrameExample extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | import spark.implicits._ 13 | 14 | 15 | val schema = StructType( 16 | StructField("firstName", StringType, true) :: 17 | StructField("lastName", IntegerType, false) :: 18 | StructField("middleName", IntegerType, false) :: Nil) 19 | 20 | val colSeq = Seq("firstName","lastName","middleName") 21 | 22 | case class Name(firstName: String, lastName: String, middleName:String) 23 | 24 | // Create empty dataframe using StructType schema 25 | val df = spark.createDataFrame(spark.sparkContext 26 | .emptyRDD[Row], schema) 27 | 28 | // Using implicit encoder 29 | 30 | Seq.empty[(String,String,String)].toDF(colSeq:_*) 31 | 32 | //Using case class 33 | 34 | Seq.empty[Name].toDF().printSchema() 35 | 36 | //Using emptyDataFrame 37 | spark.emptyDataFrame 38 | 39 | 40 | //Using emptyDataset 41 | 42 | 43 | } 44 | 45 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/CreateEmptyDatasetExample.scala: -------------------------------------------------------------------------------- 1 | 2 | package com.sparkbyexamples.spark.dataframe 3 | 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 6 | 7 | object CreateEmptyDatasetExample extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | import spark.implicits._ 15 | 16 | val schema = StructType( 17 | StructField("firstName", StringType, true) :: 18 | StructField("lastName", IntegerType, false) :: 19 | StructField("middleName", IntegerType, false) :: Nil) 20 | 21 | val colSeq = Seq("firstName","lastName","middleName") 22 | 23 | case class Name(firstName: String, lastName: String, middleName:String) 24 | 25 | spark.createDataset(Seq.empty[Name]) 26 | spark.createDataset(Seq.empty[(String,String,String)]) 27 | spark.createDataset(spark.sparkContext.emptyRDD[Name]) 28 | Seq.empty[(String,String,String)].toDS() 29 | Seq.empty[Name].toDS() 30 | spark.emptyDataset[Name] 31 | } -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/DataFrameWithComplexDSL.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | case class Employee(firstName:String,lastName:String, email:String,salary:Int) 6 | case class Department(id:Int,name:String) 7 | case class DepartmentWithEmployees(department: Department, employees: Seq[Employee]) 8 | object DataFrameWithDSL2 { 9 | 10 | def main(args: Array[String]): Unit = { 11 | 12 | val department1 = Department(123456, "Computer Science") 13 | val department2 = Department(789012, "Mechanical Engineering") 14 | val department3 = Department(345678, "Theater and Drama") 15 | val department4 = Department(901234, "Indoor Recreation") 16 | 17 | //Create the Employees 18 | 19 | val employee1 = Employee("michael", "armbrust", "no-reply@berkeley.edu", 100000) 20 | val employee2 = Employee("xiangrui", "meng", "no-reply@stanford.edu", 120000) 21 | val employee3 = Employee("matei", "", "no-reply@waterloo.edu", 140000) 22 | val employee4 = Employee("", "wendell", "no-reply@berkeley.edu", 160000) 23 | 24 | //Create the DepartmentWithEmployees instances from Departments and Employees 25 | val departmentWithEmployees1 = DepartmentWithEmployees(department1, List(employee1, employee2)) 26 | val departmentWithEmployees2 = DepartmentWithEmployees(department2, List(employee3, employee4)) 27 | val departmentWithEmployees3 = DepartmentWithEmployees(department3, List(employee1, employee4)) 28 | val departmentWithEmployees4 = DepartmentWithEmployees(department4, List(employee2, employee3)) 29 | 30 | val data1 = Seq(departmentWithEmployees1,departmentWithEmployees2) 31 | 32 | val data2 = Seq(departmentWithEmployees3,departmentWithEmployees4) 33 | 34 | val spark: SparkSession = SparkSession.builder() 35 | .master("local[1]") 36 | .appName("SparkByExample") 37 | .getOrCreate() 38 | 39 | import spark.implicits._ 40 | 41 | val df = spark.createDataFrame(data1) 42 | val df2 = spark.createDataFrame(data2) 43 | 44 | //union 45 | val finalDF = df.union(df2) 46 | finalDF.printSchema() 47 | finalDF.show(false) 48 | 49 | finalDF.select("department.*").printSchema() 50 | finalDF.select(explode(col("employees"))).select("col.*").show(false) 51 | 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/DataFrameWithSimpleDSL.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | object DataFrameWithSimpleDSL { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | val filePath = "C://000_Projects/opt/BigData/zipcodes.csv" 15 | 16 | var df:DataFrame = spark.read.option("header","true").csv(filePath) 17 | df.printSchema() 18 | 19 | // Where 20 | df.select("*").where(df("RecordNumber") < 10).show() 21 | //Filter 22 | df.filter(df("State")==="PR").select("State").show() 23 | //Distinct 24 | df.select(df("State")).distinct().show() 25 | //Count 26 | println("Number of records"+df.count()) 27 | 28 | //When Otherwise 29 | //df.select(df("State"), case df("State") when "PR" then "PR123" 30 | 31 | // where with and and or conditions 32 | df.where(df("State") === "PR" && df("City").contains("DEL")).show() 33 | 34 | //Order or Sort by 35 | df.orderBy(df("RecordNumber").desc, df("State").asc).show() 36 | 37 | 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/FromCSVFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object FromCSVFile { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | spark.sparkContext 15 | 16 | val filePath="src/main/resources/zipcodes.csv" 17 | 18 | //Chaining multiple options 19 | val df2 = spark.read.options(Map("inferSchema"->"true","sep"->",","header"->"true")).csv(filePath) 20 | df2.show(false) 21 | df2.printSchema() 22 | 23 | df2.write.json("c:/tmp/spark_output/zipcodes") 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/FromCSVFile2.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object FromCSVFile2 { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | val filePath="src/main/resources/stream.csv" 15 | 16 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->"|","header"->"true")).csv(filePath) 17 | 18 | val df2 = df.select("Gender", "BirthDate", "TotalCost", "TotalChildren", "ProductCategoryName") 19 | .filter("Gender is not null") 20 | .filter("BirthDate is not null") 21 | .filter("TotalChildren is not null") 22 | .filter("ProductCategoryName is not null") 23 | df2.show() 24 | 25 | df.select("Gender", "BirthDate", "TotalCost", "TotalChildren", "ProductCategoryName") 26 | .where(df("Gender").isNotNull && df("BirthDate").isNotNull && df("TotalChildren").isNotNull && df("ProductCategoryName").isNotNull ).show() 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/FromJsonFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object FromJsonFile { 6 | 7 | def main(args:Array[String]): Unit = { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | val sc = spark.sparkContext 14 | 15 | val rdd = sc.textFile("src/main/resources/zipcodes.json") 16 | //Todo : convert RDD to DataFrame 17 | rdd.collect().foreach(println) 18 | 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ParquetExample { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val data = Seq(("James ","","Smith","36636","M",3000), 15 | ("Michael ","Rose","","40288","M",4000), 16 | ("Robert ","","Williams","42114","M",4000), 17 | ("Maria ","Anne","Jones","39192","F",4000), 18 | ("Jen","Mary","Brown","","F",-1) 19 | ) 20 | 21 | val columns = Seq("firstname","middlename","lastname","dob","gender","salary") 22 | import spark.sqlContext.implicits._ 23 | val df = data.toDF(columns:_*) 24 | 25 | df.show() 26 | df.printSchema() 27 | 28 | df.write 29 | .parquet("C:\\tmp\\output\\people.parquet") 30 | 31 | val parqDF = spark.read.parquet("C:\\tmp\\output\\people.parquet") 32 | parqDF.createOrReplaceTempView("ParquetTable") 33 | 34 | spark.sql("select * from ParquetTable where salary >= 4000").explain() 35 | val parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ") 36 | 37 | parkSQL.show() 38 | parkSQL.printSchema() 39 | 40 | df.write 41 | .partitionBy("gender","salary") 42 | .parquet("C:\\tmp\\output\\people2.parquet") 43 | 44 | val parqDF2 = spark.read.parquet("C:\\tmp\\output\\people2.parquet") 45 | parqDF2.createOrReplaceTempView("ParquetTable2") 46 | 47 | val df3 = spark.sql("select * from ParquetTable2 where gender='M' and salary >= 4000") 48 | df3.explain() 49 | df3.printSchema() 50 | df3.show() 51 | 52 | val parqDF3 = spark.read 53 | .parquet("C:\\tmp\\output\\people2.parquet\\gender=M") 54 | parqDF3.show() 55 | 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/RenameColDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 5 | import org.apache.spark.sql.functions.{col, _} 6 | 7 | object RenameColDataFrame { 8 | 9 | def main(args:Array[String]):Unit= { 10 | 11 | val spark: SparkSession = SparkSession.builder() 12 | .master("local[1]") 13 | .appName("SparkByExamples.com") 14 | .getOrCreate() 15 | 16 | val data = Seq(Row(Row("James ","","Smith"),"36636","M",3000), 17 | Row(Row("Michael ","Rose",""),"40288","M",4000), 18 | Row(Row("Robert ","","Williams"),"42114","M",4000), 19 | Row(Row("Maria ","Anne","Jones"),"39192","F",4000), 20 | Row(Row("Jen","Mary","Brown"),"","F",-1) 21 | ) 22 | 23 | val schema = new StructType() 24 | .add("name",new StructType() 25 | .add("firstname",StringType) 26 | .add("middlename",StringType) 27 | .add("lastname",StringType)) 28 | .add("dob",StringType) 29 | .add("gender",StringType) 30 | .add("salary",IntegerType) 31 | 32 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema) 33 | 34 | df.printSchema() 35 | 36 | df.withColumnRenamed("dob","DateOfBirth") 37 | .printSchema() 38 | 39 | val schema2 = new StructType() 40 | .add("fname",StringType) 41 | .add("middlename",StringType) 42 | .add("lname",StringType) 43 | 44 | df.select(col("name").cast(schema2), 45 | col("dob"), 46 | col("gender"), 47 | col("salary")) 48 | .printSchema() 49 | 50 | df.select(col("name.firstname").as("fname"), 51 | col("name.middlename").as("mname"), 52 | col("name.lastname").as("lname"), 53 | col("dob"),col("gender"),col("salary")) 54 | .printSchema() 55 | 56 | df.withColumnRenamed("name.firstname","fname") 57 | .withColumnRenamed("name.middlename","mname") 58 | .withColumnRenamed("name.lastname","lname") 59 | .drop("name") 60 | .printSchema() 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/SQLExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object DataFrameWithSQL_ { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val data = Seq(1,2,3) 15 | 16 | import spark.sqlContext.implicits._ 17 | 18 | val df = data.toDF("field1") 19 | 20 | df.createOrReplaceTempView("table1") 21 | 22 | val df2 = spark.sql("select tb1.field1 as field1,tb2.field1 as field2 from table1 tb1, table1 tb2 where tb1.field1 <> tb2.field1") 23 | df2.printSchema() 24 | df2.show(false) 25 | 26 | df2.createOrReplaceTempView("table2") 27 | 28 | val df3 = spark.sql("select distinct tb1.field1,tb1.field2 from table2 tb1, table2 tb2 where tb1.field1 == tb2.field2 and tb1.field2 == tb2.field1") 29 | 30 | df3.show(false) 31 | 32 | 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/SaveDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | object SaveDataFrame { 6 | 7 | def main(args: Array[String]): Unit = { 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | val filePath = "C://000_Projects/opt/BigData/zipcodes.csv" 14 | 15 | var df:DataFrame = spark.read.option("header","true").csv(filePath) 16 | 17 | df.repartition(5).write.option("header","true").csv("c:/tmp/output/df1") 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/UDFDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object UDFDataFrame { 6 | def main(args:Array[String]): Unit = { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[3]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | val data = Seq(("2018/01/23",23),("2018/01/24",24),("2018/02/20",25)) 14 | 15 | import spark.sqlContext.implicits._ 16 | val df = data.toDF("date1","day") 17 | 18 | val replace: String => String = _.replace("/","-") 19 | import org.apache.spark.sql.functions.udf 20 | val replaceUDF = udf(replace) 21 | val minDate = df.agg(min($"date1")).collect()(0).get(0) 22 | 23 | val df2 = df.select("*").filter( to_date(replaceUDF($"date1")) > date_add(to_date(replaceUDF(lit(minDate))),7 )) 24 | df2.show() 25 | } 26 | 27 | 28 | } 29 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/WithColumn.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, MapType, StringType, StructType} 5 | import org.apache.spark.sql.functions._ 6 | object WithColumn { 7 | 8 | def main(args:Array[String]):Unit= { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .master("local[1]") 12 | .appName("SparkByExamples.com") 13 | .getOrCreate() 14 | 15 | val arrayStructureData = Seq( 16 | Row(Row("James ","","Smith"),"1","M",3100,List("Cricket","Movies"),Map("hair"->"black","eye"->"brown")), 17 | Row(Row("Michael ","Rose",""),"2","M",3100,List("Tennis"),Map("hair"->"brown","eye"->"black")), 18 | Row(Row("Robert ","","Williams"),"3","M",3100,List("Cooking","Football"),Map("hair"->"red","eye"->"gray")), 19 | Row(Row("Maria ","Anne","Jones"),"4","M",3100,null,Map("hair"->"blond","eye"->"red")), 20 | Row(Row("Jen","Mary","Brown"),"5","M",3100,List("Blogging"),Map("white"->"black","eye"->"black")) 21 | ) 22 | 23 | val arrayStructureSchema = new StructType() 24 | .add("name",new StructType() 25 | .add("firstname",StringType) 26 | .add("middlename",StringType) 27 | .add("lastname",StringType)) 28 | .add("id",StringType) 29 | .add("gender",StringType) 30 | .add("salary",IntegerType) 31 | .add("Hobbies", ArrayType(StringType)) 32 | .add("properties", MapType(StringType,StringType)) 33 | 34 | val df2 = spark.createDataFrame( 35 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 36 | 37 | //Change the column data type 38 | df2.withColumn("salary",df2("salary").cast("Integer")) 39 | 40 | //Derive a new column from existing 41 | val df4=df2.withColumn("CopiedColumn",df2("salary")* -1) 42 | 43 | //Transforming existing column 44 | val df5 = df2.withColumn("salary",df2("salary")*100) 45 | 46 | //You can also chain withColumn to change multiple columns 47 | 48 | //Renaming a column. 49 | val df3=df2.withColumnRenamed("gender","sex") 50 | df3.printSchema() 51 | 52 | //Droping a column 53 | val df6=df4.drop("CopiedColumn") 54 | println(df6.columns.contains("CopiedColumn")) 55 | 56 | //Adding a literal value 57 | df2.withColumn("Country", lit("USA")).printSchema() 58 | 59 | //Retrieving 60 | df2.show(false) 61 | df2.select("name").show(false) 62 | df2.select("name.firstname").show(false) 63 | df2.select("name.*").show(false) 64 | 65 | 66 | val df8 = df2.select(col("*"),explode(col("hobbies"))) 67 | df8.show(false) 68 | 69 | 70 | //df8.select(from_collection()) 71 | 72 | 73 | 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/AnotherExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | 6 | class AnotherExample { 7 | 8 | def main(args:Array[String]):Unit= { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .master("local[1]") 12 | .appName("SparkByExamples.com") 13 | .getOrCreate() 14 | 15 | /** 16 | * Simple using columns list 17 | */ 18 | val data = Seq(("James ","","Smith","2018","01","M",3000), 19 | ("Michael ","Rose","","2010","03","M",4000), 20 | ("Robert ","","Williams","2010","03","M",4000), 21 | ("Maria ","Anne","Jones","2005","05","F",4000), 22 | ("Jen","Mary","Brown","2010","07","",-1) 23 | ) 24 | 25 | val columns = Seq("firstname","middlename","lastname","dob_year","dob_month","gender","salary") 26 | import spark.sqlContext.implicits._ 27 | val df = data.toDF(columns:_*) 28 | 29 | /** 30 | * schema using Row data 31 | */ 32 | val data3 = Seq(Row("James ","","Smith","36636","M",3000), 33 | Row("Michael ","Rose","","40288","M",4000), 34 | Row("Robert ","","Williams","42114","M",4000), 35 | Row("Maria ","Anne","Jones","39192","F",4000), 36 | Row("Jen","Mary","Brown","","F",-1) 37 | ) 38 | 39 | val schema3 = new StructType() 40 | .add("firstname",StringType) 41 | .add("middlename",StringType) 42 | .add("lastname",StringType) 43 | .add("dob",StringType) 44 | .add("gender",StringType) 45 | .add("salary",IntegerType) 46 | 47 | val df3 = spark.createDataFrame(spark.sparkContext.parallelize(data3),schema3) 48 | 49 | /** 50 | * nested structure schema 51 | */ 52 | val data4 = Seq(Row(Row("James ","","Smith"),"36636","M",3000), 53 | Row(Row("Michael ","Rose",""),"40288","M",4000), 54 | Row(Row("Robert ","","Williams"),"42114","M",4000), 55 | Row(Row("Maria ","Anne","Jones"),"39192","F",4000), 56 | Row(Row("Jen","Mary","Brown"),"","F",-1) 57 | ) 58 | 59 | val schema4 = new StructType() 60 | .add("name",new StructType() 61 | .add("firstname",StringType) 62 | .add("middlename",StringType) 63 | .add("lastname",StringType)) 64 | .add("dob",StringType) 65 | .add("gender",StringType) 66 | .add("salary",IntegerType) 67 | 68 | val df4 = spark.createDataFrame(spark.sparkContext.parallelize(data4),schema4) 69 | 70 | 71 | 72 | 73 | 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/MathFunctions.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | 6 | object MathFunctions { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val spark = SparkSession.builder() 10 | .appName("sparkbyexamples.com") 11 | .master("local") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | import spark.implicits._ 15 | val data = Seq((2,2.67),(3,3.12),(4,4.34),(5,1.10)) 16 | // data.sc 17 | // data.printSchema() 18 | // data.withColumn("factorial",factorial(col("number"))) 19 | // // .withColumn("ceil") 20 | // .show() 21 | 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/PivotExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object PivotExample { 6 | def main(args:Array[String]):Unit= { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | val data = Seq(("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), 14 | ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), 15 | ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), 16 | ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")) 17 | 18 | 19 | 20 | import spark.sqlContext.implicits._ 21 | val df = data.toDF("Product","Amount","Country") 22 | df.show() 23 | 24 | //pivot 25 | val pivotDF = df.groupBy("Product","Country") 26 | .sum("Amount") 27 | .groupBy("Product") 28 | .pivot("Country") 29 | .sum("sum(Amount)") 30 | pivotDF.show() 31 | 32 | // val countries = Seq("USA","China","Canada","Mexico") 33 | // val pivotDF2 = df.groupBy("Product").pivot("Country", countries).sum("Amount") 34 | // pivotDF2.show() 35 | 36 | //unpivot 37 | //val unPivotDF = pivotDF.select($"Product",expr("stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) " + 38 | //"as (Country,Total)")).where("Total is not null") 39 | //unPivotDF.show() 40 | 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/StringFunctions.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | class StringFunctions { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/WhenOtherwise.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{when, _} 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 6 | 7 | object WhenOtherwise { 8 | 9 | def main(args:Array[String]):Unit= { 10 | 11 | val spark: SparkSession = SparkSession.builder() 12 | .master("local[1]") 13 | .appName("SparkByExamples.com") 14 | .getOrCreate() 15 | 16 | import spark.sqlContext.implicits._ 17 | 18 | val data = List(("James ","","Smith","36636","M",60000), 19 | ("Michael ","Rose","","40288","M",70000), 20 | ("Robert ","","Williams","42114","",400000), 21 | ("Maria ","Anne","Jones","39192","F",500000), 22 | ("Jen","Mary","Brown","","F",0)) 23 | 24 | val cols = Seq("first_name","middle_name","last_name","dob","gender","salary") 25 | 26 | val df = spark.createDataFrame(data).toDF(cols:_*) 27 | 28 | val df2 = df.withColumn("gender", when(col("gender") === "M","Male") 29 | .when(col("gender") === "F","Female") 30 | .otherwise("Unknown")) 31 | 32 | 33 | val df3 = df.withColumn("gender", 34 | expr("case when gender = 'M' then 'Male' " + 35 | "when gender = 'F' then 'Female' " + 36 | "else 'Unknown' end")) 37 | 38 | val df4 = df.select(col("*"), when(col("gender") === "M","Male") 39 | .when(col("gender") === "F","Female") 40 | .otherwise("Unknown").alias("new_gender")) 41 | 42 | val df5 = df.select(col("*"), 43 | expr("case when gender = 'M' then 'Male' " + 44 | "when gender = 'F' then 'Female' " + 45 | "else 'Unknown' end").alias("new_gender")) 46 | 47 | val dataDF = Seq( 48 | (66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4" 49 | )).toDF("id", "code", "amt") 50 | 51 | df2.show() 52 | df3.show() 53 | df4.show() 54 | df5.show() 55 | dataDF.show() 56 | 57 | dataDF.withColumn("new_column", 58 | when(col("code") === "a" || col("code") === "d", "A") 59 | .when(col("code") === "b" and col("amt") === "4", "B") 60 | .otherwise("A1")) 61 | .show() 62 | 63 | //alternatively, we can also use "and" "or" operators 64 | dataDF.withColumn("new_column", 65 | when(col("code") === "a" or col("code") === "d", "A") 66 | .when(col("code") === "b" and col("amt") === "4", "B") 67 | .otherwise("A1")) 68 | .show() 69 | 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfArrayType.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.functions.{explode, flatten} 5 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 6 | 7 | object ArrayOfArrayType extends App { 8 | 9 | val spark = SparkSession.builder().appName("SparkByExamples.com") 10 | .master("local[1]") 11 | .getOrCreate() 12 | 13 | val arrayArrayData = Seq( 14 | Row("James",List(List("Java","Scala","C++"),List("Spark","Java"))), 15 | Row("Michael",List(List("Spark","Java","C++"),List("Spark","Java"))), 16 | Row("Robert",List(List("CSharp","VB"),List("Spark","Python"))) 17 | ) 18 | 19 | val arrayArraySchema = new StructType().add("name",StringType) 20 | .add("subjects",ArrayType(ArrayType(StringType))) 21 | 22 | val df = spark.createDataFrame( 23 | spark.sparkContext.parallelize(arrayArrayData),arrayArraySchema) 24 | df.printSchema() 25 | df.show(false) 26 | 27 | import spark.implicits._ 28 | val df2 = df.select($"name",explode($"subjects")) 29 | 30 | 31 | df2.printSchema() 32 | df2.show(false) 33 | 34 | //Convert Array of Array into Single array 35 | df.select($"name",flatten($"subjects")).show(false) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfMapType.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | import org.apache.spark.sql.functions.{explode} 6 | import org.apache.spark.sql.types._ 7 | 8 | object ArrayOfMapType extends App { 9 | val spark = SparkSession.builder().appName("SparkByExamples.com") 10 | .master("local[1]") 11 | .getOrCreate() 12 | 13 | val arrayMapSchema = new StructType().add("name",StringType) 14 | .add("properties", 15 | ArrayType(new MapType(StringType,StringType,true))) 16 | 17 | val arrayMapData = Seq( 18 | Row("James",List(Map("hair"->"black","eye"->"brown"), Map("height"->"5.9"))), 19 | Row("Michael",List(Map("hair"->"brown","eye"->"black"),Map("height"->"6"))), 20 | Row("Robert",List(Map("hair"->"red","eye"->"gray"),Map("height"->"6.3"))) 21 | ) 22 | 23 | val df = spark.createDataFrame( 24 | spark.sparkContext.parallelize(arrayMapData),arrayMapSchema) 25 | df.printSchema() 26 | df.show(false) 27 | 28 | import spark.implicits._ 29 | 30 | val df2 = df.select($"name",explode($"properties")) 31 | df2.printSchema() 32 | df2.show(false) 33 | } 34 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfStructType.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} 5 | import org.apache.spark.sql.{Row, SparkSession} 6 | 7 | object ArrayOfStructType extends App{ 8 | 9 | val spark = SparkSession.builder().appName("SparkByExamples.com") 10 | .master("local[1]") 11 | .getOrCreate() 12 | 13 | val arrayStructData = Seq( 14 | Row("James",List(Row("Java","XX",120),Row("Scala","XA",300))), 15 | Row("Michael",List(Row("Java","XY",200),Row("Scala","XB",500))), 16 | Row("Robert",List(Row("Java","XZ",400),Row("Scala","XC",250))), 17 | Row("Washington",null) 18 | ) 19 | 20 | val arrayStructSchema = new StructType().add("name",StringType) 21 | .add("booksIntersted",ArrayType(new StructType() 22 | .add("name",StringType) 23 | .add("author",StringType) 24 | .add("pages",IntegerType))) 25 | 26 | val df = spark.createDataFrame( 27 | spark.sparkContext.parallelize(arrayStructData),arrayStructSchema) 28 | df.printSchema() 29 | df.show(false) 30 | 31 | import spark.implicits._ 32 | val df2 = df.select($"name",explode($"booksIntersted")) 33 | df2.printSchema() 34 | df2.show(false) 35 | 36 | df2.groupBy($"name").agg(collect_list($"col").as("booksIntersted")) 37 | .show(false) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/MapTypeExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | import org.apache.spark.sql.functions.{col, explode, lit, map, map_concat, map_from_entries, map_keys, map_values} 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types._ 5 | 6 | object MapTypeExample extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | import spark.implicits._ 13 | 14 | //Creating DF with MapType 15 | val arrayStructureData = Seq( 16 | Row("James",List(Row("Newark","NY"),Row("Brooklyn","NY")), 17 | Map("hair"->"black","eye"->"brown"), Map("height"->"5.9")), 18 | Row("Michael",List(Row("SanJose","CA"),Row("Sandiago","CA")), 19 | Map("hair"->"brown","eye"->"black"),Map("height"->"6")), 20 | Row("Robert",List(Row("LasVegas","NV")), 21 | Map("hair"->"red","eye"->"gray"),Map("height"->"6.3")), 22 | Row("Maria",null,Map("hair"->"blond","eye"->"red"), 23 | Map("height"->"5.6")), 24 | Row("Jen",List(Row("LAX","CA"),Row("Orange","CA")), 25 | Map("white"->"black","eye"->"black"),Map("height"->"5.2")) 26 | ) 27 | 28 | 29 | val mapType = DataTypes.createMapType(StringType,StringType) 30 | 31 | val arrayStructureSchema = new StructType() 32 | .add("name",StringType) 33 | .add("addresses", ArrayType(new StructType() 34 | .add("city",StringType) 35 | .add("state",StringType))) 36 | .add("properties", mapType) 37 | .add("secondProp", MapType(StringType,StringType)) 38 | 39 | val mapTypeDF = spark.createDataFrame( 40 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 41 | mapTypeDF.printSchema() 42 | mapTypeDF.show() 43 | 44 | mapTypeDF.select(col("name"),map_keys(col("properties"))).show(false) 45 | mapTypeDF.select(col("name"),map_values(col("properties"))).show(false) 46 | mapTypeDF.select(col("name"),map_concat(col("properties"),col("secondProp"))).show(false) 47 | 48 | 49 | 50 | } 51 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/AddTime.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{expr,col} 5 | object AddTime extends App { 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | spark.sparkContext.setLogLevel("ERROR") 12 | 13 | import spark.sqlContext.implicits._ 14 | 15 | spark.sql( "select current_timestamp," + 16 | "cast(current_timestamp as TIMESTAMP) + INTERVAL 2 hours as added_hours," + 17 | "cast(current_timestamp as TIMESTAMP) + INTERVAL 5 minutes as added_minutes," + 18 | "cast(current_timestamp as TIMESTAMP) + INTERVAL 55 seconds as added_seconds" 19 | ).show(false) 20 | 21 | 22 | val df = Seq(("2019-07-01 12:01:19.101"), 23 | ("2019-06-24 12:01:19.222"), 24 | ("2019-11-16 16:44:55.406"), 25 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 26 | 27 | 28 | df.createOrReplaceTempView("AddTimeExample") 29 | 30 | val df2 = spark.sql("select input_timestamp, " + 31 | "cast(input_timestamp as TIMESTAMP) + INTERVAL 2 hours as added_hours," + 32 | "cast(input_timestamp as TIMESTAMP) + INTERVAL 5 minutes as added_minutes," + 33 | "cast(input_timestamp as TIMESTAMP) + INTERVAL 55 seconds as added_seconds from AddTimeExample" 34 | ) 35 | df2.show(false) 36 | 37 | df.withColumn("added_hours",col("input_timestamp") + expr("INTERVAL 2 HOURS")) 38 | .withColumn("added_minutes",col("input_timestamp") + expr("INTERVAL 2 minutes")) 39 | .withColumn("added_seconds",col("input_timestamp") + expr("INTERVAL 2 seconds")) 40 | .show(false) 41 | } 42 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/CurrentDateAndTime.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | 6 | object CurrentDateAndTime extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | //Get current Date & Time 17 | val df = Seq((1)).toDF("seq") 18 | 19 | val curDate = df.withColumn("current_date",current_date().as("current_date")) 20 | .withColumn("current_timestamp",current_timestamp().as("current_timestamp")) 21 | curDate.show(false) 22 | 23 | 24 | curDate.select(date_format(col("current_timestamp"),"MM-dd-yyyy").as("date"), 25 | date_format(col("current_timestamp"),"HH:mm:ss.SSS").as("time"), 26 | date_format(col("current_date"), "MM-dd-yyyy").as("current_date_formateed")) 27 | .show(false) 28 | 29 | 30 | } 31 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateAddMonths.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | 6 | object DateAddMonths extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select( 17 | col("date"), 18 | add_months(col("date"),3).as("add_months"), 19 | add_months(col("date"),-3).as("sub_months"), 20 | date_add(col("date"),4).as("date_add"), 21 | date_sub(col("date"),4).as("date_sub") 22 | ).show() 23 | 24 | Seq(("06-03-2009"),("07-24-2009")).toDF("date").select( 25 | col("Date"), 26 | add_months(to_date(col("Date"),"MM-dd-yyyy"),3).as("add_months"), 27 | add_months(to_date(col("Date"),"MM-dd-yyyy"),-3).as("add_months2"), 28 | date_add(to_date(col("Date"),"MM-dd-yyyy"),3).as("date_add"), 29 | date_add(to_date(col("Date"),"MM-dd-yyyy"),-3).as("date_add2"), 30 | date_sub(to_date(col("Date"),"MM-dd-yyyy"),3).as("date_sub") 31 | ).show() 32 | 33 | } 34 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateDiff.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | 6 | object DateDiff extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | //Difference between two dates in days 17 | Seq(("2019-07-01"),("2019-06-24"),("2019-08-24"),("2018-07-23")).toDF("date") 18 | .select( 19 | col("date"), 20 | current_date().as("current_date"), 21 | datediff(current_date(),col("date")).as("datediff") 22 | ).show() 23 | 24 | // Difference between two dates in Months and Years 25 | val df = Seq(("2019-07-01"),("2019-06-24"),("2019-08-24"),("2018-12-23"),("2018-07-20")) 26 | .toDF("startDate").select( 27 | col("startDate"),current_date().as("endDate") 28 | ) 29 | 30 | calculateDiff(df) 31 | 32 | //Difference between two dates when dates are not in Spark DateType format 'yyyy-MM-dd'. 33 | //Note that when dates are not in Spark DateType format, all Spark functions returns null 34 | //Hence, first convert the input dates to Spark DateType using to_date function 35 | val dfDate = Seq(("07-01-2019"),("06-24-2019"),("08-24-2019"),("12-23-2018"),("07-20-2018")) 36 | .toDF("startDate").select( 37 | to_date(col("startDate"),"MM-dd-yyyy").as("startDate"), 38 | current_date().as("endDate") 39 | ) 40 | 41 | calculateDiff(dfDate) 42 | 43 | def calculateDiff(df:DataFrame): Unit ={ 44 | df.withColumn("datesDiff", datediff(col("endDate"),col("startDate"))) 45 | .withColumn("montsDiff", months_between( 46 | col("endDate"),col("startDate"))) 47 | .withColumn("montsDiff_round",round(months_between( 48 | col("endDate"),col("startDate")),2)) 49 | .withColumn("yearsDiff",months_between( 50 | col("endDate"),col("startDate"),true).divide(12)) 51 | .withColumn("yearsDiff_round",round(months_between( 52 | col("endDate"),col("startDate"),true).divide(12),2)) 53 | .show() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateExamples.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | 6 | object DateExamples { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[3]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | spark.sparkContext.setLogLevel("ERROR") 15 | val data = Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")) 16 | 17 | import spark.sqlContext.implicits._ 18 | val df = data.toDF("date") 19 | 20 | //date_format 21 | Seq(("2019-01-23")).toDF("InputDate").select( 22 | current_date()as("current_date"), 23 | col("InputDate"), 24 | date_format(col("InputDate"), "MM-dd-yyyy").as("date_format") 25 | ).show() 26 | 27 | //to_date 28 | Seq(("04/13/2019")).toDF("InputDate").select( 29 | col("InputDate"), 30 | to_date(col("InputDate"), "MM/dd/yyyy").as("to_date") 31 | ).show() 32 | 33 | 34 | //datediff, 35 | Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select( 36 | col("date"), 37 | current_date(), 38 | datediff(current_date(),col("date")).as("datediff") 39 | ).show() 40 | 41 | //months_between 42 | Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select( 43 | col("date"), 44 | current_date(), 45 | datediff(current_date(),col("date")).as("datediff"), 46 | months_between(current_date(),col("date")).as("months_between") 47 | ).show() 48 | 49 | //Trunc 50 | Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select( 51 | col("date"), 52 | trunc(col("date"),"Month").as("Month_Trunc"), 53 | trunc(col("date"),"Year").as("Month_Year"), 54 | trunc(col("date"),"Month").as("Month_Trunc") 55 | ).show() 56 | 57 | Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select( 58 | col("date"), 59 | add_months(col("date"),3).as("add_months"), 60 | add_months(col("date"),-3).as("sub_months"), 61 | date_add(col("date"),4).as("date_add"), 62 | date_sub(col("date"),4).as("date_sub") 63 | ).show() 64 | 65 | Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select( 66 | col("date"), 67 | year(col("date")).as("year"), 68 | month(col("date")).as("month"), 69 | dayofweek(col("date")).as("dayofweek"), 70 | dayofmonth(col("date")).as("dayofmonth"), 71 | dayofyear(col("date")).as("dayofyear"), 72 | next_day(col("date"),"Sunday").as("next_day"), 73 | weekofyear(col("date")).as("weekofyear") 74 | ).show() 75 | 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateLastDay.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, last_day, to_date} 5 | 6 | object DateLastDay extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | Seq(("2019-01-01"),("2020-02-24"),("2019-02-24"), 17 | ("2019-05-01"),("2018-03-24"),("2007-12-19")) 18 | .toDF("Date").select( 19 | col("Date"), 20 | last_day(col("Date")).as("last_day") 21 | ).show() 22 | 23 | 24 | Seq(("06-03-2009"),("07-24-2009")).toDF("Date").select( 25 | col("Date"), 26 | last_day(to_date(col("Date"),"MM-dd-yyyy")).as("last_day") 27 | ).show() 28 | 29 | } 30 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateToString.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import com.sparkbyexamples.spark.dataframe.functions.datetime.DateFormat.spark 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions.{current_date, current_timestamp, date_format} 6 | 7 | object DateToString extends App { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | Seq(1).toDF("seq").select( 18 | current_date().as("current_date"), 19 | date_format(current_timestamp(),"yyyy MM dd").as("yyyy MM dd"), 20 | date_format(current_timestamp(),"MM/dd/yyyy hh:mm").as("MM/dd/yyyy"), 21 | date_format(current_timestamp(),"yyyy MMM dd").as("yyyy MMMM dd"), 22 | date_format(current_timestamp(),"yyyy MMMM dd E").as("yyyy MMMM dd E") 23 | ).show(false) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DayAndWeekOfYear.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, date_format, to_timestamp} 5 | 6 | 7 | object DayAndWeekOfYear extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | val df = Seq(("2019-01-03 12:01:19.000"), 18 | ("2019-02-01 12:01:19.000"), 19 | ("2019-7-16 16:44:55.406"), 20 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 21 | 22 | df.withColumn("input_timestamp", 23 | to_timestamp(col("input_timestamp"))) 24 | .withColumn("day_of_year", date_format(col("input_timestamp"), "D")) 25 | .withColumn("week_of_year", date_format(col("input_timestamp"), "w")) 26 | 27 | .show(false) 28 | } -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DayWeekAndWeekMonth.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, to_timestamp,date_format} 5 | 6 | 7 | object DayWeekAndWeekMonth extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | val df = Seq(("2019-07-01 12:01:19.000"), 18 | ("2019-06-24 12:01:19.000"), 19 | ("2019-11-16 16:44:55.406"), 20 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 21 | 22 | df.withColumn("input_timestamp", 23 | to_timestamp(col("input_timestamp"))) 24 | .withColumn("week_day_number", date_format(col("input_timestamp"), "u")) 25 | .withColumn("week_day_abb", date_format(col("input_timestamp"), "E")) 26 | .withColumn("week_day_full", date_format(col("input_timestamp"), "EEEE")) 27 | .withColumn("week_of_month", date_format(col("input_timestamp"), "W")) 28 | .show(false) 29 | } -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/GetTimeFromTimestamp.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col,hour,minute,second} 5 | 6 | object GetTimeFromTimestamp extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | val df = Seq(("2019-07-01 12:01:19.000"), 17 | ("2019-06-24 12:01:19.000"), 18 | ("2019-11-16 16:44:55.406"), 19 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 20 | 21 | 22 | df.withColumn("hour", hour(col("input_timestamp"))) 23 | .withColumn("minute", minute(col("input_timestamp"))) 24 | .withColumn("second", second(col("input_timestamp"))) 25 | .show(false) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/StringToDate.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, to_date} 5 | 6 | object StringToDate extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | Seq(("06-03-2009"),("07-24-2009")).toDF("Date").select( 17 | col("Date"), 18 | to_date(col("Date"),"MM-dd-yyyy").as("to_date") 19 | ).show() 20 | } 21 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/StringToTimestamp.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types.LongType 6 | 7 | object StringToTimestamp extends App { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | //String to timestamps 18 | val df = Seq(("2019-07-01 12:01:19.000"), 19 | ("2019-06-24 12:01:19.000"), 20 | ("2019-11-16 16:44:55.406"), 21 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 22 | 23 | df.withColumn("datetype_timestamp", 24 | to_timestamp(col("input_timestamp"))) 25 | .printSchema() 26 | 27 | 28 | //Convert string to timestamp when input string has just time 29 | val df1 = Seq(("12:01:19.345"), 30 | ("12:01:20.567"), 31 | ("16:02:44.406"), 32 | ("16:50:59.406")) 33 | .toDF("input_timestamp") 34 | 35 | df1.withColumn("datetype_timestamp", 36 | to_timestamp(col("input_timestamp"),"HH:mm:ss.SSS")) 37 | .show(false) 38 | 39 | //when dates are not in Spark DateType format 'yyyy-MM-dd HH:mm:ss.SSS'. 40 | //Note that when dates are not in Spark DateType format, all Spark functions returns null 41 | //Hence, first convert the input dates to Spark DateType using to_timestamp function 42 | val dfDate = Seq(("07-01-2019 12 01 19 406"), 43 | ("06-24-2019 12 01 19 406"), 44 | ("11-16-2019 16 44 55 406"), 45 | ("11-16-2019 16 50 59 406")).toDF("input_timestamp") 46 | 47 | dfDate.withColumn("datetype_timestamp", 48 | to_timestamp(col("input_timestamp"),"MM-dd-yyyy HH mm ss SSS")) 49 | .show(false) 50 | 51 | 52 | } 53 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimestampToDate.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, to_date, to_timestamp} 5 | import org.apache.spark.sql.types.DateType 6 | 7 | object TimestampToDate extends App { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | val df = Seq(("2019-07-01 12:01:19.000"), 18 | ("2019-06-24 12:01:19.000"), 19 | ("2019-11-16 16:44:55.406"), 20 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 21 | 22 | //Timestamp String to DateType 23 | df.withColumn("datetype", 24 | to_date(col("input_timestamp"),"yyyy-MM-dd")) 25 | .show(false) 26 | 27 | //Timestamp type to DateType 28 | df.withColumn("ts",to_timestamp(col("input_timestamp"))) 29 | .withColumn("datetype",to_date(col("ts"))) 30 | .show(false) 31 | 32 | //Using Cast 33 | df.withColumn("ts",to_timestamp(col("input_timestamp"))) 34 | .withColumn("datetype",col("ts").cast(DateType)) 35 | .show(false) 36 | } 37 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimestampToString.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{current_date, current_timestamp, date_format} 5 | 6 | object TimestampToString extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | import spark.sqlContext.implicits._ 17 | Seq(1).toDF("seq").select( 18 | current_timestamp().as("current_date"), 19 | date_format(current_timestamp(),"yyyy MM dd").as("yyyy MM dd"), 20 | date_format(current_timestamp(),"MM/dd/yyyy hh:mm").as("MM/dd/yyyy"), 21 | date_format(current_timestamp(),"yyyy MMM dd").as("yyyy MMMM dd"), 22 | date_format(current_timestamp(),"yyyy MMMM dd E").as("yyyy MMMM dd E") 23 | ).show(false) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/unixtimeExample.scala: -------------------------------------------------------------------------------- 1 | // ?..// 2 | // / 3 | // // package com.sparkbyexamples.spark.dataframe.functions.datetime 4 | // 5 | //import org.apache.spark.sql.SparkSession 6 | //import o rg.apache.spark.sql.functions.{col, from_unixtime, unix_timestampfc 7 | //kn b bnm} 8 | // 9 | //object unixtimeExample extends App { 10 | // 11 | // val spark:SparkSession = SparkSession.builder() 12 | // .master("local") 13 | // .appName("SparkByExamples.com") 14 | // .getOrCreate() 15 | // spark.sparkContext.setLogLevel("ERROR") 16 | // 17 | // import spark.sqlContext.implicits._ 18 | // 19 | // val df = Seq(("2019-07-01 12:01:19"), 20 | // ("2019-06-24 12:01:19"), 21 | // ("2019-11-16 16:44:55"), 22 | // ("2019-11-16 16:50:59")).toDF("input_timestamp") 23 | // 24 | // 25 | // val df2 = df.withColumn("unix_timestamp", unix_timestamp(col("input_timestamp"))) 26 | // .withColumn("current_unix_timestamp", unix_timestamp()) 27 | // df2.show(false) 28 | // 29 | // df2.withColumn("from_unixtime",from_unixtime(col("unix_timestamp"))) 30 | // .show(false) 31 | // 32 | //} 33 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/from_json.scala: -------------------------------------------------------------------------------- 1 | //package com.sparkbyexamples.spark.dataframe.functions 2 | // 3 | //import org.apache.spark.sql.SparkSession 4 | //import org.apache.spark.sql.functions.col 5 | //import org.apache.spark.sql.types.{StringType, StructType} 6 | // 7 | //object from_json { 8 | // def main(args:Array[String]):Unit= { 9 | // 10 | // val spark: SparkSession = SparkSession.builder() 11 | // .master("local[1]") 12 | // .appName("SparkByExample") 13 | // .getOrCreate() 14 | // 15 | // 16 | // val data = Seq(("1","{\"name\":\"Anne\",\"Age\":\"12\",\"country\":\"Denmark\"}"), 17 | // ("2","{\"name\":\"Zen\",\"Age\":\"24\"}"), 18 | // ("3","{\"name\":\"Fred\",\"Age\":\"20\",\"country\":\"France\"}"), 19 | // ("4","{\"name\":\"Mona\",\"Age\":\"18\",\"country\":\"Denmark\"}") 20 | // ) 21 | // 22 | // import spark.sqlContext.implicits._ 23 | // val df = data.toDF("ID","details_Json") 24 | // 25 | // val schema = (new StructType()).add("name",StringType,true) 26 | // .add("Age",StringType,true) 27 | // .add("country",StringType,true) 28 | // 29 | // val df2 = df.withColumn("details_Struct", from_json($"details_Json", schema)) 30 | // .withColumn("country",col("details_Struct").getField("country")) 31 | // .filter(col("country").equalTo("Denmark")) 32 | // 33 | // 34 | // df2.printSchema() 35 | // df2.show(false) 36 | // } 37 | //} 38 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/litTypeLit.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.IntegerType 5 | 6 | object litTypeLit extends App { 7 | 8 | 9 | 10 | val spark = SparkSession.builder() 11 | .appName("sparkbyexamples.com") 12 | .master("local") 13 | .getOrCreate() 14 | 15 | import spark.sqlContext.implicits._ 16 | import org.apache.spark.sql.functions._ 17 | 18 | val data = Seq(("111",50000),("222",60000),("333",40000)) 19 | val df = data.toDF("EmpId","Salary") 20 | val df2 = df.select(col("EmpId"),col("Salary"),lit("1").as("lit_value1")) 21 | df2.show() 22 | 23 | val df3 = df2.withColumn("lit_value2", 24 | when(col("Salary") >=40000 && col("Salary") <= 50000, lit("100").cast(IntegerType)) 25 | .otherwise(lit("200").cast(IntegerType)) 26 | ) 27 | 28 | df3.show() 29 | 30 | val df4 = df3.withColumn("typedLit_seq",typedLit(Seq(1, 2, 3))) 31 | .withColumn("typedLit_map",typedLit(Map("a" -> 1, "b" -> 2))) 32 | .withColumn("typedLit_struct",typedLit(("a", 2, 1.0))) 33 | 34 | df4.printSchema() 35 | df4.show() 36 | 37 | } 38 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/PersonsComplexXML.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} 5 | 6 | object PersonsComplexXML { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val spark = SparkSession.builder().master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | /* 14 | Read XML File 15 | */ 16 | val df = spark.read 17 | .format("xml") 18 | .option("rowTag", "person") 19 | .load("src/main/resources/persons_complex.xml") 20 | 21 | df.printSchema() 22 | 23 | df.show() 24 | val schema = new StructType() 25 | .add("_id",StringType) 26 | .add("firstname",StringType) 27 | .add("middlename",StringType) 28 | .add("lastname",StringType) 29 | .add("dob_year",StringType) 30 | .add("dob_month",StringType) 31 | .add("gender",StringType) 32 | .add("salary",StringType) 33 | .add("addresses", new StructType() 34 | .add("address",ArrayType( 35 | new StructType() 36 | .add("_type",StringType) 37 | .add("addressLine",StringType) 38 | .add("city",StringType) 39 | .add("state",StringType) 40 | ) 41 | ) 42 | ) 43 | 44 | val df2 = spark.read 45 | .format("xml") 46 | .option("rowTag", "person") 47 | .schema(schema) 48 | .load("src/main/resources/persons.xml") 49 | 50 | // df.foreach(row=>{ 51 | // println("ID:"+row.getAs("_id") ) 52 | // println("ID:"+row(0)) 53 | // println("ID:"+row.get(0)) 54 | // println(row.getAs("addresses")) 55 | // // println("ID:"+row.getString(0)) 56 | // }) 57 | // 58 | df2.write 59 | .format("com.databricks.spark.xml") 60 | .option("rootTag", "persons") 61 | .option("rowTag", "person") 62 | .save("src/main/resources/persons_new.xml") 63 | 64 | } 65 | } 66 | 67 | 68 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/PersonsXML.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml 2 | 3 | import org.apache.spark.sql.{SparkSession, types} 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} 5 | 6 | object PersonsXML { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val spark = SparkSession.builder().master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | /* 14 | Read XML File 15 | */ 16 | val df = spark.read 17 | .format("xml") 18 | .option("rowTag", "person") 19 | .load("src/main/resources/persons.xml") 20 | 21 | df.printSchema() 22 | df.show() 23 | 24 | val schema = new StructType() 25 | .add("_id",StringType) 26 | .add("firstname",StringType) 27 | .add("middlename",StringType) 28 | .add("lastname",StringType) 29 | .add("dob_year",StringType) 30 | .add("dob_month",StringType) 31 | .add("gender",StringType) 32 | .add("salary",StringType) 33 | 34 | val df2 = spark.read 35 | .format("xml") 36 | .option("rowTag", "person") 37 | .schema(schema) 38 | .load("src/main/resources/persons.xml") 39 | 40 | df2.write 41 | .format("com.databricks.spark.xml") 42 | .option("rootTag", "persons") 43 | .option("rowTag", "person") 44 | .save("src/main/resources/persons_new.xml") 45 | 46 | } 47 | } 48 | 49 | 50 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/ReadBooksXMLWithNestedArray.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml 2 | 3 | import com.sparkbyexamples.spark.beans.BooksWithArray 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 6 | import org.apache.spark.sql.types.StructType 7 | 8 | object ReadBooksXMLWithNestedArray { 9 | 10 | def main(args: Array[String]): Unit = { 11 | val spark = SparkSession.builder().master("local[1]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | 15 | val df = spark.sqlContext.read 16 | .format("com.databricks.spark.xml") 17 | .option("rowTag", "book") 18 | .load("src/main/resources/books_withnested_array.xml") 19 | 20 | df.printSchema() 21 | df.show() 22 | 23 | df.foreach(row=>{ 24 | println(""+row.getAs("author")+","+row.getAs("_id")) 25 | println(row.getStruct(4).getAs("country")) 26 | println(row.getStruct(4).getClass) 27 | val arr = row.getStruct(7).getList(0) 28 | for (i<-0 to arr.size-1){ 29 | val b = arr.get(i).asInstanceOf[GenericRowWithSchema] 30 | println(""+b.getAs("name") +","+b.getAs("location")) 31 | } 32 | }) 33 | 34 | } 35 | } 36 | 37 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/ReadBooksXMLWithNestedArrayStruct.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 5 | import org.apache.spark.sql.types._ 6 | 7 | object ReadBooksXMLWithNestedArrayStruct { 8 | 9 | def main(args: Array[String]): Unit = { 10 | val spark = SparkSession.builder().master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | val customSchema = StructType(Array( 15 | StructField("_id", StringType, nullable = true), 16 | StructField("author", StringType, nullable = true), 17 | StructField("description", StringType, nullable = true), 18 | StructField("genre", StringType ,nullable = true), 19 | StructField("price", DoubleType, nullable = true), 20 | StructField("publish_date", StringType, nullable = true), 21 | StructField("title", StringType, nullable = true), 22 | StructField("otherInfo",StructType(Array( 23 | StructField("pagesCount", StringType, nullable = true), 24 | StructField("language", StringType, nullable = true), 25 | StructField("country", StringType, nullable = true), 26 | StructField("address", StructType(Array( 27 | StructField("addressline1", StringType, nullable = true), 28 | StructField("city", StringType, nullable = true), 29 | StructField("state", StringType, nullable = true) 30 | )) 31 | )) 32 | )), 33 | StructField("stores",StructType(Array( 34 | StructField("store",ArrayType( 35 | StructType(Array( 36 | StructField("location",StringType,true), 37 | StructField("name",StringType,true) 38 | )) 39 | )) 40 | ))) 41 | )) 42 | 43 | val df = spark.sqlContext.read 44 | .format("com.databricks.spark.xml") 45 | .option("rowTag", "book") 46 | .schema(customSchema) 47 | .load("src/main/resources/books_withnested_array.xml") 48 | 49 | df.printSchema() 50 | df.show() 51 | 52 | df.foreach(row=>{ 53 | println(""+row.getAs("author")+","+row.getAs("_id")) 54 | println(row.getStruct(4).getAs("country")) 55 | println(row.getStruct(4).getClass) 56 | val arr = row.getStruct(7).getList(0) 57 | for (i<-0 to arr.size-1){ 58 | val b = arr.get(i).asInstanceOf[GenericRowWithSchema] 59 | println(""+b.getAs("name") +","+b.getAs("location")) 60 | } 61 | }) 62 | 63 | } 64 | } 65 | 66 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/xstream/WriteXML.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml.xstream 2 | 3 | import com.thoughtworks.xstream.XStream 4 | import com.thoughtworks.xstream.io.xml.DomDriver 5 | import org.apache.spark.sql.types.{StringType, StructType} 6 | import org.apache.spark.sql.{Row, SparkSession} 7 | 8 | object WriteXML { 9 | def main(args: Array[String]): Unit = { 10 | 11 | val spark:SparkSession = SparkSession.builder() 12 | .master("local") 13 | .appName("SparkByExample") 14 | .getOrCreate() 15 | 16 | val sc = spark.sparkContext 17 | 18 | val data = Seq(Row("1",Row("James ","","Smith"),"36636","M","3000"), 19 | Row("2",Row("Michael ","Rose",""),"40288","M","4000"), 20 | Row("3",Row("Robert ","","Williams"),"42114","M","4000"), 21 | Row("4",Row("Maria ","Anne","Jones"),"39192","F","4000"), 22 | Row("5",Row("Jen","Mary","Brown"),"","F","-1") 23 | ) 24 | 25 | val schema = new StructType() 26 | .add("id",StringType) 27 | .add("name",new StructType() 28 | .add("firstName",StringType) 29 | .add("middleName",StringType) 30 | .add("lastName",StringType)) 31 | .add("ssn",StringType) 32 | .add("gender",StringType) 33 | .add("salary",StringType) 34 | 35 | case class Name(firstName:String,middleName:String,lastName:String) 36 | case class Person(id:String,name:Name,ssn:String,gender:String,salary:String) 37 | import spark.sqlContext.implicits._ 38 | 39 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema)//.as[Person] 40 | 41 | val ds = df.mapPartitions(part=>{ 42 | val xstream = new XStream(new DomDriver) 43 | val data = part.map(ite=>{ 44 | val nameRow:Row = ite.getAs[Row]("name") 45 | val name= Name(nameRow.getAs("firstName"),nameRow.getAs("firstName"),nameRow.getAs("firstName")) 46 | val person = Person(ite.getAs("id"),name,ite.getAs("ssn"),ite.getAs("gender"),ite.getAs("salary")) 47 | //xstream.aliasType("Person",Class[String]) 48 | val xmlString = xstream.toXML(person) 49 | xmlString 50 | }) 51 | data 52 | }) 53 | 54 | ds.write.text("c:/tmp/xstream.xml") 55 | 56 | // val df2 = spark.createDataFrame(spark.sparkContext.parallelize(data),schema).as[Person] 57 | // 58 | // val ds2 = df2.mapPartitions(part=>{ 59 | // val xstream = new XStream(new DomDriver) 60 | // val person = part.map(ite=>{ 61 | // val xmlString = xstream.toXML(person) 62 | // xmlString 63 | // }) 64 | // person 65 | // }) 66 | // ds2.write.text("c:/tmp/xstream_2.xml") 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/DataSetFromData.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object DataSetFromData { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | val data = Seq((1,2),(3,4),(5,6)) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/DataSetWithCustomClass.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | class Test(field1:String,field2:String,field3:String) extends Serializable{ 6 | 7 | 8 | } 9 | 10 | object TestEncoders { 11 | implicit def testEncoder: org.apache.spark.sql.Encoder[Test] = 12 | org.apache.spark.sql.Encoders.kryo[Test] 13 | } 14 | object DataSetWithCustomClass { 15 | 16 | def main(args:Array[String]):Unit= { 17 | 18 | val spark: SparkSession = SparkSession.builder() 19 | .master("local[1]") 20 | .appName("SparkByExample") 21 | .getOrCreate() 22 | 23 | val test:Test = new Test("Field1","Field2","Field3") 24 | 25 | import spark.sqlContext.implicits._ 26 | import org.apache.spark.sql.Encoders 27 | import TestEncoders._ 28 | // implicit val encoder = Encoders.bean[Test](classOf[Test]) 29 | 30 | val data = Seq(test) 31 | val rdd = spark.sparkContext.parallelize(data) 32 | val ds = spark.createDataset(rdd) 33 | 34 | val ds2 = ds.selectExpr("CAST(value AS String)") 35 | .as[(String)] 36 | 37 | 38 | ds.printSchema() 39 | ds2.show(false) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXML.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset.xml 2 | 3 | import com.sparkbyexamples.spark.beans.{Books, BooksDiscounted} 4 | import org.apache.spark.sql.{Encoders, SparkSession} 5 | 6 | object ReadBooksXML { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val spark = SparkSession.builder().master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | import spark.implicits._ 14 | 15 | val ds = spark.sqlContext.read 16 | .format("com.databricks.spark.xml") 17 | .option("rowTag", "book") 18 | .load("src/main/resources/books.xml").as[Books] 19 | 20 | 21 | val newds = ds.map(f=>{ 22 | BooksDiscounted(f._id,f.author,f.description,f.price,f.publish_date,f.title, f.price - f.price*20/100) 23 | }) 24 | 25 | newds.printSchema() 26 | newds.show() 27 | 28 | newds.foreach(f=>{ 29 | println("Price :"+f.price + ", Discounted Price :"+f.discountPrice) 30 | }) 31 | 32 | //First element 33 | println("First Element" +newds.first()._id) 34 | 35 | } 36 | } 37 | 38 | 39 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXMLWithNestedArray.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset.xml 2 | 3 | import com.sparkbyexamples.spark.beans.{Books, BooksWithArray} 4 | import org.apache.spark.sql.{SparkSession, functions} 5 | 6 | object ReadBooksXMLWithNestedArray { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val spark = SparkSession.builder().master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | import spark.implicits._ 14 | val ds = spark.sqlContext.read 15 | .format("com.databricks.spark.xml") 16 | .option("rowTag", "book") 17 | .load("src/main/resources/books_withnested_array.xml").as[BooksWithArray] 18 | 19 | ds.printSchema() 20 | ds.show() 21 | 22 | ds.foreach(f=>{ 23 | println(f.author+","+f.otherInfo.country+","+f.otherInfo.address.addressline1) 24 | for(s<-f.stores.store){ 25 | println(s.name) 26 | } 27 | 28 | }) 29 | 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXMLWithNestedArrayDSL.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset.xml 2 | 3 | 4 | 5 | import com.sparkbyexamples.spark.beans.Books 6 | import org.apache.spark.sql.{Encoders, SparkSession, functions} 7 | 8 | object ReadBooksXMLWithNestedArrayDSL { 9 | 10 | def main(args: Array[String]): Unit = { 11 | val spark = SparkSession.builder().master("local[1]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | 15 | import spark.implicits._ 16 | val xmlDF = spark.sqlContext.read 17 | .format("com.databricks.spark.xml") 18 | .option("rowTag", "book") 19 | .load("src/main/resources/books_withnested_array.xml") 20 | 21 | xmlDF.printSchema() 22 | println(xmlDF.count()) 23 | 24 | xmlDF.show() 25 | 26 | xmlDF.select(xmlDF("title"),xmlDF("price")*100).show() 27 | 28 | xmlDF.select("author").show() 29 | 30 | 31 | xmlDF.select("stores").show() 32 | 33 | xmlDF.withColumn("store", functions.explode(xmlDF("stores.store"))).show() 34 | 35 | val df = xmlDF.withColumn("store", functions.explode(xmlDF("stores.store"))) 36 | .select("_id","author","stores.country","store.name") 37 | 38 | val storeDF = xmlDF.select("stores.store") 39 | storeDF.printSchema() 40 | 41 | df.foreach(f=>{ 42 | println(f.getAs("_id")) 43 | }) 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | } 52 | } 53 | 54 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/SparkXMLUsingXstream.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml 2 | 3 | import com.thoughtworks.xstream.XStream 4 | import com.thoughtworks.xstream.io.xml.DomDriver 5 | import org.apache.spark.sql.SparkSession 6 | 7 | case class Animal(cri:String,taille:Int) 8 | 9 | object SparkXMLUsingXStream{ 10 | def main(args: Array[String]): Unit = { 11 | val spark = SparkSession. 12 | builder.master ("local[*]") 13 | .appName ("sparkbyexamples.com") 14 | .getOrCreate () 15 | 16 | var animal:Animal = Animal("Rugissement",150) 17 | val xstream1 = new XStream(new DomDriver()) 18 | xstream1.alias("testAni",classOf[Animal]) 19 | xstream1.aliasField("cricri",classOf[Animal],"cri") 20 | val xmlString = Seq(xstream1.toXML(animal)) 21 | 22 | import spark.implicits._ 23 | val newDf = xmlString.toDF() 24 | newDf.show(false) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/dataset/xml/sparkXml.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset.xml 2 | 3 | import org.apache.spark.sql.functions.{col, explode} 4 | import org.apache.spark.sql.{SQLContext, SparkSession} 5 | 6 | object sparkXml { 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark = SparkSession. 10 | builder.master("local[*]") 11 | //.config("spark.debug.maxToStringFields", "100") 12 | .appName("Insight Application Big Data") 13 | .getOrCreate() 14 | 15 | val df = spark.read 16 | .format("com.databricks.spark.xml") 17 | .option("rowTag", "row") 18 | .load("src/main/resources/input.xml") 19 | df.createOrReplaceTempView("categ_entry") 20 | 21 | df.printSchema() 22 | spark.sql("Select c26['_VALUE'] as value, c26['_m'] as option from categ_entry").show(false) 23 | 24 | val df2 = df.withColumn("c26Struct",explode(df("c26"))) 25 | df2.select(col("c26Struct._VALUE").alias("value"),col("c26Struct._m").alias("option") ).show(false) 26 | 27 | 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/CreateEmptyRDD.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object CreateEmptyRDD extends App{ 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[3]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val rdd = spark.sparkContext.emptyRDD 13 | val rddString = spark.sparkContext.emptyRDD[String] 14 | 15 | println(rdd) 16 | println(rddString) 17 | println("Num of Partitions: "+rdd.getNumPartitions) 18 | 19 | //rddString.saveAsTextFile("test.txt") // returns error 20 | 21 | val rdd2 = spark.sparkContext.parallelize(Seq.empty[String]) 22 | println(rdd2) 23 | println("Num of Partitions: "+rdd2.getNumPartitions) 24 | 25 | //rdd2.saveAsTextFile("test3.txt") 26 | 27 | // Pair RDD 28 | 29 | type dataType = (String,Int) 30 | var pairRDD = spark.sparkContext.emptyRDD[dataType] 31 | println(pairRDD) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/CreateRDD.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object CreateRDD { 6 | 7 | def main(args:Array[String]): Unit ={ 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | //Create RDD from collection 15 | val rdd=spark.sparkContext.parallelize( 16 | Seq(("Java", 20000), ("Python", 100000), ("Scala", 3000)) 17 | ) 18 | 19 | //Create RDD from another RDD 20 | val rdd2 = rdd.map(row=>{ 21 | (row._1,row._2+100) 22 | }) 23 | 24 | rdd2.foreach(println) 25 | 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/OperationsOnRDD.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object OperationsOnRDD { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark = SparkSession.builder() 10 | .appName("SparkByExample") 11 | .master("local") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | val rdd = spark.sparkContext.parallelize( 17 | List("Germany India USA","USA London Russia","Mexico Brazil Canada China") 18 | ) 19 | 20 | val listRdd = spark.sparkContext.parallelize(List(9,2,3,4,5,6,7,8)) 21 | 22 | //reduce 23 | println("Minimum :"+listRdd.reduce((a,b)=> a min b)) 24 | println("Maximum :"+listRdd.reduce((a,b)=> a max b)) 25 | println("Sum :"+listRdd.reduce((a,b)=> a + b)) 26 | 27 | //flatMap 28 | val wordsRdd = rdd.flatMap(_.split(" ")) 29 | wordsRdd.foreach(println) 30 | 31 | //sortBy 32 | println("Sort by word name") 33 | val sortRdd = wordsRdd.sortBy(f=>f) // also can write f=>f 34 | 35 | //GroupBy 36 | val groupRdd = wordsRdd.groupBy(word=>word.length) 37 | groupRdd.foreach(println) 38 | 39 | //map 40 | val tupp2Rdd = wordsRdd.map(f=>(f,1)) 41 | tupp2Rdd.foreach(println) 42 | 43 | 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/PartitionBy.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.HashPartitioner 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object PartitionBy { 8 | 9 | 10 | def main(args:Array[String]): Unit = { 11 | 12 | val spark:SparkSession = SparkSession.builder() 13 | .master("local[3]") 14 | .appName("SparkByExample") 15 | .getOrCreate() 16 | 17 | val sc = spark.sparkContext 18 | 19 | val rdd = sc.textFile("C://000_Projects/opt/BigData/zipcodes.csv") 20 | 21 | val rdd2:RDD[Array[String]] = rdd.map(m=>m.split(",")) 22 | 23 | 24 | val rdd3 = rdd2.map(a=>(a(1),a.mkString(","))) 25 | 26 | val rdd4 = rdd3.partitionBy(new HashPartitioner(3)) 27 | 28 | rdd4.saveAsTextFile("c:/tmp/output/partition") 29 | 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDAccumulator.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDAccumulator_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDBroadcast.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDBroadcast_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDCache.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDCache_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromCSVFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDFromCSVFile { 7 | 8 | def main(args:Array[String]): Unit ={ 9 | 10 | def splitString(row:String):Array[String]={ 11 | row.split(",") 12 | } 13 | 14 | val spark:SparkSession = SparkSession.builder() 15 | .master("local[3]") 16 | .appName("SparkByExample") 17 | .getOrCreate() 18 | val sc = spark.sparkContext 19 | 20 | val rdd = sc.textFile("src/main/resources/zipcodes-noheader.csv") 21 | 22 | val rdd2:RDD[ZipCode] = rdd.map(row=>{ 23 | val strArray = splitString(row) 24 | ZipCode(strArray(0).toInt,strArray(1),strArray(3),strArray(4)) 25 | }) 26 | 27 | rdd2.foreach(a=>println(a.city)) 28 | } 29 | 30 | } 31 | 32 | 33 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromDataUsingParallelize.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.SQLContext 6 | 7 | object RDDFromDataUsingParallelize { 8 | 9 | def main(args: Array[String]): Unit = { 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[3]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | val rdd:RDD[Int] = spark.sparkContext.parallelize(List(1,2,3,4,5)) 15 | val rddCollect:Array[Int] = rdd.collect() 16 | println("Number of Partitions: "+rdd.getNumPartitions) 17 | println("Action: First element: "+rdd.first()) 18 | println("Action: RDD converted to Array[Int] : ") 19 | rddCollect.foreach(println) 20 | 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromParallelizeRange.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDFromParallelizeRange { 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | val sc = spark.sparkContext 15 | 16 | val rdd4:RDD[Range] = sc.parallelize(List(1 to 1000)) 17 | println("Number of Partitions : "+rdd4.getNumPartitions) 18 | 19 | val rdd5 = rdd4.repartition(5) 20 | println("Number of Partitions : "+rdd5.getNumPartitions) 21 | 22 | val rdd6:Array[Range] = rdd5.collect() 23 | println(rdd6.mkString(",")) 24 | 25 | val rdd7:Array[Array[Range]] = rdd5.glom().collect() 26 | println("After glom"); 27 | rdd7.foreach(f=>{ 28 | println("For each partition") 29 | f.foreach(f1=>println(f1)) 30 | }) 31 | 32 | 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromWholeTextFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDFromWholeTextFile { 7 | 8 | def main(args:Array[String]): Unit = { 9 | 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[3]") 12 | .appName("SparkByExamples.com") 13 | .getOrCreate() 14 | val sc = spark.sparkContext 15 | 16 | val rdd = sc.wholeTextFiles("C://000_Projects/opt/BigData/alice.txt") 17 | rdd.foreach(a=>println(a._1+"---->"+a._2)) 18 | 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDHadoopInputFormat.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDHadoopInputFormat_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDPersist.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDPersist_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDReadFilesFromDirectory.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDReadFilesFromDirectory_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDSaveAsObjectFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDSaveAsObjectFile_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/RDDSequenceFiles.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDSequenceFiles_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/ReadMultipleCSVFiles.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object ReadMultipleCSVFiles extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | println("spark read csv files from a directory into RDD") 16 | val rddFromFile = spark.sparkContext.textFile("C:/tmp/files/text01.csv") 17 | println(rddFromFile.getClass) 18 | 19 | val rdd = rddFromFile.map(f=>{ 20 | f.split(",") 21 | }) 22 | 23 | println("Iterate RDD") 24 | rdd.foreach(f=>{ 25 | println("Col1:"+f(0)+",Col2:"+f(1)) 26 | }) 27 | println(rdd) 28 | 29 | println("Get data Using collect") 30 | rdd.collect().foreach(f=>{ 31 | println("Col1:"+f(0)+",Col2:"+f(1)) 32 | }) 33 | 34 | println("read all csv files from a directory to single RDD") 35 | val rdd2 = spark.sparkContext.textFile("C:/tmp/files/*") 36 | rdd2.foreach(f=>{ 37 | println(f) 38 | }) 39 | 40 | println("read csv files base on wildcard character") 41 | val rdd3 = spark.sparkContext.textFile("C:/tmp/files/text*.csv") 42 | rdd3.foreach(f=>{ 43 | println(f) 44 | }) 45 | 46 | println("read multiple csv files into a RDD") 47 | val rdd4 = spark.sparkContext.textFile("C:/tmp/files/text01.csv,C:/tmp/files/text02.csv") 48 | rdd4.foreach(f=>{ 49 | println(f) 50 | }) 51 | 52 | } 53 | 54 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/ReadMultipleFiles.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ReadMultipleFiles extends App { 6 | 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | println("read all text files from a directory to single RDD") 16 | val rdd = spark.sparkContext.textFile("C:/tmp/files/*") 17 | rdd.foreach(f=>{ 18 | println(f) 19 | }) 20 | 21 | println("read text files base on wildcard character") 22 | val rdd2 = spark.sparkContext.textFile("C:/tmp/files/text*.txt") 23 | rdd2.foreach(f=>{ 24 | println(f) 25 | }) 26 | 27 | println("read multiple text files into a RDD") 28 | val rdd3 = spark.sparkContext.textFile("C:/tmp/files/text01.txt,C:/tmp/files/text02.txt") 29 | rdd3.foreach(f=>{ 30 | println(f) 31 | }) 32 | 33 | println("Read files and directory together") 34 | val rdd4 = spark.sparkContext.textFile("C:/tmp/files/text01.txt,C:/tmp/files/text02.txt,C:/tmp/files/*") 35 | rdd4.foreach(f=>{ 36 | println(f) 37 | }) 38 | 39 | 40 | val rddWhole = spark.sparkContext.wholeTextFiles("C:/tmp/files/*") 41 | rddWhole.foreach(f=>{ 42 | println(f._1+"=>"+f._2) 43 | }) 44 | 45 | val rdd5 = spark.sparkContext.textFile("C:/tmp/files/*") 46 | val rdd6 = rdd5.map(f=>{ 47 | f.split(",") 48 | }) 49 | 50 | rdd6.foreach(f => { 51 | println("Col1:"+f(0)+",Col2:"+f(1)) 52 | }) 53 | 54 | } 55 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/SortBy.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 6 | 7 | object SortBy { 8 | 9 | def main(args: Array[String]): Unit = { 10 | 11 | val spark:SparkSession = SparkSession.builder() 12 | .master("local[3]") 13 | .appName("SparkByExample") 14 | .getOrCreate() 15 | 16 | val sc = spark.sparkContext 17 | 18 | val rdd:RDD[String] = sc.textFile("C://000_Projects/opt/BigData/zipcodes-noheader.csv") 19 | 20 | val rddZip:RDD[ZipCode] = rdd.map(f=>{ 21 | val arr = split(f) 22 | ZipCode(arr(0).toInt,arr(1),arr(3),arr(4)) 23 | }) 24 | 25 | //SortBy 26 | val rddSort = rddZip.sortBy(f=>f.recordNumber) 27 | rddSort.collect().foreach(f=>println(f.toString)) 28 | 29 | //SorybyKey 30 | //First create pairRDD 31 | val rddTuple=rddZip.map(f=>{ 32 | Tuple2(f.recordNumber,f.toString) 33 | }) 34 | rddTuple.sortByKey().collect().foreach(f=>println(f._2)) 35 | } 36 | 37 | def split(str:String): Array[String] ={ 38 | str.split(",") 39 | } 40 | 41 | } 42 | 43 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/WordCount.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object WordCount { 7 | 8 | 9 | def main(args:Array[String]): Unit = { 10 | 11 | val spark:SparkSession = SparkSession.builder() 12 | .master("local[3]") 13 | .appName("SparkByExample") 14 | .getOrCreate() 15 | 16 | val sc = spark.sparkContext 17 | 18 | val rdd:RDD[String] = sc.textFile("src/main/scala/test.txt") 19 | 20 | // rdd.collect 21 | rdd.collect().foreach(println) 22 | 23 | // rdd flatMap 24 | val rdd2 = rdd.flatMap(f=>f.split(" ")) 25 | rdd2.foreach(f=>println(f)) 26 | 27 | //Create a Tuple by adding 1 to each word 28 | val rdd3 = rdd2.map(m=>(m,1)) 29 | rdd3.foreach(println) 30 | 31 | //Filter 32 | val rdd4 = rdd3.filter(a=> a._1.startsWith("a")) 33 | rdd4.foreach(println) 34 | 35 | //ReduceBy 36 | val rdd5 = rdd3.reduceByKey(_ + _) 37 | rdd5.foreach(println) 38 | 39 | //Swap word,count and sort by key 40 | rdd5.map(a=>(a._2,a._1)).sortByKey().foreach(println) 41 | 42 | 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/ZipCode.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | case class ZipCode(recordNumber:Int,zipCode:String,city:String,state:String) 4 | 5 | 6 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/rdd/xml/XmlRecordReader.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd.xml 2 | 3 | import com.databricks.spark.xml.XmlInputFormat 4 | import org.apache.hadoop.conf.Configuration 5 | import org.apache.hadoop.io.{LongWritable, Text} 6 | import org.apache.spark.api.java.JavaSparkContext 7 | import org.apache.spark.api.java.function.VoidFunction 8 | import org.apache.spark.sql.SparkSession 9 | 10 | import scala.xml.XML 11 | 12 | 13 | object XmlRecordReader { 14 | def main(args: Array[String]): Unit = { 15 | val sparkSession = SparkSession.builder.appName("XmlRecordReader").master("local").getOrCreate 16 | val javaSparkContext = new JavaSparkContext(sparkSession.sparkContext) 17 | val configuration = new Configuration 18 | configuration.set("xmlinput.start", "") 19 | configuration.set("xmlinput.end", "") 20 | configuration.set("mapreduce.input.fileinputformat.inputdir", "src/main/resources/records.xml") 21 | val javaPairRDD = javaSparkContext.newAPIHadoopRDD(configuration, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text]) 22 | javaPairRDD.foreach(new VoidFunction[Tuple2[LongWritable, Text]]() { 23 | @throws[Exception] 24 | override def call(tuple: Tuple2[LongWritable, Text]): Unit = { // TODO Auto-generated method stub 25 | 26 | val xml = XML.loadString(tuple._2.toString) 27 | val forecast = (xml \ "Name") text 28 | 29 | println("forecast" + forecast) 30 | 31 | } 32 | }) 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/stackoverflow/AddingLiterral.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.stackoverflow 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 5 | case class Employee(EmpId: String, Experience: Double, Salary: Double) 6 | 7 | case class Employee2(EmpId: EmpData, Experience: EmpData, Salary: EmpData) 8 | case class EmpData(key: String,value:String) 9 | object AddingLiterral { 10 | def main(args: Array[String]): Unit = { 11 | 12 | val spark = SparkSession.builder() 13 | .master("local[1]") 14 | .appName("SparkByExample") 15 | .getOrCreate(); 16 | import spark.sqlContext.implicits._ 17 | import org.apache.spark.sql.functions._ 18 | val data = Seq(("111",5,50000),("222",6,60000),("333",7,60000)) 19 | val df = data.toDF("EmpId","Experience","Salary") 20 | 21 | val newdf = df.withColumn("EmpId", struct(lit("1").as("key"),col("EmpId").as("value"))) 22 | .withColumn("Experience", struct(lit("2").as("key"),col("Experience").as("value"))) 23 | .withColumn("Salary", struct(lit("3").as("key"),col("Salary").as("value"))) 24 | .show(false) 25 | 26 | val ds = df.as[Employee] 27 | val newDS = ds.map(rec=>{ 28 | (EmpData("1",rec.EmpId), EmpData("2",rec.Experience.toString),EmpData("3",rec.Salary.toString)) 29 | }) 30 | val finalDS = newDS.toDF("EmpId","Experience","Salary").as[Employee2] 31 | finalDS.show(false) 32 | // newDS.withColumnRenamed("_1","EmpId") 33 | // .withColumnRenamed("_2","Experience") 34 | // .withColumnRenamed("_3","Salary") 35 | // .show(false) 36 | 37 | 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/stackoverflow/Test.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.stackoverflow 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | object Test { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate(); 13 | import spark.sqlContext.implicits._ 14 | 15 | val df1:DataFrame = Seq( 16 | ("Mark", "2018-02-20 00:00:00"), 17 | ("Alex", "2018-03-01 00:00:00"), 18 | ("Bob", "2018-03-01 00:00:00"), 19 | ("Mark", "2018-07-01 00:00:00"), 20 | ("Kate", "2018-07-01 00:00:00") 21 | ).toDF("USER_NAME", "REQUEST_DATE") 22 | 23 | df1.show() 24 | 25 | val df2: DataFrame = Seq( 26 | ("Alex", "2018-01-01 00:00:00", "2018-02-01 00:00:00", "OUT"), 27 | ("Bob", "2018-02-01 00:00:00", "2018-02-05 00:00:00", "IN"), 28 | ("Mark", "2018-02-01 00:00:00", "2018-03-01 00:00:00", "IN"), 29 | ("Mark", "2018-05-01 00:00:00", "2018-08-01 00:00:00", "OUT"), 30 | ("Meggy", "2018-02-01 00:00:00", "2018-02-01 00:00:00", "OUT") 31 | ).toDF("NAME", "START_DATE", "END_DATE", "STATUS") 32 | 33 | df2.show() 34 | 35 | val df3 = df1.join(df2, col("USER_NAME") === col("NAME"), "left_outer") 36 | 37 | 38 | df3.groupBy("USER_NAME","REQUEST_DATE") 39 | 40 | val df4 = df3.withColumn("USER_STATUS", when($"REQUEST_DATE" > $"START_DATE" and $"REQUEST_DATE" < $"END_DATE", "Our user") otherwise ("Not our user")) 41 | 42 | df4.select("USER_NAME","REQUEST_DATE","USER_STATUS").distinct()show(false) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /spark-sql-examples/src/main/scala/com/sparkbyexamples/spark/stackoverflow/Test2.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.stackoverflow 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object Test2 { 6 | 7 | // def main(args: Array[String]): Unit = { 8 | // 9 | // val spark = SparkSession.builder() 10 | // .master("local[1]") 11 | // .appName("SparkByExample") 12 | // .getOrCreate(); 13 | // 14 | // val peopleDFCsv = spark.read.format("csv") 15 | // .load("src/main/resources/stack.csv") 16 | // 17 | // val d = peopleDFCsv.map(row=>{ 18 | // val col1=row.get(1) 19 | // val col2=row.get(1) 20 | // (col1,col2) 21 | // }).toDF() 22 | // 23 | // } 24 | } 25 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode1.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode10.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} 2 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false} 3 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false} 4 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode11.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode2.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false} 2 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false} 3 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode3.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":61391,"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX","LocationType":"NOT ACCEPTABLE","Lat":32.72,"Long":-97.31,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Cingular Wireless, TX","Location":"NA-US-TX-CINGULAR WIRELESS","Decommisioned":false} 2 | {"RecordNumber":61392,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX","LocationType":"PRIMARY","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Fort Worth, TX","Location":"NA-US-TX-FORT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986} 3 | {"RecordNumber":61393,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX","LocationType":"ACCEPTABLE","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Ft Worth, TX","Location":"NA-US-TX-FT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986} 4 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode4.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":4,"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Urb Eugene Rice, PR","Location":"NA-US-PR-URB EUGENE RICE","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode5.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":39827,"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.37,"Long":-111.64,"Xaxis":-0.3,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14962,"EstimatedPopulation":26883,"TotalWages":563792730,"Notes":"no NWS data, "} 2 | {"RecordNumber":39828,"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.38,"Long":-111.84,"Xaxis":-0.31,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14374,"EstimatedPopulation":25446,"TotalWages":471000465} 3 | {"RecordNumber":49345,"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL","LocationType":"PRIMARY","Lat":30.69,"Long":-81.92,"Xaxis":0.12,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Hilliard, FL","Location":"NA-US-FL-HILLIARD","Decommisioned":false,"TaxReturnsFiled":3922,"EstimatedPopulation":7443,"TotalWages":133112149} 4 | {"RecordNumber":49346,"Zipcode":34445,"ZipCodeType":"PO BOX","City":"HOLDER","State":"FL","LocationType":"PRIMARY","Lat":28.96,"Long":-82.41,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Holder, FL","Location":"NA-US-FL-HOLDER","Decommisioned":false} 5 | {"RecordNumber":49347,"Zipcode":32564,"ZipCodeType":"STANDARD","City":"HOLT","State":"FL","LocationType":"PRIMARY","Lat":30.72,"Long":-86.67,"Xaxis":0.04,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Holt, FL","Location":"NA-US-FL-HOLT","Decommisioned":false,"TaxReturnsFiled":1207,"EstimatedPopulation":2190,"TotalWages":36395913} 6 | {"RecordNumber":49348,"Zipcode":34487,"ZipCodeType":"PO BOX","City":"HOMOSASSA","State":"FL","LocationType":"PRIMARY","Lat":28.78,"Long":-82.61,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Homosassa, FL","Location":"NA-US-FL-HOMOSASSA","Decommisioned":false} 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode6.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":10,"Zipcode":708,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode7.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false} 2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599} 3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517} 4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493} 5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318} 6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473} 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode8.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false} 2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599} 3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517} 4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493} 5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318} 6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473} 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/folder_streaming/zipcode9.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493} 2 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318} 3 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473} 4 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/person.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "Person", 4 | "namespace": "com.sparkbyexamples", 5 | "fields": [ 6 | {"name": "id","type": ["int", "null"]}, 7 | {"name": "firstname","type": ["string", "null"]}, 8 | {"name": "middlename","type": ["string", "null"]}, 9 | {"name": "lastname","type": ["string", "null"]}, 10 | {"name": "dob_year","type": ["int", "null"]}, 11 | {"name": "dob_month","type": ["int", "null"]}, 12 | {"name": "gender","type": ["string", "null"]}, 13 | {"name": "salary","type": ["int", "null"]} 14 | ] 15 | } -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/person.json: -------------------------------------------------------------------------------- 1 | {"id":1,"firstname":"James ","middlename":"","lastname":"Smith","dob_year":2018,"dob_month":1,"gender":"M","salary":3000} 2 | {"id":2,"firstname":"Michael ","middlename":"Rose","lastname":"","dob_year":2010,"dob_month":3,"gender":"M","salary":4000} 3 | {"id":3,"firstname":"Robert ","middlename":"","lastname":"Williams","dob_year":2010,"dob_month":3,"gender":"M","salary":4000} 4 | {"id":4,"firstname":"Maria ","middlename":"Anne","lastname":"Jones","dob_year":2005,"dob_month":5,"gender":"F","salary":4000} 5 | {"id":5,"firstname":"Jen","middlename":"Mary","lastname":"Brown","dob_year":2010,"dob_month":7,"gender":"","salary":-1} 6 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/SparkStreamingFromDirectory.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 5 | 6 | object SparkStreamingFromDirectory { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[3]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | 15 | spark.sparkContext.setLogLevel("ERROR") 16 | 17 | val schema = StructType( 18 | List( 19 | StructField("RecordNumber", IntegerType, true), 20 | StructField("Zipcode", StringType, true), 21 | StructField("ZipCodeType", StringType, true), 22 | StructField("City", StringType, true), 23 | StructField("State", StringType, true), 24 | StructField("LocationType", StringType, true), 25 | StructField("Lat", StringType, true), 26 | StructField("Long", StringType, true), 27 | StructField("Xaxis", StringType, true), 28 | StructField("Yaxis", StringType, true), 29 | StructField("Zaxis", StringType, true), 30 | StructField("WorldRegion", StringType, true), 31 | StructField("Country", StringType, true), 32 | StructField("LocationText", StringType, true), 33 | StructField("Location", StringType, true), 34 | StructField("Decommisioned", StringType, true) 35 | ) 36 | ) 37 | 38 | val df = spark.readStream 39 | .schema(schema) 40 | .json("c:/tmp/stream_folder") 41 | 42 | df.printSchema() 43 | 44 | val groupDF = df.select("Zipcode") 45 | .groupBy("Zipcode").count() 46 | groupDF.printSchema() 47 | 48 | groupDF.writeStream 49 | .format("console") 50 | .outputMode("complete") 51 | .option("truncate",false) 52 | .option("newRows",30) 53 | .start() 54 | .awaitTermination() 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/SparkStreamingFromSocket.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{explode, split} 5 | 6 | object SparkStreamingFromSocket { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[3]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | 15 | spark.sparkContext.setLogLevel("ERROR") 16 | 17 | val df = spark.readStream 18 | .format("socket") 19 | .option("host","192.168.1.100") 20 | .option("port","7890") 21 | .load() 22 | 23 | df.printSchema() 24 | 25 | val wordsDF = df.select(explode(split(df("value")," ")).alias("word")) 26 | 27 | val count = wordsDF.groupBy("word").count() 28 | 29 | val query = count.writeStream 30 | .format("console") 31 | .outputMode("complete") 32 | .start() 33 | .awaitTermination() 34 | 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/batch/SparkBatchConsumeFromKafka.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.batch 2 | 3 | import org.apache.spark.sql.SparkSession 4 | //https://spark.apache.org/docs/2.3.0/structured-streaming-kafka-integration.html 5 | object SparkBatchConsumeFromKafka { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("https://SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | val df = spark 17 | .read 18 | .format("kafka") 19 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 20 | .option("subscribe", "text_topic6") 21 | // .option("startingOffsets", """{"topic1":{"0":23,"1":-2},"topic2":{"0":-2}}""") 22 | // .option("endingOffsets", """{"topic1":{"0":50,"1":-1},"topic2":{"0":-1}}""") 23 | 24 | // .option("subscribePattern", "topic.*") 25 | // .option("startingOffsets", "earliest") 26 | // .option("endingOffsets", "latest") 27 | .load() 28 | 29 | df.printSchema() 30 | 31 | // Displays Data in Binary 32 | df.show() 33 | 34 | //// Displays Data in String 35 | val df2 = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)","topic") 36 | df2.show(false) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/batch/SparkBatchConsumeFromKafkaAvro.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.batch 2 | 3 | import java.io.File 4 | 5 | import org.apache.avro.Schema 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.sql.functions.col 8 | import org.apache.spark.sql.avro.from_avro 9 | 10 | //https://spark.apache.org/docs/2.3.0/structured-streaming-kafka-integration.html 11 | object SparkBatchConsumeFromKafkaAvro { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | val spark: SparkSession = SparkSession.builder() 16 | .master("local[1]") 17 | .appName("https://SparkByExamples.com") 18 | .getOrCreate() 19 | 20 | spark.sparkContext.setLogLevel("ERROR") 21 | 22 | val df = spark 23 | .read 24 | .format("kafka") 25 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 26 | .option("subscribe", "avro_topic1") 27 | .load() 28 | 29 | df.printSchema() 30 | 31 | /* 32 | Displays Data in Binary 33 | */ 34 | 35 | val schemaAvro = new Schema.Parser() 36 | .parse(new File("src/main/resources/person.avsc")) 37 | 38 | /* 39 | Displays Data in String 40 | */ 41 | val df2 = df.select(from_avro(col("value"),schemaAvro.toString ).as("value")) 42 | .selectExpr("CAST(value AS STRING)") 43 | 44 | df2.select("value.gender").show() 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/batch/SparkBatchProduceToKafka.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.batch 2 | import org.apache.spark.sql.SparkSession 3 | //https://spark.apache.org/docs/2.3.0/structured-streaming-kafka-integration.html 4 | object SparkBatchProduceToKafka { 5 | 6 | def main(args: Array[String]): Unit = { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | val data = Seq (("iphone", "2007"),("iphone 3G","2008"), 16 | ("iphone 3GS","2009"), 17 | ("iphone 4","2010"), 18 | ("iphone 4S","2011"), 19 | ("iphone 5","2012"), 20 | ("iphone 8","2014"), 21 | ("iphone 10","2017")) 22 | 23 | val df = spark.createDataFrame(data).toDF("key","value") 24 | 25 | /* 26 | since we are using dataframe which is already in text, 27 | selectExpr is optional. 28 | If the bytes of the Kafka records represent UTF8 strings, 29 | we can simply use a cast to convert the binary data 30 | into the correct type. 31 | 32 | df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") 33 | */ 34 | df.write 35 | .format("kafka") 36 | .option("kafka.bootstrap.servers","192.168.1.100:9092") 37 | .option("topic","text_topic6") 38 | .save() 39 | 40 | 41 | val data2 = Seq((1,"James ","","Smith",2018,1,"M",3000), 42 | (2,"Michael ","Rose","",2010,3,"M",4000), 43 | (3,"Robert ","","Williams",2010,3,"M",4000), 44 | (4,"Maria ","Anne","Jones",2005,5,"F",4000), 45 | (5,"Jen","Mary","Brown",2010,7,"",-1) 46 | ) 47 | 48 | val columns = Seq("id","firstname","middlename","lastname","dob_year", 49 | "dob_month","gender","salary") 50 | import spark.sqlContext.implicits._ 51 | val df2 = data2.toDF(columns:_*) 52 | 53 | /* 54 | Writing Json as a Value to Kafka topic 55 | */ 56 | df2.toJSON.write 57 | .format("kafka") 58 | .option("kafka.bootstrap.servers","192.168.1.100:9092") 59 | .option("topic","text_topic6") 60 | .save() 61 | 62 | /* 63 | Another way of Writing Json 64 | By sending key and value to Kafka 65 | using to_json() 66 | */ 67 | df2.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value") 68 | .write 69 | .format("kafka") 70 | .option("kafka.bootstrap.servers","192.168.1.100:9092") 71 | .option("topic","text_topic6") 72 | .save() 73 | 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/batch/SparkBatchProduceToKafkaAvro.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.batch 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.col 5 | import org.apache.spark.sql.avro.to_avro 6 | //https://spark.apache.org/docs/2.3.0/structured-streaming-kafka-integration.html 7 | object SparkBatchProduceToKafkaAvro { 8 | 9 | def main(args: Array[String]): Unit = { 10 | 11 | val spark: SparkSession = SparkSession.builder() 12 | .master("local[1]") 13 | .appName("SparkByExample") 14 | .getOrCreate() 15 | 16 | spark.sparkContext.setLogLevel("ERROR") 17 | 18 | /* 19 | Write Avro to Kafka 20 | */ 21 | val data2 = Seq((1,"James ","","Smith",2018,1,"M",3000), 22 | (2,"Michael ","Rose","",2010,3,"M",4000), 23 | (3,"Robert ","","Williams",2010,3,"M",4000), 24 | (4,"Maria ","Anne","Jones",2005,5,"F",4000), 25 | (5,"Jen","Mary","Brown",2010,7,"",-1) 26 | ) 27 | 28 | val columns = Seq("id","firstname","middlename","lastname","dob_year", 29 | "dob_month","gender","salary") 30 | import spark.sqlContext.implicits._ 31 | val df2 = data2.toDF(columns:_*) 32 | 33 | df2.toJSON.select(to_avro(col("value")).as("value")) 34 | .write 35 | .format("kafka") 36 | .option("kafka.bootstrap.servers","192.168.1.100:9092") 37 | .option("topic","avro_topic1") 38 | .save() 39 | 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingForeachRDD.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.inprogress 2 | 3 | object SparkStreamingForeachRDD_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingForeachWriter.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.inprogress 2 | 3 | object SparkStreamingForeachWriter_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingFromDirectoryTmp.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.inprogress 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 5 | 6 | object SparkStreamingFromDirectoryTmp { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[3]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | 15 | spark.sparkContext.setLogLevel("ERROR") 16 | 17 | val schema = StructType( 18 | List( 19 | StructField("RecordNumber", IntegerType, true), 20 | StructField("Zipcode", StringType, true), 21 | StructField("ZipCodeType", StringType, true), 22 | StructField("City", StringType, true), 23 | StructField("State", StringType, true), 24 | StructField("LocationType", StringType, true), 25 | StructField("Lat", StringType, true), 26 | StructField("Long", StringType, true), 27 | StructField("Xaxis", StringType, true), 28 | StructField("Yaxis", StringType, true), 29 | StructField("Zaxis", StringType, true), 30 | StructField("WorldRegion", StringType, true), 31 | StructField("Country", StringType, true), 32 | StructField("LocationText", StringType, true), 33 | StructField("Location", StringType, true), 34 | StructField("Decommisioned", StringType, true) 35 | ) 36 | ) 37 | 38 | val df = spark.readStream 39 | //.option("header","true") 40 | //.option("maxFilesPerTrigger",3) 41 | .schema(schema) 42 | .json("c:/tmp/stream_folder") 43 | //.text("c:/tmp/stream_folder") 44 | 45 | df.printSchema() 46 | 47 | // val groupDF = df.select( 48 | // get_json_object(col("value").cast("string"),"$.Zipcode") 49 | // .alias("Zipcode")).groupBy("Zipcode").count() 50 | 51 | val groupDF = df.select("Zipcode") 52 | .groupBy("Zipcode").count() 53 | groupDF.printSchema() 54 | 55 | groupDF.writeStream 56 | .format("console") 57 | .outputMode("complete") 58 | .option("truncate",false) 59 | .option("newRows",30) 60 | .start() 61 | .awaitTermination() 62 | 63 | 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingKafkaProducerZipcodeObject.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.inprogress 2 | 3 | import org.apache.spark.sql.SparkSession 4 | object SparkStreamingKafkaUserObject { 5 | 6 | def main(args: Array[String]): Unit = { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[3]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | val df = spark.readStream 16 | .option("header", "true") 17 | .option("maxFilesPerTrigger", 3) 18 | .text("c:/tmp/stream_folder") 19 | 20 | df.printSchema() 21 | 22 | df.writeStream 23 | .format("kafka") 24 | .outputMode("append") 25 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 26 | .option("topic", "topic_text") 27 | .option("checkpointLocation", "c:/tmp/checkpoint") 28 | .start() 29 | .awaitTermination() 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingToHDFS.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.inprogress 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object SparkStreamingToHDFS_ { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | val df = spark.readStream 17 | .format("socket") 18 | .option("host","localhost") 19 | .option("port","9090") 20 | .load() 21 | 22 | df.printSchema() 23 | 24 | 25 | 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingToJDBC.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.inprogress 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object SparkStreamingToJDBC_ { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | val df = spark.readStream 17 | .format("socket") 18 | .option("host","localhost") 19 | .option("port","9090") 20 | .load() 21 | 22 | df.printSchema() 23 | 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingToParquetFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.inprogress 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object SparkStreamingToParquetFile_ { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | val df = spark.readStream 17 | .format("socket") 18 | .option("host","localhost") 19 | .option("port","9090") 20 | .load() 21 | 22 | //Parquet doesn't support complete mode hence, 23 | //we can't write aggregated output 24 | df.writeStream 25 | .format("parquet") 26 | .outputMode("append") 27 | .option("path","c:/tmp/spark_out/parquet") 28 | .option("checkpointLocation", "c:/tmp/checkpoint") 29 | .start() 30 | .awaitTermination() 31 | 32 | 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingToS3.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.inprogress 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object SparkStreamingToS3_ { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | val df = spark.readStream 17 | .format("socket") 18 | .option("host","localhost") 19 | .option("port","9090") 20 | .load() 21 | 22 | df.printSchema() 23 | 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/inprogress/SparkStreamingTwitter.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.inprogress 2 | 3 | object SparkStreamingTwitter_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/KafkaProduceAvro.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.kafka 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object KafkaProduceAvro { 6 | 7 | def main(args:Array[String]): Unit ={ 8 | 9 | 10 | val spark: SparkSession = SparkSession.builder().master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | val data = Seq((1,"James ","","Smith",2018,1,"M",3000), 17 | (2,"Michael ","Rose","",2010,3,"M",4000), 18 | (3,"Robert ","","Williams",2010,3,"M",4000), 19 | (4,"Maria ","Anne","Jones",2005,5,"F",4000), 20 | (5,"Jen","Mary","Brown",2010,7,"",-1) 21 | ) 22 | 23 | val columns = Seq("id","firstname","middlename","lastname","dob_year", 24 | "dob_month","gender","salary") 25 | import spark.sqlContext.implicits._ 26 | val df = data.toDF(columns:_*) 27 | 28 | df.write.json("c:/tmp/person.json") 29 | 30 | 31 | 32 | // // `from_avro` requires Avro schema in JSON string format. 33 | // val jsonFormatSchema = new String(Files.readAllBytes(Paths.get("src/main/resources/person.avsc"))) 34 | // 35 | // val df = spark 36 | // .readStream 37 | // .format("kafka") 38 | // .option("kafka.bootstrap.servers", "host1:port1,host2:port2") 39 | // .option("subscribe", "topic1") 40 | // .load() 41 | // 42 | // // 1. Decode the Avro data into a struct; 43 | // // 2. Filter by column `favorite_color`; 44 | // // 3. Encode the column `name` in Avro format. 45 | // val output = df 46 | // .select(from_avro(col("value"), jsonFormatSchema) as "user") 47 | // .where(col("user.favorite_color") === "red") 48 | // .select(to_avro(col("user.name")) as "value") 49 | 50 | // val data = Seq (("iphone", "2007"),("iphone 3G","2008"), 51 | // ("iphone 3GS","2009"), 52 | // ("iphone 4","2010"), 53 | // ("iphone 4S","2011"), 54 | // ("iphone 5","2012"), 55 | // ("iphone 8","2014"), 56 | // ("iphone 10","2017")) 57 | // 58 | // val df = spark.createDataFrame(data).toDF("key","value") 59 | 60 | val ds = df.toJSON 61 | ds.printSchema() 62 | 63 | val query = ds 64 | .writeStream 65 | .format("kafka") 66 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 67 | .option("topic", "text_topic") 68 | .start() 69 | 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/SparkStreamingConsumeKafka.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.kafka 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{explode, split} 5 | 6 | object SparkStreamingConsumeKafka { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[3]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | 15 | spark.sparkContext.setLogLevel("ERROR") 16 | 17 | val df = spark.readStream 18 | .format("kafka") 19 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 20 | .option("subscribe", "topic_text") 21 | //.option("subscribePattern", "topic.*") 22 | .option("startingOffsets", "earliest") // Other possible values assign and latest 23 | .load() 24 | 25 | df.printSchema() 26 | 27 | val groupCount = df.select(explode(split(df("value")," ")).alias("word")) 28 | .groupBy("word").count() 29 | 30 | groupCount.writeStream 31 | .format("console") 32 | .outputMode("complete") 33 | .start() 34 | .awaitTermination() 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/avro/KafkaConsumerAvro.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.kafka.avro 2 | import java.nio.file.{Files, Paths} 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.avro._ 5 | import org.apache.spark.sql.functions.col 6 | 7 | object KafkaConsumerAvro { 8 | def main(args: Array[String]): Unit = { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .master("local") 12 | .appName("SparkByExample.com") 13 | .getOrCreate() 14 | 15 | spark.sparkContext.setLogLevel("ERROR") 16 | 17 | val df = spark.readStream 18 | .format("kafka") 19 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 20 | .option("subscribe", "avro_topic") 21 | .option("startingOffsets", "earliest") // From starting 22 | .load() 23 | 24 | /* 25 | Prints Kafka schema with columns (topic, offset, partition e.t.c) 26 | */ 27 | df.printSchema() 28 | 29 | /* 30 | Read schema to convert Avro data to DataFrame 31 | */ 32 | val jsonFormatSchema = new String( 33 | Files.readAllBytes(Paths.get("./src/main/resources/person.avsc"))) 34 | 35 | val personDF = df.select(from_avro(col("value"), jsonFormatSchema).as("person")) 36 | .select("person.*") 37 | 38 | personDF.printSchema() 39 | 40 | /* 41 | Stream data to Console for testing 42 | */ 43 | personDF.writeStream 44 | .format("console") 45 | .outputMode("append") 46 | .start() 47 | .awaitTermination() 48 | 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/avro/KafkaProduceAvro.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.kafka.avro 2 | import org.apache.spark.sql.SparkSession 3 | import org.apache.spark.sql.functions.{col, from_json,to_json,struct} 4 | import org.apache.spark.sql.avro.to_avro 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 6 | object KafkaProduceAvro { 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample.com") 12 | .getOrCreate() 13 | 14 | /* 15 | Disable logging as it writes too much log 16 | */ 17 | spark.sparkContext.setLogLevel("ERROR") 18 | 19 | /* 20 | This consumes JSON data from Kafka 21 | */ 22 | val df = spark.readStream 23 | .format("kafka") 24 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 25 | .option("subscribe", "json_topic") 26 | .option("startingOffsets", "earliest") // From starting 27 | .load() 28 | 29 | /* 30 | Prints Kafka schema with columns (topic, offset, partition e.t.c) 31 | */ 32 | df.printSchema() 33 | 34 | val schema = new StructType() 35 | .add("id",IntegerType) 36 | .add("firstname",StringType) 37 | .add("middlename",StringType) 38 | .add("lastname",StringType) 39 | .add("dob_year",IntegerType) 40 | .add("dob_month",IntegerType) 41 | .add("gender",StringType) 42 | .add("salary",IntegerType) 43 | 44 | /* 45 | Converts JSON string to DataFrame 46 | */ 47 | val personDF = df.selectExpr("CAST(value AS STRING)") // First convert binary to string 48 | .select(from_json(col("value"), schema).as("data")) 49 | 50 | 51 | personDF.printSchema() 52 | /* 53 | *uncomment below code if you want to write it to console for testing. 54 | */ 55 | // person.select(to_json(struct("data.*")).as("value")) 56 | // .writeStream 57 | // .format("console") 58 | // .outputMode("append") 59 | // .start() 60 | // .awaitTermination() 61 | 62 | /* 63 | * Convert DataFrame columns to Avro format and name it as "value" 64 | * And send this Avro data to Kafka topic 65 | */ 66 | 67 | personDF.select(to_avro(struct("data.*")) as "value") 68 | .writeStream 69 | .format("kafka") 70 | .outputMode("append") 71 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 72 | .option("topic", "avro_topic") 73 | .option("checkpointLocation","c:/tmp") 74 | .start() 75 | .awaitTermination() 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/com/sparkbyexamples/spark/streaming/kafka/json/SparkStreamingConsumerKafkaJson.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.streaming.kafka.json 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, from_json} 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 6 | 7 | object SparkStreamingConsumerKafkaJson { 8 | 9 | def main(args: Array[String]): Unit = { 10 | 11 | val spark: SparkSession = SparkSession.builder() 12 | .master("local[3]") 13 | .appName("SparkByExample") 14 | .getOrCreate() 15 | 16 | spark.sparkContext.setLogLevel("ERROR") 17 | 18 | val df = spark.readStream 19 | .format("kafka") 20 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 21 | .option("subscribe", "json_topic") 22 | .option("startingOffsets", "earliest") // From starting 23 | .load() 24 | 25 | df.printSchema() 26 | 27 | //df.show(false) 28 | //org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();; 29 | 30 | val schema = new StructType() 31 | .add("id",IntegerType) 32 | .add("firstname",StringType) 33 | .add("middlename",StringType) 34 | .add("lastname",StringType) 35 | .add("dob_year",IntegerType) 36 | .add("dob_month",IntegerType) 37 | .add("gender",StringType) 38 | .add("salary",IntegerType) 39 | 40 | val person = df.selectExpr("CAST(value AS STRING)") 41 | .select(from_json(col("value"), schema).as("data")) 42 | .select("data.*") 43 | 44 | /** 45 | *uncomment below code if you want to write it to console for testing. 46 | */ 47 | // val query = person.writeStream 48 | // .format("console") 49 | // .outputMode("append") 50 | // .start() 51 | // .awaitTermination() 52 | 53 | /** 54 | *uncomment below code if you want to write it to kafka topic. 55 | */ 56 | df.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value") 57 | .writeStream 58 | .format("kafka") 59 | .outputMode("append") 60 | .option("kafka.bootstrap.servers", "192.168.1.100:9092") 61 | .option("topic", "josn_data_topic") 62 | .start() 63 | .awaitTermination() 64 | 65 | 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /spark2.3-avro-examples/src/main/resources/person.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "Person", 4 | "namespace": "com.sparkbyexamples", 5 | "fields": [ 6 | {"name": "id","type": "int"}, 7 | {"name": "firstname","type": "string"}, 8 | {"name": "middlename","type": "string"}, 9 | {"name": "lastname","type": "string"}, 10 | {"name": "dob_year","type": "int"}, 11 | {"name": "dob_month","type": "int"}, 12 | {"name": "gender","type": "string"}, 13 | {"name": "salary","type": "int"} 14 | ] 15 | } --------------------------------------------------------------------------------