├── .gitignore ├── README.md ├── pom.xml └── src └── main ├── resources ├── address-multiline.csv ├── address.csv ├── books.xml ├── books_withnested_array.xml ├── csv │ ├── invalid.txt │ ├── text01.txt │ ├── text02.txt │ ├── text03.txt │ └── text04.txt ├── free-zipcode-database.csv ├── kv.csv ├── multiline-zipcode.json ├── persons.xml ├── persons_complex.xml ├── records.xml ├── schema.json ├── simple_zipcodes.csv ├── simple_zipcodes.json ├── simple_zipcodes.txt ├── small_zipcode.csv ├── stream.csv ├── test.txt ├── txt │ ├── alice.txt │ ├── datasets.csv │ └── holmes.txt ├── zipcodes-noheader.csv ├── zipcodes.avro ├── zipcodes.csv ├── zipcodes.json ├── zipcodes.parquet ├── zipcodes20.csv └── zipcodes_streaming │ ├── zipcode1.json │ ├── zipcode10.json │ ├── zipcode11.json │ ├── zipcode12.json │ ├── zipcode2.json │ ├── zipcode3.json │ ├── zipcode4.json │ ├── zipcode5.json │ ├── zipcode6.json │ ├── zipcode7.json │ ├── zipcode8.json │ └── zipcode9.json └── scala └── com └── sparkbyexamples └── spark ├── SQLContextExample.scala ├── SparkContextExample.scala ├── SparkSessionTest.scala ├── SparkSessionWrapper.scala ├── beans ├── Books.scala ├── BooksDiscounted.scala ├── BooksStruct.scala ├── BooksWithArray.scala ├── User.scala └── Zipcode.scala ├── dataframe ├── ArrayToColumn.scala ├── AvroExample.scala ├── AvroToJson.scala ├── AvroToParquet.scala ├── BroadcastExample.scala ├── CaseClassSparkSchema.scala ├── CastColumnType.scala ├── ColumnTruncate.scala ├── CreateDataFrame.scala ├── CreateEmptyDataFrameExample.scala ├── CreateEmptyDatasetExample.scala ├── CsvToAvroParquetJson.scala ├── DataFrameFromCSVFile.scala ├── DataFrameWithComplexDSL.scala ├── DataFrameWithSimpleDSL.scala ├── DataTypeExample.scala ├── FilterExample.scala ├── FilterNullRowsExample.scala ├── FlattenNestedStruct.scala ├── FromCSVFile2.scala ├── FromCSVMultiline.scala ├── FromJsonFile.scala ├── FromTextFile.scala ├── GroupbyExample.scala ├── HandleNullExample.scala ├── JsonFromMultiline.scala ├── JsonToAvroCsvParquet.scala ├── ParquetAWSExample.scala ├── ParquetExample.scala ├── ParquetToAvro.scala ├── ParquetToCsv.scala ├── ParquetToJson.scala ├── ReadJsonFromString.scala ├── RemoveNullRowsExample.scala ├── RenameColDataFrame.scala ├── SQLExample.scala ├── SaveDataFrame.scala ├── SparkUDF.scala ├── StructTypeUsage.scala ├── UDFDataFrame.scala ├── UnionExample.scala ├── WhereExample.scala ├── WithColumn.scala ├── examples │ ├── CacheExample.scala │ ├── CastStringToInt.scala │ ├── CollectExample.scala │ ├── DataFrameComplex.scala │ ├── DataFrameEmptyCheck.scala │ ├── DropColumn.scala │ ├── ForEachExample.scala │ ├── ForEachPartExample.scala │ ├── MapFlatMap.scala │ ├── MapTransformation.scala │ ├── RangePartition.scala │ ├── ReadORCFile.scala │ ├── RenameDeleteFile.scala │ ├── RepartitionExample.scala │ ├── SaveSingleFile.scala │ ├── SelectExamples.scala │ ├── SelectSelectExpr.scala │ ├── ShuffleExample.scala │ └── Util.scala ├── functions │ ├── AddColumn.scala │ ├── AnotherExample.scala │ ├── PivotExample.scala │ ├── RemoveDuplicate.scala │ ├── SortExample.scala │ ├── WhenOtherwise.scala │ ├── WindowGroupbyFirst.scala │ ├── aggregate │ │ ├── AggregateFunctions.scala │ │ ├── DistinctCount.scala │ │ └── SQLDistinct.scala │ ├── collection │ │ ├── ArrayContainsExample.scala │ │ ├── ArrayOfArrayType.scala │ │ ├── ArrayOfMapType.scala │ │ ├── ArrayOfString.scala │ │ ├── ArrayOfStructType.scala │ │ ├── ArrayTypeExample.scala │ │ ├── CollectListExample.scala │ │ ├── ExplodeArrayAndMap.scala │ │ ├── MapFunctions.scala │ │ ├── MapToColumn.scala │ │ ├── MapTypeExample.scala │ │ ├── SliceArray.scala │ │ └── StringToArray.scala │ ├── datetime │ │ ├── AddTime.scala │ │ ├── CurrentDateAndTime.scala │ │ ├── DateAddMonths.scala │ │ ├── DateDiff.scala │ │ ├── DateExamples.scala │ │ ├── DateFormat.scala │ │ ├── DateInMilli.scala │ │ ├── DateLastDay.scala │ │ ├── DateToString.scala │ │ ├── DateTrunc.scala │ │ ├── DayAndWeekOfYear.scala │ │ ├── DayWeekAndWeekMonth.scala │ │ ├── GetTimeFromTimestamp.scala │ │ ├── Spark3Date.scala │ │ ├── StringToDate.scala │ │ ├── StringToTimestamp.scala │ │ ├── TimeInMilli.scala │ │ ├── TimestampDiff.scala │ │ ├── TimestampToDate.scala │ │ ├── TimestampToString.scala │ │ └── UnixTimestamp.scala │ ├── from_json.scala │ ├── litTypeLit.scala │ ├── string │ │ ├── ConcatExample.scala │ │ └── SplitExample.scala │ └── window │ │ ├── RowNumber.scala │ │ ├── WindowFunctions.scala │ │ └── WindowGroupbyFirst.scala ├── join │ ├── CrossJoinExample.scala │ ├── InnerJoinExample.scala │ ├── JoinExample.scala │ ├── JoinMultipleColumns.scala │ ├── JoinMultipleDataFrames.scala │ └── SelfJoinExample.scala └── xml │ ├── PersonsComplexXML.scala │ ├── PersonsXML.scala │ ├── ReadBooksXMLWithNestedArray.scala │ ├── ReadBooksXMLWithNestedArrayStruct.scala │ └── xstream │ └── WriteXML.scala ├── dataset ├── DataSetFromData.scala ├── DataSetWithCustomClass.scala └── xml │ ├── ReadBooksXML.scala │ ├── ReadBooksXMLWithNestedArray.scala │ ├── ReadBooksXMLWithNestedArrayDSL.scala │ ├── SparkXMLUsingXstream.scala │ └── sparkXml.scala ├── rdd ├── CreateEmptyRDD.scala ├── CreateRDD.scala ├── OperationOnPairRDDComplex.scala ├── OperationsOnPairRDD.scala ├── OperationsOnRDD.scala ├── PartitionBy.scala ├── RDDAccumulator.scala ├── RDDActions.scala ├── RDDBroadcast.scala ├── RDDCache.scala ├── RDDFromCSVFile.scala ├── RDDFromDataUsingParallelize.scala ├── RDDFromParallelizeRange.scala ├── RDDFromWholeTextFile.scala ├── RDDHadoopInputFormat.scala ├── RDDPrint.scala ├── RDDReadFilesFromDirectory.scala ├── RDDRepartitionExample.scala ├── RDDSaveAsObjectFile.scala ├── RDDSequenceFiles.scala ├── RDDShuffleExample.scala ├── ReadMultipleCSVFiles.scala ├── ReadMultipleFiles.scala ├── ReadTextFiles.scala ├── SortBy.scala ├── WordCountExample.scala ├── ZipCode.scala ├── functions │ ├── FlatMapExample.scala │ ├── MapExample.scala │ ├── ReduceByKeyExample.scala │ ├── SortByKeyExample.scala │ ├── aggregateExample.scala │ ├── foldExample.scala │ └── reduceExample.scala └── xml │ └── XmlRecordReader.scala ├── spark30 ├── ADQExample.scala └── ReadBinary.scala └── stackoverflow ├── AddingLiterral.scala ├── SparkContextOld.scala ├── Test.scala └── Test2.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .metadata 3 | .cache-main 4 | .classpath 5 | .project 6 | .settings 7 | *.class 8 | *.orig 9 | *.log 10 | target/ 11 | .DS_Store 12 | *.iml 13 | scalastyle-output.xml 14 | 15 | -------------------------------------------------------------------------------- /src/main/resources/address-multiline.csv: -------------------------------------------------------------------------------- 1 | Id,Address Line1,City,State,Zipcode 2 | 1,9182 Clear Water Rd,Fayetteville,AR,72704 3 | 2,"9920 State 4 | Highway 89",Ringling,OK,73456 5 | 3,9724 E Landon Ln,Kennewick,WA,99338 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/main/resources/address.csv: -------------------------------------------------------------------------------- 1 | Id,Address Line1,City,State,Zipcode 2 | 1,9182 Clear Water Rd,Fayetteville,AR,72704 3 | 2,9724 E Landon Ln,Kennewick,WA,99338 4 | 3,9509 Clay Creek Ln,Fort Worth,TX,76177 5 | 4,98016 S Garnsey St,Santa Ana,CA,92707 6 | 5,9920 State Highway 89,Ringling,OK,73456 -------------------------------------------------------------------------------- /src/main/resources/csv/invalid.txt: -------------------------------------------------------------------------------- 1 | Invalid,I -------------------------------------------------------------------------------- /src/main/resources/csv/text01.txt: -------------------------------------------------------------------------------- 1 | One,1 2 | Eleven,11 -------------------------------------------------------------------------------- /src/main/resources/csv/text02.txt: -------------------------------------------------------------------------------- 1 | Two,2 -------------------------------------------------------------------------------- /src/main/resources/csv/text03.txt: -------------------------------------------------------------------------------- 1 | Three,3 -------------------------------------------------------------------------------- /src/main/resources/csv/text04.txt: -------------------------------------------------------------------------------- 1 | Four,4 -------------------------------------------------------------------------------- /src/main/resources/kv.csv: -------------------------------------------------------------------------------- 1 | key,value 2 | record1,My Name is Naveen 3 | record2,My Name is Praveen 4 | record3,My Name is Prabha -------------------------------------------------------------------------------- /src/main/resources/multiline-zipcode.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "RecordNumber": 2, 3 | "Zipcode": 704, 4 | "ZipCodeType": "STANDARD", 5 | "City": "PASEO COSTA DEL SUR", 6 | "State": "PR" 7 | }, 8 | { 9 | "RecordNumber": 10, 10 | "Zipcode": 709, 11 | "ZipCodeType": "STANDARD", 12 | "City": "BDA SAN LUIS", 13 | "State": "PR" 14 | }] 15 | -------------------------------------------------------------------------------- /src/main/resources/persons.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | James 4 | Smith 5 | 6 | 1980 7 | 1 8 | M 9 | 10000 10 | 11 |
12 | 123 ABC street 13 | NewJersy 14 | NJ 15 |
16 |
17 | 456 apple street 18 | newark 19 | DE 20 |
21 |
22 |
23 | 24 | Michael 25 | 26 | Rose 27 | 1990 28 | 6 29 | M 30 | 10000 31 | 32 |
33 | 4512 main st 34 | new york 35 | NY 36 |
37 |
38 | 4367 orange st 39 | sandiago 40 | CA 41 |
42 |
43 |
44 |
45 | -------------------------------------------------------------------------------- /src/main/resources/persons_complex.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | James 4 | Smith 5 | 6 | 1980 7 | 1 8 | M 9 | 10000 10 | 11 |
12 | 1 capler dr 13 | new york 14 | NY 15 |
16 |
17 | 455 catalina dr 18 | chicago 19 | IL 20 |
21 |
22 |
23 | 24 | Michael 25 | 26 | Rose 27 | 1990 28 | 6 29 | M 30 | 10000 31 | 32 |
33 | 2345 pasadena village 34 | orlando 35 | FL 36 |
37 |
38 | 3 walnut dr 39 | wilmington 40 | DE 41 |
42 |
43 |
44 |
-------------------------------------------------------------------------------- /src/main/resources/records.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | John 4 | 10 5 | M 6 | 7 | 8 | Jenny 9 | 12 10 | F 11 | 12 | 13 | Janardhan 14 | 14 15 | M 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/main/resources/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "type" : "struct", 3 | "fields" : [ { 4 | "name" : "name", 5 | "type" : { 6 | "type" : "struct", 7 | "fields" : [ { 8 | "name" : "firstname", 9 | "type" : "string", 10 | "nullable" : true, 11 | "metadata" : { } 12 | }, { 13 | "name" : "middlename", 14 | "type" : "string", 15 | "nullable" : true, 16 | "metadata" : { } 17 | }, { 18 | "name" : "lastname", 19 | "type" : "string", 20 | "nullable" : true, 21 | "metadata" : { } 22 | } ] 23 | }, 24 | "nullable" : true, 25 | "metadata" : { } 26 | }, { 27 | "name" : "dob", 28 | "type" : "string", 29 | "nullable" : true, 30 | "metadata" : { } 31 | }, { 32 | "name" : "gender", 33 | "type" : "string", 34 | "nullable" : true, 35 | "metadata" : { } 36 | }, { 37 | "name" : "salary", 38 | "type" : "integer", 39 | "nullable" : true, 40 | "metadata" : { } 41 | } ] 42 | } -------------------------------------------------------------------------------- /src/main/resources/simple_zipcodes.csv: -------------------------------------------------------------------------------- 1 | Id,JsonValue 2 | 1,"{\"Zipcode\":704,\"ZipCodeType\":\"STANDARD\",\"City\":\"PARC PARQUE\",\"State\":\"PR\"}" 3 | 2,"{\"Zipcode\":704,\"ZipCodeType\":\"STANDARD\",\"City\":\"PASEO COSTA DEL SUR\",\"State\":\"PR\"}" 4 | 3,"{\"Zipcode\":709,\"ZipCodeType\":\"STANDARD\",\"City\":\"BDA SAN LUIS\",\"State\":\"PR\"}" 5 | 4,"{\"Zipcode\":76166,\"ZipCodeType\":\"UNIQUE\",\"City\":\"CINGULAR WIRELESS\",\"State\":\"TX\"}" 6 | 5,"{\"Zipcode\":76177,\"ZipCodeType\":\"STANDARD\",\"City\":\"FORT WORTH\",\"State\":\"TX\"}" 7 | 6,"{\"Zipcode\":76177,\"ZipCodeType\":\"STANDARD\",\"City\":\"FT WORTH\",\"State\":\"TX\"}" 8 | 7,"{\"Zipcode\":704,\"ZipCodeType\":\"STANDARD\",\"City\":\"URB EUGENE RICE\",\"State\":\"PR\"}" 9 | 8,"{\"Zipcode\":85209,\"ZipCodeType\":\"STANDARD\",\"City\":\"MESA\",\"State\":\"AZ\"}" 10 | 9,"{\"Zipcode\":85210,\"ZipCodeType\":\"STANDARD\",\"City\":\"MESA\",\"State\":\"AZ\"}" 11 | 10,"{\"Zipcode\":32046,\"ZipCodeType\":\"STANDARD\",\"City\":\"HILLIARD\",\"State\":\"FL\"}" 12 | -------------------------------------------------------------------------------- /src/main/resources/simple_zipcodes.json: -------------------------------------------------------------------------------- 1 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"} 2 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR"} 3 | {"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR"} 4 | {"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX"} 5 | {"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX"} 6 | {"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX"} 7 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR"} 8 | {"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ"} 9 | {"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ"} 10 | {"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL"} 11 | -------------------------------------------------------------------------------- /src/main/resources/simple_zipcodes.txt: -------------------------------------------------------------------------------- 1 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"} 2 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR"} 3 | {"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR"} 4 | {"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX"} 5 | {"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX"} 6 | {"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX"} 7 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR"} 8 | {"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ"} 9 | {"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ"} 10 | {"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL"} 11 | -------------------------------------------------------------------------------- /src/main/resources/small_zipcode.csv: -------------------------------------------------------------------------------- 1 | id,zipcode,type,city,state,population 2 | 1,704,STANDARD,,PR,30100 3 | 2,704,,PASEO COSTA DEL SUR,PR, 4 | 3,709,,BDA SAN LUIS,PR,3700 5 | 4,76166,UNIQUE,CINGULAR WIRELESS,TX,84000 6 | 5,76177,STANDARD,,TX, -------------------------------------------------------------------------------- /src/main/resources/stream.csv: -------------------------------------------------------------------------------- 1 | TotalCost|BirthDate|Gender|TotalChildren|ProductCategoryName 2 | 1000||Male|2|Technology 3 | 2000|1957-03-06||3|Beauty 4 | 3000|1959-03-06|Male||Car 5 | 4000|1953-03-06|Male|2| 6 | 5000|1957-03-06|Female|3|Beauty 7 | 6000|1959-03-06|Male|4|Car -------------------------------------------------------------------------------- /src/main/resources/zipcodes.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spark-examples/spark-scala-examples/e4f18c30dec398bb6ca110f98272b20b461f3310/src/main/resources/zipcodes.avro -------------------------------------------------------------------------------- /src/main/resources/zipcodes.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spark-examples/spark-scala-examples/e4f18c30dec398bb6ca110f98272b20b461f3310/src/main/resources/zipcodes.parquet -------------------------------------------------------------------------------- /src/main/resources/zipcodes20.csv: -------------------------------------------------------------------------------- 1 | 1,US,PARC PARQUE,704,PR 2 | 2,US,PASEO COSTA DEL SUR,704,PR 3 | 10,US,BDA SAN LUIS,709,PR 4 | 61391,US,CINGULAR WIRELESS,76166,TX 5 | 61392,US,FORT WORTH,76177,TX 6 | 61393,US,FT WORTH,76177,TX 7 | 4,US,URB EUGENE RICE,704,PR 8 | 39827,US,MESA,85209,AZ 9 | 39828,US,MESA,85210,AZ 10 | 49345,US,HILLIARD,32046,FL 11 | 49346,US,HOLDER,34445,FL 12 | 49347,US,HOLT,32564,FL 13 | 49348,US,HOMOSASSA,34487,FL 14 | 3,US,SECT LANAUSSE,704,PR 15 | 54354,US,SPRING GARDEN,36275,AL 16 | 54355,US,SPRINGVILLE,35146,AL 17 | 54356,US,SPRUCE PINE,35585,AL 18 | 76511,US,ASH HILL,27007,NC 19 | 76512,US,ASHEBORO,27203,NC 20 | 76513,US,ASHEBORO,27204,NC 21 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode1.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode10.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} 2 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false} 3 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false} 4 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode11.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode12.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false} -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode2.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false} 2 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false} 3 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode3.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":61391,"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX","LocationType":"NOT ACCEPTABLE","Lat":32.72,"Long":-97.31,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Cingular Wireless, TX","Location":"NA-US-TX-CINGULAR WIRELESS","Decommisioned":false} 2 | {"RecordNumber":61392,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX","LocationType":"PRIMARY","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Fort Worth, TX","Location":"NA-US-TX-FORT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986} 3 | {"RecordNumber":61393,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX","LocationType":"ACCEPTABLE","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Ft Worth, TX","Location":"NA-US-TX-FT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986} 4 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode4.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":4,"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Urb Eugene Rice, PR","Location":"NA-US-PR-URB EUGENE RICE","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode5.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":39827,"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.37,"Long":-111.64,"Xaxis":-0.3,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14962,"EstimatedPopulation":26883,"TotalWages":563792730,"Notes":"no NWS data, "} 2 | {"RecordNumber":39828,"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.38,"Long":-111.84,"Xaxis":-0.31,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14374,"EstimatedPopulation":25446,"TotalWages":471000465} 3 | {"RecordNumber":49345,"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL","LocationType":"PRIMARY","Lat":30.69,"Long":-81.92,"Xaxis":0.12,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Hilliard, FL","Location":"NA-US-FL-HILLIARD","Decommisioned":false,"TaxReturnsFiled":3922,"EstimatedPopulation":7443,"TotalWages":133112149} 4 | {"RecordNumber":49346,"Zipcode":34445,"ZipCodeType":"PO BOX","City":"HOLDER","State":"FL","LocationType":"PRIMARY","Lat":28.96,"Long":-82.41,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Holder, FL","Location":"NA-US-FL-HOLDER","Decommisioned":false} 5 | {"RecordNumber":49347,"Zipcode":32564,"ZipCodeType":"STANDARD","City":"HOLT","State":"FL","LocationType":"PRIMARY","Lat":30.72,"Long":-86.67,"Xaxis":0.04,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Holt, FL","Location":"NA-US-FL-HOLT","Decommisioned":false,"TaxReturnsFiled":1207,"EstimatedPopulation":2190,"TotalWages":36395913} 6 | {"RecordNumber":49348,"Zipcode":34487,"ZipCodeType":"PO BOX","City":"HOMOSASSA","State":"FL","LocationType":"PRIMARY","Lat":28.78,"Long":-82.61,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Homosassa, FL","Location":"NA-US-FL-HOMOSASSA","Decommisioned":false} 7 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode6.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":10,"Zipcode":708,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false} 2 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode7.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false} 2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599} 3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517} 4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493} 5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318} 6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473} 7 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode8.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false} 2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599} 3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517} 4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493} 5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318} 6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473} 7 | -------------------------------------------------------------------------------- /src/main/resources/zipcodes_streaming/zipcode9.json: -------------------------------------------------------------------------------- 1 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493} 2 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318} 3 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473} 4 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/SQLContextExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark 2 | 3 | import org.apache.spark.sql.{SQLContext, SparkSession} 4 | 5 | object SQLContextExample extends App { 6 | 7 | val spark = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate(); 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | 15 | val sqlContext:SQLContext = spark.sqlContext 16 | 17 | //read csv with options 18 | val df = sqlContext.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true")) 19 | .csv("src/main/resources/zipcodes.csv") 20 | df.show() 21 | df.printSchema() 22 | 23 | df.createOrReplaceTempView("TAB") 24 | sqlContext.sql("select * from TAB") 25 | .show(false) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/SparkContextExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark 2 | 3 | import com.sparkbyexamples.spark.dataframe.functions.SortExample.spark 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.{SQLContext, SparkSession} 6 | 7 | object SparkContextExample extends App{ 8 | 9 | val spark = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate(); 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | 17 | val sparkContext:SparkContext = spark.sparkContext 18 | val sqlCon:SQLContext = spark.sqlContext 19 | 20 | val sqlContext = new org.apache.spark.sql.SQLContext(spark.sparkContext) 21 | 22 | println("First SparkContext:") 23 | println("APP Name :"+spark.sparkContext.appName); 24 | println("Deploy Mode :"+spark.sparkContext.deployMode); 25 | println("Master :"+spark.sparkContext.master); 26 | 27 | val sparkSession2 = SparkSession.builder() 28 | .master("local[1]") 29 | .appName("SparkByExample-test") 30 | .getOrCreate(); 31 | 32 | println("Second SparkContext:") 33 | println("APP Name :"+sparkSession2.sparkContext.appName); 34 | println("Deploy Mode :"+sparkSession2.sparkContext.deployMode); 35 | println("Master :"+sparkSession2.sparkContext.master); 36 | 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/SparkSessionTest.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object SparkSessionTest { 6 | 7 | def main(args:Array[String]): Unit ={ 8 | 9 | 10 | val spark = SparkSession.builder() 11 | .master("local[1]") 12 | .appName("SparkByExample") 13 | .getOrCreate(); 14 | 15 | println("First SparkContext:") 16 | println("APP Name :"+spark.sparkContext.appName); 17 | println("Deploy Mode :"+spark.sparkContext.deployMode); 18 | println("Master :"+spark.sparkContext.master); 19 | 20 | val sparkSession2 = SparkSession.builder() 21 | .master("local[1]") 22 | .appName("SparkByExample-test") 23 | .getOrCreate(); 24 | 25 | println("Second SparkContext:") 26 | println("APP Name :"+sparkSession2.sparkContext.appName); 27 | println("Deploy Mode :"+sparkSession2.sparkContext.deployMode); 28 | println("Master :"+sparkSession2.sparkContext.master); 29 | 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/SparkSessionWrapper.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | trait SparkSessionWrapper { 6 | lazy val spark: SparkSession = { 7 | SparkSession 8 | .builder() 9 | .master("local") 10 | .appName("spark session") 11 | .config("spark.sql.shuffle.partitions", "1") 12 | .getOrCreate() 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/beans/Books.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | case class Books(_id:String, author:String, description:String, price:Double, publish_date:String, title:String) -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/beans/BooksDiscounted.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | case class BooksDiscounted(_id:String, author:String, description:String, price:Double, publish_date:String, title:String, discountPrice:Double) 4 | 5 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/beans/BooksStruct.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | class BooksStruct { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/beans/BooksWithArray.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | case class BooksWithArray(_id:String, author:String, description:String, price:Double, publish_date:String, title:String,otherInfo:OtherInfo,stores:Stores) 4 | case class OtherInfo(pagesCount:String,language:String,country:String,address:Address) 5 | case class Address(addressline1:String,city:String,state:String) 6 | case class Stores(store:Array[Store]) 7 | case class Store(name:String) 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/beans/User.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | class User() { 4 | private var name:String = "" 5 | private var age:Int = 0 6 | 7 | def this(name: String, age: Int) { 8 | this() 9 | this.name =name 10 | this.age = age 11 | } 12 | 13 | def getName: String = this.name 14 | 15 | def getAge: Int = this.age 16 | 17 | override def toString: String = "User(" + name + ", " + age + ")" 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/beans/Zipcode.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.beans 2 | 3 | import scala.beans.BeanProperty 4 | 5 | class Zipcode { 6 | 7 | @BeanProperty 8 | var RecordNumber = -1 9 | @BeanProperty 10 | var Zipcode="" 11 | @BeanProperty 12 | var ZipCodeType="" 13 | @BeanProperty 14 | var City="" 15 | @BeanProperty 16 | var State="" 17 | @BeanProperty 18 | var LocationType="" 19 | @BeanProperty 20 | var Lat="" 21 | @BeanProperty 22 | var Long="" 23 | @BeanProperty 24 | var Xaxis="" 25 | @BeanProperty 26 | var Yaxis="" 27 | @BeanProperty 28 | var Zaxis="" 29 | @BeanProperty 30 | var WorldRegion="" 31 | @BeanProperty 32 | var Country="" 33 | @BeanProperty 34 | var LocationText="" 35 | @BeanProperty 36 | var Location="" 37 | @BeanProperty 38 | var Decommisioned="" 39 | } 40 | 41 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/ArrayToColumn.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | 6 | object ArrayToColumn extends App { 7 | 8 | val spark = SparkSession.builder().appName("SparkByExamples.com") 9 | .master("local[1]") 10 | .getOrCreate() 11 | 12 | val arrayData = Seq( 13 | Row("James",List("Java","Scala","C++")), 14 | Row("Michael",List("Spark","Java","C++")), 15 | Row("Robert",List("CSharp","VB","")) 16 | ) 17 | 18 | val arraySchema = new StructType().add("name",StringType) 19 | .add("subjects",ArrayType(StringType)) 20 | 21 | val arrayDF = spark.createDataFrame(spark.sparkContext.parallelize(arrayData),arraySchema) 22 | arrayDF.printSchema() 23 | arrayDF.show() 24 | 25 | // val arrayDFColumn = df.select( 26 | // df("name") +: (0 until 2).map(i => df("subjects")(i).alias(s"LanguagesKnown$i")): _* 27 | // ) 28 | // 29 | // arrayDFColumn.show(false) 30 | 31 | //How to convert Array of Array to column 32 | val arrayArrayData = Seq( 33 | Row("James",List(List("Java","Scala","C++"),List("Spark","Java"))), 34 | Row("Michael",List(List("Spark","Java","C++"),List("Spark","Java"))), 35 | Row("Robert",List(List("CSharp","VB"),List("Spark","Python"))) 36 | ) 37 | 38 | val arrayArraySchema = new StructType().add("name",StringType) 39 | .add("subjects",ArrayType(ArrayType(StringType))) 40 | 41 | val df = spark.createDataFrame(spark.sparkContext.parallelize(arrayArrayData),arrayArraySchema) 42 | df.printSchema() 43 | df.show() 44 | 45 | val df2 = df.select( 46 | df("name") +: (0 until 2).map(i => df("subjects")(i).alias(s"LanguagesKnown$i")): _* 47 | ) 48 | 49 | df2.show(false) 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/AvroToJson.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object AvroToJson extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExample") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | //read avro file 15 | val df = spark.read.format("avro") 16 | .load("src/main/resources/zipcodes.avro") 17 | df.show() 18 | df.printSchema() 19 | 20 | //convert to json 21 | df.write.mode(SaveMode.Overwrite) 22 | .json("/tmp/json/zipcodes.json") 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/AvroToParquet.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object AvroToParquet extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExample") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | //read avro file 15 | val df = spark.read.format("avro") 16 | .load("src/main/resources/zipcodes.avro") 17 | df.show() 18 | df.printSchema() 19 | 20 | //convert to parquet 21 | df.write.mode(SaveMode.Overwrite) 22 | .parquet("/tmp/parquet/zipcodes.parquet") 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/BroadcastExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object BroadcastExample extends App{ 6 | 7 | val spark = SparkSession.builder() 8 | .appName("SparkByExamples.com") 9 | .master("local") 10 | .getOrCreate() 11 | 12 | val states = Map(("NY","New York"),("CA","California"),("FL","Florida")) 13 | val countries = Map(("USA","United States of America"),("IN","India")) 14 | 15 | val broadcastStates = spark.sparkContext.broadcast(states) 16 | val broadcastCountries = spark.sparkContext.broadcast(countries) 17 | 18 | val data = Seq(("James","Smith","USA","CA"), 19 | ("Michael","Rose","USA","NY"), 20 | ("Robert","Williams","USA","CA"), 21 | ("Maria","Jones","USA","FL") 22 | ) 23 | 24 | val columns = Seq("firstname","lastname","country","state") 25 | import spark.sqlContext.implicits._ 26 | val df = data.toDF(columns:_*) 27 | 28 | val df2 = df.map(row=>{ 29 | val country = row.getString(2) 30 | val state = row.getString(3) 31 | 32 | val fullCountry = broadcastCountries.value.get(country).get 33 | val fullState = broadcastStates.value.get(state).get 34 | (row.getString(0),row.getString(1),fullCountry,fullState) 35 | }).toDF(columns:_*) 36 | 37 | df2.show(false) 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/CaseClassSparkSchema.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.Encoders 4 | import org.apache.spark.sql.types.StructType 5 | 6 | object CaseClassSparkSchema extends App{ 7 | 8 | case class Name(first:String,last:String,middle:String) 9 | case class Employee(fullName:Name,age:Integer,gender:String) 10 | 11 | val encoderSchema = Encoders.product[Employee].schema 12 | encoderSchema.printTreeString() 13 | 14 | import org.apache.spark.sql.catalyst.ScalaReflection 15 | val schema = ScalaReflection.schemaFor[Employee].dataType.asInstanceOf[StructType] 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/CastColumnType.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types._ 5 | import org.apache.spark.sql.functions._ 6 | 7 | import org.apache.spark.sql.{Row, SparkSession} 8 | import org.apache.spark.sql.types._ 9 | import org.apache.spark.sql.functions._ 10 | 11 | object CastColumnType extends App { 12 | val spark: SparkSession = SparkSession.builder() 13 | .master("local[1]") 14 | .appName("SparkByExamples.com") 15 | .getOrCreate() 16 | 17 | val simpleData = Seq(Row("James", 34, "2006-01-01", "true", "M", 3000.60), 18 | Row("Michael", 33, "1980-01-10", "true", "F", 3300.80), 19 | Row("Robert", 37, "06-01-1992", "false", "M", 5000.50) 20 | ) 21 | 22 | val simpleSchema = StructType(Array( 23 | StructField("firstName", StringType, true), 24 | StructField("age", IntegerType, true), 25 | StructField("jobStartDate", StringType, true), 26 | StructField("isGraduated", StringType, true), 27 | StructField("gender", StringType, true), 28 | StructField("salary", DoubleType, true) 29 | )) 30 | 31 | val df = spark.createDataFrame( 32 | spark.sparkContext.parallelize(simpleData), simpleSchema) 33 | df.printSchema() 34 | df.show(false) 35 | 36 | val df2 = df.withColumn("age", col("age").cast(StringType)) 37 | .withColumn("isGraduated", col("isGraduated").cast(BooleanType)) 38 | .withColumn("jobStartDate", col("jobStartDate").cast(DateType)) 39 | df2.printSchema() 40 | 41 | val df3 = df2.selectExpr("cast(age as int) age", 42 | "cast(isGraduated as string) isGraduated", 43 | "cast(jobStartDate as string) jobStartDate") 44 | df3.printSchema() 45 | df3.show(false) 46 | 47 | df3.createOrReplaceTempView("CastExample") 48 | val df4 = spark.sql("SELECT STRING(age),BOOLEAN(isGraduated), " + 49 | "DATE(jobStartDate) from CastExample") 50 | df4.printSchema() 51 | df4.show(false) 52 | 53 | 54 | val cast_df = df.select(df.columns.map { 55 | case column@"age" => 56 | col(column).cast("String").as(column) 57 | case column@"salary" => 58 | col(column).cast("String").as(column) 59 | case column => 60 | col(column) 61 | }: _*) 62 | 63 | cast_df.printSchema() 64 | 65 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/ColumnTruncate.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import com.sparkbyexamples.spark.SQLContextExample.spark 4 | import org.apache.log4j.lf5.LogLevel 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object ColumnTruncate extends App { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | 15 | 16 | import spark.implicits._ 17 | val columns = Seq("Seqno","Quote") 18 | val data = Seq(("1", "Be the change that you wish to see in the world"), 19 | ("2", "Everyone thinks of changing the world, but no one thinks of changing himself."), 20 | ("3", "The purpose of our lives is to be happy.")) 21 | val df = data.toDF(columns:_*) 22 | df.show(false) 23 | 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/CreateDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | 6 | object CreateDataFrame { 7 | 8 | def main(args:Array[String]):Unit={ 9 | 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[1]").appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | import spark.implicits._ 15 | val columns = Seq("language","users_count") 16 | val data = Seq(("Java", "20000"), ("Python", "100000"), ("Scala", "3000")) 17 | val rdd = spark.sparkContext.parallelize(data) 18 | 19 | 20 | //From RDD (USING toDF()) 21 | val dfFromRDD1 = rdd.toDF("language","users") 22 | dfFromRDD1.printSchema() 23 | //From RDD (USING createDataFrame) 24 | val dfFromRDD2 = spark.createDataFrame(rdd).toDF(columns:_*) 25 | dfFromRDD2.printSchema() 26 | //From RDD (USING createDataFrame and Adding schema using StructType) 27 | //convert RDD[T] to RDD[Row] 28 | val schema = StructType( Array(StructField("language", StringType, true), 29 | StructField("language", StringType, true))) 30 | 31 | val rowRDD = rdd.map(attributes => Row(attributes._1, attributes._2)) 32 | val dfFromRDD3 = spark.createDataFrame(rowRDD,schema) 33 | 34 | 35 | //From Data (USING toDF()) 36 | val dfFromData1 = data.toDF() 37 | 38 | //From Data (USING createDataFrame) 39 | var dfFromData2 = spark.createDataFrame(data).toDF(columns:_*) 40 | 41 | //From Data (USING createDataFrame and Adding schema using StructType) 42 | import scala.collection.JavaConversions._ 43 | val rowData = data 44 | .map(attributes => Row(attributes._1, attributes._2)) 45 | var dfFromData3 = spark.createDataFrame(rowData,schema) 46 | 47 | //From Data (USING createDataFrame and Adding bean class) 48 | //To-DO 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/CreateEmptyDataFrameExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 5 | 6 | object CreateEmptyDataFrameExample extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | import spark.implicits._ 13 | 14 | 15 | val schema = StructType( 16 | StructField("firstName", StringType, true) :: 17 | StructField("lastName", IntegerType, false) :: 18 | StructField("middleName", IntegerType, false) :: Nil) 19 | 20 | val colSeq = Seq("firstName","lastName","middleName") 21 | 22 | case class Name(firstName: String, lastName: String, middleName:String) 23 | 24 | // Create empty dataframe using StructType schema 25 | val df = spark.createDataFrame(spark.sparkContext 26 | .emptyRDD[Row], schema) 27 | 28 | // Using implicit encoder 29 | Seq.empty[(String,String,String)].toDF(colSeq:_*) 30 | 31 | //Using case class 32 | 33 | Seq.empty[Name].toDF().printSchema() 34 | 35 | //Using emptyDataFrame 36 | spark.emptyDataFrame 37 | 38 | 39 | //Using emptyDataset 40 | 41 | 42 | } 43 | 44 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/CreateEmptyDatasetExample.scala: -------------------------------------------------------------------------------- 1 | 2 | package com.sparkbyexamples.spark.dataframe 3 | 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 6 | 7 | object CreateEmptyDatasetExample extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR"); 14 | import spark.implicits._ 15 | 16 | val schema = StructType( 17 | StructField("firstName", StringType, true) :: 18 | StructField("lastName", IntegerType, false) :: 19 | StructField("middleName", IntegerType, false) :: Nil) 20 | 21 | val colSeq = Seq("firstName","lastName","middleName") 22 | 23 | case class Name(firstName: String, lastName: String, middleName:String) 24 | case class Empty() 25 | val ds0 = spark.emptyDataset[Empty] 26 | ds0.printSchema() 27 | 28 | val ds1=spark.emptyDataset[Name] 29 | ds1.printSchema() 30 | 31 | val ds2 = spark.createDataset(Seq.empty[Name]) 32 | ds2.printSchema() 33 | 34 | val ds4=spark.createDataset(spark.sparkContext.emptyRDD[Name]) 35 | ds4.printSchema() 36 | 37 | val ds3=spark.createDataset(Seq.empty[(String,String,String)]) 38 | ds3.printSchema() 39 | val ds5=Seq.empty[(String,String,String)].toDS() 40 | ds5.printSchema() 41 | 42 | val ds6=Seq.empty[Name].toDS() 43 | ds6.printSchema() 44 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/CsvToAvroParquetJson.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object CsvToAvroParquetJson extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExample") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | //read csv with options 15 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true")) 16 | .csv("src/main/resources/zipcodes.csv") 17 | df.show() 18 | df.printSchema() 19 | 20 | //convert to avro 21 | df.write.format("avro").mode(SaveMode.Overwrite) 22 | .save("/tmp/avro/zipcodes.avro") 23 | 24 | //convert to avro by partition 25 | df.write.partitionBy("State","Zipcode") 26 | .format("avro") 27 | .mode(SaveMode.Overwrite) 28 | .save("/tmp/avro/zipcodes_partition.avro") 29 | 30 | //convert to parquet 31 | df.write.mode(SaveMode.Overwrite).parquet("/tmp/parquet/zipcodes.parquet") 32 | 33 | //convert to csv 34 | df.write.mode(SaveMode.Overwrite).json("/tmp/json/zipcodes.json") 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/DataFrameFromCSVFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object DataFrameFromCSVFile { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | //spark read csv file 17 | val df = spark.read.csv("src/main/resources/zipcodes.csv") 18 | df.show() 19 | df.printSchema() 20 | 21 | //read csv with options 22 | val df2 = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true")).csv("src/main/resources/zipcodes.csv") 23 | df2.show() 24 | df2.printSchema() 25 | 26 | //read with custom schema 27 | import org.apache.spark.sql.types._ 28 | val schema = new StructType() 29 | .add("RecordNumber",IntegerType,true) 30 | .add("Zipcode",IntegerType,true) 31 | .add("ZipCodeType",StringType,true) 32 | .add("City",StringType,true) 33 | .add("State",StringType,true) 34 | .add("LocationType",StringType,true) 35 | .add("Lat",DoubleType,true) 36 | .add("Long",DoubleType,true) 37 | .add("Xaxis",DoubleType,true) 38 | .add("Yaxis",DoubleType,true) 39 | .add("Zaxis",DoubleType,true) 40 | .add("WorldRegion",StringType,true) 41 | .add("Country",StringType,true) 42 | .add("LocationText",StringType,true) 43 | .add("Location",StringType,true) 44 | .add("Decommisioned",BooleanType,true) 45 | .add("TaxReturnsFiled",IntegerType,true) 46 | .add("EstimatedPopulation",IntegerType,true) 47 | .add("TotalWages",IntegerType,true) 48 | .add("Notes",StringType,true) 49 | 50 | //Write dataframe back to csv file 51 | val df_with_schema = spark.read.format("csv") 52 | .option("header", "true") 53 | .schema(schema) 54 | .load("src/main/resources/zipcodes.csv") 55 | 56 | df_with_schema.printSchema() 57 | df_with_schema.show(false) 58 | 59 | 60 | //Write a csv file 61 | df_with_schema.write.mode(SaveMode.Overwrite) 62 | .csv("c:/tmp/spark_output/zipcodes") 63 | 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/DataFrameWithSimpleDSL.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | object DataFrameWithSimpleDSL { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | val filePath = "C://000_Projects/opt/BigData/zipcodes.csv" 15 | 16 | var df:DataFrame = spark.read.option("header","true").csv(filePath) 17 | df.printSchema() 18 | 19 | // Where 20 | df.select("*").where(df("RecordNumber") < 10).show() 21 | //Filter 22 | df.filter(df("State")==="PR").select("State").show() 23 | //Distinct 24 | df.select(df("State")).distinct().show() 25 | //Count 26 | println("Number of records"+df.count()) 27 | 28 | //When Otherwise 29 | //df.select(df("State"), case df("State") when "PR" then "PR123" 30 | 31 | // where with and and or conditions 32 | df.where(df("State") === "PR" && df("City").contains("DEL")).show() 33 | 34 | //Order or Sort by 35 | df.orderBy(df("RecordNumber").desc, df("State").asc).show() 36 | 37 | 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/FilterExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 5 | import org.apache.spark.sql.functions.array_contains 6 | object FilterExample extends App{ 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | val arrayStructureData = Seq( 16 | Row(Row("James","","Smith"),List("Java","Scala","C++"),"OH","M"), 17 | Row(Row("Anna","Rose",""),List("Spark","Java","C++"),"NY","F"), 18 | Row(Row("Julia","","Williams"),List("CSharp","VB"),"OH","F"), 19 | Row(Row("Maria","Anne","Jones"),List("CSharp","VB"),"NY","M"), 20 | Row(Row("Jen","Mary","Brown"),List("CSharp","VB"),"NY","M"), 21 | Row(Row("Mike","Mary","Williams"),List("Python","VB"),"OH","M") 22 | ) 23 | 24 | val arrayStructureSchema = new StructType() 25 | .add("name",new StructType() 26 | .add("firstname",StringType) 27 | .add("middlename",StringType) 28 | .add("lastname",StringType)) 29 | .add("languages", ArrayType(StringType)) 30 | .add("state", StringType) 31 | .add("gender", StringType) 32 | 33 | val df = spark.createDataFrame( 34 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 35 | df.printSchema() 36 | df.show() 37 | 38 | //Condition 39 | df.filter(df("state") === "OH") 40 | .show(false) 41 | 42 | //SQL Expression 43 | df.filter("gender == 'M'") 44 | .show(false) 45 | 46 | //multiple condition 47 | df.filter(df("state") === "OH" && df("gender") === "M") 48 | .show(false) 49 | 50 | //Array condition 51 | df.filter(array_contains(df("languages"),"Java")) 52 | .show(false) 53 | 54 | //Struct condition 55 | df.filter(df("name.lastname") === "Williams") 56 | .show(false) 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/FilterNullRowsExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{SparkSession} 4 | import org.apache.spark.sql.functions.col 5 | 6 | object FilterNullRowsExample extends App{ 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | val data = Seq( 15 | ("James",null,"M"), 16 | ("Anna","NY","F"), 17 | ("Julia",null,null) 18 | ) 19 | import spark.implicits._ 20 | val columns = Seq("name","state","gender") 21 | val df = data.toDF(columns:_*) 22 | 23 | df.printSchema() 24 | df.show() 25 | 26 | df.filter("state is NULL").show(false) 27 | df.filter(df("state").isNull).show(false) 28 | df.filter(col("state").isNull).show(false) 29 | 30 | df.filter("state is not NULL").show(false) 31 | df.filter("NOT state is NULL").show(false) 32 | df.filter(df("state").isNotNull).show(false) 33 | 34 | df.filter("state is NULL AND gender is NULL").show(false) 35 | df.filter(df("state").isNull && df("gender").isNull).show(false) 36 | 37 | df.createOrReplaceTempView("DATA") 38 | spark.sql("SELECT * FROM DATA where STATE IS NULL").show(false) 39 | spark.sql("SELECT * FROM DATA where STATE IS NULL AND GENDER IS NULL").show(false) 40 | spark.sql("SELECT * FROM DATA where STATE IS NOT NULL").show(false) 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/FromCSVFile2.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object FromCSVFile2 { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val filePath="src/main/resources/stream.csv" 15 | 16 | val df3 = spark.read.option("header",true).csv("src/main/resources/zipcodes.csv") 17 | df3.show(false) 18 | 19 | 20 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->"|","header"->"true")).csv(filePath) 21 | 22 | val df2 = df.select("Gender", "BirthDate", "TotalCost", "TotalChildren", "ProductCategoryName") 23 | .filter("Gender is not null") 24 | .filter("BirthDate is not null") 25 | .filter("TotalChildren is not null") 26 | .filter("ProductCategoryName is not null") 27 | df2.show() 28 | 29 | df.select("Gender", "BirthDate", "TotalCost", "TotalChildren", "ProductCategoryName") 30 | .where(df("Gender").isNotNull && df("BirthDate").isNotNull && df("TotalChildren").isNotNull && df("ProductCategoryName").isNotNull ).show() 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/FromCSVMultiline.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object FromCSVMultiline extends App { 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[3]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | 13 | val df = spark.read 14 | .option("header",true) 15 | .option("delimiter",",") 16 | .option("multiLine",true) 17 | .option("quotes","\"") 18 | .csv("src/main/resources/address-multiline.csv") 19 | 20 | df.show(false) 21 | } 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/FromTextFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} 4 | 5 | object FromTextFile { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | //returns DataFrame 15 | val df:DataFrame = spark.read.text("src/main/resources/csv/text01.txt") 16 | df.printSchema() 17 | df.show(false) 18 | 19 | //converting to columns by splitting 20 | import spark.implicits._ 21 | val df2 = df.map(f=>{ 22 | val elements = f.getString(0).split(",") 23 | (elements(0),elements(1)) 24 | }) 25 | 26 | df2.printSchema() 27 | df2.show(false) 28 | 29 | // returns Dataset[String] 30 | val ds:Dataset[String] = spark.read.textFile("src/main/resources/csv/text01.txt") 31 | ds.printSchema() 32 | ds.show(false) 33 | 34 | //converting to columns by splitting 35 | import spark.implicits._ 36 | val ds2 = ds.map(f=> { 37 | val elements = f.split(",") 38 | (elements(0),elements(1)) 39 | }) 40 | 41 | ds2.printSchema() 42 | ds2.show(false) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/HandleNullExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object HandleNullExample extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val filePath="src/main/resources/small_zipcode.csv" 13 | 14 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true")).csv(filePath) 15 | df.printSchema() 16 | df.show(false) 17 | 18 | df.na.fill(0) 19 | .show(false) 20 | 21 | df.na.fill(0,Array("population")) 22 | .show(false) 23 | 24 | df.na.fill("") 25 | .show(false) 26 | 27 | df.na.fill("unknown",Array("city")) 28 | .na.fill("",Array("type")) 29 | .show(false) 30 | 31 | // Array and map columns 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/JsonFromMultiline.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object JsonFromMultiline extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[3]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | //read multiline json file 13 | val multiline_df = spark.read.option("multiline", "true") 14 | .json("src/main/resources/multiline-zipcode.json") 15 | multiline_df.printSchema() 16 | multiline_df.show(false) 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/JsonToAvroCsvParquet.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object JsonToAvroCsvParquet extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExample") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | //read json file into dataframe 15 | val df = spark.read.json("src/main/resources/zipcodes.json") 16 | df.printSchema() 17 | df.show(false) 18 | 19 | //convert to avro 20 | df.write.format("avro").save("/tmp/avro/zipcodes.avro") 21 | 22 | //convert to avro by partition 23 | df.write.partitionBy("State","Zipcode") 24 | .format("avro").save("/tmp/avro/zipcodes_partition.avro") 25 | 26 | //convert to parquet 27 | df.write.parquet("/tmp/parquet/zipcodes.parquet") 28 | 29 | //convert to csv 30 | df.write.option("header","true").csv("/tmp/csv/zipcodes.csv") 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ParquetExample { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val data = Seq(("James ","","Smith","36636","M",3000), 15 | ("Michael ","Rose","","40288","M",4000), 16 | ("Robert ","","Williams","42114","M",4000), 17 | ("Maria ","Anne","Jones","39192","F",4000), 18 | ("Jen","Mary","Brown","","F",-1) 19 | ) 20 | 21 | val columns = Seq("firstname","middlename","lastname","dob","gender","salary") 22 | import spark.sqlContext.implicits._ 23 | val df = data.toDF(columns:_*) 24 | 25 | df.show() 26 | df.printSchema() 27 | 28 | df.write 29 | .parquet("C:\\tmp\\output\\people.parquet") 30 | 31 | val parqDF = spark.read.parquet("C:\\tmp\\output\\people.parquet") 32 | parqDF.createOrReplaceTempView("ParquetTable") 33 | 34 | spark.sql("select * from ParquetTable where salary >= 4000").explain() 35 | val parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ") 36 | 37 | parkSQL.show() 38 | parkSQL.printSchema() 39 | 40 | df.write 41 | .partitionBy("gender","salary") 42 | .parquet("C:\\tmp\\output\\people2.parquet") 43 | 44 | val parqDF2 = spark.read.parquet("C:\\tmp\\output\\people2.parquet") 45 | parqDF2.createOrReplaceTempView("ParquetTable2") 46 | 47 | val df3 = spark.sql("select * from ParquetTable2 where gender='M' and salary >= 4000") 48 | df3.explain() 49 | df3.printSchema() 50 | df3.show() 51 | 52 | val parqDF3 = spark.read 53 | .parquet("C:\\tmp\\output\\people2.parquet\\gender=M") 54 | parqDF3.show() 55 | 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetToAvro.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object ParquetToAvro extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExample") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | //read parquet file 15 | val df = spark.read.format("parquet") 16 | .load("src/main/resources/zipcodes.parquet") 17 | df.show() 18 | df.printSchema() 19 | 20 | //convert to avro 21 | df.write.format("avro") 22 | .mode(SaveMode.Overwrite) 23 | .save("/tmp/avro/zipcodes.avro") 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetToCsv.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object ParquetToCsv extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | //read parquet file 15 | val df = spark.read.format("parquet") 16 | .load("src/main/resources/zipcodes.parquet") 17 | df.show() 18 | df.printSchema() 19 | 20 | //convert to csv 21 | df.write.mode(SaveMode.Overwrite) 22 | .csv("/tmp/csv/zipcodes.csv") 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetToJson.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object ParquetToJson extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | //read parquet file 15 | val df = spark.read.format("parquet") 16 | .load("src/main/resources/zipcodes.parquet") 17 | df.show() 18 | df.printSchema() 19 | 20 | //convert to json 21 | df.write.mode(SaveMode.Overwrite) 22 | .json("/tmp/json/zipcodes.json") 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/ReadJsonFromString.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types._ 6 | 7 | object ReadJsonFromString extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | //Read JSON string from text file 17 | val dfFromText:DataFrame = spark.read.text("src/main/resources/simple_zipcodes.txt") 18 | dfFromText.printSchema() 19 | 20 | val schema = new StructType() 21 | .add("Zipcode", StringType, true) 22 | .add("ZipCodeType", StringType, true) 23 | .add("City", StringType, true) 24 | .add("State", StringType, true) 25 | 26 | val dfJSON = dfFromText.withColumn("jsonData",from_json(col("value"),schema)) 27 | .select("jsonData.*") 28 | dfJSON.printSchema() 29 | dfJSON.show(false) 30 | 31 | //alternatively using select 32 | val dfJSON2 = dfFromText.select(from_json(col("value"), schema).as("jsonData")) 33 | .select("jsonData.*") 34 | 35 | //Read JSON string from CSV file 36 | val dfFromCSV:DataFrame = spark.read.option("header",true) 37 | .csv("src/main/resources/simple_zipcodes.csv") 38 | dfFromCSV.printSchema() 39 | dfFromCSV.show(false) 40 | 41 | val dfFromCSVJSON = dfFromCSV.select(col("Id"), 42 | from_json(col("JsonValue"),schema).as("jsonData")) 43 | .select("Id","jsonData.*") 44 | dfFromCSVJSON.printSchema() 45 | dfFromCSVJSON.show(false) 46 | 47 | //Read json from string 48 | import spark.implicits._ 49 | val jsonStr = """{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}""" 50 | val df = spark.read.json(Seq(jsonStr).toDS()) 51 | df.show(false) 52 | 53 | // from RDD[String] 54 | // deprecated 55 | val rdd = spark.sparkContext.parallelize( 56 | """ {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"} """ :: Nil) 57 | val df2 = spark.read.json(rdd) 58 | df2.show() 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/RemoveNullRowsExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object RemoveNullRowsExample extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | val filePath="src/main/resources/small_zipcode.csv" 14 | 15 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true")).csv(filePath) 16 | df.printSchema() 17 | df.show(false) 18 | 19 | df.na.drop().show(false) 20 | 21 | //all/any 22 | df.na.drop("any").show(false) 23 | 24 | df.na.drop(Seq("population","type")).show(false) 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/RenameColDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 5 | import org.apache.spark.sql.functions.{col, _} 6 | 7 | object RenameColDataFrame { 8 | 9 | def main(args:Array[String]):Unit= { 10 | 11 | val spark: SparkSession = SparkSession.builder() 12 | .master("local[1]") 13 | .appName("SparkByExamples.com") 14 | .getOrCreate() 15 | 16 | val data = Seq(Row(Row("James ","","Smith"),"36636","M",3000), 17 | Row(Row("Michael ","Rose",""),"40288","M",4000), 18 | Row(Row("Robert ","","Williams"),"42114","M",4000), 19 | Row(Row("Maria ","Anne","Jones"),"39192","F",4000), 20 | Row(Row("Jen","Mary","Brown"),"","F",-1) 21 | ) 22 | 23 | val schema = new StructType() 24 | .add("name",new StructType() 25 | .add("firstname",StringType) 26 | .add("middlename",StringType) 27 | .add("lastname",StringType)) 28 | .add("dob",StringType) 29 | .add("gender",StringType) 30 | .add("salary",IntegerType) 31 | 32 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema) 33 | 34 | df.printSchema() 35 | 36 | df.withColumnRenamed("dob","DateOfBirth") 37 | .printSchema() 38 | 39 | val schema2 = new StructType() 40 | .add("fname",StringType) 41 | .add("middlename",StringType) 42 | .add("lname",StringType) 43 | 44 | df.select(col("name").cast(schema2), 45 | col("dob"), 46 | col("gender"), 47 | col("salary")) 48 | .printSchema() 49 | 50 | df.select(col("name.firstname").as("fname"), 51 | col("name.middlename").as("mname"), 52 | col("name.lastname").as("lname"), 53 | col("dob"),col("gender"),col("salary")) 54 | .printSchema() 55 | 56 | df.withColumnRenamed("name.firstname","fname") 57 | .withColumnRenamed("name.middlename","mname") 58 | .withColumnRenamed("name.lastname","lname") 59 | .drop("name") 60 | .printSchema() 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/SQLExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object DataFrameWithSQL_ { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val data = Seq(1,2,3) 15 | 16 | import spark.sqlContext.implicits._ 17 | 18 | val df = data.toDF("field1") 19 | 20 | df.createOrReplaceTempView("table1") 21 | 22 | val df2 = spark.sql("select tb1.field1 as field1,tb2.field1 as field2 from table1 tb1, table1 tb2 where tb1.field1 <> tb2.field1") 23 | df2.printSchema() 24 | df2.show(false) 25 | 26 | df2.createOrReplaceTempView("table2") 27 | 28 | val df3 = spark.sql("select distinct tb1.field1,tb1.field2 from table2 tb1, table2 tb2 where tb1.field1 == tb2.field2 and tb1.field2 == tb2.field1") 29 | 30 | df3.show(false) 31 | 32 | 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/SaveDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | object SaveDataFrame { 6 | 7 | def main(args: Array[String]): Unit = { 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | val filePath = "C://000_Projects/opt/BigData/zipcodes.csv" 14 | 15 | var df:DataFrame = spark.read.option("header","true").csv(filePath) 16 | 17 | df.repartition(5).write.option("header","true").csv("c:/tmp/output/df1") 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/SparkUDF.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.functions.udf 4 | import org.apache.spark.sql.functions.col 5 | import org.apache.spark.sql.{Row, SparkSession} 6 | 7 | object SparkUDF extends App{ 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | import spark.implicits._ 15 | val columns = Seq("Seqno","Quote") 16 | val data = Seq(("1", "Be the change that you wish to see in the world"), 17 | ("2", "Everyone thinks of changing the world, but no one thinks of changing himself."), 18 | ("3", "The purpose of our lives is to be happy.") 19 | 20 | ) 21 | val df = data.toDF(columns:_*) 22 | df.show(false) 23 | 24 | val convertCase = (str:String) => { 25 | val arr = str.split(" ") 26 | arr.map(f=> f.substring(0,1).toUpperCase + f.substring(1,f.length)).mkString(" ") 27 | } 28 | 29 | //Using with DataFrame 30 | val convertUDF = udf(convertCase) 31 | df.select(col("Seqno"), 32 | convertUDF(col("Quote")).as("Quote") ).show(false) 33 | 34 | // Using it on SQL 35 | spark.udf.register("convertUDF", convertCase) 36 | df.createOrReplaceTempView("QUOTE_TABLE") 37 | spark.sql("select Seqno, convertUDF(Quote) from QUOTE_TABLE").show(false) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/UDFDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object UDFDataFrame { 6 | def main(args:Array[String]): Unit = { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[3]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | val data = Seq(("2018/01/23",23),("2018/01/24",24),("2018/02/20",25)) 14 | 15 | import spark.sqlContext.implicits._ 16 | val df = data.toDF("date1","day") 17 | 18 | val replace: String => String = _.replace("/","-") 19 | import org.apache.spark.sql.functions.udf 20 | val replaceUDF = udf(replace) 21 | val minDate = df.agg(min($"date1")).collect()(0).get(0) 22 | 23 | val df2 = df.select("*").filter( to_date(replaceUDF($"date1")) > date_add(to_date(replaceUDF(lit(minDate))),7 )) 24 | df2.show() 25 | } 26 | 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/UnionExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object UnionExample extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.implicits._ 15 | 16 | val simpleData = Seq(("James","Sales","NY",90000,34,10000), 17 | ("Michael","Sales","NY",86000,56,20000), 18 | ("Robert","Sales","CA",81000,30,23000), 19 | ("Maria","Finance","CA",90000,24,23000) 20 | ) 21 | val df = simpleData.toDF("employee_name","department","state","salary","age","bonus") 22 | df.printSchema() 23 | df.show() 24 | 25 | val simpleData2 = Seq(("James","Sales","NY",90000,34,10000), 26 | ("Maria","Finance","CA",90000,24,23000), 27 | ("Jen","Finance","NY",79000,53,15000), 28 | ("Jeff","Marketing","CA",80000,25,18000), 29 | ("Kumar","Marketing","NY",91000,50,21000) 30 | ) 31 | val df2 = simpleData2.toDF("employee_name","department","state","salary","age","bonus") 32 | df2.show(false) 33 | 34 | val df3 = df.union(df2) 35 | df3.show(false) 36 | df3.distinct().show(false) 37 | 38 | val df4 = df.unionAll(df2) 39 | df4.show(false) 40 | 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/WhereExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe 2 | 3 | import org.apache.spark.sql.functions.array_contains 4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 5 | import org.apache.spark.sql.{Row, SparkSession} 6 | 7 | object WhereExample extends App{ 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | val arrayStructureData = Seq( 17 | Row(Row("James","","Smith"),List("Java","Scala","C++"),"OH","M"), 18 | Row(Row("Anna","Rose",""),List("Spark","Java","C++"),"NY","F"), 19 | Row(Row("Julia","","Williams"),List("CSharp","VB"),"OH","F"), 20 | Row(Row("Maria","Anne","Jones"),List("CSharp","VB"),"NY","M"), 21 | Row(Row("Jen","Mary","Brown"),List("CSharp","VB"),"NY","M"), 22 | Row(Row("Mike","Mary","Williams"),List("Python","VB"),"OH","M") 23 | ) 24 | 25 | val arrayStructureSchema = new StructType() 26 | .add("name",new StructType() 27 | .add("firstname",StringType) 28 | .add("middlename",StringType) 29 | .add("lastname",StringType)) 30 | .add("languages", ArrayType(StringType)) 31 | .add("state", StringType) 32 | .add("gender", StringType) 33 | 34 | val df = spark.createDataFrame( 35 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 36 | df.printSchema() 37 | df.show() 38 | 39 | //Condition 40 | df.where(df("state") === "OH") 41 | .show(false) 42 | 43 | //SQL Expression 44 | df.where("gender == 'M'") 45 | .show(false) 46 | 47 | //multiple condition 48 | df.where(df("state") === "OH" && df("gender") === "M") 49 | .show(false) 50 | 51 | //Array condition 52 | df.where(array_contains(df("languages"),"Java")) 53 | .show(false) 54 | 55 | //Struct condition 56 | df.where(df("name.lastname") === "Williams") 57 | .show(false) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/CacheExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object CacheExample extends App { 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | //read csv with options 13 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true")) 14 | .csv("src/main/resources/zipcodes.csv") 15 | 16 | val df2 = df.where(col("State") === "PR").cache() 17 | df2.show(false) 18 | 19 | println(df2.count()) 20 | 21 | val df3 = df2.where(col("Zipcode") === 704) 22 | 23 | 24 | println(df2.count()) 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/CastStringToInt.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object CastStringToInt extends App { 6 | 7 | val spark = SparkSession.builder 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val simpleData = Seq(("James",34,"true","M","3000.6089"), 13 | ("Michael",33,"true","F","3300.8067"), 14 | ("Robert",37,"false","M","5000.5034") 15 | ) 16 | 17 | import spark.implicits._ 18 | val df = simpleData.toDF("firstname","age","isGraduated","gender","salary") 19 | df.printSchema() 20 | 21 | import org.apache.spark.sql.functions.col 22 | import org.apache.spark.sql.types.IntegerType 23 | // Convert String to Integer Type 24 | val df2= df.withColumn("salary",col("salary").cast(IntegerType)) 25 | df2.printSchema() 26 | df2.show() 27 | 28 | df.withColumn("salary",col("salary").cast("int")).printSchema() 29 | df.withColumn("salary",col("salary").cast("integer")).printSchema() 30 | 31 | // Using select 32 | df.select(col("salary").cast("int").as("salary")).printSchema() 33 | 34 | //Using selectExpr() 35 | df.selectExpr("cast(salary as int) salary","isGraduated").printSchema() 36 | df.selectExpr("INT(salary)","isGraduated").printSchema() 37 | 38 | //Using with spark.sql() 39 | df.createOrReplaceTempView("CastExample") 40 | spark.sql("SELECT INT(salary),BOOLEAN(isGraduated),gender from CastExample").printSchema() 41 | spark.sql("SELECT cast(salary as int) salary, BOOLEAN(isGraduated),gender from CastExample").printSchema() 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/CollectExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 5 | 6 | object CollectExample extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | val data = Seq(Row(Row("James ","","Smith"),"36636","M",3000), 14 | Row(Row("Michael ","Rose",""),"40288","M",4000), 15 | Row(Row("Robert ","","Williams"),"42114","M",4000), 16 | Row(Row("Maria ","Anne","Jones"),"39192","F",4000), 17 | Row(Row("Jen","Mary","Brown"),"","F",-1) 18 | ) 19 | 20 | val schema = new StructType() 21 | .add("name",new StructType() 22 | .add("firstname",StringType) 23 | .add("middlename",StringType) 24 | .add("lastname",StringType)) 25 | .add("id",StringType) 26 | .add("gender",StringType) 27 | .add("salary",IntegerType) 28 | 29 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema) 30 | df.printSchema() 31 | df.show(false) 32 | 33 | val colList = df.collectAsList() 34 | val colData = df.collect() 35 | 36 | colData.foreach(row=> 37 | { 38 | val salary = row.getInt(3)//Index starts from zero 39 | println(salary) 40 | }) 41 | 42 | //Retrieving data from Struct column 43 | colData.foreach(row=> 44 | { 45 | val salary = row.getInt(3) 46 | val fullName:Row = row.getStruct(0) //Index starts from zero 47 | val firstName = fullName.getString(0)//In struct row, again index starts from zero 48 | val middleName = fullName.get(1).toString 49 | val lastName = fullName.getAs[String]("lastname") 50 | println(firstName+","+middleName+","+lastName+","+salary) 51 | }) 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/DataFrameComplex.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types._ 5 | 6 | object DataFrameComplex extends App { 7 | 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local[5]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val structureData = Seq( 15 | Row(Row("James","","Smith"),"36636","NewYork",3100, List("Java","Scala"),Map("hair"->"black","eye"->"brown")), 16 | Row(Row("Michael","Rose",""),"40288","California",4300,List("Python","PHP"),Map("hair"->"black","eye"->"brown")), 17 | Row(Row("Robert","","Williams"),"42114","Florida",1400,List("C++","C#"),Map("hair"->"black","eye"->"brown")), 18 | Row(Row("Maria","Anne","Jones"),"39192","Florida",5500,List("Python","Scala"),Map("hair"->"black","eye"->"brown")), 19 | Row(Row("Jen","Mary","Brown"),"34561","NewYork",3000,List("R","Scala"),Map("hair"->"black","eye"->"brown")) 20 | ) 21 | 22 | val structureSchema = new StructType() 23 | .add("name",new StructType() 24 | .add("firstname",StringType) 25 | .add("middlename",StringType) 26 | .add("lastname",StringType)) 27 | .add("id",StringType) 28 | .add("location",StringType) 29 | .add("salary",IntegerType) 30 | .add("languagesKnown",ArrayType(StringType)) 31 | .add("properties",MapType(StringType,StringType)) 32 | 33 | 34 | val df2 = spark.createDataFrame(spark.sparkContext.parallelize(structureData),structureSchema) 35 | df2.printSchema() 36 | df2.show(false) 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/DataFrameEmptyCheck.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object DataFrameEmptyCheck extends App { 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExample") 10 | .getOrCreate() 11 | 12 | val df = spark.emptyDataFrame 13 | 14 | println(df.isEmpty) 15 | println(df.rdd.isEmpty()) 16 | println(df.head()) 17 | println() 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/DropColumn.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 5 | import org.apache.spark.sql.functions.col 6 | object DropColumn extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[5]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | val data = Seq( 14 | Row("James","","Smith","36636","NewYork",3100), 15 | Row("Michael","Rose","","40288","California",4300), 16 | Row("Robert","","Williams","42114","Florida",1400), 17 | Row("Maria","Anne","Jones","39192","Florida",5500), 18 | Row("Jen","Mary","Brown","34561","NewYork",3000) 19 | ) 20 | 21 | val schema = new StructType() 22 | .add("firstname",StringType) 23 | .add("middlename",StringType) 24 | .add("lastname",StringType) 25 | .add("id",StringType) 26 | .add("location",StringType) 27 | .add("salary",IntegerType) 28 | 29 | val df = spark.createDataFrame( 30 | spark.sparkContext.parallelize(data),schema) 31 | df.printSchema() 32 | df.show(false) 33 | 34 | df.drop(df("firstname")) 35 | .printSchema() 36 | 37 | df.drop(col("firstname")) 38 | .printSchema() 39 | 40 | val df2 = df.drop("firstname") 41 | df2.printSchema() 42 | 43 | df.drop("firstname","middlename","lastname") 44 | .printSchema() 45 | 46 | val cols = Seq("firstname","middlename","lastname") 47 | df.drop(cols:_*) 48 | .printSchema() 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/ForEachExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ForEachExample extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val data = Seq(("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), 13 | ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), 14 | ("Carrots",1200,"China"),("Beans",1500,"China")) 15 | 16 | //DataFrame 17 | val df = spark.createDataFrame(data).toDF("Product","Amount","Country") 18 | df.foreach(f=> println(f)) 19 | 20 | val longAcc = spark.sparkContext.longAccumulator("SumAccumulator") 21 | df.foreach(f=> { 22 | longAcc.add(f.getInt(1)) 23 | }) 24 | println("Accumulator value:"+longAcc.value) 25 | //rdd 26 | val rdd = spark.sparkContext.parallelize(Seq(1,2,3,4,5,6,7,8,9)) 27 | rdd.foreach(print) 28 | 29 | //rdd accumulator 30 | val rdd2 = spark.sparkContext.parallelize(Seq(1,2,3,4,5,6,7,8,9)) 31 | val longAcc2 = spark.sparkContext.longAccumulator("SumAccumulator2") 32 | rdd .foreach(f=> { 33 | longAcc2.add(f) 34 | }) 35 | println("Accumulator value:"+longAcc2.value) 36 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/ForEachPartExample.scala: -------------------------------------------------------------------------------- 1 | //package com.sparkbyexamples.spark.dataframe.examples 2 | // 3 | //import org.apache.spark.sql.SparkSession 4 | // 5 | //object ForEachPartExample extends App { 6 | // 7 | // val spark: SparkSession = SparkSession.builder() 8 | // .master("local[1]") 9 | // .appName("SparkByExamples.com") 10 | // .getOrCreate() 11 | // 12 | // val data = Seq(("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), 13 | // ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), 14 | // ("Carrots",1200,"China"),("Beans",1500,"China")) 15 | // 16 | // // foreachPartition DataFrame 17 | // val df = spark.createDataFrame(data).toDF("Product","Amount","Country") 18 | // df.foreachPartition(partition => { 19 | // //Initialize any database connection 20 | // partition.foreach(fun=>{ 21 | // //apply the function 22 | // }) 23 | // }) 24 | // 25 | // //rdd 26 | // val rdd = spark.sparkContext.parallelize(Seq(1,2,3,4,5,6,7,8,9)) 27 | // rdd.foreachPartition(partition => { 28 | // //Initialize any database connection 29 | // partition.foreach(fun=>{ 30 | // //apply the function 31 | // }) 32 | // }) 33 | //} -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/MapFlatMap.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import com.sparkbyexamples.spark.rdd.functions.FlatMapExample.spark 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 6 | 7 | object MapFlatMap extends App{ 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val data = Seq("Project Gutenberg’s", 15 | "Alice’s Adventures in Wonderland", 16 | "Project Gutenberg’s", 17 | "Adventures in Wonderland", 18 | "Project Gutenberg’s") 19 | 20 | import spark.sqlContext.implicits._ 21 | val df = data.toDF("data") 22 | df.show(false) 23 | 24 | //Map Transformation 25 | val mapDF=df.map(fun=> { 26 | fun.getString(0).split(" ") 27 | }) 28 | mapDF.show(false) 29 | 30 | //Flat Map Transformation 31 | val flatMapDF=df.flatMap(fun=> 32 | { 33 | fun.getString(0).split(" ") 34 | }) 35 | flatMapDF.show() 36 | 37 | val arrayStructureData = Seq( 38 | Row("James,,Smith",List("Java","Scala","C++"),"CA"), 39 | Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"), 40 | Row("Robert,,Williams",List("CSharp","VB","R"),"NV") 41 | ) 42 | 43 | val arrayStructureSchema = new StructType() 44 | .add("name",StringType) 45 | .add("languagesAtSchool", ArrayType(StringType)) 46 | .add("currentState", StringType) 47 | 48 | val df1 = spark.createDataFrame( 49 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 50 | 51 | 52 | //flatMap() Usage 53 | val df2=df1.flatMap(f => { 54 | val lang=f.getSeq[String](1) 55 | lang.map((f.getString(0),_,f.getString(2))) 56 | }) 57 | 58 | val df3=df2.toDF("Name","language","State") 59 | df3.show(false) 60 | 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/MapTransformation.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType,ArrayType,MapType} 5 | 6 | object MapTransformation extends App{ 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[5]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | val structureData = Seq( 14 | Row("James","","Smith","36636","NewYork",3100), 15 | Row("Michael","Rose","","40288","California",4300), 16 | Row("Robert","","Williams","42114","Florida",1400), 17 | Row("Maria","Anne","Jones","39192","Florida",5500), 18 | Row("Jen","Mary","Brown","34561","NewYork",3000) 19 | ) 20 | 21 | val structureSchema = new StructType() 22 | .add("firstname",StringType) 23 | .add("middlename",StringType) 24 | .add("lastname",StringType) 25 | .add("id",StringType) 26 | .add("location",StringType) 27 | .add("salary",IntegerType) 28 | 29 | val df2 = spark.createDataFrame( 30 | spark.sparkContext.parallelize(structureData),structureSchema) 31 | df2.printSchema() 32 | df2.show(false) 33 | 34 | import spark.implicits._ 35 | val util = new Util() 36 | val df3 = df2.map(row=>{ 37 | 38 | val fullName = util.combine(row.getString(0),row.getString(1),row.getString(2)) 39 | (fullName, row.getString(3),row.getInt(5)) 40 | }) 41 | val df3Map = df3.toDF("fullName","id","salary") 42 | 43 | df3Map.printSchema() 44 | df3Map.show(false) 45 | 46 | val df4 = df2.mapPartitions(iterator => { 47 | val util = new Util() 48 | val res = iterator.map(row=>{ 49 | val fullName = util.combine(row.getString(0),row.getString(1),row.getString(2)) 50 | (fullName, row.getString(3),row.getInt(5)) 51 | }) 52 | res 53 | }) 54 | val df4part = df4.toDF("fullName","id","salary") 55 | df4part.printSchema() 56 | df4part.show(false) 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/RangePartition.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.col 5 | object RangePartition extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() .master("local[1]") 8 | .appName("SparkByExamples.com") 9 | .getOrCreate() 10 | 11 | /** 12 | * Simple using columns list 13 | */ 14 | val data = Seq((1,10),(2,20),(3,10),(4,20),(5,10), 15 | (6,30),(7,50),(8,50),(9,50),(10,30), 16 | (11,10),(12,10),(13,40),(14,40),(15,40), 17 | (16,40),(17,50),(18,10),(19,40),(20,40) 18 | ) 19 | 20 | import spark.sqlContext.implicits._ 21 | val dfRange = data.toDF("id","count") 22 | .repartitionByRange(5,col("count")) 23 | 24 | dfRange.write.option("header",true).csv("c:/tmp/range-partition") 25 | dfRange.write.partitionBy() 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/ReadORCFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.{SparkSession} 4 | 5 | object ReadORCFile extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val data =Seq(("James ","","Smith","36636","M",3000), 13 | ("Michael ","Rose","","40288","M",4000), 14 | ("Robert ","","Williams","42114","M",4000), 15 | ("Maria ","Anne","Jones","39192","F",4000), 16 | ("Jen","Mary","Brown","","F",-1)) 17 | val columns=Seq("firstname","middlename","lastname","dob","gender","salary") 18 | val df=spark.createDataFrame(data).toDF(columns:_*) 19 | 20 | df.write.mode("overwrite") 21 | .orc("/tmp/orc/data.orc") 22 | 23 | df.write.mode("overwrite") 24 | .option("compression","none12") 25 | .orc("/tmp/orc/data-nocomp.orc") 26 | 27 | df.write.mode("overwrite") 28 | .option("compression","zlib") 29 | .orc("/tmp/orc/data-zlib.orc") 30 | 31 | val df2=spark.read.orc("/tmp/orc/data.orc") 32 | df2.show(false) 33 | 34 | df2.createOrReplaceTempView("ORCTable") 35 | val orcSQL = spark.sql("select firstname,dob from ORCTable where salary >= 4000 ") 36 | orcSQL.show(false) 37 | 38 | spark.sql("CREATE TEMPORARY VIEW PERSON USING orc OPTIONS (path \"/tmp/orc/data.orc\")") 39 | spark.sql("SELECT * FROM PERSON").show() 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/RenameDeleteFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object RenameDeleteFile extends App{ 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | //Create Hadoop Configuration from Spark 15 | val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) 16 | 17 | val srcPath=new Path("/tmp/address_rename_merged.csv") 18 | val destPath= new Path("/tmp/address_merged.csv") 19 | 20 | //Rename a File 21 | if(fs.exists(srcPath) && fs.isFile(srcPath)) 22 | fs.rename(srcPath,destPath) 23 | 24 | //Alternatively, you can also create Hadoop configuration 25 | val hadoopConfig = new Configuration() 26 | val hdfs = FileSystem.get(hadoopConfig) 27 | if(hdfs.isFile(srcPath)) 28 | hdfs.rename(srcPath,destPath) 29 | 30 | 31 | //Delete a File 32 | if(hdfs.isDirectory(srcPath)) 33 | hdfs.delete(new Path("/tmp/.address_merged2.csv.crc"),true) 34 | 35 | import scala.sys.process._ 36 | //Delete a File 37 | s"hdfs dfs -rm /tmp/.address_merged2.csv.crc" ! 38 | 39 | //Delete a Directory 40 | s"hdfs dfs -rm -r /tmp/.address_merged2.csv.crc" ! 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/RepartitionExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object RepartitionExample extends App { 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[5]") 9 | .appName("SparkByExamples.com") 10 | // .config("spark.default.parallelism", "500") 11 | .getOrCreate() 12 | 13 | // spark.sqlContext.setConf("spark.default.parallelism", "500") 14 | //spark.conf.set("spark.default.parallelism", "500") 15 | val df = spark.range(0,20) 16 | df.printSchema() 17 | println(df.rdd.partitions.length) 18 | 19 | df.write.mode(SaveMode.Overwrite)csv("c:/tmp/df-partition.csv") 20 | 21 | val df2 = df.repartition(10) 22 | 23 | println(df2.rdd.partitions.length) 24 | 25 | val df3 = df.coalesce(2) 26 | println(df3.rdd.partitions.length) 27 | 28 | val df4 = df.groupBy("id").count() 29 | println(df4.rdd.getNumPartitions) 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/SaveSingleFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import java.io.File 4 | 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} 7 | import org.apache.spark.sql.{SaveMode, SparkSession} 8 | 9 | object SaveSingleFile extends App{ 10 | 11 | val spark:SparkSession = SparkSession.builder() 12 | .master("local[3]") 13 | .appName("SparkByExamples.com") 14 | .getOrCreate() 15 | 16 | val df = spark.read.option("header",true) 17 | .csv("src/main/resources/address.csv") 18 | df.repartition(1) 19 | .write.mode(SaveMode.Overwrite).csv("/tmp/address") 20 | 21 | 22 | val hadoopConfig = new Configuration() 23 | val hdfs = FileSystem.get(hadoopConfig) 24 | 25 | val srcPath=new Path("/tmp/address") 26 | val destPath= new Path("/tmp/address_merged.csv") 27 | val srcFile=FileUtil.listFiles(new File("c:/tmp/address")) 28 | .filterNot(f=>f.getPath.endsWith(".csv"))(0) 29 | //Copy the CSV file outside of Directory and rename 30 | FileUtil.copy(srcFile,hdfs,destPath,true,hadoopConfig) 31 | //Remove Directory created by df.write() 32 | hdfs.delete(srcPath,true) 33 | //Removes CRC File 34 | hdfs.delete(new Path("/tmp/.address_merged.csv.crc"),true) 35 | 36 | // Merge Using Haddop API 37 | df.repartition(1).write.mode(SaveMode.Overwrite) 38 | .csv("/tmp/address-tmp") 39 | val srcFilePath=new Path("/tmp/address-tmp") 40 | val destFilePath= new Path("/tmp/address_merged2.csv") 41 | FileUtil.copyMerge(hdfs, srcFilePath, hdfs, destFilePath, true, hadoopConfig, null) 42 | //Remove hidden CRC file if not needed. 43 | hdfs.delete(new Path("/tmp/.address_merged2.csv.crc"),true) 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/SelectSelectExpr.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.col 5 | object SelectSelectExpr extends App { 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val data = Seq(("Java", "20000"), ("Python", "100000"), ("Scala", "3000")) 13 | val df = spark.createDataFrame(data).toDF("language","users_count") 14 | df.select("language","users_count as count").show() //Example 1 15 | df.select(df("language"),df("users_count").as("count")).show() //Example 2 16 | df.select(col("language"),col("users_count")).show() ////Example 3 17 | //df.select("language",col("users_count")).show() ////Example 3 18 | 19 | df.selectExpr("language","users_count as count").show() //Example 1 20 | //df.selectExpr(df("language"),df("users_count").as("count")).show() //Example 2 21 | //df.selectExpr(col("language"),col("users_count")).show() ////Example 3 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/ShuffleExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ShuffleExample extends App { 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | import spark.implicits._ 13 | 14 | val simpleData = Seq(("James","Sales","NY",90000,34,10000), 15 | ("Michael","Sales","NY",86000,56,20000), 16 | ("Robert","Sales","CA",81000,30,23000), 17 | ("Maria","Finance","CA",90000,24,23000), 18 | ("Raman","Finance","CA",99000,40,24000), 19 | ("Scott","Finance","NY",83000,36,19000), 20 | ("Jen","Finance","NY",79000,53,15000), 21 | ("Jeff","Marketing","CA",80000,25,18000), 22 | ("Kumar","Marketing","NY",91000,50,21000) 23 | ) 24 | val df = simpleData.toDF("employee_name","department","state","salary","age","bonus") 25 | 26 | val df2 = df.groupBy("state").count() 27 | df2.show(false) 28 | println(df2.rdd.getNumPartitions) 29 | 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/examples/Util.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.examples 2 | 3 | class Util extends Serializable { 4 | def combine(fname:String,mname:String,lname:String):String = { 5 | fname+","+mname+","+lname 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/AddColumn.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, lit, typedLit, when} 5 | import org.apache.spark.sql.types.IntegerType 6 | 7 | object AddColumn extends App { 8 | 9 | val spark = SparkSession.builder() 10 | .appName("SparkByExamples.com") 11 | .master("local") 12 | .getOrCreate() 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | val data = Seq(("111",50000),("222",60000),("333",40000)) 17 | val df = data.toDF("EmpId","Salary") 18 | df.show(false) 19 | 20 | //Derive a new column from existing 21 | df.withColumn("CopiedColumn",df("salary")* -1) 22 | .show(false) 23 | 24 | //Using select 25 | df.select($"EmpId",$"Salary", ($"salary"* -1).as("CopiedColumn") ) 26 | .show(false) 27 | 28 | //Adding a literal 29 | val df2 = df.select(col("EmpId"),col("Salary"),lit("1").as("lit_value1")) 30 | df2.show() 31 | 32 | val df3 = df2.withColumn("lit_value2", 33 | when(col("Salary") >=40000 && col("Salary") <= 50000, lit("100").cast(IntegerType)) 34 | .otherwise(lit("200").cast(IntegerType)) 35 | ) 36 | df3.show(false) 37 | 38 | //Adding a list column 39 | val df4 = df3.withColumn("typedLit_seq",typedLit(Seq(1, 2, 3))) 40 | .withColumn("typedLit_map",typedLit(Map("a" -> 1, "b" -> 2))) 41 | .withColumn("typedLit_struct",typedLit(("a", 2, 1.0))) 42 | 43 | df4.printSchema() 44 | df4.show() 45 | 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/PivotExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object PivotExample { 6 | def main(args:Array[String]):Unit= { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | val data = Seq(("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), 15 | ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), 16 | ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), 17 | ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")) 18 | 19 | 20 | 21 | import spark.sqlContext.implicits._ 22 | val df = data.toDF("Product","Amount","Country") 23 | df.show() 24 | 25 | //pivot 26 | val pivotDF = df.groupBy("Product","Country") 27 | .sum("Amount") 28 | .groupBy("Product") 29 | .pivot("Country") 30 | .sum("sum(Amount)") 31 | pivotDF.show() 32 | 33 | val countries = Seq("USA","China","Canada","Mexico") 34 | val pivotDF2 = df.groupBy("Product").pivot("Country", countries).sum("Amount") 35 | pivotDF2.show() 36 | 37 | //unpivot 38 | // val unPivotDF = pivotDF.select($"Product",expr("stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) " + 39 | // "as (Country,Total)")) //.where("Total is not null") 40 | // unPivotDF.show() 41 | 42 | df.select(collect_list("")) 43 | 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/RemoveDuplicate.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | object RemoveDuplicate extends App { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/SortExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object SortExample extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.implicits._ 15 | 16 | val simpleData = Seq(("James","Sales","NY",90000,34,10000), 17 | ("Michael","Sales","NY",86000,56,20000), 18 | ("Robert","Sales","CA",81000,30,23000), 19 | ("Maria","Finance","CA",90000,24,23000), 20 | ("Raman","Finance","CA",99000,40,24000), 21 | ("Scott","Finance","NY",83000,36,19000), 22 | ("Jen","Finance","NY",79000,53,15000), 23 | ("Jeff","Marketing","CA",80000,25,18000), 24 | ("Kumar","Marketing","NY",91000,50,21000) 25 | ) 26 | val df = simpleData.toDF("employee_name","department","state","salary","age","bonus") 27 | df.printSchema() 28 | df.show() 29 | 30 | df.sort("department","state").show(false) 31 | df.sort(col("department"),col("state")).show(false) 32 | 33 | df.orderBy("department","state").show(false) 34 | df.orderBy(col("department"),col("state")).show(false) 35 | 36 | df.sort(col("department").asc,col("state").asc).show(false) 37 | df.orderBy(col("department").asc,col("state").asc).show(false) 38 | 39 | df.sort(col("department").asc,col("state").desc).show(false) 40 | df.orderBy(col("department").asc,col("state").desc).show(false) 41 | 42 | df.select($"employee_name",asc("department"),desc("state"),$"salary",$"age",$"bonus").show(false) 43 | df.createOrReplaceTempView("EMP") 44 | spark.sql(" select employee_name,asc('department'),desc('state'),salary,age,bonus from EMP").show(false) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/aggregate/DistinctCount.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.aggregate 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | 6 | object DistinctCount extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.implicits._ 16 | 17 | val simpleData = Seq(("James", "Sales", 3000), 18 | ("Michael", "Sales", 4600), 19 | ("Robert", "Sales", 4100), 20 | ("Maria", "Finance", 3000), 21 | ("James", "Sales", 3000), 22 | ("Scott", "Finance", 3300), 23 | ("Jen", "Finance", 3900), 24 | ("Jeff", "Marketing", 3000), 25 | ("Kumar", "Marketing", 2000), 26 | ("Saif", "Sales", 4100) 27 | ) 28 | val df = simpleData.toDF("employee_name", "department", "salary") 29 | df.show() 30 | 31 | println("Distinct Count: " + df.distinct().count()) 32 | 33 | val df2 = df.select(countDistinct("department", "salary")) 34 | df2.show(false) 35 | println("Distinct Count of Department & Salary: "+df2.collect()(0)(0)) 36 | 37 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/aggregate/SQLDistinct.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.aggregate 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | 6 | object SQLDistinct extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.implicits._ 16 | 17 | val simpleData = Seq(("James", "Sales", 3000), 18 | ("Michael", "Sales", 4600), 19 | ("Robert", "Sales", 4100), 20 | ("Maria", "Finance", 3000), 21 | ("James", "Sales", 3000), 22 | ("Scott", "Finance", 3300), 23 | ("Jen", "Finance", 3900), 24 | ("Jeff", "Marketing", 3000), 25 | ("Kumar", "Marketing", 2000), 26 | ("Saif", "Sales", 4100) 27 | ) 28 | val df = simpleData.toDF("employee_name", "department", "salary") 29 | df.show() 30 | 31 | //Distinct all columns 32 | val distinctDF = df.distinct() 33 | println("Distinct count: "+distinctDF.count()) 34 | distinctDF.show(false) 35 | 36 | val df2 = df.dropDuplicates() 37 | println("Distinct count: "+df2.count()) 38 | df2.show(false) 39 | 40 | //Distinct using dropDuplicates 41 | val dropDisDF = df.dropDuplicates("department","salary") 42 | println("Distinct count of department & salary : "+dropDisDF.count()) 43 | dropDisDF.show(false) 44 | 45 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayContainsExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.functions.{array_contains,col} 4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 5 | import org.apache.spark.sql.{Row, SparkSession} 6 | 7 | object ArrayContainsExample extends App { 8 | 9 | val spark = SparkSession.builder().appName("SparkByExamples.com") 10 | .master("local[1]") 11 | .getOrCreate() 12 | 13 | val data = Seq( 14 | Row("James,,Smith",List("Java","Scala","C++"),"CA"), 15 | Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"), 16 | Row("Robert,,Williams",null,"NV") 17 | ) 18 | 19 | val schema = new StructType() 20 | .add("name",StringType) 21 | .add("languagesAtSchool", ArrayType(StringType)) 22 | .add("currentState", StringType) 23 | 24 | val df = spark.createDataFrame( 25 | spark.sparkContext.parallelize(data),schema) 26 | df.printSchema() 27 | df.show(false) 28 | 29 | val df2=df.withColumn("Java Present", 30 | array_contains(col("languagesAtSchool"),"Java")) 31 | df2.show(false) 32 | 33 | val df3=df.where(array_contains(col("languagesAtSchool"),"Java")) 34 | df3.show(false) 35 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfArrayType.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.functions.{explode, flatten} 5 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 6 | 7 | object ArrayOfArrayType extends App { 8 | 9 | val spark = SparkSession.builder().appName("SparkByExamples.com") 10 | .master("local[1]") 11 | .getOrCreate() 12 | 13 | val arrayArrayData = Seq( 14 | Row("James",List(List("Java","Scala","C++"),List("Spark","Java"))), 15 | Row("Michael",List(List("Spark","Java","C++"),List("Spark","Java"))), 16 | Row("Robert",List(List("CSharp","VB"),List("Spark","Python"))) 17 | ) 18 | 19 | val arrayArraySchema = new StructType().add("name",StringType) 20 | .add("subjects",ArrayType(ArrayType(StringType))) 21 | 22 | val df = spark.createDataFrame( 23 | spark.sparkContext.parallelize(arrayArrayData),arrayArraySchema) 24 | df.printSchema() 25 | df.show(false) 26 | 27 | import spark.implicits._ 28 | val df2 = df.select($"name",explode($"subjects")) 29 | 30 | 31 | df2.printSchema() 32 | df2.show(false) 33 | 34 | //Convert Array of Array into Single array 35 | df.select($"name",flatten($"subjects")).show(false) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfMapType.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | import org.apache.spark.sql.functions.{explode} 6 | import org.apache.spark.sql.types._ 7 | 8 | object ArrayOfMapType extends App { 9 | val spark = SparkSession.builder().appName("SparkByExamples.com") 10 | .master("local[1]") 11 | .getOrCreate() 12 | 13 | val arrayMapSchema = new StructType().add("name",StringType) 14 | .add("properties", 15 | ArrayType(new MapType(StringType,StringType,true))) 16 | 17 | val arrayMapData = Seq( 18 | Row("James",List(Map("hair"->"black","eye"->"brown"), Map("height"->"5.9"))), 19 | Row("Michael",List(Map("hair"->"brown","eye"->"black"),Map("height"->"6"))), 20 | Row("Robert",List(Map("hair"->"red","eye"->"gray"),Map("height"->"6.3"))) 21 | ) 22 | 23 | val df = spark.createDataFrame( 24 | spark.sparkContext.parallelize(arrayMapData),arrayMapSchema) 25 | df.printSchema() 26 | df.show(false) 27 | 28 | import spark.implicits._ 29 | 30 | val df2 = df.select($"name",explode($"properties")) 31 | df2.printSchema() 32 | df2.show(false) 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfString.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 5 | import org.apache.spark.sql.functions.{col,concat_ws} 6 | 7 | object ArrayOfString extends App{ 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val arrayStructureData = Seq( 15 | Row("James,,Smith",List("Java","Scala","C++"),"CA"), 16 | Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"), 17 | Row("Robert,,Williams",List("CSharp","VB"),"NV") 18 | ) 19 | 20 | val arrayStructureSchema = new StructType() 21 | .add("name",StringType) 22 | .add("languagesAtSchool", ArrayType(StringType)) 23 | .add("currentState", StringType) 24 | 25 | 26 | val df = spark.createDataFrame( 27 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 28 | df.printSchema() 29 | df.show() 30 | 31 | val df2 = df.withColumn("languagesAtSchool", 32 | concat_ws(",",col("languagesAtSchool"))) 33 | df2.printSchema() 34 | df2.show() 35 | 36 | import spark.implicits._ 37 | val df3 = df.map(f=>{ 38 | val name = f.getString(0) 39 | val lang = f.getList(1).toArray.mkString(",") 40 | (name,lang,f.getString(2)) 41 | }) 42 | 43 | df3.toDF("Name","Languages","currentState") 44 | .show(false) 45 | 46 | df.createOrReplaceTempView("ARRAY_STRING") 47 | spark.sql("select name, concat_ws(',',languagesAtSchool) as languagesAtSchool," + 48 | " currentState from ARRAY_STRING") 49 | .show(false) 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfStructType.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} 5 | import org.apache.spark.sql.{Row, SparkSession} 6 | 7 | object ArrayOfStructType extends App{ 8 | 9 | val spark = SparkSession.builder().appName("SparkByExamples.com") 10 | .master("local[1]") 11 | .getOrCreate() 12 | 13 | val arrayStructData = Seq( 14 | Row("James",List(Row("Java","XX",120),Row("Scala","XA",300))), 15 | Row("Michael",List(Row("Java","XY",200),Row("Scala","XB",500))), 16 | Row("Robert",List(Row("Java","XZ",400),Row("Scala","XC",250))), 17 | Row("Washington",null) 18 | ) 19 | 20 | val arrayStructSchema = new StructType().add("name",StringType) 21 | .add("booksIntersted",ArrayType(new StructType() 22 | .add("name",StringType) 23 | .add("author",StringType) 24 | .add("pages",IntegerType))) 25 | 26 | val df = spark.createDataFrame( 27 | spark.sparkContext.parallelize(arrayStructData),arrayStructSchema) 28 | df.printSchema() 29 | df.show(false) 30 | 31 | import spark.implicits._ 32 | val df2 = df.select($"name",explode($"booksIntersted")) 33 | df2.printSchema() 34 | df2.show(false) 35 | 36 | df2.groupBy($"name").agg(collect_list($"col").as("booksIntersted")) 37 | .show(false) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/CollectListExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.types.{StringType, StructType} 5 | import org.apache.spark.sql.{Row, SparkSession} 6 | 7 | object CollectListExample extends App { 8 | 9 | val spark = SparkSession.builder().appName("SparkByExamples.com") 10 | .master("local[1]") 11 | .getOrCreate() 12 | 13 | val arrayStructData = Seq( 14 | Row("James", "Java"), Row("James", "C#"),Row("James", "Python"), 15 | Row("Michael", "Java"),Row("Michael", "PHP"),Row("Michael", "PHP"), 16 | Row("Robert", "Java"),Row("Robert", "Java"),Row("Robert", "Java"), 17 | Row("Washington", null) 18 | ) 19 | val arrayStructSchema = new StructType().add("name", StringType) 20 | .add("booksIntersted", StringType) 21 | 22 | val df = spark.createDataFrame( 23 | spark.sparkContext.parallelize(arrayStructData),arrayStructSchema) 24 | df.printSchema() 25 | df.show(false) 26 | 27 | val df2 = df.groupBy("name").agg(collect_list("booksIntersted") 28 | .as("booksIntersted")) 29 | df2.printSchema() 30 | df2.show(false) 31 | 32 | df.groupBy("name").agg(collect_set("booksIntersted") 33 | .as("booksIntersted")) 34 | .show(false) 35 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/MapToColumn.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | import org.apache.spark.sql.types._ 6 | 7 | object MapToColumn extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val arrayStructureData = Seq( 15 | Row("James",Map("hair"->"black","eye"->"brown")), 16 | Row("Michael",Map("hair"->"gray","eye"->"black")), 17 | Row("Robert",Map("hair"->"brown")) 18 | ) 19 | 20 | val mapType = DataTypes.createMapType(StringType,StringType) 21 | 22 | val arrayStructureSchema = new StructType() 23 | .add("name",StringType) 24 | .add("property", MapType(StringType,StringType)) 25 | 26 | val mapTypeDF = spark.createDataFrame( 27 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 28 | mapTypeDF.printSchema() 29 | mapTypeDF.show(false) 30 | 31 | mapTypeDF.select(col("name"), 32 | col("property").getItem("hair").as("hair_color"), 33 | col("property").getItem("eye").as("eye_color")) 34 | .show(false) 35 | 36 | import spark.implicits._ 37 | val keysDF = mapTypeDF.select(explode(map_keys($"property"))).distinct() 38 | val keys = keysDF.collect().map(f=>f.get(0)) 39 | val keyCols = keys.map(f=> col("property").getItem(f).as(f.toString)) 40 | mapTypeDF.select(col("name") +: keyCols:_*).show(false) 41 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/MapTypeExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | import org.apache.spark.sql.functions.{col, explode, lit, map, map_concat, map_from_entries, map_keys, map_values} 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types._ 5 | 6 | object MapTypeExample extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | //Creating DF with MapType 14 | val arrayStructureData = Seq( 15 | Row("James",List(Row("Newark","NY"),Row("Brooklyn","NY")), 16 | Map("hair"->"black","eye"->"brown"), Map("height"->"5.9")), 17 | Row("Michael",List(Row("SanJose","CA"),Row("Sandiago","CA")), 18 | Map("hair"->"brown","eye"->"black"),Map("height"->"6")), 19 | Row("Robert",List(Row("LasVegas","NV")), 20 | Map("hair"->"red","eye"->"gray"),Map("height"->"6.3")), 21 | Row("Maria",null,Map("hair"->"blond","eye"->"red"), 22 | Map("height"->"5.6")), 23 | Row("Jen",List(Row("LAX","CA"),Row("Orange","CA")), 24 | Map("white"->"black","eye"->"black"),Map("height"->"5.2")) 25 | ) 26 | 27 | 28 | val mapType = DataTypes.createMapType(StringType,StringType) 29 | 30 | val arrayStructureSchema = new StructType() 31 | .add("name",StringType) 32 | .add("addresses", ArrayType(new StructType() 33 | .add("city",StringType) 34 | .add("state",StringType))) 35 | .add("properties", mapType) 36 | .add("secondProp", MapType(StringType,StringType)) 37 | 38 | val mapTypeDF = spark.createDataFrame( 39 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 40 | mapTypeDF.printSchema() 41 | mapTypeDF.show(false) 42 | 43 | mapTypeDF.select(col("name"),map_keys(col("properties"))).show(false) 44 | mapTypeDF.select(col("name"),map_values(col("properties"))).show(false) 45 | mapTypeDF.select(col("name"),map_concat(col("properties"),col("secondProp"))).show(false) 46 | 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/SliceArray.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.functions.{array_join, col, slice, split} 5 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 6 | 7 | object SliceArray extends App { 8 | 9 | 10 | val spark = SparkSession.builder() 11 | .appName("SparkByExamples.com") 12 | .master("local") 13 | .getOrCreate() 14 | 15 | val arrayStructureData = Seq( 16 | Row("James,,Smith",List("Java","Scala","C++","Pascal","Spark")), 17 | Row("Michael,Rose,",List("Spark","Java","C++","Scala","PHP")), 18 | Row("Robert,,Williams",List("CSharp","VB",".Net","C#.net","")) 19 | ) 20 | 21 | val arrayStructureSchema = new StructType() 22 | .add("name",StringType) 23 | .add("languagesAtSchool", ArrayType(StringType)) 24 | 25 | val df = spark.createDataFrame( 26 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 27 | df.show(false) 28 | df.printSchema() 29 | 30 | 31 | val splitDF2 = df.withColumn("languages", 32 | slice(col("languagesAtSchool"),2,3)) 33 | .drop("languagesAtSchool") 34 | splitDF2.printSchema() 35 | splitDF2.show(false) 36 | 37 | df.createOrReplaceTempView("PERSON") 38 | spark.sql("select name, slice(languagesAtSchool,2,3) as NameArray from PERSON") 39 | .show(false) 40 | 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/StringToArray.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.collection 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, split} 5 | 6 | object StringToArray extends App { 7 | 8 | val spark = SparkSession.builder() 9 | .appName("SparkByExamples.com") 10 | .master("local") 11 | .getOrCreate() 12 | 13 | val data = Seq(("James, A, Smith","2018","M",3000), 14 | ("Michael, Rose, Jones","2010","M",4000), 15 | ("Robert,K,Williams","2010","M",4000), 16 | ("Maria,Anne,Jones","2005","F",4000), 17 | ("Jen,Mary,Brown","2010","",-1) 18 | ) 19 | 20 | import spark.sqlContext.implicits._ 21 | val df = data.toDF("name","dob_year","gender","salary") 22 | df.printSchema() 23 | df.show(false) 24 | 25 | val df2 = df.select(split(col("name"),",").as("NameArray")) 26 | .drop("name") 27 | 28 | df2.printSchema() 29 | df2.show(false) 30 | 31 | df.createOrReplaceTempView("PERSON") 32 | spark.sql("select SPLIT(name,',') as NameArray from PERSON") 33 | .show(false) 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/AddTime.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object AddTime extends App { 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | spark.sparkContext.setLogLevel("ERROR") 12 | 13 | import spark.sqlContext.implicits._ 14 | 15 | spark.sql( "select current_timestamp," + 16 | "cast(current_timestamp as TIMESTAMP) + INTERVAL 2 hours as added_hours," + 17 | "cast(current_timestamp as TIMESTAMP) + INTERVAL 5 minutes as added_minutes," + 18 | "cast(current_timestamp as TIMESTAMP) + INTERVAL 55 seconds as added_seconds" 19 | ).show(false) 20 | 21 | 22 | val df = Seq(("2019-07-01 12:01:19.101"), 23 | ("2019-06-24 12:01:19.222"), 24 | ("2019-11-16 16:44:55.406"), 25 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 26 | 27 | 28 | df.createOrReplaceTempView("AddTimeExample") 29 | 30 | val df2 = spark.sql("select input_timestamp, " + 31 | "cast(input_timestamp as TIMESTAMP) + INTERVAL 2 hours as added_hours," + 32 | "cast(input_timestamp as TIMESTAMP) + INTERVAL 5 minutes as added_minutes," + 33 | "cast(input_timestamp as TIMESTAMP) + INTERVAL 55 seconds as added_seconds from AddTimeExample" 34 | ) 35 | df2.show(false) 36 | 37 | df.withColumn("added_hours",col("input_timestamp") + expr("INTERVAL 2 HOURS")) 38 | .withColumn("added_minutes",col("input_timestamp") + expr("INTERVAL 2 minutes")) 39 | .withColumn("added_seconds",col("input_timestamp") + expr("INTERVAL 2 seconds")) 40 | .show(false) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/CurrentDateAndTime.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | 6 | object CurrentDateAndTime extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | //Get current Date & Time 17 | val df = Seq((1)).toDF("seq") 18 | 19 | val curDate = df.withColumn("current_date",current_date().as("current_date")) 20 | .withColumn("current_timestamp",current_timestamp().as("current_timestamp")) 21 | curDate.show(false) 22 | 23 | 24 | curDate.select(date_format(col("current_timestamp"),"MM-dd-yyyy").as("date"), 25 | date_format(col("current_timestamp"),"HH:mm:ss.SSS").as("time"), 26 | date_format(col("current_date"), "MM-dd-yyyy").as("current_date_formateed")) 27 | .show(false) 28 | 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateAddMonths.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types.IntegerType 6 | 7 | object DateAddMonths extends App { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select( 18 | col("date"), 19 | add_months(col("date"),3).as("add_months"), 20 | add_months(col("date"),-3).as("sub_months"), 21 | date_add(col("date"),4).as("date_add"), 22 | date_sub(col("date"),4).as("date_sub") 23 | ).show() 24 | 25 | Seq(("06-03-2009"),("07-24-2009")).toDF("date").select( 26 | col("Date"), 27 | add_months(to_date(col("Date"),"MM-dd-yyyy"),3).as("add_months"), 28 | add_months(to_date(col("Date"),"MM-dd-yyyy"),-3).as("add_months2"), 29 | date_add(to_date(col("Date"),"MM-dd-yyyy"),3).as("date_add"), 30 | date_add(to_date(col("Date"),"MM-dd-yyyy"),-3).as("date_add2"), 31 | date_sub(to_date(col("Date"),"MM-dd-yyyy"),3).as("date_sub") 32 | ).show() 33 | 34 | // Seq(("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)).toDF("date","increment").select( 35 | // col("date"), 36 | // add_months(to_date(col("date"),"yyyy-MM-dd"),col("increment").cast(IntegerType).).as("date_inc") 37 | // ).show() 38 | 39 | Seq(("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)) 40 | .toDF("date","increment") 41 | .select(col("date"),col("increment"), 42 | expr("add_months(to_date(date,'yyyy-MM-dd'),cast(increment as int))").as("inc_date")) 43 | .show() 44 | 45 | Seq(("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)) 46 | .toDF("date","increment") 47 | .selectExpr("date","increment","add_months(to_date(date,'yyyy-MM-dd'),cast(increment as int)) as inc_date") 48 | .show() 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateDiff.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | 6 | object DateDiff extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | //Difference between two dates in days 17 | Seq(("2019-07-01"),("2019-06-24"),("2019-08-24"),("2018-07-23")).toDF("date") 18 | .select( 19 | col("date"), 20 | current_date().as("current_date"), 21 | datediff(current_date(),col("date")).as("datediff") 22 | ).show() 23 | 24 | // Difference between two dates in Months and Years 25 | val df = Seq(("2019-07-01"),("2019-06-24"),("2019-08-24"),("2018-12-23"),("2018-07-20")) 26 | .toDF("startDate").select( 27 | col("startDate"),current_date().as("endDate") 28 | ) 29 | 30 | calculateDiff(df) 31 | 32 | //Difference between two dates when dates are not in Spark DateType format 'yyyy-MM-dd'. 33 | //Note that when dates are not in Spark DateType format, all Spark functions returns null 34 | //Hence, first convert the input dates to Spark DateType using to_date function 35 | val dfDate = Seq(("07-01-2019"),("06-24-2019"),("08-24-2019"),("12-23-2018"),("07-20-2018")) 36 | .toDF("startDate").select( 37 | to_date(col("startDate"),"MM-dd-yyyy").as("startDate"), 38 | current_date().as("endDate") 39 | ) 40 | 41 | calculateDiff(dfDate) 42 | 43 | def calculateDiff(df:DataFrame): Unit ={ 44 | df.withColumn("datesDiff", datediff(col("endDate"),col("startDate"))) 45 | .withColumn("montsDiff", months_between( 46 | col("endDate"),col("startDate"))) 47 | .withColumn("montsDiff_round",round(months_between( 48 | col("endDate"),col("startDate")),2)) 49 | .withColumn("yearsDiff",months_between( 50 | col("endDate"),col("startDate"),true).divide(12)) 51 | .withColumn("yearsDiff_round",round(months_between( 52 | col("endDate"),col("startDate"),true).divide(12),2)) 53 | .show() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateInMilli.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{unix_timestamp, _} 5 | import org.apache.spark.sql.types.{DateType, LongType, TimestampType} 6 | 7 | object DateInMilli extends App{ 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | val df = Seq(1).toDF("seq").select( 18 | current_date().as("current_date"), 19 | unix_timestamp().as("unix_timestamp_seconds") 20 | ) 21 | 22 | df.printSchema() 23 | df.show(false) 24 | 25 | //Convert unix seconds to date 26 | df.select( 27 | to_date(col("unix_timestamp_seconds").cast(TimestampType)).as("current_date") 28 | ).show(false) 29 | 30 | //convert date to unix seconds 31 | df.select( 32 | unix_timestamp(col("current_date")).as("unix_seconds"), 33 | unix_timestamp(lit("12-21-2019"),"mm-DD-yyyy").as("unix_seconds2") 34 | ).show(false) 35 | 36 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateLastDay.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, last_day, to_date} 5 | 6 | object DateLastDay extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | Seq(("2019-01-01"),("2020-02-24"),("2019-02-24"), 17 | ("2019-05-01"),("2018-03-24"),("2007-12-19")) 18 | .toDF("Date").select( 19 | col("Date"), 20 | last_day(col("Date")).as("last_day") 21 | ).show() 22 | 23 | 24 | Seq(("06-03-2009"),("07-24-2009")).toDF("Date").select( 25 | col("Date"), 26 | last_day(to_date(col("Date"),"MM-dd-yyyy")).as("last_day") 27 | ).show() 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateToString.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import com.sparkbyexamples.spark.dataframe.functions.datetime.DateFormat.spark 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions.{current_date, current_timestamp, date_format} 6 | 7 | object DateToString extends App { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | Seq(1).toDF("seq").select( 18 | current_date().as("current_date"), 19 | date_format(current_timestamp(),"yyyy MM dd").as("yyyy MM dd"), 20 | date_format(current_timestamp(),"MM/dd/yyyy hh:mm").as("MM/dd/yyyy"), 21 | date_format(current_timestamp(),"yyyy MMM dd").as("yyyy MMMM dd"), 22 | date_format(current_timestamp(),"yyyy MMMM dd E").as("yyyy MMMM dd E") 23 | ).show(false) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DayAndWeekOfYear.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, date_format, to_timestamp} 5 | 6 | 7 | object DayAndWeekOfYear extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | val df = Seq(("2019-01-03 12:01:19.000"), 18 | ("2019-02-01 12:01:19.000"), 19 | ("2019-7-16 16:44:55.406"), 20 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 21 | 22 | //Get Day of the Year example 23 | df.withColumn("input_timestamp", 24 | to_timestamp(col("input_timestamp"))) 25 | .withColumn("day_of_year", date_format(col("input_timestamp"), "D")) 26 | .show(false) 27 | 28 | //Get Week of the Year example 29 | df.withColumn("input_timestamp", 30 | to_timestamp(col("input_timestamp"))) 31 | .withColumn("week_of_year", date_format(col("input_timestamp"), "w")) 32 | .show(false) 33 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DayWeekAndWeekMonth.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, to_timestamp,date_format} 5 | 6 | 7 | object DayWeekAndWeekMonth extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | val df = Seq(("2019-07-01 12:01:19.000"), 18 | ("2019-06-24 12:01:19.000"), 19 | ("2019-11-16 16:44:55.406"), 20 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 21 | 22 | df.withColumn("input_timestamp", 23 | to_timestamp(col("input_timestamp"))) 24 | .withColumn("week_day_number", date_format(col("input_timestamp"), "u")) 25 | .withColumn("week_day_abb", date_format(col("input_timestamp"), "E")) 26 | .show(false) 27 | 28 | df.withColumn("input_timestamp", 29 | to_timestamp(col("input_timestamp"))) 30 | .withColumn("week_day_full", date_format(col("input_timestamp"), "EEEE")) 31 | .withColumn("week_of_month", date_format(col("input_timestamp"), "W")) 32 | .show(false) 33 | } -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/GetTimeFromTimestamp.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col,hour,minute,second} 5 | 6 | object GetTimeFromTimestamp extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | val df = Seq(("2019-07-01 12:01:19.000"), 17 | ("2019-06-24 12:01:19.000"), 18 | ("2019-11-16 16:44:55.406"), 19 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 20 | 21 | 22 | df.withColumn("hour", hour(col("input_timestamp"))) 23 | .withColumn("minute", minute(col("input_timestamp"))) 24 | .withColumn("second", second(col("input_timestamp"))) 25 | .show(false) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/Spark3Date.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{to_timestamp, _} 5 | 6 | object Spark3Date extends App{ 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | val df3 = Seq(1).toDF("seq").select( 17 | current_date().as("current_date"), 18 | current_timestamp().as("current_time"), 19 | unix_timestamp().as("epoch_time_seconds") 20 | ) 21 | // 22 | // val data2 = df.collect() 23 | // data2.foreach(println) 24 | // 25 | // val df2 = Seq(("06-03-2009","07-01-2009 12:01:19.000")).toDF("Date","Time").select( 26 | // col("Date"),col("Time"), 27 | // to_date(col("Date"),"MM-dd-yyyy").as("to_date"), 28 | // to_timestamp(col("Time"),"MM-dd-yyyy HH:mm:ss.SSS").as("to_timestamp") 29 | // ) 30 | // df2.show(false) 31 | // 32 | // val df3 = Seq(("06-03-1500","07-01-1500 12:01:19.000")).toDF("Date","Time").select( 33 | // col("Date"),col("Time"), 34 | // to_date(col("Date"),"MM-dd-yyyy").as("to_date"), 35 | // to_timestamp(col("Time"),"MM-dd-yyyy HH:mm:ss.SSS").as("to_timestamp") 36 | // 37 | // ) 38 | val df=spark.range(1,10000).toDF("num") 39 | println("Before re-partition :"+df.rdd.getNumPartitions) 40 | df.createOrReplaceTempView("RANGE_TABLE") 41 | val df2=spark.sql("SELECT /*+ REPARTITION(20) */ * FROM RANGE_TABLE") 42 | println("After re-partition :"+df2.rdd.getNumPartitions) 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/StringToDate.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, to_date} 5 | 6 | object StringToDate extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | Seq(("06-03-2009"),("07-24-2009")).toDF("Date").select( 17 | col("Date"), 18 | to_date(col("Date"),"MM-dd-yyyy").as("to_date") 19 | ).show() 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/StringToTimestamp.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types.LongType 6 | 7 | object StringToTimestamp extends App { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | //String to timestamps 18 | val df = Seq(("2019-07-01 12:01:19.000"), 19 | ("2019-06-24 12:01:19.000"), 20 | ("2019-11-16 16:44:55.406"), 21 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 22 | 23 | df.withColumn("datetype_timestamp", 24 | to_timestamp(col("input_timestamp"))) 25 | .printSchema() 26 | 27 | 28 | //Convert string to timestamp when input string has just time 29 | val df1 = Seq(("12:01:19.345"), 30 | ("12:01:20.567"), 31 | ("16:02:44.406"), 32 | ("16:50:59.406")) 33 | .toDF("input_timestamp") 34 | 35 | df1.withColumn("datetype_timestamp", 36 | to_timestamp(col("input_timestamp"),"HH:mm:ss.SSS")) 37 | .show(false) 38 | 39 | //when dates are not in Spark DateType format 'yyyy-MM-dd HH:mm:ss.SSS'. 40 | //Note that when dates are not in Spark DateType format, all Spark functions returns null 41 | //Hence, first convert the input dates to Spark DateType using to_timestamp function 42 | val dfDate = Seq(("07-01-2019 12 01 19 406"), 43 | ("06-24-2019 12 01 19 406"), 44 | ("11-16-2019 16 44 55 406"), 45 | ("11-16-2019 16 50 59 406")).toDF("input_timestamp") 46 | 47 | dfDate.withColumn("datetype_timestamp", 48 | to_timestamp(col("input_timestamp"),"MM-dd-yyyy HH mm ss SSS")) 49 | .show(false) 50 | 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimeInMilli.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types.{LongType, TimestampType} 6 | 7 | object TimeInMilli extends App{ 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | val df = Seq(1).toDF("seq").select( 18 | current_timestamp().as("current_time"), 19 | unix_timestamp().as("epoch_time_seconds") 20 | ) 21 | 22 | df.printSchema() 23 | df.show(false) 24 | 25 | //Convert epoch_time to timestamp 26 | df.select( 27 | col("epoch_time_seconds").cast(TimestampType).as("current_time"), 28 | col("epoch_time_seconds").cast("timestamp").as("current_time2") 29 | ).show(false) 30 | 31 | //convert timestamp to Unix epoch time 32 | df.select( 33 | unix_timestamp(col("current_time")).as("unix_epoch_time"), 34 | col("current_time").cast(LongType).as("unix_epoch_time2") 35 | ).show(false) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimestampToDate.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{col, to_date, to_timestamp} 5 | import org.apache.spark.sql.types.DateType 6 | 7 | object TimestampToDate extends App { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | import spark.sqlContext.implicits._ 16 | 17 | val df = Seq(("2019-07-01 12:01:19.000"), 18 | ("2019-06-24 12:01:19.000"), 19 | ("2019-11-16 16:44:55.406"), 20 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp") 21 | 22 | //Timestamp String to DateType 23 | df.withColumn("datetype", 24 | to_date(col("input_timestamp"),"yyyy-MM-dd")) 25 | .show(false) 26 | 27 | //Timestamp type to DateType 28 | df.withColumn("ts",to_timestamp(col("input_timestamp"))) 29 | .withColumn("datetype",to_date(col("ts"))) 30 | .show(false) 31 | 32 | //Using Cast 33 | df.withColumn("ts",to_timestamp(col("input_timestamp"))) 34 | .withColumn("datetype",col("ts").cast(DateType)) 35 | .show(false) 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimestampToString.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{current_date, current_timestamp, date_format} 5 | 6 | object TimestampToString extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | import spark.sqlContext.implicits._ 17 | Seq(1).toDF("seq").select( 18 | current_timestamp().as("current_date"), 19 | date_format(current_timestamp(),"yyyy MM dd").as("yyyy MM dd"), 20 | date_format(current_timestamp(),"MM/dd/yyyy hh:mm").as("MM/dd/yyyy"), 21 | date_format(current_timestamp(),"yyyy MMM dd").as("yyyy MMMM dd"), 22 | date_format(current_timestamp(),"yyyy MMMM dd E").as("yyyy MMMM dd E") 23 | ).show(false) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/UnixTimestamp.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.datetime 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{from_unixtime, unix_timestamp, _} 5 | 6 | object UnixTimestamp extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.sqlContext.implicits._ 15 | 16 | //Convert Timestamp to Unix timestamp 17 | val inputDF = Seq(("2019-07-01 12:01:19.000","07-01-2019 12:01:19.000", "07-01-2019")) 18 | .toDF("timestamp_1","timestamp_2","timestamp_3") 19 | inputDF.printSchema() 20 | inputDF.show(false) 21 | 22 | //Convert timestamp to unix timestamp 23 | val df = inputDF.select( 24 | unix_timestamp(col("timestamp_1")).as("timestamp_1"), 25 | unix_timestamp(col("timestamp_2"),"MM-dd-yyyy HH:mm:ss").as("timestamp_2"), 26 | unix_timestamp(col("timestamp_3"),"MM-dd-yyyy").as("timestamp_3"), 27 | unix_timestamp().as("timestamp_4") 28 | ) 29 | df.printSchema() 30 | df.show(false) 31 | 32 | // Convert Unix timestamp to timestamp 33 | val df2 = df.select( 34 | from_unixtime(col("timestamp_1")).as("timestamp_1"), 35 | from_unixtime(col("timestamp_2"),"MM-dd-yyyy HH:mm:ss").as("timestamp_2"), 36 | from_unixtime(col("timestamp_3"),"MM-dd-yyyy").as("timestamp_3"), 37 | from_unixtime(col("timestamp_4")).as("timestamp_4") 38 | ) 39 | df2.printSchema() 40 | df2.show(false) 41 | 42 | //Convert unix timestamp to timestamp 43 | val timeDF = Seq(1).toDF("seq").select( 44 | from_unixtime(unix_timestamp()).as("timestamp_1"), 45 | from_unixtime(unix_timestamp(),"MM-dd-yyyy HH:mm:ss").as("timestamp_2"), 46 | from_unixtime(unix_timestamp(),"dd-MM-yyyy HH:mm:ss").as("timestamp_3"), 47 | from_unixtime(unix_timestamp(),"HH:mm:ss").as("timestamp_4") 48 | ).show() 49 | 50 | //Convert unix timestamp to date 51 | val dateDF = Seq(1).toDF("seq").select( 52 | from_unixtime(unix_timestamp(),"MM-dd-yyyy").as("date_1"), 53 | from_unixtime(unix_timestamp(),"dd-MM-yyyy HH:mm:ss").as("date_2"), 54 | from_unixtime(unix_timestamp(),"yyyy-MM-dd").as("date_3") 55 | ).show(false) 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/from_json.scala: -------------------------------------------------------------------------------- 1 | //package com.sparkbyexamples.spark.dataframe.functions 2 | // 3 | //import org.apache.spark.sql.SparkSession 4 | //import org.apache.spark.sql.functions.col 5 | //import org.apache.spark.sql.types.{StringType, StructType} 6 | // 7 | //object from_json { 8 | // def main(args:Array[String]):Unit= { 9 | // 10 | // val spark: SparkSession = SparkSession.builder() 11 | // .master("local[1]") 12 | // .appName("SparkByExample") 13 | // .getOrCreate() 14 | // 15 | // 16 | // val data = Seq(("1","{\"name\":\"Anne\",\"Age\":\"12\",\"country\":\"Denmark\"}"), 17 | // ("2","{\"name\":\"Zen\",\"Age\":\"24\"}"), 18 | // ("3","{\"name\":\"Fred\",\"Age\":\"20\",\"country\":\"France\"}"), 19 | // ("4","{\"name\":\"Mona\",\"Age\":\"18\",\"country\":\"Denmark\"}") 20 | // ) 21 | // 22 | // import spark.sqlContext.implicits._ 23 | // val df = data.toDF("ID","details_Json") 24 | // 25 | // val schema = (new StructType()).add("name",StringType,true) 26 | // .add("Age",StringType,true) 27 | // .add("country",StringType,true) 28 | // 29 | // val df2 = df.withColumn("details_Struct", from_json($"details_Json", schema)) 30 | // .withColumn("country",col("details_Struct").getField("country")) 31 | // .filter(col("country").equalTo("Denmark")) 32 | // 33 | // 34 | // df2.printSchema() 35 | // df2.show(false) 36 | // } 37 | //} 38 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/litTypeLit.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.IntegerType 5 | 6 | object litTypeLit extends App { 7 | 8 | 9 | 10 | val spark = SparkSession.builder() 11 | .appName("sparkbyexamples.com") 12 | .master("local") 13 | .getOrCreate() 14 | 15 | import spark.sqlContext.implicits._ 16 | import org.apache.spark.sql.functions._ 17 | 18 | val data = Seq(("111",50000),("222",60000),("333",40000)) 19 | val df = data.toDF("EmpId","Salary") 20 | val df2 = df.select(col("EmpId"),col("Salary"),lit("1").as("lit_value1")) 21 | df2.show() 22 | 23 | val df3 = df2.withColumn("lit_value2", 24 | when(col("Salary") >=40000 && col("Salary") <= 50000, lit("100").cast(IntegerType)) 25 | .otherwise(lit("200").cast(IntegerType)) 26 | ) 27 | 28 | df3.show() 29 | 30 | val df4 = df3.withColumn("typedLit_seq",typedLit(Seq(1, 2, 3))) 31 | .withColumn("typedLit_map",typedLit(Map("a" -> 1, "b" -> 2))) 32 | .withColumn("typedLit_struct",typedLit(("a", 2, 1.0))) 33 | 34 | df4.printSchema() 35 | df4.show() 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/string/ConcatExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.string 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.{lit, _} 5 | object ConcatExample extends App { 6 | 7 | val spark = SparkSession.builder() 8 | .appName("SparkByExamples.com") 9 | .master("local") 10 | .getOrCreate() 11 | 12 | val data = Seq(("James","A","Smith","2018","M",3000), 13 | ("Michael","Rose","Jones","2010","M",4000), 14 | ("Robert","K","Williams","2010","M",4000), 15 | ("Maria","Anne","Jones","2005","F",4000), 16 | ("Jen","Mary","Brown","2010","",-1) 17 | ) 18 | 19 | val columns = Seq("fname","mname","lname","dob_year","gender","salary") 20 | import spark.sqlContext.implicits._ 21 | val df = data.toDF(columns:_*) 22 | df.printSchema() 23 | df.show(false) 24 | 25 | df.select(concat(col("fname"),lit(','), 26 | col("mname"),lit(','),col("lname")).as("FullName")) 27 | .show(false) 28 | 29 | df.withColumn("FullName",concat(col("fname"),lit(','), 30 | col("mname"),lit(','),col("lname"))) 31 | .drop("fname") 32 | .drop("mname") 33 | .drop("lname") 34 | .show(false) 35 | 36 | df.withColumn("FullName",concat_ws(",",col("fname"),col("mname"),col("lname"))) 37 | .drop("fname") 38 | .drop("mname") 39 | .drop("lname") 40 | .show(false) 41 | 42 | df.createOrReplaceTempView("EMP") 43 | 44 | spark.sql("select CONCAT(fname,' ',lname,' ',mname) as FullName from EMP") 45 | .show(false) 46 | 47 | spark.sql("select fname ||' '|| lname ||' '|| mname as FullName from EMP") 48 | .show(false) 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/string/SplitExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.string 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes} 6 | 7 | object SplitExample extends App{ 8 | 9 | val spark = SparkSession.builder() 10 | .appName("SparkByExamples.com") 11 | .master("local") 12 | .getOrCreate() 13 | 14 | val data = Seq(("James, A, Smith","2018","M",3000), 15 | ("Michael, Rose, Jones","2010","M",4000), 16 | ("Robert,K,Williams","2010","M",4000), 17 | ("Maria,Anne,Jones","2005","F",4000), 18 | ("Jen,Mary,Brown","2010","",-1) 19 | ) 20 | 21 | import spark.sqlContext.implicits._ 22 | val df = data.toDF("name","dob_year","gender","salary") 23 | df.printSchema() 24 | df.show(false) 25 | 26 | val df2 = df.select(split(col("name"),",").getItem(0).as("FirstName"), 27 | split(col("name"),",").getItem(1).as("MiddleName"), 28 | split(col("name"),",").getItem(2).as("LastName")) 29 | .drop("name") 30 | 31 | df2.printSchema() 32 | df2.show(false) 33 | 34 | 35 | val splitDF = df.withColumn("FirstName",split(col("name"),",").getItem(0)) 36 | .withColumn("MiddleName",split(col("name"),",").getItem(1)) 37 | .withColumn("LastName",split(col("name"),",").getItem(2)) 38 | .withColumn("NameArray",split(col("name"),",")) 39 | .drop("name") 40 | splitDF.printSchema() 41 | splitDF.show(false) 42 | 43 | df.createOrReplaceTempView("PERSON") 44 | spark.sql("select SPLIT(name,',') as NameArray from PERSON") 45 | .show(false) 46 | 47 | 48 | val splitDF2 = df.withColumn("FirstName",split(col("name"),",").getItem(0)) 49 | .withColumn("MiddleName",array_join(slice(split(col("name"),","),2,3),"/")) 50 | 51 | .withColumn("NameArray",split(col("name"),",")) 52 | .drop("name") 53 | splitDF2.printSchema() 54 | splitDF2.show(false) 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/functions/window/RowNumber.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.functions.window 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.expressions.Window 5 | import org.apache.spark.sql.functions.row_number 6 | 7 | object RowNumber extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | import spark.implicits._ 17 | 18 | val simpleData = Seq(("James", "Sales", 3000), 19 | ("Michael", "Sales", 4600), 20 | ("Robert", "Sales", 4100), 21 | ("Maria", "Finance", 3000), 22 | ("James", "Sales", 3000), 23 | ("Scott", "Finance", 3300), 24 | ("Jen", "Finance", 3900), 25 | ("Jeff", "Marketing", 3000), 26 | ("Kumar", "Marketing", 2000), 27 | ("Saif", "Sales", 4100) 28 | ) 29 | val df = simpleData.toDF("employee_name", "department", "salary") 30 | df.show() 31 | 32 | //row_number 33 | val windowSpec = Window.partitionBy("department").orderBy("salary") 34 | df.withColumn("row_number",row_number.over(windowSpec)) 35 | .show() 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/join/CrossJoinExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.join 2 | 3 | class CrossJoinExample { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/join/InnerJoinExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.join 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.catalyst.plans.Inner 5 | 6 | object InnerJoinExample extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | val emp = Seq((1,"Smith",-1,"2018","10","M",3000), 16 | (2,"Rose",1,"2010","20","M",4000), 17 | (3,"Williams",1,"2010","10","M",1000), 18 | (4,"Jones",2,"2005","10","F",2000), 19 | (5,"Brown",2,"2010","40","",-1), 20 | (6,"Brown",2,"2010","50","",-1) 21 | ) 22 | val empColumns = Seq("emp_id","name","superior_emp_id","year_joined","emp_dept_id","gender","salary") 23 | import spark.sqlContext.implicits._ 24 | val empDF = emp.toDF(empColumns:_*) 25 | empDF.show(false) 26 | 27 | val dept = Seq(("Finance",10), 28 | ("Marketing",20), 29 | ("Sales",30), 30 | ("IT",40) 31 | ) 32 | 33 | val deptColumns = Seq("dept_name","dept_id") 34 | val deptDF = dept.toDF(deptColumns:_*) 35 | deptDF.show(false) 36 | 37 | 38 | println("Inner join") 39 | empDF.join(deptDF,empDF("emp_dept_id") === deptDF("dept_id"),"inner") 40 | .show(false) 41 | empDF.join(deptDF,empDF("emp_dept_id") === deptDF("dept_id")) 42 | .show(false) 43 | 44 | empDF.join(deptDF,empDF("emp_dept_id") === deptDF("dept_id"),Inner.sql) 45 | .show(false) 46 | 47 | empDF.join(deptDF).where(empDF("emp_dept_id") === deptDF("dept_id")) 48 | .show(false) 49 | 50 | empDF.join(deptDF).filter(empDF("emp_dept_id") === deptDF("dept_id")) 51 | .show(false) 52 | 53 | empDF.createOrReplaceTempView("EMP") 54 | deptDF.createOrReplaceTempView("DEPT") 55 | 56 | val joinDF2 = spark.sql("select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id") 57 | joinDF2.show(false) 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/join/JoinMultipleColumns.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.join 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object JoinMultipleColumns extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | val emp = Seq((1,"Smith",-1,"2018",10,"M",3000), 15 | (2,"Rose",1,"2010",20,"M",4000), 16 | (3,"Williams",1,"2010",10,"M",1000), 17 | (4,"Jones",2,"2005",10,"F",2000), 18 | (5,"Brown",2,"2010",30,"",-1), 19 | (6,"Brown",2,"2010",50,"",-1) 20 | ) 21 | val empColumns = Seq("emp_id","name","superior_emp_id","branch_id","dept_id","gender","salary") 22 | import spark.sqlContext.implicits._ 23 | val empDF = emp.toDF(empColumns:_*) 24 | empDF.show(false) 25 | 26 | val dept = Seq(("Finance",10,"2018"), 27 | ("Marketing",20,"2010"), 28 | ("Marketing",20,"2018"), 29 | ("Sales",30,"2005"), 30 | ("Sales",30,"2010"), 31 | ("IT",50,"2010") 32 | ) 33 | 34 | val deptColumns = Seq("dept_name","dept_id","branch_id") 35 | val deptDF = dept.toDF(deptColumns:_*) 36 | deptDF.show(false) 37 | 38 | //Using multiple columns on join expression 39 | empDF.join(deptDF, empDF("dept_id") === deptDF("dept_id") && 40 | empDF("branch_id") === deptDF("branch_id"),"inner") 41 | .show(false) 42 | 43 | //Using Join with multiple columns on where clause 44 | empDF.join(deptDF).where(empDF("dept_id") === deptDF("dept_id") && 45 | empDF("branch_id") === deptDF("branch_id")) 46 | .show(false) 47 | 48 | //Using Join with multiple columns on filter clause 49 | empDF.join(deptDF).filter(empDF("dept_id") === deptDF("dept_id") && 50 | empDF("branch_id") === deptDF("branch_id")) 51 | .show(false) 52 | 53 | //Using SQL & multiple columns on join expression 54 | empDF.createOrReplaceTempView("EMP") 55 | deptDF.createOrReplaceTempView("DEPT") 56 | 57 | val resultDF = spark.sql("select e.* from EMP e, DEPT d " + 58 | "where e.dept_id == d.dept_id and e.branch_id == d.branch_id") 59 | resultDF.show(false) 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/join/JoinMultipleDataFrames.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.join 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object JoinMultipleDataFrames extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | val emp = Seq((1,"Smith","10"), 15 | (2,"Rose","20"), 16 | (3,"Williams","10"), 17 | (4,"Jones","10"), 18 | (5,"Brown","40"), 19 | (6,"Brown","50") 20 | ) 21 | val empColumns = Seq("emp_id","name","emp_dept_id") 22 | import spark.sqlContext.implicits._ 23 | val empDF = emp.toDF(empColumns:_*) 24 | empDF.show(false) 25 | 26 | val dept = Seq(("Finance",10), 27 | ("Marketing",20), 28 | ("Sales",30), 29 | ("IT",40) 30 | ) 31 | val deptColumns = Seq("dept_name","dept_id") 32 | val deptDF = dept.toDF(deptColumns:_*) 33 | deptDF.show(false) 34 | 35 | val address = Seq((1,"1523 Main St","SFO","CA"), 36 | (2,"3453 Orange St","SFO","NY"), 37 | (3,"34 Warner St","Jersey","NJ"), 38 | (4,"221 Cavalier St","Newark","DE"), 39 | (5,"789 Walnut St","Sandiago","CA") 40 | ) 41 | val addColumns = Seq("emp_id","addline1","city","state") 42 | val addDF = address.toDF(addColumns:_*) 43 | addDF.show(false) 44 | 45 | //Using Join expression 46 | empDF.join(deptDF,empDF("emp_dept_id") === deptDF("dept_id"),"inner" ) 47 | .join(addDF,empDF("emp_id") === addDF("emp_id"),"inner") 48 | .show(false) 49 | 50 | //Using where 51 | empDF.join(deptDF).where(empDF("emp_dept_id") === deptDF("dept_id")) 52 | .join(addDF).where(empDF("emp_id") === addDF("emp_id")) 53 | .show(false) 54 | 55 | //Using Filter 56 | empDF.join(deptDF).filter(empDF("emp_dept_id") === deptDF("dept_id")) 57 | .join(addDF).filter(empDF("emp_id") === addDF("emp_id")) 58 | .show(false) 59 | 60 | //Using SQL expression 61 | empDF.createOrReplaceTempView("EMP") 62 | deptDF.createOrReplaceTempView("DEPT") 63 | addDF.createOrReplaceTempView("ADD") 64 | 65 | spark.sql("select * from EMP e, DEPT d, ADD a " + 66 | "where e.emp_dept_id == d.dept_id and e.emp_id == a.emp_id") 67 | .show(false) 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/join/SelfJoinExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.join 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.col 5 | 6 | object SelfJoinExample extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | val emp = Seq((1,"Smith",1,"10",3000), 16 | (2,"Rose",1,"20",4000), 17 | (3,"Williams",1,"10",1000), 18 | (4,"Jones",2,"10",2000), 19 | (5,"Brown",2,"40",-1), 20 | (6,"Brown",2,"50",-1) 21 | ) 22 | val empColumns = Seq("emp_id","name","superior_emp_id","emp_dept_id","salary") 23 | import spark.sqlContext.implicits._ 24 | val empDF = emp.toDF(empColumns:_*) 25 | empDF.show(false) 26 | 27 | println("self join") 28 | val selfDF = empDF.as("emp1").join(empDF.as("emp2"), 29 | col("emp1.superior_emp_id") === col("emp2.emp_id"),"inner") 30 | selfDF.show(false) 31 | 32 | selfDF.select(col("emp1.emp_id"),col("emp1.name"), 33 | col("emp2.emp_id").as("superior_emp_id"), 34 | col("emp2.name").as("superior_emp_name")) 35 | .show(false) 36 | 37 | //Spark SQL self join with where clause 38 | empDF.as("emp1").join(empDF.as("emp2")).where( 39 | col("emp1.superior_emp_id") === col("emp2.emp_id")) 40 | .select(col("emp1.emp_id"),col("emp1.name"), 41 | col("emp2.emp_id").as("superior_emp_id"), 42 | col("emp2.name").as("superior_emp_name")) 43 | .show(false) 44 | 45 | //Spark SQL self join with filter clause 46 | empDF.as("emp1").join(empDF.as("emp2")).filter( 47 | col("emp1.superior_emp_id") === col("emp2.emp_id")) 48 | .select(col("emp1.emp_id"),col("emp1.name"), 49 | col("emp2.emp_id").as("superior_emp_id"), 50 | col("emp2.name").as("superior_emp_name")) 51 | .show(false) 52 | 53 | 54 | empDF.createOrReplaceTempView("EMP") 55 | spark.sql("select emp1.emp_id,emp1.name," + 56 | "emp2.emp_id as superior_emp_id, emp2.name as superior_emp_name " + 57 | "from EMP emp1 INNER JOIN EMP emp2 on emp1.superior_emp_id == emp2.emp_id") 58 | .show(false) 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/xml/PersonsComplexXML.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} 5 | 6 | object PersonsComplexXML { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val spark = SparkSession.builder().master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | /* 14 | Read XML File 15 | */ 16 | val df = spark.read 17 | .format("xml") 18 | .option("rowTag", "person") 19 | .load("src/main/resources/persons_complex.xml") 20 | 21 | df.printSchema() 22 | 23 | df.show() 24 | val schema = new StructType() 25 | .add("_id",StringType) 26 | .add("firstname",StringType) 27 | .add("middlename",StringType) 28 | .add("lastname",StringType) 29 | .add("dob_year",StringType) 30 | .add("dob_month",StringType) 31 | .add("gender",StringType) 32 | .add("salary",StringType) 33 | .add("addresses", new StructType() 34 | .add("address",ArrayType( 35 | new StructType() 36 | .add("_type",StringType) 37 | .add("addressLine",StringType) 38 | .add("city",StringType) 39 | .add("state",StringType) 40 | ) 41 | ) 42 | ) 43 | 44 | val df2 = spark.read 45 | .format("xml") 46 | .option("rowTag", "person") 47 | .schema(schema) 48 | .load("src/main/resources/persons.xml") 49 | 50 | // df.foreach(row=>{ 51 | // println("ID:"+row.getAs("_id") ) 52 | // println("ID:"+row(0)) 53 | // println("ID:"+row.get(0)) 54 | // println(row.getAs("addresses")) 55 | // // println("ID:"+row.getString(0)) 56 | // }) 57 | // 58 | df2.write 59 | .format("com.databricks.spark.xml") 60 | .option("rootTag", "persons") 61 | .option("rowTag", "person") 62 | .save("src/main/resources/persons_new.xml") 63 | 64 | } 65 | } 66 | 67 | 68 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/xml/PersonsXML.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml 2 | 3 | import org.apache.spark.sql.{SparkSession, types} 4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} 5 | 6 | object PersonsXML { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val spark = SparkSession.builder().master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | /* 14 | Read XML File 15 | */ 16 | val df = spark.read 17 | .format("xml") 18 | .option("rowTag", "person") 19 | .load("src/main/resources/persons.xml") 20 | 21 | df.printSchema() 22 | df.show() 23 | 24 | val schema = new StructType() 25 | .add("_id",StringType) 26 | .add("firstname",StringType) 27 | .add("middlename",StringType) 28 | .add("lastname",StringType) 29 | .add("dob_year",StringType) 30 | .add("dob_month",StringType) 31 | .add("gender",StringType) 32 | .add("salary",StringType) 33 | 34 | val df2 = spark.read 35 | .format("xml") 36 | .option("rowTag", "person") 37 | .schema(schema) 38 | .load("src/main/resources/persons.xml") 39 | 40 | df2.write 41 | .format("com.databricks.spark.xml") 42 | .option("rootTag", "persons") 43 | .option("rowTag", "person") 44 | .save("src/main/resources/persons_new.xml") 45 | 46 | } 47 | } 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataframe/xml/ReadBooksXMLWithNestedArray.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml 2 | 3 | import com.sparkbyexamples.spark.beans.BooksWithArray 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 6 | import org.apache.spark.sql.types.StructType 7 | 8 | object ReadBooksXMLWithNestedArray { 9 | 10 | def main(args: Array[String]): Unit = { 11 | val spark = SparkSession.builder().master("local[1]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | 15 | val df = spark.sqlContext.read 16 | .format("com.databricks.spark.xml") 17 | .option("rowTag", "book") 18 | .load("src/main/resources/books_withnested_array.xml") 19 | 20 | df.printSchema() 21 | df.show() 22 | 23 | df.foreach(row=>{ 24 | println(""+row.getAs("author")+","+row.getAs("_id")) 25 | println(row.getStruct(4).getAs("country")) 26 | println(row.getStruct(4).getClass) 27 | val arr = row.getStruct(7).getList(0) 28 | for (i<-0 to arr.size-1){ 29 | val b = arr.get(i).asInstanceOf[GenericRowWithSchema] 30 | println(""+b.getAs("name") +","+b.getAs("location")) 31 | } 32 | }) 33 | 34 | } 35 | } 36 | 37 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataset/DataSetFromData.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object DataSetFromData { 6 | 7 | def main(args:Array[String]):Unit= { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | val data = Seq((1,2),(3,4),(5,6)) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataset/DataSetWithCustomClass.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | class Test(field1:String,field2:String,field3:String) extends Serializable{ 6 | 7 | 8 | } 9 | 10 | object TestEncoders { 11 | implicit def testEncoder: org.apache.spark.sql.Encoder[Test] = 12 | org.apache.spark.sql.Encoders.kryo[Test] 13 | } 14 | object DataSetWithCustomClass { 15 | 16 | def main(args:Array[String]):Unit= { 17 | 18 | val spark: SparkSession = SparkSession.builder() 19 | .master("local[1]") 20 | .appName("SparkByExample") 21 | .getOrCreate() 22 | 23 | val test:Test = new Test("Field1","Field2","Field3") 24 | 25 | import spark.sqlContext.implicits._ 26 | import org.apache.spark.sql.Encoders 27 | import TestEncoders._ 28 | // implicit val encoder = Encoders.bean[Test](classOf[Test]) 29 | 30 | val data = Seq(test) 31 | val rdd = spark.sparkContext.parallelize(data) 32 | val ds = spark.createDataset(rdd) 33 | 34 | val ds2 = ds.selectExpr("CAST(value AS String)") 35 | .as[(String)] 36 | 37 | 38 | ds.printSchema() 39 | ds2.show(false) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXML.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset.xml 2 | 3 | import com.sparkbyexamples.spark.beans.{Books, BooksDiscounted} 4 | import org.apache.spark.sql.{Encoders, SparkSession} 5 | 6 | object ReadBooksXML { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val spark = SparkSession.builder().master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | import spark.implicits._ 14 | 15 | val ds = spark.sqlContext.read 16 | .format("com.databricks.spark.xml") 17 | .option("rowTag", "book") 18 | .load("src/main/resources/books.xml").as[Books] 19 | 20 | 21 | val newds = ds.map(f=>{ 22 | BooksDiscounted(f._id,f.author,f.description,f.price,f.publish_date,f.title, f.price - f.price*20/100) 23 | }) 24 | 25 | newds.printSchema() 26 | newds.show() 27 | 28 | newds.foreach(f=>{ 29 | println("Price :"+f.price + ", Discounted Price :"+f.discountPrice) 30 | }) 31 | 32 | //First element 33 | println("First Element" +newds.first()._id) 34 | 35 | } 36 | } 37 | 38 | 39 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXMLWithNestedArray.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset.xml 2 | 3 | import com.sparkbyexamples.spark.beans.{Books, BooksWithArray} 4 | import org.apache.spark.sql.{SparkSession, functions} 5 | 6 | object ReadBooksXMLWithNestedArray { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val spark = SparkSession.builder().master("local[1]") 10 | .appName("SparkByExample") 11 | .getOrCreate() 12 | 13 | import spark.implicits._ 14 | val ds = spark.sqlContext.read 15 | .format("com.databricks.spark.xml") 16 | .option("rowTag", "book") 17 | .load("src/main/resources/books_withnested_array.xml").as[BooksWithArray] 18 | 19 | ds.printSchema() 20 | ds.show() 21 | 22 | ds.foreach(f=>{ 23 | println(f.author+","+f.otherInfo.country+","+f.otherInfo.address.addressline1) 24 | for(s<-f.stores.store){ 25 | println(s.name) 26 | } 27 | 28 | }) 29 | 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXMLWithNestedArrayDSL.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset.xml 2 | 3 | 4 | 5 | import com.sparkbyexamples.spark.beans.Books 6 | import org.apache.spark.sql.{Encoders, SparkSession, functions} 7 | 8 | object ReadBooksXMLWithNestedArrayDSL { 9 | 10 | def main(args: Array[String]): Unit = { 11 | val spark = SparkSession.builder().master("local[1]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | 15 | import spark.implicits._ 16 | val xmlDF = spark.sqlContext.read 17 | .format("com.databricks.spark.xml") 18 | .option("rowTag", "book") 19 | .load("src/main/resources/books_withnested_array.xml") 20 | 21 | xmlDF.printSchema() 22 | println(xmlDF.count()) 23 | 24 | xmlDF.show() 25 | 26 | xmlDF.select(xmlDF("title"),xmlDF("price")*100).show() 27 | 28 | xmlDF.select("author").show() 29 | 30 | 31 | xmlDF.select("stores").show() 32 | 33 | xmlDF.withColumn("store", functions.explode(xmlDF("stores.store"))).show() 34 | 35 | val df = xmlDF.withColumn("store", functions.explode(xmlDF("stores.store"))) 36 | .select("_id","author","stores.country","store.name") 37 | 38 | val storeDF = xmlDF.select("stores.store") 39 | storeDF.printSchema() 40 | 41 | df.foreach(f=>{ 42 | println(f.getAs("_id")) 43 | }) 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | } 52 | } 53 | 54 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataset/xml/SparkXMLUsingXstream.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataframe.xml 2 | 3 | import com.thoughtworks.xstream.XStream 4 | import com.thoughtworks.xstream.io.xml.DomDriver 5 | import org.apache.spark.sql.SparkSession 6 | 7 | case class Animal(cri:String,taille:Int) 8 | 9 | object SparkXMLUsingXStream{ 10 | def main(args: Array[String]): Unit = { 11 | val spark = SparkSession. 12 | builder.master ("local[*]") 13 | .appName ("sparkbyexamples.com") 14 | .getOrCreate () 15 | 16 | var animal:Animal = Animal("Rugissement",150) 17 | val xstream1 = new XStream(new DomDriver()) 18 | xstream1.alias("testAni",classOf[Animal]) 19 | xstream1.aliasField("cricri",classOf[Animal],"cri") 20 | val xmlString = Seq(xstream1.toXML(animal)) 21 | 22 | import spark.implicits._ 23 | val newDf = xmlString.toDF() 24 | newDf.show(false) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/dataset/xml/sparkXml.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.dataset.xml 2 | 3 | import org.apache.spark.sql.functions.{col, explode} 4 | import org.apache.spark.sql.{SQLContext, SparkSession} 5 | 6 | object sparkXml { 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark = SparkSession. 10 | builder.master("local[*]") 11 | //.config("spark.debug.maxToStringFields", "100") 12 | .appName("Insight Application Big Data") 13 | .getOrCreate() 14 | 15 | val df = spark.read 16 | .format("com.databricks.spark.xml") 17 | .option("rowTag", "row") 18 | .load("src/main/resources/input.xml") 19 | df.createOrReplaceTempView("categ_entry") 20 | 21 | df.printSchema() 22 | spark.sql("Select c26['_VALUE'] as value, c26['_m'] as option from categ_entry").show(false) 23 | 24 | val df2 = df.withColumn("c26Struct",explode(df("c26"))) 25 | df2.select(col("c26Struct._VALUE").alias("value"),col("c26Struct._m").alias("option") ).show(false) 26 | 27 | 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/CreateEmptyRDD.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object CreateEmptyRDD extends App{ 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[3]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val rdd = spark.sparkContext.emptyRDD 13 | val rddString = spark.sparkContext.emptyRDD[String] 14 | 15 | println(rdd) 16 | println(rddString) 17 | println("Num of Partitions: "+rdd.getNumPartitions) 18 | 19 | rddString.saveAsTextFile("c:/tmp/test5.txt") 20 | 21 | val rdd2 = spark.sparkContext.parallelize(Seq.empty[String]) 22 | println(rdd2) 23 | println("Num of Partitions: "+rdd2.getNumPartitions) 24 | 25 | rdd2.saveAsTextFile("c:/tmp/test3.txt") 26 | 27 | // Pair RDD 28 | 29 | type dataType = (String,Int) 30 | var pairRDD = spark.sparkContext.emptyRDD[dataType] 31 | println(pairRDD) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/CreateRDD.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object CreateRDD { 6 | 7 | def main(args:Array[String]): Unit ={ 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | val rdd=spark.sparkContext.parallelize(Seq(("Java", 20000), ("Python", 100000), ("Scala", 3000))) 15 | rdd.foreach(println) 16 | 17 | val rdd1 = spark.sparkContext.textFile("/path/textFile.txt") 18 | 19 | val rdd2 = spark.sparkContext.wholeTextFiles("/path/textFile.txt") 20 | rdd2.foreach(record=>println("FileName : "+record._1+", FileContents :"+record._2)) 21 | 22 | val rdd3 = rdd.map(row=>{(row._1,row._2+100)}) 23 | rdd3.foreach(println) 24 | 25 | val myRdd2 = spark.range(20).toDF().rdd 26 | myRdd2.foreach(println) 27 | 28 | 29 | 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/OperationsOnPairRDD.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | import scala.collection.mutable 6 | 7 | object OperationsOnPairRDD { 8 | 9 | def main(args: Array[String]): Unit = { 10 | 11 | val spark = SparkSession.builder() 12 | .appName("SparkByExample") 13 | .master("local") 14 | .getOrCreate() 15 | 16 | spark.sparkContext.setLogLevel("ERROR") 17 | 18 | val rdd = spark.sparkContext.parallelize( 19 | List("Germany India USA","USA India Russia","India Brazil Canada China") 20 | ) 21 | 22 | val wordsRdd = rdd.flatMap(_.split(" ")) 23 | val pairRDD = wordsRdd.map(f=>(f,1)) 24 | pairRDD.foreach(println) 25 | 26 | println("Distinct ==>") 27 | pairRDD.distinct().foreach(println) 28 | 29 | 30 | //SortByKey 31 | println("Sort by Key ==>") 32 | val sortRDD = pairRDD.sortByKey() 33 | sortRDD.foreach(println) 34 | 35 | //reduceByKey 36 | println("Reduce by Key ==>") 37 | val wordCount = pairRDD.reduceByKey((a,b)=>a+b) 38 | wordCount.foreach(println) 39 | 40 | def param1= (accu:Int,v:Int) => accu + v 41 | def param2= (accu1:Int,accu2:Int) => accu1 + accu2 42 | println("Aggregate by Key ==> wordcount") 43 | val wordCount2 = pairRDD.aggregateByKey(0)(param1,param2) 44 | wordCount2.foreach(println) 45 | 46 | //keys 47 | println("Keys ==>") 48 | wordCount2.keys.foreach(println) 49 | 50 | //values 51 | println("values ==>") 52 | wordCount2.values.foreach(println) 53 | 54 | println("Count :"+wordCount2.count()) 55 | 56 | println("collectAsMap ==>") 57 | pairRDD.collectAsMap().foreach(println) 58 | 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/OperationsOnRDD.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object OperationsOnRDD { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark = SparkSession.builder() 10 | .appName("SparkByExample") 11 | .master("local") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | val rdd = spark.sparkContext.parallelize( 17 | List("Germany India USA","USA London Russia","Mexico Brazil Canada China") 18 | ) 19 | 20 | val listRdd = spark.sparkContext.parallelize(List(9,2,3,4,5,6,7,8)) 21 | 22 | //reduce 23 | println("Minimum :"+listRdd.reduce((a,b)=> a min b)) 24 | println("Maximum :"+listRdd.reduce((a,b)=> a max b)) 25 | println("Sum :"+listRdd.reduce((a,b)=> a + b)) 26 | 27 | //flatMap 28 | val wordsRdd = rdd.flatMap(_.split(" ")) 29 | wordsRdd.foreach(println) 30 | 31 | //sortBy 32 | println("Sort by word name") 33 | val sortRdd = wordsRdd.sortBy(f=>f) // also can write f=>f 34 | 35 | //GroupBy 36 | val groupRdd = wordsRdd.groupBy(word=>word.length) 37 | groupRdd.foreach(println) 38 | 39 | //map 40 | val tupp2Rdd = wordsRdd.map(f=>(f,1)) 41 | tupp2Rdd.foreach(println) 42 | 43 | 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/PartitionBy.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.HashPartitioner 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object PartitionBy { 8 | 9 | 10 | def main(args:Array[String]): Unit = { 11 | 12 | val spark:SparkSession = SparkSession.builder() 13 | .master("local[3]") 14 | .appName("SparkByExample") 15 | .getOrCreate() 16 | 17 | val sc = spark.sparkContext 18 | 19 | val rdd = sc.textFile("C://000_Projects/opt/BigData/zipcodes.csv") 20 | 21 | val rdd2:RDD[Array[String]] = rdd.map(m=>m.split(",")) 22 | 23 | 24 | val rdd3 = rdd2.map(a=>(a(1),a.mkString(","))) 25 | 26 | val rdd4 = rdd3.partitionBy(new HashPartitioner(3)) 27 | 28 | rdd4.saveAsTextFile("c:/tmp/output/partition") 29 | 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDAccumulator.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDAccumulator extends App { 7 | 8 | val spark = SparkSession.builder() 9 | .appName("SparkByExample") 10 | .master("local") 11 | .getOrCreate() 12 | 13 | val longAcc = spark.sparkContext.longAccumulator("SumAccumulator") 14 | 15 | val rdd = spark.sparkContext.parallelize(Array(1, 2, 3)) 16 | 17 | rdd.foreach(x => longAcc.add(x)) 18 | println(longAcc.value) 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDBroadcast.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object RDDBroadcast extends App { 6 | 7 | val spark = SparkSession.builder() 8 | .appName("SparkByExamples.com") 9 | .master("local") 10 | .getOrCreate() 11 | 12 | val states = Map(("NY","New York"),("CA","California"),("FL","Florida")) 13 | val countries = Map(("USA","United States of America"),("IN","India")) 14 | 15 | val broadcastStates = spark.sparkContext.broadcast(states) 16 | val broadcastCountries = spark.sparkContext.broadcast(countries) 17 | 18 | val data = Seq(("James","Smith","USA","CA"), 19 | ("Michael","Rose","USA","NY"), 20 | ("Robert","Williams","USA","CA"), 21 | ("Maria","Jones","USA","FL") 22 | ) 23 | 24 | val rdd = spark.sparkContext.parallelize(data) 25 | 26 | val rdd2 = rdd.map(f=>{ 27 | val country = f._3 28 | val state = f._4 29 | val fullCountry = broadcastCountries.value.get(country).get 30 | val fullState = broadcastStates.value.get(state).get 31 | (f._1,f._2,fullCountry,fullState) 32 | }) 33 | 34 | println(rdd2.collect().mkString("\n")) 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDCache.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDCache extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | val sc = spark.sparkContext 13 | 14 | val rdd = sc.textFile("src/main/resources/zipcodes-noheader.csv") 15 | 16 | val rdd2:RDD[ZipCode] = rdd.map(row=>{ 17 | val strArray = row.split(",") 18 | ZipCode(strArray(0).toInt,strArray(1),strArray(3),strArray(4)) 19 | }) 20 | 21 | rdd2.cache() 22 | 23 | 24 | println(rdd2.count()) 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromCSVFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDFromCSVFile { 7 | 8 | def main(args:Array[String]): Unit ={ 9 | 10 | def splitString(row:String):Array[String]={ 11 | row.split(",") 12 | } 13 | 14 | val spark:SparkSession = SparkSession.builder() 15 | .master("local[3]") 16 | .appName("SparkByExample") 17 | .getOrCreate() 18 | val sc = spark.sparkContext 19 | 20 | val rdd = sc.textFile("src/main/resources/zipcodes-noheader.csv") 21 | 22 | val rdd2:RDD[ZipCode] = rdd.map(row=>{ 23 | val strArray = splitString(row) 24 | ZipCode(strArray(0).toInt,strArray(1),strArray(3),strArray(4)) 25 | }) 26 | 27 | rdd2.foreach(a=>println(a.city)) 28 | } 29 | 30 | } 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromDataUsingParallelize.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.SQLContext 6 | 7 | object RDDFromDataUsingParallelize { 8 | 9 | def main(args: Array[String]): Unit = { 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[3]") 12 | .appName("SparkByExample") 13 | .getOrCreate() 14 | val rdd:RDD[Int] = spark.sparkContext.parallelize(List(1,2,3,4,5)) 15 | val rddCollect:Array[Int] = rdd.collect() 16 | println("Number of Partitions: "+rdd.getNumPartitions) 17 | println("Action: First element: "+rdd.first()) 18 | println("Action: RDD converted to Array[Int] : ") 19 | rddCollect.foreach(println) 20 | 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromParallelizeRange.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDFromParallelizeRange { 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local[3]") 11 | .appName("SparkByExample") 12 | .getOrCreate() 13 | 14 | val sc = spark.sparkContext 15 | 16 | val rdd4:RDD[Range] = sc.parallelize(List(1 to 1000)) 17 | println("Number of Partitions : "+rdd4.getNumPartitions) 18 | 19 | val rdd5 = rdd4.repartition(5) 20 | println("Number of Partitions : "+rdd5.getNumPartitions) 21 | 22 | val rdd6:Array[Range] = rdd5.collect() 23 | println(rdd6.mkString(",")) 24 | 25 | val rdd7:Array[Array[Range]] = rdd5.glom().collect() 26 | println("After glom"); 27 | rdd7.foreach(f=>{ 28 | println("For each partition") 29 | f.foreach(f1=>println(f1)) 30 | }) 31 | 32 | 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromWholeTextFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDFromWholeTextFile { 7 | 8 | def main(args:Array[String]): Unit = { 9 | 10 | val spark:SparkSession = SparkSession.builder() 11 | .master("local[3]") 12 | .appName("SparkByExamples.com") 13 | .getOrCreate() 14 | val sc = spark.sparkContext 15 | 16 | val rdd = sc.wholeTextFiles("C://000_Projects/opt/BigData/alice.txt") 17 | rdd.foreach(a=>println(a._1+"---->"+a._2)) 18 | 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDHadoopInputFormat.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDHadoopInputFormat_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDPrint.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object RDDPrint extends App{ 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExample") 10 | .getOrCreate() 11 | val dept = List(("Finance",10),("Marketing",20), 12 | ("Sales",30), ("IT",40)) 13 | val rdd=spark.sparkContext.parallelize(dept) 14 | println(rdd) 15 | val dataColl=rdd.collect() 16 | println(dataColl) 17 | dataColl.foreach(println) 18 | 19 | dataColl.foreach(f=>println(f._1 +","+f._2)) 20 | val dataCollLis=rdd.collectAsMap() 21 | dataCollLis.foreach(f=>println(f._1 +","+f._2)) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDReadFilesFromDirectory.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDReadFilesFromDirectory_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDRepartitionExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object RDDRepartitionExample extends App { 6 | 7 | val spark:SparkSession = SparkSession.builder() 8 | .master("local[5]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val rdd = spark.sparkContext.parallelize(Range(0,20)) 13 | println("From local[5]"+rdd.partitions.size) 14 | 15 | val rdd1 = spark.sparkContext.parallelize(Range(0,20), 6) 16 | println("parallelize : "+rdd1.partitions.size) 17 | 18 | rdd1.partitions.foreach(f=> f.toString) 19 | val rddFromFile = spark.sparkContext.textFile("src/main/resources/test.txt",9) 20 | 21 | println("TextFile : "+rddFromFile.partitions.size) 22 | 23 | rdd1.saveAsTextFile("c:/tmp/partition") 24 | val rdd2 = rdd1.repartition(4) 25 | println("Repartition size : "+rdd2.partitions.size) 26 | 27 | rdd2.saveAsTextFile("c:/tmp/re-partition") 28 | 29 | val rdd3 = rdd1.coalesce(4) 30 | println("Repartition size : "+rdd3.partitions.size) 31 | 32 | rdd3.saveAsTextFile("c:/tmp/coalesce") 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDSaveAsObjectFile.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDSaveAsObjectFile_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDSequenceFiles.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object RDDSequenceFiles_ { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/RDDShuffleExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDShuffleExample extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[5]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | val sc = spark.sparkContext 14 | 15 | val rdd:RDD[String] = sc.textFile("src/main/resources/test.txt") 16 | 17 | println(rdd.getNumPartitions) 18 | val rdd2 = rdd.flatMap(f=>f.split(" ")) 19 | .map(m=>(m,1)) 20 | 21 | //ReduceBy transformation 22 | val rdd5 = rdd2.reduceByKey(_ + _) 23 | 24 | println(rdd5.getNumPartitions) 25 | 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/ReadMultipleCSVFiles.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object ReadMultipleCSVFiles extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | println("spark read csv files from a directory into RDD") 16 | val rddFromFile = spark.sparkContext.textFile("C:/tmp/files/text01.csv") 17 | println(rddFromFile.getClass) 18 | 19 | val rdd = rddFromFile.map(f=>{ 20 | f.split(",") 21 | }) 22 | 23 | println("Iterate RDD") 24 | rdd.foreach(f=>{ 25 | println("Col1:"+f(0)+",Col2:"+f(1)) 26 | }) 27 | println(rdd) 28 | 29 | println("Get data Using collect") 30 | rdd.collect().foreach(f=>{ 31 | println("Col1:"+f(0)+",Col2:"+f(1)) 32 | }) 33 | 34 | println("read all csv files from a directory to single RDD") 35 | val rdd2 = spark.sparkContext.textFile("C:/tmp/files/*") 36 | rdd2.foreach(f=>{ 37 | println(f) 38 | }) 39 | 40 | println("read csv files base on wildcard character") 41 | val rdd3 = spark.sparkContext.textFile("C:/tmp/files/text*.csv") 42 | rdd3.foreach(f=>{ 43 | println(f) 44 | }) 45 | 46 | println("read multiple csv files into a RDD") 47 | val rdd4 = spark.sparkContext.textFile("C:/tmp/files/text01.csv,C:/tmp/files/text02.csv") 48 | rdd4.foreach(f=>{ 49 | println(f) 50 | }) 51 | 52 | } 53 | 54 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/ReadMultipleFiles.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | object ReadMultipleFiles extends App { 4 | 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object ReadMultipleFiles extends App { 8 | 9 | val spark:SparkSession = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExamples.com") 12 | .getOrCreate() 13 | 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | println("read all text files from a directory to single RDD") 17 | val rdd = spark.sparkContext.textFile("C:/tmp/files/*") 18 | rdd.foreach(f=>{ 19 | println(f) 20 | }) 21 | 22 | println("read text files base on wildcard character") 23 | val rdd2 = spark.sparkContext.textFile("C:/tmp/files/text*.txt") 24 | rdd2.foreach(f=>{ 25 | println(f) 26 | }) 27 | 28 | println("read multiple text files into a RDD") 29 | val rdd3 = spark.sparkContext.textFile("C:/tmp/files/text01.txt,C:/tmp/files/text02.txt") 30 | rdd3.foreach(f=>{ 31 | println(f) 32 | }) 33 | 34 | println("Read files and directory together") 35 | val rdd4 = spark.sparkContext.textFile("C:/tmp/files/text01.txt,C:/tmp/files/text02.txt,C:/tmp/files/*") 36 | rdd4.foreach(f=>{ 37 | println(f) 38 | }) 39 | 40 | 41 | val rddWhole = spark.sparkContext.wholeTextFiles("C:/tmp/files/*") 42 | rddWhole.foreach(f=>{ 43 | println(f._1+"=>"+f._2) 44 | }) 45 | 46 | val rdd5 = spark.sparkContext.textFile("C:/tmp/files/*") 47 | val rdd6 = rdd5.map(f=>{ 48 | f.split(",") 49 | }) 50 | 51 | rdd6.foreach(f => { 52 | println("Col1:"+f(0)+",Col2:"+f(1)) 53 | }) 54 | 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/ReadTextFiles.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object ReadTextFiles extends App { 7 | 8 | val spark:SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | println("##spark read text files from a directory into RDD") 16 | val rddFromFile = spark.sparkContext.textFile("src/main/resources/csv/text01.txt") 17 | println(rddFromFile.getClass) 18 | 19 | println("##Get data Using collect") 20 | rddFromFile.collect().foreach(f=>{ 21 | println(f) 22 | }) 23 | 24 | println("##read multiple text files into a RDD") 25 | val rdd4 = spark.sparkContext.textFile("src/main/resources/csv/text01.txt," + 26 | "src/main/resources/csv/text02.txt") 27 | rdd4.foreach(f=>{ 28 | println(f) 29 | }) 30 | 31 | println("##read text files base on wildcard character") 32 | val rdd3 = spark.sparkContext.textFile("src/main/resources/csv/text*.txt") 33 | rdd3.foreach(f=>{ 34 | println(f) 35 | }) 36 | 37 | println("##read all text files from a directory to single RDD") 38 | val rdd2 = spark.sparkContext.textFile("src/main/resources/csv/*") 39 | rdd2.foreach(f=>{ 40 | println(f) 41 | }) 42 | 43 | println("##read whole text files") 44 | val rddWhole:RDD[(String,String)] = spark.sparkContext.wholeTextFiles("src/main/resources/csv/text01.txt") 45 | println(rddWhole.getClass) 46 | rddWhole.foreach(f=>{ 47 | println(f._1+"=>"+f._2) 48 | }) 49 | } 50 | 51 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/SortBy.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 6 | 7 | object SortBy { 8 | 9 | def main(args: Array[String]): Unit = { 10 | 11 | val spark:SparkSession = SparkSession.builder() 12 | .master("local[3]") 13 | .appName("SparkByExample") 14 | .getOrCreate() 15 | 16 | val sc = spark.sparkContext 17 | 18 | val rdd:RDD[String] = sc.textFile("C://000_Projects/opt/BigData/zipcodes-noheader.csv") 19 | 20 | val rddZip:RDD[ZipCode] = rdd.map(f=>{ 21 | val arr = split(f) 22 | ZipCode(arr(0).toInt,arr(1),arr(3),arr(4)) 23 | }) 24 | 25 | //SortBy 26 | val rddSort = rddZip.sortBy(f=>f.recordNumber) 27 | rddSort.collect().foreach(f=>println(f.toString)) 28 | 29 | //SorybyKey 30 | //First create pairRDD 31 | val rddTuple=rddZip.map(f=>{ 32 | Tuple2(f.recordNumber,f.toString) 33 | }) 34 | rddTuple.sortByKey().collect().foreach(f=>println(f._2)) 35 | } 36 | 37 | def split(str:String): Array[String] ={ 38 | str.split(",") 39 | } 40 | 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/ZipCode.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd 2 | 3 | case class ZipCode(recordNumber:Int,zipCode:String,city:String,state:String) 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/functions/FlatMapExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd.functions 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType} 5 | 6 | object FlatMapExample extends App{ 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .master("local[1]") 10 | .appName("SparkByExamples.com") 11 | .getOrCreate() 12 | 13 | val data = Seq("Project Gutenberg’s", 14 | "Alice’s Adventures in Wonderland", 15 | "Project Gutenberg’s", 16 | "Adventures in Wonderland", 17 | "Project Gutenberg’s") 18 | val rdd=spark.sparkContext.parallelize(data) 19 | rdd.foreach(println) 20 | 21 | val rdd1 = rdd.flatMap(f=>f.split(" ")) 22 | rdd1.foreach(println) 23 | 24 | val arrayStructureData = Seq( 25 | Row("James,,Smith",List("Java","Scala","C++"),"CA"), 26 | Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"), 27 | Row("Robert,,Williams",List("CSharp","VB","R"),"NV") 28 | ) 29 | 30 | val arrayStructureSchema = new StructType() 31 | .add("name",StringType) 32 | .add("languagesAtSchool", ArrayType(StringType)) 33 | .add("currentState", StringType) 34 | 35 | 36 | val df = spark.createDataFrame( 37 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema) 38 | import spark.implicits._ 39 | val df2=df.flatMap(f=> f.getSeq[String](1).map((f.getString(0),_,f.getString(2)))) 40 | .toDF("Name","Language","State") 41 | df2.show(false) 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/functions/MapExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object MapExample extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val data = Seq("Project", 13 | "Gutenberg’s", 14 | "Alice’s", 15 | "Adventures", 16 | "in", 17 | "Wonderland", 18 | "Project", 19 | "Gutenberg’s", 20 | "Adventures", 21 | "in", 22 | "Wonderland", 23 | "Project", 24 | "Gutenberg’s") 25 | 26 | val rdd=spark.sparkContext.parallelize(data) 27 | 28 | val rdd2=rdd.map(f=> (f,1)) 29 | rdd2.foreach(println) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/functions/ReduceByKeyExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ReduceByKeyExample extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val data = Seq(("Project", 1), 13 | ("Gutenberg’s", 1), 14 | ("Alice’s", 1), 15 | ("Adventures", 1), 16 | ("in", 1), 17 | ("Wonderland", 1), 18 | ("Project", 1), 19 | ("Gutenberg’s", 1), 20 | ("Adventures", 1), 21 | ("in", 1), 22 | ("Wonderland", 1), 23 | ("Project", 1), 24 | ("Gutenberg’s", 1)) 25 | 26 | val rdd=spark.sparkContext.parallelize(data) 27 | 28 | val rdd2=rdd.reduceByKey(_ + _) 29 | 30 | rdd2.foreach(println) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/functions/SortByKeyExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object SortByKeyExample extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | val data = Seq(("Project","A", 1), 13 | ("Gutenberg’s", "X",3), 14 | ("Alice’s", "C",5), 15 | ("Adventures","B", 1) 16 | ) 17 | 18 | val rdd=spark.sparkContext.parallelize(data) 19 | rdd.foreach(println) 20 | val rdd2=rdd.map(f=>{(f._2, (f._1,f._2,f._3))}) 21 | rdd2.foreach(println) 22 | val rdd3= rdd2.sortByKey() 23 | val rdd4= rdd2.sortByKey(false) 24 | rdd4.foreach(println) 25 | 26 | val rdd5 = rdd.sortBy(f=>(f._3,f._2),false) 27 | rdd5.foreach(println) 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/functions/aggregateExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object aggregateExample extends App { 6 | 7 | val spark = SparkSession.builder() 8 | .appName("SparkByExamples.com") 9 | .master("local[3]") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | //aggregate example 15 | val listRdd = spark.sparkContext.parallelize(List(1,2,3,4,5,3,2)) 16 | def param0= (accu:Int, v:Int) => accu + v 17 | def param1= (accu1:Int,accu2:Int) => accu1 + accu2 18 | println("output 1 : "+listRdd.aggregate(0)(param0,param1)) 19 | 20 | 21 | val inputRDD = spark.sparkContext.parallelize(List(("Z", 1),("A", 20),("B", 30),("C", 40),("B", 30),("B", 60))) 22 | def param3= (accu:Int, v:(String,Int)) => accu + v._2 23 | def param4= (accu1:Int,accu2:Int) => accu1 + accu2 24 | println("output 2 : "+inputRDD.aggregate(0)(param3,param4)) 25 | 26 | println("Number fo Partitions :"+listRdd.getNumPartitions) 27 | //aggregate example 28 | println("output 1 : "+listRdd.aggregate(1)(param0,param1)) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/functions/foldExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object foldExample extends App { 6 | 7 | val spark = SparkSession.builder() 8 | .appName("SparkByExamples.com") 9 | .master("local[3]") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | //fold example 15 | val listRdd = spark.sparkContext.parallelize(List(1,2,3,4,5,3,2)) 16 | println("Partitions : "+listRdd.getNumPartitions) 17 | println("Total : "+listRdd.fold(0)((acc,ele) => {acc + ele})) 18 | println("Total with init value 2 : "+listRdd.fold(2)((acc,ele) => {acc + ele})) 19 | println("Min : "+listRdd.fold(0)((acc,ele) => {acc min ele})) 20 | println("Max : "+listRdd.fold(0)((acc,ele) => {acc max ele})) 21 | 22 | val inputRDD = spark.sparkContext.parallelize(List(("Z", 1),("A", 20),("B", 30),("C", 40),("B", 30),("B", 60))) 23 | 24 | println("Total : "+inputRDD.fold(("",0))( (acc,ele)=>{ ("Total", acc._2 + ele._2) })) 25 | println("Min : "+inputRDD.fold(("",0))( (acc,ele)=>{ ("Min", acc._2 min ele._2) })) 26 | println("Max : "+inputRDD.fold(("",0))( (acc,ele)=>{ ("Max", acc._2 max ele._2) })) 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/functions/reduceExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object reduceExample extends App { 6 | 7 | val spark = SparkSession.builder() 8 | .appName("SparkByExamples.com") 9 | .master("local[3]") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | val listRdd = spark.sparkContext.parallelize(List(1,2,3,4,5,3,2)) 15 | 16 | println("output min using binary : "+listRdd.reduce(_ min _)) 17 | println("output max using binary : "+listRdd.reduce(_ max _)) 18 | println("output sum using binary : "+listRdd.reduce(_ + _)) 19 | 20 | 21 | // Alternatively you can write 22 | println("output min : "+listRdd.reduce( (a,b) => a min b)) 23 | println("output max : "+listRdd.reduce( (a,b) => a max b)) 24 | println("output sum : "+listRdd.reduce( (a,b) => a + b)) 25 | 26 | 27 | val inputRDD = spark.sparkContext.parallelize(List(("Z", 1),("A", 20),("B", 30), 28 | ("C", 40),("B", 30),("B", 60))) 29 | 30 | println("output min : "+inputRDD.reduce( (a,b)=> ("max",a._2 min b._2))._2) 31 | println("output max : "+inputRDD.reduce( (a,b)=> ("max",a._2 max b._2))._2) 32 | println("output sum : "+inputRDD.reduce( (a,b)=> ("Sum",a._2 + b._2))._2) 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/rdd/xml/XmlRecordReader.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.rdd.xml 2 | 3 | import com.databricks.spark.xml.XmlInputFormat 4 | import org.apache.hadoop.conf.Configuration 5 | import org.apache.hadoop.io.{LongWritable, Text} 6 | import org.apache.spark.api.java.JavaSparkContext 7 | import org.apache.spark.api.java.function.VoidFunction 8 | import org.apache.spark.sql.SparkSession 9 | 10 | import scala.xml.XML 11 | 12 | 13 | object XmlRecordReader { 14 | def main(args: Array[String]): Unit = { 15 | val sparkSession = SparkSession.builder.appName("XmlRecordReader").master("local").getOrCreate 16 | val javaSparkContext = new JavaSparkContext(sparkSession.sparkContext) 17 | val configuration = new Configuration 18 | configuration.set("xmlinput.start", "") 19 | configuration.set("xmlinput.end", "") 20 | configuration.set("mapreduce.input.fileinputformat.inputdir", "src/main/resources/records.xml") 21 | val javaPairRDD = javaSparkContext.newAPIHadoopRDD(configuration, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text]) 22 | javaPairRDD.foreach(new VoidFunction[Tuple2[LongWritable, Text]]() { 23 | @throws[Exception] 24 | override def call(tuple: Tuple2[LongWritable, Text]): Unit = { // TODO Auto-generated method stub 25 | 26 | val xml = XML.loadString(tuple._2.toString) 27 | val forecast = (xml \ "Name") text 28 | 29 | println("forecast" + forecast) 30 | 31 | } 32 | }) 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/spark30/ADQExample.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.spark30 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ADQExample extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[5]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | import spark.implicits._ 15 | val simpleData = Seq(("James","Sales","NY",90000,34,10000), 16 | ("Michael","Sales","NY",86000,56,20000), 17 | ("Robert","Sales","CA",81000,30,23000), 18 | ("Maria","Finance","CA",90000,24,23000), 19 | ("Raman","Finance","CA",99000,40,24000), 20 | ("Scott","Finance","NY",83000,36,19000), 21 | ("Jen","Finance","NY",79000,53,15000), 22 | ("Jeff","Marketing","CA",80000,25,18000), 23 | ("Kumar","Marketing","NY",91000,50,21000) 24 | ) 25 | val df = simpleData.toDF("employee_name","department","state","salary","age","bonus") 26 | 27 | val df1=df.groupBy("department").count() 28 | println(df1.rdd.getNumPartitions) 29 | 30 | spark.conf.set("spark.sql.adaptive.enabled",200) 31 | val df2=df.groupBy("department").count() 32 | println(df2.rdd.getNumPartitions) 33 | 34 | 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/spark30/ReadBinary.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.spark30 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ReadBinary extends App{ 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .master("local[1]") 9 | .appName("SparkByExamples.com") 10 | .getOrCreate() 11 | 12 | //spark.sparkContext.setLogLevel("ERROR") 13 | 14 | val df = spark.read.format("binaryFile").load("C:\\tmp\\binary\\spark.png") 15 | df.printSchema() 16 | df.show() 17 | 18 | val df2 = spark.read.format("binaryFile").load("C:\\tmp\\binary\\") 19 | df2.printSchema() 20 | //df2.show(false) 21 | 22 | val df3 = spark.read.format("binaryFile").load("C:\\tmp\\binary\\*.png") 23 | df3.printSchema() 24 | df3.show(false) 25 | 26 | // To load files with paths matching a given glob pattern while keeping the behavior of partition discovery 27 | val df4 = spark.read.format("binaryFile") 28 | .option("pathGlobFilter", "*.png") 29 | .load("C:\\tmp\\binary\\") 30 | df4.printSchema() 31 | //df4.show(false) 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/stackoverflow/AddingLiterral.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.stackoverflow 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 5 | case class Employee(EmpId: String, Experience: Double, Salary: Double) 6 | 7 | case class Employee2(EmpId: EmpData, Experience: EmpData, Salary: EmpData) 8 | case class EmpData(key: String,value:String) 9 | object AddingLiterral { 10 | def main(args: Array[String]): Unit = { 11 | 12 | val spark = SparkSession.builder() 13 | .master("local[1]") 14 | .appName("SparkByExample") 15 | .getOrCreate(); 16 | import spark.sqlContext.implicits._ 17 | import org.apache.spark.sql.functions._ 18 | val data = Seq(("111",5,50000),("222",6,60000),("333",7,60000)) 19 | val df = data.toDF("EmpId","Experience","Salary") 20 | 21 | val newdf = df.withColumn("EmpId", struct(lit("1").as("key"),col("EmpId").as("value"))) 22 | .withColumn("Experience", struct(lit("2").as("key"),col("Experience").as("value"))) 23 | .withColumn("Salary", struct(lit("3").as("key"),col("Salary").as("value"))) 24 | .show(false) 25 | 26 | val ds = df.as[Employee] 27 | val newDS = ds.map(rec=>{ 28 | (EmpData("1",rec.EmpId), EmpData("2",rec.Experience.toString),EmpData("3",rec.Salary.toString)) 29 | }) 30 | val finalDS = newDS.toDF("EmpId","Experience","Salary").as[Employee2] 31 | finalDS.show(false) 32 | // newDS.withColumnRenamed("_1","EmpId") 33 | // .withColumnRenamed("_2","Experience") 34 | // .withColumnRenamed("_3","Salary") 35 | // .show(false) 36 | 37 | 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/stackoverflow/SparkContextOld.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.stackoverflow 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | object SparkContextOld extends App{ 6 | 7 | val conf = new SparkConf().setAppName("sparkbyexamples.com").setMaster("local[1]") 8 | val sparkContext = new SparkContext(conf) 9 | val rdd = sparkContext.textFile("/src/main/resources/text/alice.txt") 10 | 11 | sparkContext.setLogLevel("ERROR") 12 | 13 | println("First SparkContext:") 14 | println("APP Name :"+sparkContext.appName) 15 | println("Deploy Mode :"+sparkContext.deployMode) 16 | println("Master :"+sparkContext.master) 17 | println("Master :"+sparkContext.applicationId) 18 | // sparkContext.stop() 19 | 20 | val conf2 = new SparkConf().setAppName("sparkbyexamples.com-2").setMaster("local[1]") 21 | val sparkContext2 = new SparkContext(conf2) 22 | 23 | println("Second SparkContext:") 24 | println("APP Name :"+sparkContext2.appName) 25 | println("Deploy Mode :"+sparkContext2.deployMode) 26 | println("Master :"+sparkContext2.master) 27 | 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/stackoverflow/Test.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.stackoverflow 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | object Test { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val spark = SparkSession.builder() 10 | .master("local[1]") 11 | .appName("SparkByExample") 12 | .getOrCreate(); 13 | import spark.sqlContext.implicits._ 14 | 15 | val df1:DataFrame = Seq( 16 | ("Mark", "2018-02-20 00:00:00"), 17 | ("Alex", "2018-03-01 00:00:00"), 18 | ("Bob", "2018-03-01 00:00:00"), 19 | ("Mark", "2018-07-01 00:00:00"), 20 | ("Kate", "2018-07-01 00:00:00") 21 | ).toDF("USER_NAME", "REQUEST_DATE") 22 | 23 | df1.show() 24 | 25 | val df2: DataFrame = Seq( 26 | ("Alex", "2018-01-01 00:00:00", "2018-02-01 00:00:00", "OUT"), 27 | ("Bob", "2018-02-01 00:00:00", "2018-02-05 00:00:00", "IN"), 28 | ("Mark", "2018-02-01 00:00:00", "2018-03-01 00:00:00", "IN"), 29 | ("Mark", "2018-05-01 00:00:00", "2018-08-01 00:00:00", "OUT"), 30 | ("Meggy", "2018-02-01 00:00:00", "2018-02-01 00:00:00", "OUT") 31 | ).toDF("NAME", "START_DATE", "END_DATE", "STATUS") 32 | 33 | df2.show() 34 | 35 | val df3 = df1.join(df2, col("USER_NAME") === col("NAME"), "left_outer") 36 | 37 | 38 | df3.groupBy("USER_NAME","REQUEST_DATE") 39 | 40 | val df4 = df3.withColumn("USER_STATUS", when($"REQUEST_DATE" > $"START_DATE" and $"REQUEST_DATE" < $"END_DATE", "Our user") otherwise ("Not our user")) 41 | 42 | df4.select("USER_NAME","REQUEST_DATE","USER_STATUS").distinct()show(false) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/sparkbyexamples/spark/stackoverflow/Test2.scala: -------------------------------------------------------------------------------- 1 | package com.sparkbyexamples.spark.stackoverflow 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object Test2 { 6 | 7 | // def main(args: Array[String]): Unit = { 8 | // 9 | // val spark = SparkSession.builder() 10 | // .master("local[1]") 11 | // .appName("SparkByExample") 12 | // .getOrCreate(); 13 | // 14 | // val peopleDFCsv = spark.read.format("csv") 15 | // .load("src/main/resources/stack.csv") 16 | // 17 | // val d = peopleDFCsv.map(row=>{ 18 | // val col1=row.get(1) 19 | // val col2=row.get(1) 20 | // (col1,col2) 21 | // }).toDF() 22 | // 23 | // } 24 | } 25 | --------------------------------------------------------------------------------