├── .gitignore
├── README.md
├── pom.xml
└── src
└── main
├── resources
├── address-multiline.csv
├── address.csv
├── books.xml
├── books_withnested_array.xml
├── csv
│ ├── invalid.txt
│ ├── text01.txt
│ ├── text02.txt
│ ├── text03.txt
│ └── text04.txt
├── free-zipcode-database.csv
├── kv.csv
├── multiline-zipcode.json
├── persons.xml
├── persons_complex.xml
├── records.xml
├── schema.json
├── simple_zipcodes.csv
├── simple_zipcodes.json
├── simple_zipcodes.txt
├── small_zipcode.csv
├── stream.csv
├── test.txt
├── txt
│ ├── alice.txt
│ ├── datasets.csv
│ └── holmes.txt
├── zipcodes-noheader.csv
├── zipcodes.avro
├── zipcodes.csv
├── zipcodes.json
├── zipcodes.parquet
├── zipcodes20.csv
└── zipcodes_streaming
│ ├── zipcode1.json
│ ├── zipcode10.json
│ ├── zipcode11.json
│ ├── zipcode12.json
│ ├── zipcode2.json
│ ├── zipcode3.json
│ ├── zipcode4.json
│ ├── zipcode5.json
│ ├── zipcode6.json
│ ├── zipcode7.json
│ ├── zipcode8.json
│ └── zipcode9.json
└── scala
└── com
└── sparkbyexamples
└── spark
├── SQLContextExample.scala
├── SparkContextExample.scala
├── SparkSessionTest.scala
├── SparkSessionWrapper.scala
├── beans
├── Books.scala
├── BooksDiscounted.scala
├── BooksStruct.scala
├── BooksWithArray.scala
├── User.scala
└── Zipcode.scala
├── dataframe
├── ArrayToColumn.scala
├── AvroExample.scala
├── AvroToJson.scala
├── AvroToParquet.scala
├── BroadcastExample.scala
├── CaseClassSparkSchema.scala
├── CastColumnType.scala
├── ColumnTruncate.scala
├── CreateDataFrame.scala
├── CreateEmptyDataFrameExample.scala
├── CreateEmptyDatasetExample.scala
├── CsvToAvroParquetJson.scala
├── DataFrameFromCSVFile.scala
├── DataFrameWithComplexDSL.scala
├── DataFrameWithSimpleDSL.scala
├── DataTypeExample.scala
├── FilterExample.scala
├── FilterNullRowsExample.scala
├── FlattenNestedStruct.scala
├── FromCSVFile2.scala
├── FromCSVMultiline.scala
├── FromJsonFile.scala
├── FromTextFile.scala
├── GroupbyExample.scala
├── HandleNullExample.scala
├── JsonFromMultiline.scala
├── JsonToAvroCsvParquet.scala
├── ParquetAWSExample.scala
├── ParquetExample.scala
├── ParquetToAvro.scala
├── ParquetToCsv.scala
├── ParquetToJson.scala
├── ReadJsonFromString.scala
├── RemoveNullRowsExample.scala
├── RenameColDataFrame.scala
├── SQLExample.scala
├── SaveDataFrame.scala
├── SparkUDF.scala
├── StructTypeUsage.scala
├── UDFDataFrame.scala
├── UnionExample.scala
├── WhereExample.scala
├── WithColumn.scala
├── examples
│ ├── CacheExample.scala
│ ├── CastStringToInt.scala
│ ├── CollectExample.scala
│ ├── DataFrameComplex.scala
│ ├── DataFrameEmptyCheck.scala
│ ├── DropColumn.scala
│ ├── ForEachExample.scala
│ ├── ForEachPartExample.scala
│ ├── MapFlatMap.scala
│ ├── MapTransformation.scala
│ ├── RangePartition.scala
│ ├── ReadORCFile.scala
│ ├── RenameDeleteFile.scala
│ ├── RepartitionExample.scala
│ ├── SaveSingleFile.scala
│ ├── SelectExamples.scala
│ ├── SelectSelectExpr.scala
│ ├── ShuffleExample.scala
│ └── Util.scala
├── functions
│ ├── AddColumn.scala
│ ├── AnotherExample.scala
│ ├── PivotExample.scala
│ ├── RemoveDuplicate.scala
│ ├── SortExample.scala
│ ├── WhenOtherwise.scala
│ ├── WindowGroupbyFirst.scala
│ ├── aggregate
│ │ ├── AggregateFunctions.scala
│ │ ├── DistinctCount.scala
│ │ └── SQLDistinct.scala
│ ├── collection
│ │ ├── ArrayContainsExample.scala
│ │ ├── ArrayOfArrayType.scala
│ │ ├── ArrayOfMapType.scala
│ │ ├── ArrayOfString.scala
│ │ ├── ArrayOfStructType.scala
│ │ ├── ArrayTypeExample.scala
│ │ ├── CollectListExample.scala
│ │ ├── ExplodeArrayAndMap.scala
│ │ ├── MapFunctions.scala
│ │ ├── MapToColumn.scala
│ │ ├── MapTypeExample.scala
│ │ ├── SliceArray.scala
│ │ └── StringToArray.scala
│ ├── datetime
│ │ ├── AddTime.scala
│ │ ├── CurrentDateAndTime.scala
│ │ ├── DateAddMonths.scala
│ │ ├── DateDiff.scala
│ │ ├── DateExamples.scala
│ │ ├── DateFormat.scala
│ │ ├── DateInMilli.scala
│ │ ├── DateLastDay.scala
│ │ ├── DateToString.scala
│ │ ├── DateTrunc.scala
│ │ ├── DayAndWeekOfYear.scala
│ │ ├── DayWeekAndWeekMonth.scala
│ │ ├── GetTimeFromTimestamp.scala
│ │ ├── Spark3Date.scala
│ │ ├── StringToDate.scala
│ │ ├── StringToTimestamp.scala
│ │ ├── TimeInMilli.scala
│ │ ├── TimestampDiff.scala
│ │ ├── TimestampToDate.scala
│ │ ├── TimestampToString.scala
│ │ └── UnixTimestamp.scala
│ ├── from_json.scala
│ ├── litTypeLit.scala
│ ├── string
│ │ ├── ConcatExample.scala
│ │ └── SplitExample.scala
│ └── window
│ │ ├── RowNumber.scala
│ │ ├── WindowFunctions.scala
│ │ └── WindowGroupbyFirst.scala
├── join
│ ├── CrossJoinExample.scala
│ ├── InnerJoinExample.scala
│ ├── JoinExample.scala
│ ├── JoinMultipleColumns.scala
│ ├── JoinMultipleDataFrames.scala
│ └── SelfJoinExample.scala
└── xml
│ ├── PersonsComplexXML.scala
│ ├── PersonsXML.scala
│ ├── ReadBooksXMLWithNestedArray.scala
│ ├── ReadBooksXMLWithNestedArrayStruct.scala
│ └── xstream
│ └── WriteXML.scala
├── dataset
├── DataSetFromData.scala
├── DataSetWithCustomClass.scala
└── xml
│ ├── ReadBooksXML.scala
│ ├── ReadBooksXMLWithNestedArray.scala
│ ├── ReadBooksXMLWithNestedArrayDSL.scala
│ ├── SparkXMLUsingXstream.scala
│ └── sparkXml.scala
├── rdd
├── CreateEmptyRDD.scala
├── CreateRDD.scala
├── OperationOnPairRDDComplex.scala
├── OperationsOnPairRDD.scala
├── OperationsOnRDD.scala
├── PartitionBy.scala
├── RDDAccumulator.scala
├── RDDActions.scala
├── RDDBroadcast.scala
├── RDDCache.scala
├── RDDFromCSVFile.scala
├── RDDFromDataUsingParallelize.scala
├── RDDFromParallelizeRange.scala
├── RDDFromWholeTextFile.scala
├── RDDHadoopInputFormat.scala
├── RDDPrint.scala
├── RDDReadFilesFromDirectory.scala
├── RDDRepartitionExample.scala
├── RDDSaveAsObjectFile.scala
├── RDDSequenceFiles.scala
├── RDDShuffleExample.scala
├── ReadMultipleCSVFiles.scala
├── ReadMultipleFiles.scala
├── ReadTextFiles.scala
├── SortBy.scala
├── WordCountExample.scala
├── ZipCode.scala
├── functions
│ ├── FlatMapExample.scala
│ ├── MapExample.scala
│ ├── ReduceByKeyExample.scala
│ ├── SortByKeyExample.scala
│ ├── aggregateExample.scala
│ ├── foldExample.scala
│ └── reduceExample.scala
└── xml
│ └── XmlRecordReader.scala
├── spark30
├── ADQExample.scala
└── ReadBinary.scala
└── stackoverflow
├── AddingLiterral.scala
├── SparkContextOld.scala
├── Test.scala
└── Test2.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .metadata
3 | .cache-main
4 | .classpath
5 | .project
6 | .settings
7 | *.class
8 | *.orig
9 | *.log
10 | target/
11 | .DS_Store
12 | *.iml
13 | scalastyle-output.xml
14 |
15 |
--------------------------------------------------------------------------------
/src/main/resources/address-multiline.csv:
--------------------------------------------------------------------------------
1 | Id,Address Line1,City,State,Zipcode
2 | 1,9182 Clear Water Rd,Fayetteville,AR,72704
3 | 2,"9920 State
4 | Highway 89",Ringling,OK,73456
5 | 3,9724 E Landon Ln,Kennewick,WA,99338
6 |
7 |
8 |
--------------------------------------------------------------------------------
/src/main/resources/address.csv:
--------------------------------------------------------------------------------
1 | Id,Address Line1,City,State,Zipcode
2 | 1,9182 Clear Water Rd,Fayetteville,AR,72704
3 | 2,9724 E Landon Ln,Kennewick,WA,99338
4 | 3,9509 Clay Creek Ln,Fort Worth,TX,76177
5 | 4,98016 S Garnsey St,Santa Ana,CA,92707
6 | 5,9920 State Highway 89,Ringling,OK,73456
--------------------------------------------------------------------------------
/src/main/resources/csv/invalid.txt:
--------------------------------------------------------------------------------
1 | Invalid,I
--------------------------------------------------------------------------------
/src/main/resources/csv/text01.txt:
--------------------------------------------------------------------------------
1 | One,1
2 | Eleven,11
--------------------------------------------------------------------------------
/src/main/resources/csv/text02.txt:
--------------------------------------------------------------------------------
1 | Two,2
--------------------------------------------------------------------------------
/src/main/resources/csv/text03.txt:
--------------------------------------------------------------------------------
1 | Three,3
--------------------------------------------------------------------------------
/src/main/resources/csv/text04.txt:
--------------------------------------------------------------------------------
1 | Four,4
--------------------------------------------------------------------------------
/src/main/resources/kv.csv:
--------------------------------------------------------------------------------
1 | key,value
2 | record1,My Name is Naveen
3 | record2,My Name is Praveen
4 | record3,My Name is Prabha
--------------------------------------------------------------------------------
/src/main/resources/multiline-zipcode.json:
--------------------------------------------------------------------------------
1 | [{
2 | "RecordNumber": 2,
3 | "Zipcode": 704,
4 | "ZipCodeType": "STANDARD",
5 | "City": "PASEO COSTA DEL SUR",
6 | "State": "PR"
7 | },
8 | {
9 | "RecordNumber": 10,
10 | "Zipcode": 709,
11 | "ZipCodeType": "STANDARD",
12 | "City": "BDA SAN LUIS",
13 | "State": "PR"
14 | }]
15 |
--------------------------------------------------------------------------------
/src/main/resources/persons.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | James
4 | Smith
5 |
6 | 1980
7 | 1
8 | M
9 | 10000
10 |
11 |
12 | 123 ABC street
13 | NewJersy
14 | NJ
15 |
16 |
17 | 456 apple street
18 | newark
19 | DE
20 |
21 |
22 |
23 |
24 | Michael
25 |
26 | Rose
27 | 1990
28 | 6
29 | M
30 | 10000
31 |
32 |
33 | 4512 main st
34 | new york
35 | NY
36 |
37 |
38 | 4367 orange st
39 | sandiago
40 | CA
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/src/main/resources/persons_complex.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | James
4 | Smith
5 |
6 | 1980
7 | 1
8 | M
9 | 10000
10 |
11 |
12 | 1 capler dr
13 | new york
14 | NY
15 |
16 |
17 | 455 catalina dr
18 | chicago
19 | IL
20 |
21 |
22 |
23 |
24 | Michael
25 |
26 | Rose
27 | 1990
28 | 6
29 | M
30 | 10000
31 |
32 |
33 | 2345 pasadena village
34 | orlando
35 | FL
36 |
37 |
38 | 3 walnut dr
39 | wilmington
40 | DE
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/src/main/resources/records.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | John
4 | 10
5 | M
6 |
7 |
8 | Jenny
9 | 12
10 | F
11 |
12 |
13 | Janardhan
14 | 14
15 | M
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/main/resources/schema.json:
--------------------------------------------------------------------------------
1 | {
2 | "type" : "struct",
3 | "fields" : [ {
4 | "name" : "name",
5 | "type" : {
6 | "type" : "struct",
7 | "fields" : [ {
8 | "name" : "firstname",
9 | "type" : "string",
10 | "nullable" : true,
11 | "metadata" : { }
12 | }, {
13 | "name" : "middlename",
14 | "type" : "string",
15 | "nullable" : true,
16 | "metadata" : { }
17 | }, {
18 | "name" : "lastname",
19 | "type" : "string",
20 | "nullable" : true,
21 | "metadata" : { }
22 | } ]
23 | },
24 | "nullable" : true,
25 | "metadata" : { }
26 | }, {
27 | "name" : "dob",
28 | "type" : "string",
29 | "nullable" : true,
30 | "metadata" : { }
31 | }, {
32 | "name" : "gender",
33 | "type" : "string",
34 | "nullable" : true,
35 | "metadata" : { }
36 | }, {
37 | "name" : "salary",
38 | "type" : "integer",
39 | "nullable" : true,
40 | "metadata" : { }
41 | } ]
42 | }
--------------------------------------------------------------------------------
/src/main/resources/simple_zipcodes.csv:
--------------------------------------------------------------------------------
1 | Id,JsonValue
2 | 1,"{\"Zipcode\":704,\"ZipCodeType\":\"STANDARD\",\"City\":\"PARC PARQUE\",\"State\":\"PR\"}"
3 | 2,"{\"Zipcode\":704,\"ZipCodeType\":\"STANDARD\",\"City\":\"PASEO COSTA DEL SUR\",\"State\":\"PR\"}"
4 | 3,"{\"Zipcode\":709,\"ZipCodeType\":\"STANDARD\",\"City\":\"BDA SAN LUIS\",\"State\":\"PR\"}"
5 | 4,"{\"Zipcode\":76166,\"ZipCodeType\":\"UNIQUE\",\"City\":\"CINGULAR WIRELESS\",\"State\":\"TX\"}"
6 | 5,"{\"Zipcode\":76177,\"ZipCodeType\":\"STANDARD\",\"City\":\"FORT WORTH\",\"State\":\"TX\"}"
7 | 6,"{\"Zipcode\":76177,\"ZipCodeType\":\"STANDARD\",\"City\":\"FT WORTH\",\"State\":\"TX\"}"
8 | 7,"{\"Zipcode\":704,\"ZipCodeType\":\"STANDARD\",\"City\":\"URB EUGENE RICE\",\"State\":\"PR\"}"
9 | 8,"{\"Zipcode\":85209,\"ZipCodeType\":\"STANDARD\",\"City\":\"MESA\",\"State\":\"AZ\"}"
10 | 9,"{\"Zipcode\":85210,\"ZipCodeType\":\"STANDARD\",\"City\":\"MESA\",\"State\":\"AZ\"}"
11 | 10,"{\"Zipcode\":32046,\"ZipCodeType\":\"STANDARD\",\"City\":\"HILLIARD\",\"State\":\"FL\"}"
12 |
--------------------------------------------------------------------------------
/src/main/resources/simple_zipcodes.json:
--------------------------------------------------------------------------------
1 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}
2 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR"}
3 | {"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR"}
4 | {"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX"}
5 | {"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX"}
6 | {"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX"}
7 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR"}
8 | {"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ"}
9 | {"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ"}
10 | {"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL"}
11 |
--------------------------------------------------------------------------------
/src/main/resources/simple_zipcodes.txt:
--------------------------------------------------------------------------------
1 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}
2 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR"}
3 | {"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR"}
4 | {"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX"}
5 | {"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX"}
6 | {"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX"}
7 | {"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR"}
8 | {"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ"}
9 | {"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ"}
10 | {"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL"}
11 |
--------------------------------------------------------------------------------
/src/main/resources/small_zipcode.csv:
--------------------------------------------------------------------------------
1 | id,zipcode,type,city,state,population
2 | 1,704,STANDARD,,PR,30100
3 | 2,704,,PASEO COSTA DEL SUR,PR,
4 | 3,709,,BDA SAN LUIS,PR,3700
5 | 4,76166,UNIQUE,CINGULAR WIRELESS,TX,84000
6 | 5,76177,STANDARD,,TX,
--------------------------------------------------------------------------------
/src/main/resources/stream.csv:
--------------------------------------------------------------------------------
1 | TotalCost|BirthDate|Gender|TotalChildren|ProductCategoryName
2 | 1000||Male|2|Technology
3 | 2000|1957-03-06||3|Beauty
4 | 3000|1959-03-06|Male||Car
5 | 4000|1953-03-06|Male|2|
6 | 5000|1957-03-06|Female|3|Beauty
7 | 6000|1959-03-06|Male|4|Car
--------------------------------------------------------------------------------
/src/main/resources/zipcodes.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spark-examples/spark-scala-examples/e4f18c30dec398bb6ca110f98272b20b461f3310/src/main/resources/zipcodes.avro
--------------------------------------------------------------------------------
/src/main/resources/zipcodes.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spark-examples/spark-scala-examples/e4f18c30dec398bb6ca110f98272b20b461f3310/src/main/resources/zipcodes.parquet
--------------------------------------------------------------------------------
/src/main/resources/zipcodes20.csv:
--------------------------------------------------------------------------------
1 | 1,US,PARC PARQUE,704,PR
2 | 2,US,PASEO COSTA DEL SUR,704,PR
3 | 10,US,BDA SAN LUIS,709,PR
4 | 61391,US,CINGULAR WIRELESS,76166,TX
5 | 61392,US,FORT WORTH,76177,TX
6 | 61393,US,FT WORTH,76177,TX
7 | 4,US,URB EUGENE RICE,704,PR
8 | 39827,US,MESA,85209,AZ
9 | 39828,US,MESA,85210,AZ
10 | 49345,US,HILLIARD,32046,FL
11 | 49346,US,HOLDER,34445,FL
12 | 49347,US,HOLT,32564,FL
13 | 49348,US,HOMOSASSA,34487,FL
14 | 3,US,SECT LANAUSSE,704,PR
15 | 54354,US,SPRING GARDEN,36275,AL
16 | 54355,US,SPRINGVILLE,35146,AL
17 | 54356,US,SPRUCE PINE,35585,AL
18 | 76511,US,ASH HILL,27007,NC
19 | 76512,US,ASHEBORO,27203,NC
20 | 76513,US,ASHEBORO,27204,NC
21 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode1.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
2 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode10.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
2 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false}
3 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false}
4 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode11.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
2 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode12.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":1,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Parc Parque, PR","Location":"NA-US-PR-PARC PARQUE","Decommisioned":false}
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode2.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":2,"Zipcode":704,"ZipCodeType":"STANDARD","City":"PASEO COSTA DEL SUR","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Paseo Costa Del Sur, PR","Location":"NA-US-PR-PASEO COSTA DEL SUR","Decommisioned":false}
2 | {"RecordNumber":10,"Zipcode":709,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false}
3 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode3.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":61391,"Zipcode":76166,"ZipCodeType":"UNIQUE","City":"CINGULAR WIRELESS","State":"TX","LocationType":"NOT ACCEPTABLE","Lat":32.72,"Long":-97.31,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Cingular Wireless, TX","Location":"NA-US-TX-CINGULAR WIRELESS","Decommisioned":false}
2 | {"RecordNumber":61392,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FORT WORTH","State":"TX","LocationType":"PRIMARY","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Fort Worth, TX","Location":"NA-US-TX-FORT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986}
3 | {"RecordNumber":61393,"Zipcode":76177,"ZipCodeType":"STANDARD","City":"FT WORTH","State":"TX","LocationType":"ACCEPTABLE","Lat":32.75,"Long":-97.33,"Xaxis":-0.1,"Yaxis":-0.83,"Zaxis":0.54,"WorldRegion":"NA","Country":"US","LocationText":"Ft Worth, TX","Location":"NA-US-TX-FT WORTH","Decommisioned":false,"TaxReturnsFiled":2126,"EstimatedPopulation":4053,"TotalWages":122396986}
4 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode4.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":4,"Zipcode":704,"ZipCodeType":"STANDARD","City":"URB EUGENE RICE","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":17.96,"Long":-66.22,"Xaxis":0.38,"Yaxis":-0.87,"Zaxis":0.3,"WorldRegion":"NA","Country":"US","LocationText":"Urb Eugene Rice, PR","Location":"NA-US-PR-URB EUGENE RICE","Decommisioned":false}
2 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode5.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":39827,"Zipcode":85209,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.37,"Long":-111.64,"Xaxis":-0.3,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14962,"EstimatedPopulation":26883,"TotalWages":563792730,"Notes":"no NWS data, "}
2 | {"RecordNumber":39828,"Zipcode":85210,"ZipCodeType":"STANDARD","City":"MESA","State":"AZ","LocationType":"PRIMARY","Lat":33.38,"Long":-111.84,"Xaxis":-0.31,"Yaxis":-0.77,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Mesa, AZ","Location":"NA-US-AZ-MESA","Decommisioned":false,"TaxReturnsFiled":14374,"EstimatedPopulation":25446,"TotalWages":471000465}
3 | {"RecordNumber":49345,"Zipcode":32046,"ZipCodeType":"STANDARD","City":"HILLIARD","State":"FL","LocationType":"PRIMARY","Lat":30.69,"Long":-81.92,"Xaxis":0.12,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Hilliard, FL","Location":"NA-US-FL-HILLIARD","Decommisioned":false,"TaxReturnsFiled":3922,"EstimatedPopulation":7443,"TotalWages":133112149}
4 | {"RecordNumber":49346,"Zipcode":34445,"ZipCodeType":"PO BOX","City":"HOLDER","State":"FL","LocationType":"PRIMARY","Lat":28.96,"Long":-82.41,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Holder, FL","Location":"NA-US-FL-HOLDER","Decommisioned":false}
5 | {"RecordNumber":49347,"Zipcode":32564,"ZipCodeType":"STANDARD","City":"HOLT","State":"FL","LocationType":"PRIMARY","Lat":30.72,"Long":-86.67,"Xaxis":0.04,"Yaxis":-0.85,"Zaxis":0.51,"WorldRegion":"NA","Country":"US","LocationText":"Holt, FL","Location":"NA-US-FL-HOLT","Decommisioned":false,"TaxReturnsFiled":1207,"EstimatedPopulation":2190,"TotalWages":36395913}
6 | {"RecordNumber":49348,"Zipcode":34487,"ZipCodeType":"PO BOX","City":"HOMOSASSA","State":"FL","LocationType":"PRIMARY","Lat":28.78,"Long":-82.61,"Xaxis":0.11,"Yaxis":-0.86,"Zaxis":0.48,"WorldRegion":"NA","Country":"US","LocationText":"Homosassa, FL","Location":"NA-US-FL-HOMOSASSA","Decommisioned":false}
7 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode6.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":10,"Zipcode":708,"ZipCodeType":"STANDARD","City":"BDA SAN LUIS","State":"PR","LocationType":"NOT ACCEPTABLE","Lat":18.14,"Long":-66.26,"Xaxis":0.38,"Yaxis":-0.86,"Zaxis":0.31,"WorldRegion":"NA","Country":"US","LocationText":"Bda San Luis, PR","Location":"NA-US-PR-BDA SAN LUIS","Decommisioned":false}
2 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode7.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false}
2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599}
3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517}
4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318}
6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473}
7 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode8.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":54354,"Zipcode":36275,"ZipCodeType":"PO BOX","City":"SPRING GARDEN","State":"AL","LocationType":"PRIMARY","Lat":33.97,"Long":-85.55,"Xaxis":0.06,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Spring Garden, AL","Location":"NA-US-AL-SPRING GARDEN","Decommisioned":false}
2 | {"RecordNumber":54355,"Zipcode":35146,"ZipCodeType":"STANDARD","City":"SPRINGVILLE","State":"AL","LocationType":"PRIMARY","Lat":33.77,"Long":-86.47,"Xaxis":0.05,"Yaxis":-0.82,"Zaxis":0.55,"WorldRegion":"NA","Country":"US","LocationText":"Springville, AL","Location":"NA-US-AL-SPRINGVILLE","Decommisioned":false,"TaxReturnsFiled":4046,"EstimatedPopulation":7845,"TotalWages":172127599}
3 | {"RecordNumber":54356,"Zipcode":35585,"ZipCodeType":"STANDARD","City":"SPRUCE PINE","State":"AL","LocationType":"PRIMARY","Lat":34.37,"Long":-87.69,"Xaxis":0.03,"Yaxis":-0.82,"Zaxis":0.56,"WorldRegion":"NA","Country":"US","LocationText":"Spruce Pine, AL","Location":"NA-US-AL-SPRUCE PINE","Decommisioned":false,"TaxReturnsFiled":610,"EstimatedPopulation":1209,"TotalWages":18525517}
4 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
5 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318}
6 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473}
7 |
--------------------------------------------------------------------------------
/src/main/resources/zipcodes_streaming/zipcode9.json:
--------------------------------------------------------------------------------
1 | {"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
2 | {"RecordNumber":76512,"Zipcode":27203,"ZipCodeType":"STANDARD","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":8355,"EstimatedPopulation":15228,"TotalWages":215474318}
3 | {"RecordNumber":76513,"Zipcode":27204,"ZipCodeType":"PO BOX","City":"ASHEBORO","State":"NC","LocationType":"PRIMARY","Lat":35.71,"Long":-79.81,"Xaxis":0.14,"Yaxis":-0.79,"Zaxis":0.58,"WorldRegion":"NA","Country":"US","LocationText":"Asheboro, NC","Location":"NA-US-NC-ASHEBORO","Decommisioned":false,"TaxReturnsFiled":1035,"EstimatedPopulation":1816,"TotalWages":30322473}
4 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/SQLContextExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark
2 |
3 | import org.apache.spark.sql.{SQLContext, SparkSession}
4 |
5 | object SQLContextExample extends App {
6 |
7 | val spark = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate();
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 |
15 | val sqlContext:SQLContext = spark.sqlContext
16 |
17 | //read csv with options
18 | val df = sqlContext.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true"))
19 | .csv("src/main/resources/zipcodes.csv")
20 | df.show()
21 | df.printSchema()
22 |
23 | df.createOrReplaceTempView("TAB")
24 | sqlContext.sql("select * from TAB")
25 | .show(false)
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/SparkContextExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark
2 |
3 | import com.sparkbyexamples.spark.dataframe.functions.SortExample.spark
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.sql.{SQLContext, SparkSession}
6 |
7 | object SparkContextExample extends App{
8 |
9 | val spark = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate();
13 |
14 | spark.sparkContext.setLogLevel("ERROR")
15 |
16 |
17 | val sparkContext:SparkContext = spark.sparkContext
18 | val sqlCon:SQLContext = spark.sqlContext
19 |
20 | val sqlContext = new org.apache.spark.sql.SQLContext(spark.sparkContext)
21 |
22 | println("First SparkContext:")
23 | println("APP Name :"+spark.sparkContext.appName);
24 | println("Deploy Mode :"+spark.sparkContext.deployMode);
25 | println("Master :"+spark.sparkContext.master);
26 |
27 | val sparkSession2 = SparkSession.builder()
28 | .master("local[1]")
29 | .appName("SparkByExample-test")
30 | .getOrCreate();
31 |
32 | println("Second SparkContext:")
33 | println("APP Name :"+sparkSession2.sparkContext.appName);
34 | println("Deploy Mode :"+sparkSession2.sparkContext.deployMode);
35 | println("Master :"+sparkSession2.sparkContext.master);
36 |
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/SparkSessionTest.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object SparkSessionTest {
6 |
7 | def main(args:Array[String]): Unit ={
8 |
9 |
10 | val spark = SparkSession.builder()
11 | .master("local[1]")
12 | .appName("SparkByExample")
13 | .getOrCreate();
14 |
15 | println("First SparkContext:")
16 | println("APP Name :"+spark.sparkContext.appName);
17 | println("Deploy Mode :"+spark.sparkContext.deployMode);
18 | println("Master :"+spark.sparkContext.master);
19 |
20 | val sparkSession2 = SparkSession.builder()
21 | .master("local[1]")
22 | .appName("SparkByExample-test")
23 | .getOrCreate();
24 |
25 | println("Second SparkContext:")
26 | println("APP Name :"+sparkSession2.sparkContext.appName);
27 | println("Deploy Mode :"+sparkSession2.sparkContext.deployMode);
28 | println("Master :"+sparkSession2.sparkContext.master);
29 |
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/SparkSessionWrapper.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | trait SparkSessionWrapper {
6 | lazy val spark: SparkSession = {
7 | SparkSession
8 | .builder()
9 | .master("local")
10 | .appName("spark session")
11 | .config("spark.sql.shuffle.partitions", "1")
12 | .getOrCreate()
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/beans/Books.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.beans
2 |
3 | case class Books(_id:String, author:String, description:String, price:Double, publish_date:String, title:String)
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/beans/BooksDiscounted.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.beans
2 |
3 | case class BooksDiscounted(_id:String, author:String, description:String, price:Double, publish_date:String, title:String, discountPrice:Double)
4 |
5 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/beans/BooksStruct.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.beans
2 |
3 | class BooksStruct {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/beans/BooksWithArray.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.beans
2 |
3 | case class BooksWithArray(_id:String, author:String, description:String, price:Double, publish_date:String, title:String,otherInfo:OtherInfo,stores:Stores)
4 | case class OtherInfo(pagesCount:String,language:String,country:String,address:Address)
5 | case class Address(addressline1:String,city:String,state:String)
6 | case class Stores(store:Array[Store])
7 | case class Store(name:String)
8 |
9 |
10 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/beans/User.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.beans
2 |
3 | class User() {
4 | private var name:String = ""
5 | private var age:Int = 0
6 |
7 | def this(name: String, age: Int) {
8 | this()
9 | this.name =name
10 | this.age = age
11 | }
12 |
13 | def getName: String = this.name
14 |
15 | def getAge: Int = this.age
16 |
17 | override def toString: String = "User(" + name + ", " + age + ")"
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/beans/Zipcode.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.beans
2 |
3 | import scala.beans.BeanProperty
4 |
5 | class Zipcode {
6 |
7 | @BeanProperty
8 | var RecordNumber = -1
9 | @BeanProperty
10 | var Zipcode=""
11 | @BeanProperty
12 | var ZipCodeType=""
13 | @BeanProperty
14 | var City=""
15 | @BeanProperty
16 | var State=""
17 | @BeanProperty
18 | var LocationType=""
19 | @BeanProperty
20 | var Lat=""
21 | @BeanProperty
22 | var Long=""
23 | @BeanProperty
24 | var Xaxis=""
25 | @BeanProperty
26 | var Yaxis=""
27 | @BeanProperty
28 | var Zaxis=""
29 | @BeanProperty
30 | var WorldRegion=""
31 | @BeanProperty
32 | var Country=""
33 | @BeanProperty
34 | var LocationText=""
35 | @BeanProperty
36 | var Location=""
37 | @BeanProperty
38 | var Decommisioned=""
39 | }
40 |
41 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/ArrayToColumn.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
4 | import org.apache.spark.sql.{Row, SparkSession}
5 |
6 | object ArrayToColumn extends App {
7 |
8 | val spark = SparkSession.builder().appName("SparkByExamples.com")
9 | .master("local[1]")
10 | .getOrCreate()
11 |
12 | val arrayData = Seq(
13 | Row("James",List("Java","Scala","C++")),
14 | Row("Michael",List("Spark","Java","C++")),
15 | Row("Robert",List("CSharp","VB",""))
16 | )
17 |
18 | val arraySchema = new StructType().add("name",StringType)
19 | .add("subjects",ArrayType(StringType))
20 |
21 | val arrayDF = spark.createDataFrame(spark.sparkContext.parallelize(arrayData),arraySchema)
22 | arrayDF.printSchema()
23 | arrayDF.show()
24 |
25 | // val arrayDFColumn = df.select(
26 | // df("name") +: (0 until 2).map(i => df("subjects")(i).alias(s"LanguagesKnown$i")): _*
27 | // )
28 | //
29 | // arrayDFColumn.show(false)
30 |
31 | //How to convert Array of Array to column
32 | val arrayArrayData = Seq(
33 | Row("James",List(List("Java","Scala","C++"),List("Spark","Java"))),
34 | Row("Michael",List(List("Spark","Java","C++"),List("Spark","Java"))),
35 | Row("Robert",List(List("CSharp","VB"),List("Spark","Python")))
36 | )
37 |
38 | val arrayArraySchema = new StructType().add("name",StringType)
39 | .add("subjects",ArrayType(ArrayType(StringType)))
40 |
41 | val df = spark.createDataFrame(spark.sparkContext.parallelize(arrayArrayData),arrayArraySchema)
42 | df.printSchema()
43 | df.show()
44 |
45 | val df2 = df.select(
46 | df("name") +: (0 until 2).map(i => df("subjects")(i).alias(s"LanguagesKnown$i")): _*
47 | )
48 |
49 | df2.show(false)
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/AvroToJson.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | object AvroToJson extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExample")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | //read avro file
15 | val df = spark.read.format("avro")
16 | .load("src/main/resources/zipcodes.avro")
17 | df.show()
18 | df.printSchema()
19 |
20 | //convert to json
21 | df.write.mode(SaveMode.Overwrite)
22 | .json("/tmp/json/zipcodes.json")
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/AvroToParquet.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | object AvroToParquet extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExample")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | //read avro file
15 | val df = spark.read.format("avro")
16 | .load("src/main/resources/zipcodes.avro")
17 | df.show()
18 | df.printSchema()
19 |
20 | //convert to parquet
21 | df.write.mode(SaveMode.Overwrite)
22 | .parquet("/tmp/parquet/zipcodes.parquet")
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/BroadcastExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object BroadcastExample extends App{
6 |
7 | val spark = SparkSession.builder()
8 | .appName("SparkByExamples.com")
9 | .master("local")
10 | .getOrCreate()
11 |
12 | val states = Map(("NY","New York"),("CA","California"),("FL","Florida"))
13 | val countries = Map(("USA","United States of America"),("IN","India"))
14 |
15 | val broadcastStates = spark.sparkContext.broadcast(states)
16 | val broadcastCountries = spark.sparkContext.broadcast(countries)
17 |
18 | val data = Seq(("James","Smith","USA","CA"),
19 | ("Michael","Rose","USA","NY"),
20 | ("Robert","Williams","USA","CA"),
21 | ("Maria","Jones","USA","FL")
22 | )
23 |
24 | val columns = Seq("firstname","lastname","country","state")
25 | import spark.sqlContext.implicits._
26 | val df = data.toDF(columns:_*)
27 |
28 | val df2 = df.map(row=>{
29 | val country = row.getString(2)
30 | val state = row.getString(3)
31 |
32 | val fullCountry = broadcastCountries.value.get(country).get
33 | val fullState = broadcastStates.value.get(state).get
34 | (row.getString(0),row.getString(1),fullCountry,fullState)
35 | }).toDF(columns:_*)
36 |
37 | df2.show(false)
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/CaseClassSparkSchema.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.Encoders
4 | import org.apache.spark.sql.types.StructType
5 |
6 | object CaseClassSparkSchema extends App{
7 |
8 | case class Name(first:String,last:String,middle:String)
9 | case class Employee(fullName:Name,age:Integer,gender:String)
10 |
11 | val encoderSchema = Encoders.product[Employee].schema
12 | encoderSchema.printTreeString()
13 |
14 | import org.apache.spark.sql.catalyst.ScalaReflection
15 | val schema = ScalaReflection.schemaFor[Employee].dataType.asInstanceOf[StructType]
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/CastColumnType.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types._
5 | import org.apache.spark.sql.functions._
6 |
7 | import org.apache.spark.sql.{Row, SparkSession}
8 | import org.apache.spark.sql.types._
9 | import org.apache.spark.sql.functions._
10 |
11 | object CastColumnType extends App {
12 | val spark: SparkSession = SparkSession.builder()
13 | .master("local[1]")
14 | .appName("SparkByExamples.com")
15 | .getOrCreate()
16 |
17 | val simpleData = Seq(Row("James", 34, "2006-01-01", "true", "M", 3000.60),
18 | Row("Michael", 33, "1980-01-10", "true", "F", 3300.80),
19 | Row("Robert", 37, "06-01-1992", "false", "M", 5000.50)
20 | )
21 |
22 | val simpleSchema = StructType(Array(
23 | StructField("firstName", StringType, true),
24 | StructField("age", IntegerType, true),
25 | StructField("jobStartDate", StringType, true),
26 | StructField("isGraduated", StringType, true),
27 | StructField("gender", StringType, true),
28 | StructField("salary", DoubleType, true)
29 | ))
30 |
31 | val df = spark.createDataFrame(
32 | spark.sparkContext.parallelize(simpleData), simpleSchema)
33 | df.printSchema()
34 | df.show(false)
35 |
36 | val df2 = df.withColumn("age", col("age").cast(StringType))
37 | .withColumn("isGraduated", col("isGraduated").cast(BooleanType))
38 | .withColumn("jobStartDate", col("jobStartDate").cast(DateType))
39 | df2.printSchema()
40 |
41 | val df3 = df2.selectExpr("cast(age as int) age",
42 | "cast(isGraduated as string) isGraduated",
43 | "cast(jobStartDate as string) jobStartDate")
44 | df3.printSchema()
45 | df3.show(false)
46 |
47 | df3.createOrReplaceTempView("CastExample")
48 | val df4 = spark.sql("SELECT STRING(age),BOOLEAN(isGraduated), " +
49 | "DATE(jobStartDate) from CastExample")
50 | df4.printSchema()
51 | df4.show(false)
52 |
53 |
54 | val cast_df = df.select(df.columns.map {
55 | case column@"age" =>
56 | col(column).cast("String").as(column)
57 | case column@"salary" =>
58 | col(column).cast("String").as(column)
59 | case column =>
60 | col(column)
61 | }: _*)
62 |
63 | cast_df.printSchema()
64 |
65 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/ColumnTruncate.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import com.sparkbyexamples.spark.SQLContextExample.spark
4 | import org.apache.log4j.lf5.LogLevel
5 | import org.apache.spark.sql.SparkSession
6 |
7 | object ColumnTruncate extends App {
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 |
15 |
16 | import spark.implicits._
17 | val columns = Seq("Seqno","Quote")
18 | val data = Seq(("1", "Be the change that you wish to see in the world"),
19 | ("2", "Everyone thinks of changing the world, but no one thinks of changing himself."),
20 | ("3", "The purpose of our lives is to be happy."))
21 | val df = data.toDF(columns:_*)
22 | df.show(false)
23 |
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/CreateDataFrame.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
4 | import org.apache.spark.sql.{Row, SparkSession}
5 |
6 | object CreateDataFrame {
7 |
8 | def main(args:Array[String]):Unit={
9 |
10 | val spark:SparkSession = SparkSession.builder()
11 | .master("local[1]").appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | import spark.implicits._
15 | val columns = Seq("language","users_count")
16 | val data = Seq(("Java", "20000"), ("Python", "100000"), ("Scala", "3000"))
17 | val rdd = spark.sparkContext.parallelize(data)
18 |
19 |
20 | //From RDD (USING toDF())
21 | val dfFromRDD1 = rdd.toDF("language","users")
22 | dfFromRDD1.printSchema()
23 | //From RDD (USING createDataFrame)
24 | val dfFromRDD2 = spark.createDataFrame(rdd).toDF(columns:_*)
25 | dfFromRDD2.printSchema()
26 | //From RDD (USING createDataFrame and Adding schema using StructType)
27 | //convert RDD[T] to RDD[Row]
28 | val schema = StructType( Array(StructField("language", StringType, true),
29 | StructField("language", StringType, true)))
30 |
31 | val rowRDD = rdd.map(attributes => Row(attributes._1, attributes._2))
32 | val dfFromRDD3 = spark.createDataFrame(rowRDD,schema)
33 |
34 |
35 | //From Data (USING toDF())
36 | val dfFromData1 = data.toDF()
37 |
38 | //From Data (USING createDataFrame)
39 | var dfFromData2 = spark.createDataFrame(data).toDF(columns:_*)
40 |
41 | //From Data (USING createDataFrame and Adding schema using StructType)
42 | import scala.collection.JavaConversions._
43 | val rowData = data
44 | .map(attributes => Row(attributes._1, attributes._2))
45 | var dfFromData3 = spark.createDataFrame(rowData,schema)
46 |
47 | //From Data (USING createDataFrame and Adding bean class)
48 | //To-DO
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/CreateEmptyDataFrameExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
5 |
6 | object CreateEmptyDataFrameExample extends App {
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | import spark.implicits._
13 |
14 |
15 | val schema = StructType(
16 | StructField("firstName", StringType, true) ::
17 | StructField("lastName", IntegerType, false) ::
18 | StructField("middleName", IntegerType, false) :: Nil)
19 |
20 | val colSeq = Seq("firstName","lastName","middleName")
21 |
22 | case class Name(firstName: String, lastName: String, middleName:String)
23 |
24 | // Create empty dataframe using StructType schema
25 | val df = spark.createDataFrame(spark.sparkContext
26 | .emptyRDD[Row], schema)
27 |
28 | // Using implicit encoder
29 | Seq.empty[(String,String,String)].toDF(colSeq:_*)
30 |
31 | //Using case class
32 |
33 | Seq.empty[Name].toDF().printSchema()
34 |
35 | //Using emptyDataFrame
36 | spark.emptyDataFrame
37 |
38 |
39 | //Using emptyDataset
40 |
41 |
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/CreateEmptyDatasetExample.scala:
--------------------------------------------------------------------------------
1 |
2 | package com.sparkbyexamples.spark.dataframe
3 |
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
6 |
7 | object CreateEmptyDatasetExample extends App {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 | spark.sparkContext.setLogLevel("ERROR");
14 | import spark.implicits._
15 |
16 | val schema = StructType(
17 | StructField("firstName", StringType, true) ::
18 | StructField("lastName", IntegerType, false) ::
19 | StructField("middleName", IntegerType, false) :: Nil)
20 |
21 | val colSeq = Seq("firstName","lastName","middleName")
22 |
23 | case class Name(firstName: String, lastName: String, middleName:String)
24 | case class Empty()
25 | val ds0 = spark.emptyDataset[Empty]
26 | ds0.printSchema()
27 |
28 | val ds1=spark.emptyDataset[Name]
29 | ds1.printSchema()
30 |
31 | val ds2 = spark.createDataset(Seq.empty[Name])
32 | ds2.printSchema()
33 |
34 | val ds4=spark.createDataset(spark.sparkContext.emptyRDD[Name])
35 | ds4.printSchema()
36 |
37 | val ds3=spark.createDataset(Seq.empty[(String,String,String)])
38 | ds3.printSchema()
39 | val ds5=Seq.empty[(String,String,String)].toDS()
40 | ds5.printSchema()
41 |
42 | val ds6=Seq.empty[Name].toDS()
43 | ds6.printSchema()
44 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/CsvToAvroParquetJson.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | object CsvToAvroParquetJson extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExample")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | //read csv with options
15 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true"))
16 | .csv("src/main/resources/zipcodes.csv")
17 | df.show()
18 | df.printSchema()
19 |
20 | //convert to avro
21 | df.write.format("avro").mode(SaveMode.Overwrite)
22 | .save("/tmp/avro/zipcodes.avro")
23 |
24 | //convert to avro by partition
25 | df.write.partitionBy("State","Zipcode")
26 | .format("avro")
27 | .mode(SaveMode.Overwrite)
28 | .save("/tmp/avro/zipcodes_partition.avro")
29 |
30 | //convert to parquet
31 | df.write.mode(SaveMode.Overwrite).parquet("/tmp/parquet/zipcodes.parquet")
32 |
33 | //convert to csv
34 | df.write.mode(SaveMode.Overwrite).json("/tmp/json/zipcodes.json")
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/DataFrameFromCSVFile.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | object DataFrameFromCSVFile {
6 |
7 | def main(args:Array[String]):Unit= {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExample")
12 | .getOrCreate()
13 |
14 | spark.sparkContext.setLogLevel("ERROR")
15 |
16 | //spark read csv file
17 | val df = spark.read.csv("src/main/resources/zipcodes.csv")
18 | df.show()
19 | df.printSchema()
20 |
21 | //read csv with options
22 | val df2 = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true")).csv("src/main/resources/zipcodes.csv")
23 | df2.show()
24 | df2.printSchema()
25 |
26 | //read with custom schema
27 | import org.apache.spark.sql.types._
28 | val schema = new StructType()
29 | .add("RecordNumber",IntegerType,true)
30 | .add("Zipcode",IntegerType,true)
31 | .add("ZipCodeType",StringType,true)
32 | .add("City",StringType,true)
33 | .add("State",StringType,true)
34 | .add("LocationType",StringType,true)
35 | .add("Lat",DoubleType,true)
36 | .add("Long",DoubleType,true)
37 | .add("Xaxis",DoubleType,true)
38 | .add("Yaxis",DoubleType,true)
39 | .add("Zaxis",DoubleType,true)
40 | .add("WorldRegion",StringType,true)
41 | .add("Country",StringType,true)
42 | .add("LocationText",StringType,true)
43 | .add("Location",StringType,true)
44 | .add("Decommisioned",BooleanType,true)
45 | .add("TaxReturnsFiled",IntegerType,true)
46 | .add("EstimatedPopulation",IntegerType,true)
47 | .add("TotalWages",IntegerType,true)
48 | .add("Notes",StringType,true)
49 |
50 | //Write dataframe back to csv file
51 | val df_with_schema = spark.read.format("csv")
52 | .option("header", "true")
53 | .schema(schema)
54 | .load("src/main/resources/zipcodes.csv")
55 |
56 | df_with_schema.printSchema()
57 | df_with_schema.show(false)
58 |
59 |
60 | //Write a csv file
61 | df_with_schema.write.mode(SaveMode.Overwrite)
62 | .csv("c:/tmp/spark_output/zipcodes")
63 |
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/DataFrameWithSimpleDSL.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 |
5 | object DataFrameWithSimpleDSL {
6 |
7 | def main(args:Array[String]):Unit= {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExample")
12 | .getOrCreate()
13 |
14 | val filePath = "C://000_Projects/opt/BigData/zipcodes.csv"
15 |
16 | var df:DataFrame = spark.read.option("header","true").csv(filePath)
17 | df.printSchema()
18 |
19 | // Where
20 | df.select("*").where(df("RecordNumber") < 10).show()
21 | //Filter
22 | df.filter(df("State")==="PR").select("State").show()
23 | //Distinct
24 | df.select(df("State")).distinct().show()
25 | //Count
26 | println("Number of records"+df.count())
27 |
28 | //When Otherwise
29 | //df.select(df("State"), case df("State") when "PR" then "PR123"
30 |
31 | // where with and and or conditions
32 | df.where(df("State") === "PR" && df("City").contains("DEL")).show()
33 |
34 | //Order or Sort by
35 | df.orderBy(df("RecordNumber").desc, df("State").asc).show()
36 |
37 |
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/FilterExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
5 | import org.apache.spark.sql.functions.array_contains
6 | object FilterExample extends App{
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | val arrayStructureData = Seq(
16 | Row(Row("James","","Smith"),List("Java","Scala","C++"),"OH","M"),
17 | Row(Row("Anna","Rose",""),List("Spark","Java","C++"),"NY","F"),
18 | Row(Row("Julia","","Williams"),List("CSharp","VB"),"OH","F"),
19 | Row(Row("Maria","Anne","Jones"),List("CSharp","VB"),"NY","M"),
20 | Row(Row("Jen","Mary","Brown"),List("CSharp","VB"),"NY","M"),
21 | Row(Row("Mike","Mary","Williams"),List("Python","VB"),"OH","M")
22 | )
23 |
24 | val arrayStructureSchema = new StructType()
25 | .add("name",new StructType()
26 | .add("firstname",StringType)
27 | .add("middlename",StringType)
28 | .add("lastname",StringType))
29 | .add("languages", ArrayType(StringType))
30 | .add("state", StringType)
31 | .add("gender", StringType)
32 |
33 | val df = spark.createDataFrame(
34 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
35 | df.printSchema()
36 | df.show()
37 |
38 | //Condition
39 | df.filter(df("state") === "OH")
40 | .show(false)
41 |
42 | //SQL Expression
43 | df.filter("gender == 'M'")
44 | .show(false)
45 |
46 | //multiple condition
47 | df.filter(df("state") === "OH" && df("gender") === "M")
48 | .show(false)
49 |
50 | //Array condition
51 | df.filter(array_contains(df("languages"),"Java"))
52 | .show(false)
53 |
54 | //Struct condition
55 | df.filter(df("name.lastname") === "Williams")
56 | .show(false)
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/FilterNullRowsExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{SparkSession}
4 | import org.apache.spark.sql.functions.col
5 |
6 | object FilterNullRowsExample extends App{
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | spark.sparkContext.setLogLevel("ERROR")
14 | val data = Seq(
15 | ("James",null,"M"),
16 | ("Anna","NY","F"),
17 | ("Julia",null,null)
18 | )
19 | import spark.implicits._
20 | val columns = Seq("name","state","gender")
21 | val df = data.toDF(columns:_*)
22 |
23 | df.printSchema()
24 | df.show()
25 |
26 | df.filter("state is NULL").show(false)
27 | df.filter(df("state").isNull).show(false)
28 | df.filter(col("state").isNull).show(false)
29 |
30 | df.filter("state is not NULL").show(false)
31 | df.filter("NOT state is NULL").show(false)
32 | df.filter(df("state").isNotNull).show(false)
33 |
34 | df.filter("state is NULL AND gender is NULL").show(false)
35 | df.filter(df("state").isNull && df("gender").isNull).show(false)
36 |
37 | df.createOrReplaceTempView("DATA")
38 | spark.sql("SELECT * FROM DATA where STATE IS NULL").show(false)
39 | spark.sql("SELECT * FROM DATA where STATE IS NULL AND GENDER IS NULL").show(false)
40 | spark.sql("SELECT * FROM DATA where STATE IS NOT NULL").show(false)
41 |
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/FromCSVFile2.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object FromCSVFile2 {
6 |
7 | def main(args:Array[String]):Unit= {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | val filePath="src/main/resources/stream.csv"
15 |
16 | val df3 = spark.read.option("header",true).csv("src/main/resources/zipcodes.csv")
17 | df3.show(false)
18 |
19 |
20 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->"|","header"->"true")).csv(filePath)
21 |
22 | val df2 = df.select("Gender", "BirthDate", "TotalCost", "TotalChildren", "ProductCategoryName")
23 | .filter("Gender is not null")
24 | .filter("BirthDate is not null")
25 | .filter("TotalChildren is not null")
26 | .filter("ProductCategoryName is not null")
27 | df2.show()
28 |
29 | df.select("Gender", "BirthDate", "TotalCost", "TotalChildren", "ProductCategoryName")
30 | .where(df("Gender").isNotNull && df("BirthDate").isNotNull && df("TotalChildren").isNotNull && df("ProductCategoryName").isNotNull ).show()
31 |
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/FromCSVMultiline.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object FromCSVMultiline extends App {
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local[3]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 |
13 | val df = spark.read
14 | .option("header",true)
15 | .option("delimiter",",")
16 | .option("multiLine",true)
17 | .option("quotes","\"")
18 | .csv("src/main/resources/address-multiline.csv")
19 |
20 | df.show(false)
21 | }
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/FromTextFile.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
4 |
5 | object FromTextFile {
6 |
7 | def main(args:Array[String]):Unit= {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | //returns DataFrame
15 | val df:DataFrame = spark.read.text("src/main/resources/csv/text01.txt")
16 | df.printSchema()
17 | df.show(false)
18 |
19 | //converting to columns by splitting
20 | import spark.implicits._
21 | val df2 = df.map(f=>{
22 | val elements = f.getString(0).split(",")
23 | (elements(0),elements(1))
24 | })
25 |
26 | df2.printSchema()
27 | df2.show(false)
28 |
29 | // returns Dataset[String]
30 | val ds:Dataset[String] = spark.read.textFile("src/main/resources/csv/text01.txt")
31 | ds.printSchema()
32 | ds.show(false)
33 |
34 | //converting to columns by splitting
35 | import spark.implicits._
36 | val ds2 = ds.map(f=> {
37 | val elements = f.split(",")
38 | (elements(0),elements(1))
39 | })
40 |
41 | ds2.printSchema()
42 | ds2.show(false)
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/HandleNullExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object HandleNullExample extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val filePath="src/main/resources/small_zipcode.csv"
13 |
14 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true")).csv(filePath)
15 | df.printSchema()
16 | df.show(false)
17 |
18 | df.na.fill(0)
19 | .show(false)
20 |
21 | df.na.fill(0,Array("population"))
22 | .show(false)
23 |
24 | df.na.fill("")
25 | .show(false)
26 |
27 | df.na.fill("unknown",Array("city"))
28 | .na.fill("",Array("type"))
29 | .show(false)
30 |
31 | // Array and map columns
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/JsonFromMultiline.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object JsonFromMultiline extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[3]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | //read multiline json file
13 | val multiline_df = spark.read.option("multiline", "true")
14 | .json("src/main/resources/multiline-zipcode.json")
15 | multiline_df.printSchema()
16 | multiline_df.show(false)
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/JsonToAvroCsvParquet.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object JsonToAvroCsvParquet extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExample")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | //read json file into dataframe
15 | val df = spark.read.json("src/main/resources/zipcodes.json")
16 | df.printSchema()
17 | df.show(false)
18 |
19 | //convert to avro
20 | df.write.format("avro").save("/tmp/avro/zipcodes.avro")
21 |
22 | //convert to avro by partition
23 | df.write.partitionBy("State","Zipcode")
24 | .format("avro").save("/tmp/avro/zipcodes_partition.avro")
25 |
26 | //convert to parquet
27 | df.write.parquet("/tmp/parquet/zipcodes.parquet")
28 |
29 | //convert to csv
30 | df.write.option("header","true").csv("/tmp/csv/zipcodes.csv")
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object ParquetExample {
6 |
7 | def main(args:Array[String]):Unit= {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | val data = Seq(("James ","","Smith","36636","M",3000),
15 | ("Michael ","Rose","","40288","M",4000),
16 | ("Robert ","","Williams","42114","M",4000),
17 | ("Maria ","Anne","Jones","39192","F",4000),
18 | ("Jen","Mary","Brown","","F",-1)
19 | )
20 |
21 | val columns = Seq("firstname","middlename","lastname","dob","gender","salary")
22 | import spark.sqlContext.implicits._
23 | val df = data.toDF(columns:_*)
24 |
25 | df.show()
26 | df.printSchema()
27 |
28 | df.write
29 | .parquet("C:\\tmp\\output\\people.parquet")
30 |
31 | val parqDF = spark.read.parquet("C:\\tmp\\output\\people.parquet")
32 | parqDF.createOrReplaceTempView("ParquetTable")
33 |
34 | spark.sql("select * from ParquetTable where salary >= 4000").explain()
35 | val parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ")
36 |
37 | parkSQL.show()
38 | parkSQL.printSchema()
39 |
40 | df.write
41 | .partitionBy("gender","salary")
42 | .parquet("C:\\tmp\\output\\people2.parquet")
43 |
44 | val parqDF2 = spark.read.parquet("C:\\tmp\\output\\people2.parquet")
45 | parqDF2.createOrReplaceTempView("ParquetTable2")
46 |
47 | val df3 = spark.sql("select * from ParquetTable2 where gender='M' and salary >= 4000")
48 | df3.explain()
49 | df3.printSchema()
50 | df3.show()
51 |
52 | val parqDF3 = spark.read
53 | .parquet("C:\\tmp\\output\\people2.parquet\\gender=M")
54 | parqDF3.show()
55 |
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetToAvro.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | object ParquetToAvro extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExample")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | //read parquet file
15 | val df = spark.read.format("parquet")
16 | .load("src/main/resources/zipcodes.parquet")
17 | df.show()
18 | df.printSchema()
19 |
20 | //convert to avro
21 | df.write.format("avro")
22 | .mode(SaveMode.Overwrite)
23 | .save("/tmp/avro/zipcodes.avro")
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetToCsv.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | object ParquetToCsv extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | //read parquet file
15 | val df = spark.read.format("parquet")
16 | .load("src/main/resources/zipcodes.parquet")
17 | df.show()
18 | df.printSchema()
19 |
20 | //convert to csv
21 | df.write.mode(SaveMode.Overwrite)
22 | .csv("/tmp/csv/zipcodes.csv")
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetToJson.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | object ParquetToJson extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | //read parquet file
15 | val df = spark.read.format("parquet")
16 | .load("src/main/resources/zipcodes.parquet")
17 | df.show()
18 | df.printSchema()
19 |
20 | //convert to json
21 | df.write.mode(SaveMode.Overwrite)
22 | .json("/tmp/json/zipcodes.json")
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/ReadJsonFromString.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types._
6 |
7 | object ReadJsonFromString extends App {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | spark.sparkContext.setLogLevel("ERROR")
15 |
16 | //Read JSON string from text file
17 | val dfFromText:DataFrame = spark.read.text("src/main/resources/simple_zipcodes.txt")
18 | dfFromText.printSchema()
19 |
20 | val schema = new StructType()
21 | .add("Zipcode", StringType, true)
22 | .add("ZipCodeType", StringType, true)
23 | .add("City", StringType, true)
24 | .add("State", StringType, true)
25 |
26 | val dfJSON = dfFromText.withColumn("jsonData",from_json(col("value"),schema))
27 | .select("jsonData.*")
28 | dfJSON.printSchema()
29 | dfJSON.show(false)
30 |
31 | //alternatively using select
32 | val dfJSON2 = dfFromText.select(from_json(col("value"), schema).as("jsonData"))
33 | .select("jsonData.*")
34 |
35 | //Read JSON string from CSV file
36 | val dfFromCSV:DataFrame = spark.read.option("header",true)
37 | .csv("src/main/resources/simple_zipcodes.csv")
38 | dfFromCSV.printSchema()
39 | dfFromCSV.show(false)
40 |
41 | val dfFromCSVJSON = dfFromCSV.select(col("Id"),
42 | from_json(col("JsonValue"),schema).as("jsonData"))
43 | .select("Id","jsonData.*")
44 | dfFromCSVJSON.printSchema()
45 | dfFromCSVJSON.show(false)
46 |
47 | //Read json from string
48 | import spark.implicits._
49 | val jsonStr = """{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}"""
50 | val df = spark.read.json(Seq(jsonStr).toDS())
51 | df.show(false)
52 |
53 | // from RDD[String]
54 | // deprecated
55 | val rdd = spark.sparkContext.parallelize(
56 | """ {"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"} """ :: Nil)
57 | val df2 = spark.read.json(rdd)
58 | df2.show()
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/RemoveNullRowsExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object RemoveNullRowsExample extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 | val filePath="src/main/resources/small_zipcode.csv"
14 |
15 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true")).csv(filePath)
16 | df.printSchema()
17 | df.show(false)
18 |
19 | df.na.drop().show(false)
20 |
21 | //all/any
22 | df.na.drop("any").show(false)
23 |
24 | df.na.drop(Seq("population","type")).show(false)
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/RenameColDataFrame.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
5 | import org.apache.spark.sql.functions.{col, _}
6 |
7 | object RenameColDataFrame {
8 |
9 | def main(args:Array[String]):Unit= {
10 |
11 | val spark: SparkSession = SparkSession.builder()
12 | .master("local[1]")
13 | .appName("SparkByExamples.com")
14 | .getOrCreate()
15 |
16 | val data = Seq(Row(Row("James ","","Smith"),"36636","M",3000),
17 | Row(Row("Michael ","Rose",""),"40288","M",4000),
18 | Row(Row("Robert ","","Williams"),"42114","M",4000),
19 | Row(Row("Maria ","Anne","Jones"),"39192","F",4000),
20 | Row(Row("Jen","Mary","Brown"),"","F",-1)
21 | )
22 |
23 | val schema = new StructType()
24 | .add("name",new StructType()
25 | .add("firstname",StringType)
26 | .add("middlename",StringType)
27 | .add("lastname",StringType))
28 | .add("dob",StringType)
29 | .add("gender",StringType)
30 | .add("salary",IntegerType)
31 |
32 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema)
33 |
34 | df.printSchema()
35 |
36 | df.withColumnRenamed("dob","DateOfBirth")
37 | .printSchema()
38 |
39 | val schema2 = new StructType()
40 | .add("fname",StringType)
41 | .add("middlename",StringType)
42 | .add("lname",StringType)
43 |
44 | df.select(col("name").cast(schema2),
45 | col("dob"),
46 | col("gender"),
47 | col("salary"))
48 | .printSchema()
49 |
50 | df.select(col("name.firstname").as("fname"),
51 | col("name.middlename").as("mname"),
52 | col("name.lastname").as("lname"),
53 | col("dob"),col("gender"),col("salary"))
54 | .printSchema()
55 |
56 | df.withColumnRenamed("name.firstname","fname")
57 | .withColumnRenamed("name.middlename","mname")
58 | .withColumnRenamed("name.lastname","lname")
59 | .drop("name")
60 | .printSchema()
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/SQLExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | object DataFrameWithSQL_ {
6 |
7 | def main(args:Array[String]):Unit= {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | val data = Seq(1,2,3)
15 |
16 | import spark.sqlContext.implicits._
17 |
18 | val df = data.toDF("field1")
19 |
20 | df.createOrReplaceTempView("table1")
21 |
22 | val df2 = spark.sql("select tb1.field1 as field1,tb2.field1 as field2 from table1 tb1, table1 tb2 where tb1.field1 <> tb2.field1")
23 | df2.printSchema()
24 | df2.show(false)
25 |
26 | df2.createOrReplaceTempView("table2")
27 |
28 | val df3 = spark.sql("select distinct tb1.field1,tb1.field2 from table2 tb1, table2 tb2 where tb1.field1 == tb2.field2 and tb1.field2 == tb2.field1")
29 |
30 | df3.show(false)
31 |
32 |
33 |
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/SaveDataFrame.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 |
5 | object SaveDataFrame {
6 |
7 | def main(args: Array[String]): Unit = {
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExample")
11 | .getOrCreate()
12 |
13 | val filePath = "C://000_Projects/opt/BigData/zipcodes.csv"
14 |
15 | var df:DataFrame = spark.read.option("header","true").csv(filePath)
16 |
17 | df.repartition(5).write.option("header","true").csv("c:/tmp/output/df1")
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/SparkUDF.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.functions.udf
4 | import org.apache.spark.sql.functions.col
5 | import org.apache.spark.sql.{Row, SparkSession}
6 |
7 | object SparkUDF extends App{
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | import spark.implicits._
15 | val columns = Seq("Seqno","Quote")
16 | val data = Seq(("1", "Be the change that you wish to see in the world"),
17 | ("2", "Everyone thinks of changing the world, but no one thinks of changing himself."),
18 | ("3", "The purpose of our lives is to be happy.")
19 |
20 | )
21 | val df = data.toDF(columns:_*)
22 | df.show(false)
23 |
24 | val convertCase = (str:String) => {
25 | val arr = str.split(" ")
26 | arr.map(f=> f.substring(0,1).toUpperCase + f.substring(1,f.length)).mkString(" ")
27 | }
28 |
29 | //Using with DataFrame
30 | val convertUDF = udf(convertCase)
31 | df.select(col("Seqno"),
32 | convertUDF(col("Quote")).as("Quote") ).show(false)
33 |
34 | // Using it on SQL
35 | spark.udf.register("convertUDF", convertCase)
36 | df.createOrReplaceTempView("QUOTE_TABLE")
37 | spark.sql("select Seqno, convertUDF(Quote) from QUOTE_TABLE").show(false)
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/UDFDataFrame.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | object UDFDataFrame {
6 | def main(args:Array[String]): Unit = {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local[3]")
10 | .appName("SparkByExample")
11 | .getOrCreate()
12 |
13 | val data = Seq(("2018/01/23",23),("2018/01/24",24),("2018/02/20",25))
14 |
15 | import spark.sqlContext.implicits._
16 | val df = data.toDF("date1","day")
17 |
18 | val replace: String => String = _.replace("/","-")
19 | import org.apache.spark.sql.functions.udf
20 | val replaceUDF = udf(replace)
21 | val minDate = df.agg(min($"date1")).collect()(0).get(0)
22 |
23 | val df2 = df.select("*").filter( to_date(replaceUDF($"date1")) > date_add(to_date(replaceUDF(lit(minDate))),7 ))
24 | df2.show()
25 | }
26 |
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/UnionExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object UnionExample extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.implicits._
15 |
16 | val simpleData = Seq(("James","Sales","NY",90000,34,10000),
17 | ("Michael","Sales","NY",86000,56,20000),
18 | ("Robert","Sales","CA",81000,30,23000),
19 | ("Maria","Finance","CA",90000,24,23000)
20 | )
21 | val df = simpleData.toDF("employee_name","department","state","salary","age","bonus")
22 | df.printSchema()
23 | df.show()
24 |
25 | val simpleData2 = Seq(("James","Sales","NY",90000,34,10000),
26 | ("Maria","Finance","CA",90000,24,23000),
27 | ("Jen","Finance","NY",79000,53,15000),
28 | ("Jeff","Marketing","CA",80000,25,18000),
29 | ("Kumar","Marketing","NY",91000,50,21000)
30 | )
31 | val df2 = simpleData2.toDF("employee_name","department","state","salary","age","bonus")
32 | df2.show(false)
33 |
34 | val df3 = df.union(df2)
35 | df3.show(false)
36 | df3.distinct().show(false)
37 |
38 | val df4 = df.unionAll(df2)
39 | df4.show(false)
40 |
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/WhereExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe
2 |
3 | import org.apache.spark.sql.functions.array_contains
4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
5 | import org.apache.spark.sql.{Row, SparkSession}
6 |
7 | object WhereExample extends App{
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | spark.sparkContext.setLogLevel("ERROR")
15 |
16 | val arrayStructureData = Seq(
17 | Row(Row("James","","Smith"),List("Java","Scala","C++"),"OH","M"),
18 | Row(Row("Anna","Rose",""),List("Spark","Java","C++"),"NY","F"),
19 | Row(Row("Julia","","Williams"),List("CSharp","VB"),"OH","F"),
20 | Row(Row("Maria","Anne","Jones"),List("CSharp","VB"),"NY","M"),
21 | Row(Row("Jen","Mary","Brown"),List("CSharp","VB"),"NY","M"),
22 | Row(Row("Mike","Mary","Williams"),List("Python","VB"),"OH","M")
23 | )
24 |
25 | val arrayStructureSchema = new StructType()
26 | .add("name",new StructType()
27 | .add("firstname",StringType)
28 | .add("middlename",StringType)
29 | .add("lastname",StringType))
30 | .add("languages", ArrayType(StringType))
31 | .add("state", StringType)
32 | .add("gender", StringType)
33 |
34 | val df = spark.createDataFrame(
35 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
36 | df.printSchema()
37 | df.show()
38 |
39 | //Condition
40 | df.where(df("state") === "OH")
41 | .show(false)
42 |
43 | //SQL Expression
44 | df.where("gender == 'M'")
45 | .show(false)
46 |
47 | //multiple condition
48 | df.where(df("state") === "OH" && df("gender") === "M")
49 | .show(false)
50 |
51 | //Array condition
52 | df.where(array_contains(df("languages"),"Java"))
53 | .show(false)
54 |
55 | //Struct condition
56 | df.where(df("name.lastname") === "Williams")
57 | .show(false)
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/CacheExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | object CacheExample extends App {
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | //read csv with options
13 | val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true"))
14 | .csv("src/main/resources/zipcodes.csv")
15 |
16 | val df2 = df.where(col("State") === "PR").cache()
17 | df2.show(false)
18 |
19 | println(df2.count())
20 |
21 | val df3 = df2.where(col("Zipcode") === 704)
22 |
23 |
24 | println(df2.count())
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/CastStringToInt.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object CastStringToInt extends App {
6 |
7 | val spark = SparkSession.builder
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val simpleData = Seq(("James",34,"true","M","3000.6089"),
13 | ("Michael",33,"true","F","3300.8067"),
14 | ("Robert",37,"false","M","5000.5034")
15 | )
16 |
17 | import spark.implicits._
18 | val df = simpleData.toDF("firstname","age","isGraduated","gender","salary")
19 | df.printSchema()
20 |
21 | import org.apache.spark.sql.functions.col
22 | import org.apache.spark.sql.types.IntegerType
23 | // Convert String to Integer Type
24 | val df2= df.withColumn("salary",col("salary").cast(IntegerType))
25 | df2.printSchema()
26 | df2.show()
27 |
28 | df.withColumn("salary",col("salary").cast("int")).printSchema()
29 | df.withColumn("salary",col("salary").cast("integer")).printSchema()
30 |
31 | // Using select
32 | df.select(col("salary").cast("int").as("salary")).printSchema()
33 |
34 | //Using selectExpr()
35 | df.selectExpr("cast(salary as int) salary","isGraduated").printSchema()
36 | df.selectExpr("INT(salary)","isGraduated").printSchema()
37 |
38 | //Using with spark.sql()
39 | df.createOrReplaceTempView("CastExample")
40 | spark.sql("SELECT INT(salary),BOOLEAN(isGraduated),gender from CastExample").printSchema()
41 | spark.sql("SELECT cast(salary as int) salary, BOOLEAN(isGraduated),gender from CastExample").printSchema()
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/CollectExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
5 |
6 | object CollectExample extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | val data = Seq(Row(Row("James ","","Smith"),"36636","M",3000),
14 | Row(Row("Michael ","Rose",""),"40288","M",4000),
15 | Row(Row("Robert ","","Williams"),"42114","M",4000),
16 | Row(Row("Maria ","Anne","Jones"),"39192","F",4000),
17 | Row(Row("Jen","Mary","Brown"),"","F",-1)
18 | )
19 |
20 | val schema = new StructType()
21 | .add("name",new StructType()
22 | .add("firstname",StringType)
23 | .add("middlename",StringType)
24 | .add("lastname",StringType))
25 | .add("id",StringType)
26 | .add("gender",StringType)
27 | .add("salary",IntegerType)
28 |
29 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema)
30 | df.printSchema()
31 | df.show(false)
32 |
33 | val colList = df.collectAsList()
34 | val colData = df.collect()
35 |
36 | colData.foreach(row=>
37 | {
38 | val salary = row.getInt(3)//Index starts from zero
39 | println(salary)
40 | })
41 |
42 | //Retrieving data from Struct column
43 | colData.foreach(row=>
44 | {
45 | val salary = row.getInt(3)
46 | val fullName:Row = row.getStruct(0) //Index starts from zero
47 | val firstName = fullName.getString(0)//In struct row, again index starts from zero
48 | val middleName = fullName.get(1).toString
49 | val lastName = fullName.getAs[String]("lastname")
50 | println(firstName+","+middleName+","+lastName+","+salary)
51 | })
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/DataFrameComplex.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types._
5 |
6 | object DataFrameComplex extends App {
7 |
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local[5]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | val structureData = Seq(
15 | Row(Row("James","","Smith"),"36636","NewYork",3100, List("Java","Scala"),Map("hair"->"black","eye"->"brown")),
16 | Row(Row("Michael","Rose",""),"40288","California",4300,List("Python","PHP"),Map("hair"->"black","eye"->"brown")),
17 | Row(Row("Robert","","Williams"),"42114","Florida",1400,List("C++","C#"),Map("hair"->"black","eye"->"brown")),
18 | Row(Row("Maria","Anne","Jones"),"39192","Florida",5500,List("Python","Scala"),Map("hair"->"black","eye"->"brown")),
19 | Row(Row("Jen","Mary","Brown"),"34561","NewYork",3000,List("R","Scala"),Map("hair"->"black","eye"->"brown"))
20 | )
21 |
22 | val structureSchema = new StructType()
23 | .add("name",new StructType()
24 | .add("firstname",StringType)
25 | .add("middlename",StringType)
26 | .add("lastname",StringType))
27 | .add("id",StringType)
28 | .add("location",StringType)
29 | .add("salary",IntegerType)
30 | .add("languagesKnown",ArrayType(StringType))
31 | .add("properties",MapType(StringType,StringType))
32 |
33 |
34 | val df2 = spark.createDataFrame(spark.sparkContext.parallelize(structureData),structureSchema)
35 | df2.printSchema()
36 | df2.show(false)
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/DataFrameEmptyCheck.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object DataFrameEmptyCheck extends App {
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExample")
10 | .getOrCreate()
11 |
12 | val df = spark.emptyDataFrame
13 |
14 | println(df.isEmpty)
15 | println(df.rdd.isEmpty())
16 | println(df.head())
17 | println()
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/DropColumn.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
5 | import org.apache.spark.sql.functions.col
6 | object DropColumn extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local[5]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | val data = Seq(
14 | Row("James","","Smith","36636","NewYork",3100),
15 | Row("Michael","Rose","","40288","California",4300),
16 | Row("Robert","","Williams","42114","Florida",1400),
17 | Row("Maria","Anne","Jones","39192","Florida",5500),
18 | Row("Jen","Mary","Brown","34561","NewYork",3000)
19 | )
20 |
21 | val schema = new StructType()
22 | .add("firstname",StringType)
23 | .add("middlename",StringType)
24 | .add("lastname",StringType)
25 | .add("id",StringType)
26 | .add("location",StringType)
27 | .add("salary",IntegerType)
28 |
29 | val df = spark.createDataFrame(
30 | spark.sparkContext.parallelize(data),schema)
31 | df.printSchema()
32 | df.show(false)
33 |
34 | df.drop(df("firstname"))
35 | .printSchema()
36 |
37 | df.drop(col("firstname"))
38 | .printSchema()
39 |
40 | val df2 = df.drop("firstname")
41 | df2.printSchema()
42 |
43 | df.drop("firstname","middlename","lastname")
44 | .printSchema()
45 |
46 | val cols = Seq("firstname","middlename","lastname")
47 | df.drop(cols:_*)
48 | .printSchema()
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/ForEachExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object ForEachExample extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val data = Seq(("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"),
13 | ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"),
14 | ("Carrots",1200,"China"),("Beans",1500,"China"))
15 |
16 | //DataFrame
17 | val df = spark.createDataFrame(data).toDF("Product","Amount","Country")
18 | df.foreach(f=> println(f))
19 |
20 | val longAcc = spark.sparkContext.longAccumulator("SumAccumulator")
21 | df.foreach(f=> {
22 | longAcc.add(f.getInt(1))
23 | })
24 | println("Accumulator value:"+longAcc.value)
25 | //rdd
26 | val rdd = spark.sparkContext.parallelize(Seq(1,2,3,4,5,6,7,8,9))
27 | rdd.foreach(print)
28 |
29 | //rdd accumulator
30 | val rdd2 = spark.sparkContext.parallelize(Seq(1,2,3,4,5,6,7,8,9))
31 | val longAcc2 = spark.sparkContext.longAccumulator("SumAccumulator2")
32 | rdd .foreach(f=> {
33 | longAcc2.add(f)
34 | })
35 | println("Accumulator value:"+longAcc2.value)
36 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/ForEachPartExample.scala:
--------------------------------------------------------------------------------
1 | //package com.sparkbyexamples.spark.dataframe.examples
2 | //
3 | //import org.apache.spark.sql.SparkSession
4 | //
5 | //object ForEachPartExample extends App {
6 | //
7 | // val spark: SparkSession = SparkSession.builder()
8 | // .master("local[1]")
9 | // .appName("SparkByExamples.com")
10 | // .getOrCreate()
11 | //
12 | // val data = Seq(("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"),
13 | // ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"),
14 | // ("Carrots",1200,"China"),("Beans",1500,"China"))
15 | //
16 | // // foreachPartition DataFrame
17 | // val df = spark.createDataFrame(data).toDF("Product","Amount","Country")
18 | // df.foreachPartition(partition => {
19 | // //Initialize any database connection
20 | // partition.foreach(fun=>{
21 | // //apply the function
22 | // })
23 | // })
24 | //
25 | // //rdd
26 | // val rdd = spark.sparkContext.parallelize(Seq(1,2,3,4,5,6,7,8,9))
27 | // rdd.foreachPartition(partition => {
28 | // //Initialize any database connection
29 | // partition.foreach(fun=>{
30 | // //apply the function
31 | // })
32 | // })
33 | //}
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/MapFlatMap.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import com.sparkbyexamples.spark.rdd.functions.FlatMapExample.spark
4 | import org.apache.spark.sql.{Row, SparkSession}
5 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
6 |
7 | object MapFlatMap extends App{
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | val data = Seq("Project Gutenberg’s",
15 | "Alice’s Adventures in Wonderland",
16 | "Project Gutenberg’s",
17 | "Adventures in Wonderland",
18 | "Project Gutenberg’s")
19 |
20 | import spark.sqlContext.implicits._
21 | val df = data.toDF("data")
22 | df.show(false)
23 |
24 | //Map Transformation
25 | val mapDF=df.map(fun=> {
26 | fun.getString(0).split(" ")
27 | })
28 | mapDF.show(false)
29 |
30 | //Flat Map Transformation
31 | val flatMapDF=df.flatMap(fun=>
32 | {
33 | fun.getString(0).split(" ")
34 | })
35 | flatMapDF.show()
36 |
37 | val arrayStructureData = Seq(
38 | Row("James,,Smith",List("Java","Scala","C++"),"CA"),
39 | Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"),
40 | Row("Robert,,Williams",List("CSharp","VB","R"),"NV")
41 | )
42 |
43 | val arrayStructureSchema = new StructType()
44 | .add("name",StringType)
45 | .add("languagesAtSchool", ArrayType(StringType))
46 | .add("currentState", StringType)
47 |
48 | val df1 = spark.createDataFrame(
49 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
50 |
51 |
52 | //flatMap() Usage
53 | val df2=df1.flatMap(f => {
54 | val lang=f.getSeq[String](1)
55 | lang.map((f.getString(0),_,f.getString(2)))
56 | })
57 |
58 | val df3=df2.toDF("Name","language","State")
59 | df3.show(false)
60 |
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/MapTransformation.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType,ArrayType,MapType}
5 |
6 | object MapTransformation extends App{
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local[5]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | val structureData = Seq(
14 | Row("James","","Smith","36636","NewYork",3100),
15 | Row("Michael","Rose","","40288","California",4300),
16 | Row("Robert","","Williams","42114","Florida",1400),
17 | Row("Maria","Anne","Jones","39192","Florida",5500),
18 | Row("Jen","Mary","Brown","34561","NewYork",3000)
19 | )
20 |
21 | val structureSchema = new StructType()
22 | .add("firstname",StringType)
23 | .add("middlename",StringType)
24 | .add("lastname",StringType)
25 | .add("id",StringType)
26 | .add("location",StringType)
27 | .add("salary",IntegerType)
28 |
29 | val df2 = spark.createDataFrame(
30 | spark.sparkContext.parallelize(structureData),structureSchema)
31 | df2.printSchema()
32 | df2.show(false)
33 |
34 | import spark.implicits._
35 | val util = new Util()
36 | val df3 = df2.map(row=>{
37 |
38 | val fullName = util.combine(row.getString(0),row.getString(1),row.getString(2))
39 | (fullName, row.getString(3),row.getInt(5))
40 | })
41 | val df3Map = df3.toDF("fullName","id","salary")
42 |
43 | df3Map.printSchema()
44 | df3Map.show(false)
45 |
46 | val df4 = df2.mapPartitions(iterator => {
47 | val util = new Util()
48 | val res = iterator.map(row=>{
49 | val fullName = util.combine(row.getString(0),row.getString(1),row.getString(2))
50 | (fullName, row.getString(3),row.getInt(5))
51 | })
52 | res
53 | })
54 | val df4part = df4.toDF("fullName","id","salary")
55 | df4part.printSchema()
56 | df4part.show(false)
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/RangePartition.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.col
5 | object RangePartition extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder() .master("local[1]")
8 | .appName("SparkByExamples.com")
9 | .getOrCreate()
10 |
11 | /**
12 | * Simple using columns list
13 | */
14 | val data = Seq((1,10),(2,20),(3,10),(4,20),(5,10),
15 | (6,30),(7,50),(8,50),(9,50),(10,30),
16 | (11,10),(12,10),(13,40),(14,40),(15,40),
17 | (16,40),(17,50),(18,10),(19,40),(20,40)
18 | )
19 |
20 | import spark.sqlContext.implicits._
21 | val dfRange = data.toDF("id","count")
22 | .repartitionByRange(5,col("count"))
23 |
24 | dfRange.write.option("header",true).csv("c:/tmp/range-partition")
25 | dfRange.write.partitionBy()
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/ReadORCFile.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.{SparkSession}
4 |
5 | object ReadORCFile extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val data =Seq(("James ","","Smith","36636","M",3000),
13 | ("Michael ","Rose","","40288","M",4000),
14 | ("Robert ","","Williams","42114","M",4000),
15 | ("Maria ","Anne","Jones","39192","F",4000),
16 | ("Jen","Mary","Brown","","F",-1))
17 | val columns=Seq("firstname","middlename","lastname","dob","gender","salary")
18 | val df=spark.createDataFrame(data).toDF(columns:_*)
19 |
20 | df.write.mode("overwrite")
21 | .orc("/tmp/orc/data.orc")
22 |
23 | df.write.mode("overwrite")
24 | .option("compression","none12")
25 | .orc("/tmp/orc/data-nocomp.orc")
26 |
27 | df.write.mode("overwrite")
28 | .option("compression","zlib")
29 | .orc("/tmp/orc/data-zlib.orc")
30 |
31 | val df2=spark.read.orc("/tmp/orc/data.orc")
32 | df2.show(false)
33 |
34 | df2.createOrReplaceTempView("ORCTable")
35 | val orcSQL = spark.sql("select firstname,dob from ORCTable where salary >= 4000 ")
36 | orcSQL.show(false)
37 |
38 | spark.sql("CREATE TEMPORARY VIEW PERSON USING orc OPTIONS (path \"/tmp/orc/data.orc\")")
39 | spark.sql("SELECT * FROM PERSON").show()
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/RenameDeleteFile.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.hadoop.conf.Configuration
4 | import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
5 | import org.apache.spark.sql.SparkSession
6 |
7 | object RenameDeleteFile extends App{
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local[3]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | //Create Hadoop Configuration from Spark
15 | val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
16 |
17 | val srcPath=new Path("/tmp/address_rename_merged.csv")
18 | val destPath= new Path("/tmp/address_merged.csv")
19 |
20 | //Rename a File
21 | if(fs.exists(srcPath) && fs.isFile(srcPath))
22 | fs.rename(srcPath,destPath)
23 |
24 | //Alternatively, you can also create Hadoop configuration
25 | val hadoopConfig = new Configuration()
26 | val hdfs = FileSystem.get(hadoopConfig)
27 | if(hdfs.isFile(srcPath))
28 | hdfs.rename(srcPath,destPath)
29 |
30 |
31 | //Delete a File
32 | if(hdfs.isDirectory(srcPath))
33 | hdfs.delete(new Path("/tmp/.address_merged2.csv.crc"),true)
34 |
35 | import scala.sys.process._
36 | //Delete a File
37 | s"hdfs dfs -rm /tmp/.address_merged2.csv.crc" !
38 |
39 | //Delete a Directory
40 | s"hdfs dfs -rm -r /tmp/.address_merged2.csv.crc" !
41 |
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/RepartitionExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | object RepartitionExample extends App {
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local[5]")
9 | .appName("SparkByExamples.com")
10 | // .config("spark.default.parallelism", "500")
11 | .getOrCreate()
12 |
13 | // spark.sqlContext.setConf("spark.default.parallelism", "500")
14 | //spark.conf.set("spark.default.parallelism", "500")
15 | val df = spark.range(0,20)
16 | df.printSchema()
17 | println(df.rdd.partitions.length)
18 |
19 | df.write.mode(SaveMode.Overwrite)csv("c:/tmp/df-partition.csv")
20 |
21 | val df2 = df.repartition(10)
22 |
23 | println(df2.rdd.partitions.length)
24 |
25 | val df3 = df.coalesce(2)
26 | println(df3.rdd.partitions.length)
27 |
28 | val df4 = df.groupBy("id").count()
29 | println(df4.rdd.getNumPartitions)
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/SaveSingleFile.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import java.io.File
4 |
5 | import org.apache.hadoop.conf.Configuration
6 | import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
7 | import org.apache.spark.sql.{SaveMode, SparkSession}
8 |
9 | object SaveSingleFile extends App{
10 |
11 | val spark:SparkSession = SparkSession.builder()
12 | .master("local[3]")
13 | .appName("SparkByExamples.com")
14 | .getOrCreate()
15 |
16 | val df = spark.read.option("header",true)
17 | .csv("src/main/resources/address.csv")
18 | df.repartition(1)
19 | .write.mode(SaveMode.Overwrite).csv("/tmp/address")
20 |
21 |
22 | val hadoopConfig = new Configuration()
23 | val hdfs = FileSystem.get(hadoopConfig)
24 |
25 | val srcPath=new Path("/tmp/address")
26 | val destPath= new Path("/tmp/address_merged.csv")
27 | val srcFile=FileUtil.listFiles(new File("c:/tmp/address"))
28 | .filterNot(f=>f.getPath.endsWith(".csv"))(0)
29 | //Copy the CSV file outside of Directory and rename
30 | FileUtil.copy(srcFile,hdfs,destPath,true,hadoopConfig)
31 | //Remove Directory created by df.write()
32 | hdfs.delete(srcPath,true)
33 | //Removes CRC File
34 | hdfs.delete(new Path("/tmp/.address_merged.csv.crc"),true)
35 |
36 | // Merge Using Haddop API
37 | df.repartition(1).write.mode(SaveMode.Overwrite)
38 | .csv("/tmp/address-tmp")
39 | val srcFilePath=new Path("/tmp/address-tmp")
40 | val destFilePath= new Path("/tmp/address_merged2.csv")
41 | FileUtil.copyMerge(hdfs, srcFilePath, hdfs, destFilePath, true, hadoopConfig, null)
42 | //Remove hidden CRC file if not needed.
43 | hdfs.delete(new Path("/tmp/.address_merged2.csv.crc"),true)
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/SelectSelectExpr.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.col
5 | object SelectSelectExpr extends App {
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val data = Seq(("Java", "20000"), ("Python", "100000"), ("Scala", "3000"))
13 | val df = spark.createDataFrame(data).toDF("language","users_count")
14 | df.select("language","users_count as count").show() //Example 1
15 | df.select(df("language"),df("users_count").as("count")).show() //Example 2
16 | df.select(col("language"),col("users_count")).show() ////Example 3
17 | //df.select("language",col("users_count")).show() ////Example 3
18 |
19 | df.selectExpr("language","users_count as count").show() //Example 1
20 | //df.selectExpr(df("language"),df("users_count").as("count")).show() //Example 2
21 | //df.selectExpr(col("language"),col("users_count")).show() ////Example 3
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/ShuffleExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object ShuffleExample extends App {
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | import spark.implicits._
13 |
14 | val simpleData = Seq(("James","Sales","NY",90000,34,10000),
15 | ("Michael","Sales","NY",86000,56,20000),
16 | ("Robert","Sales","CA",81000,30,23000),
17 | ("Maria","Finance","CA",90000,24,23000),
18 | ("Raman","Finance","CA",99000,40,24000),
19 | ("Scott","Finance","NY",83000,36,19000),
20 | ("Jen","Finance","NY",79000,53,15000),
21 | ("Jeff","Marketing","CA",80000,25,18000),
22 | ("Kumar","Marketing","NY",91000,50,21000)
23 | )
24 | val df = simpleData.toDF("employee_name","department","state","salary","age","bonus")
25 |
26 | val df2 = df.groupBy("state").count()
27 | df2.show(false)
28 | println(df2.rdd.getNumPartitions)
29 |
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/examples/Util.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.examples
2 |
3 | class Util extends Serializable {
4 | def combine(fname:String,mname:String,lname:String):String = {
5 | fname+","+mname+","+lname
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/AddColumn.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{col, lit, typedLit, when}
5 | import org.apache.spark.sql.types.IntegerType
6 |
7 | object AddColumn extends App {
8 |
9 | val spark = SparkSession.builder()
10 | .appName("SparkByExamples.com")
11 | .master("local")
12 | .getOrCreate()
13 |
14 | import spark.sqlContext.implicits._
15 |
16 | val data = Seq(("111",50000),("222",60000),("333",40000))
17 | val df = data.toDF("EmpId","Salary")
18 | df.show(false)
19 |
20 | //Derive a new column from existing
21 | df.withColumn("CopiedColumn",df("salary")* -1)
22 | .show(false)
23 |
24 | //Using select
25 | df.select($"EmpId",$"Salary", ($"salary"* -1).as("CopiedColumn") )
26 | .show(false)
27 |
28 | //Adding a literal
29 | val df2 = df.select(col("EmpId"),col("Salary"),lit("1").as("lit_value1"))
30 | df2.show()
31 |
32 | val df3 = df2.withColumn("lit_value2",
33 | when(col("Salary") >=40000 && col("Salary") <= 50000, lit("100").cast(IntegerType))
34 | .otherwise(lit("200").cast(IntegerType))
35 | )
36 | df3.show(false)
37 |
38 | //Adding a list column
39 | val df4 = df3.withColumn("typedLit_seq",typedLit(Seq(1, 2, 3)))
40 | .withColumn("typedLit_map",typedLit(Map("a" -> 1, "b" -> 2)))
41 | .withColumn("typedLit_struct",typedLit(("a", 2, 1.0)))
42 |
43 | df4.printSchema()
44 | df4.show()
45 |
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/PivotExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | object PivotExample {
6 | def main(args:Array[String]):Unit= {
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | spark.sparkContext.setLogLevel("ERROR")
14 | val data = Seq(("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"),
15 | ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"),
16 | ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"),
17 | ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico"))
18 |
19 |
20 |
21 | import spark.sqlContext.implicits._
22 | val df = data.toDF("Product","Amount","Country")
23 | df.show()
24 |
25 | //pivot
26 | val pivotDF = df.groupBy("Product","Country")
27 | .sum("Amount")
28 | .groupBy("Product")
29 | .pivot("Country")
30 | .sum("sum(Amount)")
31 | pivotDF.show()
32 |
33 | val countries = Seq("USA","China","Canada","Mexico")
34 | val pivotDF2 = df.groupBy("Product").pivot("Country", countries).sum("Amount")
35 | pivotDF2.show()
36 |
37 | //unpivot
38 | // val unPivotDF = pivotDF.select($"Product",expr("stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) " +
39 | // "as (Country,Total)")) //.where("Total is not null")
40 | // unPivotDF.show()
41 |
42 | df.select(collect_list(""))
43 |
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/RemoveDuplicate.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions
2 |
3 | object RemoveDuplicate extends App {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/SortExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | object SortExample extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.implicits._
15 |
16 | val simpleData = Seq(("James","Sales","NY",90000,34,10000),
17 | ("Michael","Sales","NY",86000,56,20000),
18 | ("Robert","Sales","CA",81000,30,23000),
19 | ("Maria","Finance","CA",90000,24,23000),
20 | ("Raman","Finance","CA",99000,40,24000),
21 | ("Scott","Finance","NY",83000,36,19000),
22 | ("Jen","Finance","NY",79000,53,15000),
23 | ("Jeff","Marketing","CA",80000,25,18000),
24 | ("Kumar","Marketing","NY",91000,50,21000)
25 | )
26 | val df = simpleData.toDF("employee_name","department","state","salary","age","bonus")
27 | df.printSchema()
28 | df.show()
29 |
30 | df.sort("department","state").show(false)
31 | df.sort(col("department"),col("state")).show(false)
32 |
33 | df.orderBy("department","state").show(false)
34 | df.orderBy(col("department"),col("state")).show(false)
35 |
36 | df.sort(col("department").asc,col("state").asc).show(false)
37 | df.orderBy(col("department").asc,col("state").asc).show(false)
38 |
39 | df.sort(col("department").asc,col("state").desc).show(false)
40 | df.orderBy(col("department").asc,col("state").desc).show(false)
41 |
42 | df.select($"employee_name",asc("department"),desc("state"),$"salary",$"age",$"bonus").show(false)
43 | df.createOrReplaceTempView("EMP")
44 | spark.sql(" select employee_name,asc('department'),desc('state'),salary,age,bonus from EMP").show(false)
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/aggregate/DistinctCount.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.aggregate
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 |
6 | object DistinctCount extends App {
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.implicits._
16 |
17 | val simpleData = Seq(("James", "Sales", 3000),
18 | ("Michael", "Sales", 4600),
19 | ("Robert", "Sales", 4100),
20 | ("Maria", "Finance", 3000),
21 | ("James", "Sales", 3000),
22 | ("Scott", "Finance", 3300),
23 | ("Jen", "Finance", 3900),
24 | ("Jeff", "Marketing", 3000),
25 | ("Kumar", "Marketing", 2000),
26 | ("Saif", "Sales", 4100)
27 | )
28 | val df = simpleData.toDF("employee_name", "department", "salary")
29 | df.show()
30 |
31 | println("Distinct Count: " + df.distinct().count())
32 |
33 | val df2 = df.select(countDistinct("department", "salary"))
34 | df2.show(false)
35 | println("Distinct Count of Department & Salary: "+df2.collect()(0)(0))
36 |
37 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/aggregate/SQLDistinct.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.aggregate
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 |
6 | object SQLDistinct extends App {
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.implicits._
16 |
17 | val simpleData = Seq(("James", "Sales", 3000),
18 | ("Michael", "Sales", 4600),
19 | ("Robert", "Sales", 4100),
20 | ("Maria", "Finance", 3000),
21 | ("James", "Sales", 3000),
22 | ("Scott", "Finance", 3300),
23 | ("Jen", "Finance", 3900),
24 | ("Jeff", "Marketing", 3000),
25 | ("Kumar", "Marketing", 2000),
26 | ("Saif", "Sales", 4100)
27 | )
28 | val df = simpleData.toDF("employee_name", "department", "salary")
29 | df.show()
30 |
31 | //Distinct all columns
32 | val distinctDF = df.distinct()
33 | println("Distinct count: "+distinctDF.count())
34 | distinctDF.show(false)
35 |
36 | val df2 = df.dropDuplicates()
37 | println("Distinct count: "+df2.count())
38 | df2.show(false)
39 |
40 | //Distinct using dropDuplicates
41 | val dropDisDF = df.dropDuplicates("department","salary")
42 | println("Distinct count of department & salary : "+dropDisDF.count())
43 | dropDisDF.show(false)
44 |
45 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayContainsExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 |
3 | import org.apache.spark.sql.functions.{array_contains,col}
4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
5 | import org.apache.spark.sql.{Row, SparkSession}
6 |
7 | object ArrayContainsExample extends App {
8 |
9 | val spark = SparkSession.builder().appName("SparkByExamples.com")
10 | .master("local[1]")
11 | .getOrCreate()
12 |
13 | val data = Seq(
14 | Row("James,,Smith",List("Java","Scala","C++"),"CA"),
15 | Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"),
16 | Row("Robert,,Williams",null,"NV")
17 | )
18 |
19 | val schema = new StructType()
20 | .add("name",StringType)
21 | .add("languagesAtSchool", ArrayType(StringType))
22 | .add("currentState", StringType)
23 |
24 | val df = spark.createDataFrame(
25 | spark.sparkContext.parallelize(data),schema)
26 | df.printSchema()
27 | df.show(false)
28 |
29 | val df2=df.withColumn("Java Present",
30 | array_contains(col("languagesAtSchool"),"Java"))
31 | df2.show(false)
32 |
33 | val df3=df.where(array_contains(col("languagesAtSchool"),"Java"))
34 | df3.show(false)
35 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfArrayType.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.functions.{explode, flatten}
5 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
6 |
7 | object ArrayOfArrayType extends App {
8 |
9 | val spark = SparkSession.builder().appName("SparkByExamples.com")
10 | .master("local[1]")
11 | .getOrCreate()
12 |
13 | val arrayArrayData = Seq(
14 | Row("James",List(List("Java","Scala","C++"),List("Spark","Java"))),
15 | Row("Michael",List(List("Spark","Java","C++"),List("Spark","Java"))),
16 | Row("Robert",List(List("CSharp","VB"),List("Spark","Python")))
17 | )
18 |
19 | val arrayArraySchema = new StructType().add("name",StringType)
20 | .add("subjects",ArrayType(ArrayType(StringType)))
21 |
22 | val df = spark.createDataFrame(
23 | spark.sparkContext.parallelize(arrayArrayData),arrayArraySchema)
24 | df.printSchema()
25 | df.show(false)
26 |
27 | import spark.implicits._
28 | val df2 = df.select($"name",explode($"subjects"))
29 |
30 |
31 | df2.printSchema()
32 | df2.show(false)
33 |
34 | //Convert Array of Array into Single array
35 | df.select($"name",flatten($"subjects")).show(false)
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfMapType.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 |
3 |
4 | import org.apache.spark.sql.{Row, SparkSession}
5 | import org.apache.spark.sql.functions.{explode}
6 | import org.apache.spark.sql.types._
7 |
8 | object ArrayOfMapType extends App {
9 | val spark = SparkSession.builder().appName("SparkByExamples.com")
10 | .master("local[1]")
11 | .getOrCreate()
12 |
13 | val arrayMapSchema = new StructType().add("name",StringType)
14 | .add("properties",
15 | ArrayType(new MapType(StringType,StringType,true)))
16 |
17 | val arrayMapData = Seq(
18 | Row("James",List(Map("hair"->"black","eye"->"brown"), Map("height"->"5.9"))),
19 | Row("Michael",List(Map("hair"->"brown","eye"->"black"),Map("height"->"6"))),
20 | Row("Robert",List(Map("hair"->"red","eye"->"gray"),Map("height"->"6.3")))
21 | )
22 |
23 | val df = spark.createDataFrame(
24 | spark.sparkContext.parallelize(arrayMapData),arrayMapSchema)
25 | df.printSchema()
26 | df.show(false)
27 |
28 | import spark.implicits._
29 |
30 | val df2 = df.select($"name",explode($"properties"))
31 | df2.printSchema()
32 | df2.show(false)
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfString.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
5 | import org.apache.spark.sql.functions.{col,concat_ws}
6 |
7 | object ArrayOfString extends App{
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | val arrayStructureData = Seq(
15 | Row("James,,Smith",List("Java","Scala","C++"),"CA"),
16 | Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"),
17 | Row("Robert,,Williams",List("CSharp","VB"),"NV")
18 | )
19 |
20 | val arrayStructureSchema = new StructType()
21 | .add("name",StringType)
22 | .add("languagesAtSchool", ArrayType(StringType))
23 | .add("currentState", StringType)
24 |
25 |
26 | val df = spark.createDataFrame(
27 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
28 | df.printSchema()
29 | df.show()
30 |
31 | val df2 = df.withColumn("languagesAtSchool",
32 | concat_ws(",",col("languagesAtSchool")))
33 | df2.printSchema()
34 | df2.show()
35 |
36 | import spark.implicits._
37 | val df3 = df.map(f=>{
38 | val name = f.getString(0)
39 | val lang = f.getList(1).toArray.mkString(",")
40 | (name,lang,f.getString(2))
41 | })
42 |
43 | df3.toDF("Name","Languages","currentState")
44 | .show(false)
45 |
46 | df.createOrReplaceTempView("ARRAY_STRING")
47 | spark.sql("select name, concat_ws(',',languagesAtSchool) as languagesAtSchool," +
48 | " currentState from ARRAY_STRING")
49 | .show(false)
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/ArrayOfStructType.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 |
3 | import org.apache.spark.sql.functions._
4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
5 | import org.apache.spark.sql.{Row, SparkSession}
6 |
7 | object ArrayOfStructType extends App{
8 |
9 | val spark = SparkSession.builder().appName("SparkByExamples.com")
10 | .master("local[1]")
11 | .getOrCreate()
12 |
13 | val arrayStructData = Seq(
14 | Row("James",List(Row("Java","XX",120),Row("Scala","XA",300))),
15 | Row("Michael",List(Row("Java","XY",200),Row("Scala","XB",500))),
16 | Row("Robert",List(Row("Java","XZ",400),Row("Scala","XC",250))),
17 | Row("Washington",null)
18 | )
19 |
20 | val arrayStructSchema = new StructType().add("name",StringType)
21 | .add("booksIntersted",ArrayType(new StructType()
22 | .add("name",StringType)
23 | .add("author",StringType)
24 | .add("pages",IntegerType)))
25 |
26 | val df = spark.createDataFrame(
27 | spark.sparkContext.parallelize(arrayStructData),arrayStructSchema)
28 | df.printSchema()
29 | df.show(false)
30 |
31 | import spark.implicits._
32 | val df2 = df.select($"name",explode($"booksIntersted"))
33 | df2.printSchema()
34 | df2.show(false)
35 |
36 | df2.groupBy($"name").agg(collect_list($"col").as("booksIntersted"))
37 | .show(false)
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/CollectListExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 |
3 | import org.apache.spark.sql.functions._
4 | import org.apache.spark.sql.types.{StringType, StructType}
5 | import org.apache.spark.sql.{Row, SparkSession}
6 |
7 | object CollectListExample extends App {
8 |
9 | val spark = SparkSession.builder().appName("SparkByExamples.com")
10 | .master("local[1]")
11 | .getOrCreate()
12 |
13 | val arrayStructData = Seq(
14 | Row("James", "Java"), Row("James", "C#"),Row("James", "Python"),
15 | Row("Michael", "Java"),Row("Michael", "PHP"),Row("Michael", "PHP"),
16 | Row("Robert", "Java"),Row("Robert", "Java"),Row("Robert", "Java"),
17 | Row("Washington", null)
18 | )
19 | val arrayStructSchema = new StructType().add("name", StringType)
20 | .add("booksIntersted", StringType)
21 |
22 | val df = spark.createDataFrame(
23 | spark.sparkContext.parallelize(arrayStructData),arrayStructSchema)
24 | df.printSchema()
25 | df.show(false)
26 |
27 | val df2 = df.groupBy("name").agg(collect_list("booksIntersted")
28 | .as("booksIntersted"))
29 | df2.printSchema()
30 | df2.show(false)
31 |
32 | df.groupBy("name").agg(collect_set("booksIntersted")
33 | .as("booksIntersted"))
34 | .show(false)
35 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/MapToColumn.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 |
3 | import org.apache.spark.sql.functions._
4 | import org.apache.spark.sql.{Row, SparkSession}
5 | import org.apache.spark.sql.types._
6 |
7 | object MapToColumn extends App {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | val arrayStructureData = Seq(
15 | Row("James",Map("hair"->"black","eye"->"brown")),
16 | Row("Michael",Map("hair"->"gray","eye"->"black")),
17 | Row("Robert",Map("hair"->"brown"))
18 | )
19 |
20 | val mapType = DataTypes.createMapType(StringType,StringType)
21 |
22 | val arrayStructureSchema = new StructType()
23 | .add("name",StringType)
24 | .add("property", MapType(StringType,StringType))
25 |
26 | val mapTypeDF = spark.createDataFrame(
27 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
28 | mapTypeDF.printSchema()
29 | mapTypeDF.show(false)
30 |
31 | mapTypeDF.select(col("name"),
32 | col("property").getItem("hair").as("hair_color"),
33 | col("property").getItem("eye").as("eye_color"))
34 | .show(false)
35 |
36 | import spark.implicits._
37 | val keysDF = mapTypeDF.select(explode(map_keys($"property"))).distinct()
38 | val keys = keysDF.collect().map(f=>f.get(0))
39 | val keyCols = keys.map(f=> col("property").getItem(f).as(f.toString))
40 | mapTypeDF.select(col("name") +: keyCols:_*).show(false)
41 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/MapTypeExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 | import org.apache.spark.sql.functions.{col, explode, lit, map, map_concat, map_from_entries, map_keys, map_values}
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types._
5 |
6 | object MapTypeExample extends App {
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | //Creating DF with MapType
14 | val arrayStructureData = Seq(
15 | Row("James",List(Row("Newark","NY"),Row("Brooklyn","NY")),
16 | Map("hair"->"black","eye"->"brown"), Map("height"->"5.9")),
17 | Row("Michael",List(Row("SanJose","CA"),Row("Sandiago","CA")),
18 | Map("hair"->"brown","eye"->"black"),Map("height"->"6")),
19 | Row("Robert",List(Row("LasVegas","NV")),
20 | Map("hair"->"red","eye"->"gray"),Map("height"->"6.3")),
21 | Row("Maria",null,Map("hair"->"blond","eye"->"red"),
22 | Map("height"->"5.6")),
23 | Row("Jen",List(Row("LAX","CA"),Row("Orange","CA")),
24 | Map("white"->"black","eye"->"black"),Map("height"->"5.2"))
25 | )
26 |
27 |
28 | val mapType = DataTypes.createMapType(StringType,StringType)
29 |
30 | val arrayStructureSchema = new StructType()
31 | .add("name",StringType)
32 | .add("addresses", ArrayType(new StructType()
33 | .add("city",StringType)
34 | .add("state",StringType)))
35 | .add("properties", mapType)
36 | .add("secondProp", MapType(StringType,StringType))
37 |
38 | val mapTypeDF = spark.createDataFrame(
39 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
40 | mapTypeDF.printSchema()
41 | mapTypeDF.show(false)
42 |
43 | mapTypeDF.select(col("name"),map_keys(col("properties"))).show(false)
44 | mapTypeDF.select(col("name"),map_values(col("properties"))).show(false)
45 | mapTypeDF.select(col("name"),map_concat(col("properties"),col("secondProp"))).show(false)
46 |
47 |
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/SliceArray.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.functions.{array_join, col, slice, split}
5 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
6 |
7 | object SliceArray extends App {
8 |
9 |
10 | val spark = SparkSession.builder()
11 | .appName("SparkByExamples.com")
12 | .master("local")
13 | .getOrCreate()
14 |
15 | val arrayStructureData = Seq(
16 | Row("James,,Smith",List("Java","Scala","C++","Pascal","Spark")),
17 | Row("Michael,Rose,",List("Spark","Java","C++","Scala","PHP")),
18 | Row("Robert,,Williams",List("CSharp","VB",".Net","C#.net",""))
19 | )
20 |
21 | val arrayStructureSchema = new StructType()
22 | .add("name",StringType)
23 | .add("languagesAtSchool", ArrayType(StringType))
24 |
25 | val df = spark.createDataFrame(
26 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
27 | df.show(false)
28 | df.printSchema()
29 |
30 |
31 | val splitDF2 = df.withColumn("languages",
32 | slice(col("languagesAtSchool"),2,3))
33 | .drop("languagesAtSchool")
34 | splitDF2.printSchema()
35 | splitDF2.show(false)
36 |
37 | df.createOrReplaceTempView("PERSON")
38 | spark.sql("select name, slice(languagesAtSchool,2,3) as NameArray from PERSON")
39 | .show(false)
40 |
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/collection/StringToArray.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.collection
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{col, split}
5 |
6 | object StringToArray extends App {
7 |
8 | val spark = SparkSession.builder()
9 | .appName("SparkByExamples.com")
10 | .master("local")
11 | .getOrCreate()
12 |
13 | val data = Seq(("James, A, Smith","2018","M",3000),
14 | ("Michael, Rose, Jones","2010","M",4000),
15 | ("Robert,K,Williams","2010","M",4000),
16 | ("Maria,Anne,Jones","2005","F",4000),
17 | ("Jen,Mary,Brown","2010","",-1)
18 | )
19 |
20 | import spark.sqlContext.implicits._
21 | val df = data.toDF("name","dob_year","gender","salary")
22 | df.printSchema()
23 | df.show(false)
24 |
25 | val df2 = df.select(split(col("name"),",").as("NameArray"))
26 | .drop("name")
27 |
28 | df2.printSchema()
29 | df2.show(false)
30 |
31 | df.createOrReplaceTempView("PERSON")
32 | spark.sql("select SPLIT(name,',') as NameArray from PERSON")
33 | .show(false)
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/AddTime.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | object AddTime extends App {
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 | spark.sparkContext.setLogLevel("ERROR")
12 |
13 | import spark.sqlContext.implicits._
14 |
15 | spark.sql( "select current_timestamp," +
16 | "cast(current_timestamp as TIMESTAMP) + INTERVAL 2 hours as added_hours," +
17 | "cast(current_timestamp as TIMESTAMP) + INTERVAL 5 minutes as added_minutes," +
18 | "cast(current_timestamp as TIMESTAMP) + INTERVAL 55 seconds as added_seconds"
19 | ).show(false)
20 |
21 |
22 | val df = Seq(("2019-07-01 12:01:19.101"),
23 | ("2019-06-24 12:01:19.222"),
24 | ("2019-11-16 16:44:55.406"),
25 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
26 |
27 |
28 | df.createOrReplaceTempView("AddTimeExample")
29 |
30 | val df2 = spark.sql("select input_timestamp, " +
31 | "cast(input_timestamp as TIMESTAMP) + INTERVAL 2 hours as added_hours," +
32 | "cast(input_timestamp as TIMESTAMP) + INTERVAL 5 minutes as added_minutes," +
33 | "cast(input_timestamp as TIMESTAMP) + INTERVAL 55 seconds as added_seconds from AddTimeExample"
34 | )
35 | df2.show(false)
36 |
37 | df.withColumn("added_hours",col("input_timestamp") + expr("INTERVAL 2 HOURS"))
38 | .withColumn("added_minutes",col("input_timestamp") + expr("INTERVAL 2 minutes"))
39 | .withColumn("added_seconds",col("input_timestamp") + expr("INTERVAL 2 seconds"))
40 | .show(false)
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/CurrentDateAndTime.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 |
6 | object CurrentDateAndTime extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.sqlContext.implicits._
15 |
16 | //Get current Date & Time
17 | val df = Seq((1)).toDF("seq")
18 |
19 | val curDate = df.withColumn("current_date",current_date().as("current_date"))
20 | .withColumn("current_timestamp",current_timestamp().as("current_timestamp"))
21 | curDate.show(false)
22 |
23 |
24 | curDate.select(date_format(col("current_timestamp"),"MM-dd-yyyy").as("date"),
25 | date_format(col("current_timestamp"),"HH:mm:ss.SSS").as("time"),
26 | date_format(col("current_date"), "MM-dd-yyyy").as("current_date_formateed"))
27 | .show(false)
28 |
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateAddMonths.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types.IntegerType
6 |
7 | object DateAddMonths extends App {
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.sqlContext.implicits._
16 |
17 | Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date").select(
18 | col("date"),
19 | add_months(col("date"),3).as("add_months"),
20 | add_months(col("date"),-3).as("sub_months"),
21 | date_add(col("date"),4).as("date_add"),
22 | date_sub(col("date"),4).as("date_sub")
23 | ).show()
24 |
25 | Seq(("06-03-2009"),("07-24-2009")).toDF("date").select(
26 | col("Date"),
27 | add_months(to_date(col("Date"),"MM-dd-yyyy"),3).as("add_months"),
28 | add_months(to_date(col("Date"),"MM-dd-yyyy"),-3).as("add_months2"),
29 | date_add(to_date(col("Date"),"MM-dd-yyyy"),3).as("date_add"),
30 | date_add(to_date(col("Date"),"MM-dd-yyyy"),-3).as("date_add2"),
31 | date_sub(to_date(col("Date"),"MM-dd-yyyy"),3).as("date_sub")
32 | ).show()
33 |
34 | // Seq(("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)).toDF("date","increment").select(
35 | // col("date"),
36 | // add_months(to_date(col("date"),"yyyy-MM-dd"),col("increment").cast(IntegerType).).as("date_inc")
37 | // ).show()
38 |
39 | Seq(("2019-01-23",1),("2019-06-24",2),("2019-09-20",3))
40 | .toDF("date","increment")
41 | .select(col("date"),col("increment"),
42 | expr("add_months(to_date(date,'yyyy-MM-dd'),cast(increment as int))").as("inc_date"))
43 | .show()
44 |
45 | Seq(("2019-01-23",1),("2019-06-24",2),("2019-09-20",3))
46 | .toDF("date","increment")
47 | .selectExpr("date","increment","add_months(to_date(date,'yyyy-MM-dd'),cast(increment as int)) as inc_date")
48 | .show()
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateDiff.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.functions._
4 | import org.apache.spark.sql.{DataFrame, SparkSession}
5 |
6 | object DateDiff extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.sqlContext.implicits._
15 |
16 | //Difference between two dates in days
17 | Seq(("2019-07-01"),("2019-06-24"),("2019-08-24"),("2018-07-23")).toDF("date")
18 | .select(
19 | col("date"),
20 | current_date().as("current_date"),
21 | datediff(current_date(),col("date")).as("datediff")
22 | ).show()
23 |
24 | // Difference between two dates in Months and Years
25 | val df = Seq(("2019-07-01"),("2019-06-24"),("2019-08-24"),("2018-12-23"),("2018-07-20"))
26 | .toDF("startDate").select(
27 | col("startDate"),current_date().as("endDate")
28 | )
29 |
30 | calculateDiff(df)
31 |
32 | //Difference between two dates when dates are not in Spark DateType format 'yyyy-MM-dd'.
33 | //Note that when dates are not in Spark DateType format, all Spark functions returns null
34 | //Hence, first convert the input dates to Spark DateType using to_date function
35 | val dfDate = Seq(("07-01-2019"),("06-24-2019"),("08-24-2019"),("12-23-2018"),("07-20-2018"))
36 | .toDF("startDate").select(
37 | to_date(col("startDate"),"MM-dd-yyyy").as("startDate"),
38 | current_date().as("endDate")
39 | )
40 |
41 | calculateDiff(dfDate)
42 |
43 | def calculateDiff(df:DataFrame): Unit ={
44 | df.withColumn("datesDiff", datediff(col("endDate"),col("startDate")))
45 | .withColumn("montsDiff", months_between(
46 | col("endDate"),col("startDate")))
47 | .withColumn("montsDiff_round",round(months_between(
48 | col("endDate"),col("startDate")),2))
49 | .withColumn("yearsDiff",months_between(
50 | col("endDate"),col("startDate"),true).divide(12))
51 | .withColumn("yearsDiff_round",round(months_between(
52 | col("endDate"),col("startDate"),true).divide(12),2))
53 | .show()
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateInMilli.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{unix_timestamp, _}
5 | import org.apache.spark.sql.types.{DateType, LongType, TimestampType}
6 |
7 | object DateInMilli extends App{
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.sqlContext.implicits._
16 |
17 | val df = Seq(1).toDF("seq").select(
18 | current_date().as("current_date"),
19 | unix_timestamp().as("unix_timestamp_seconds")
20 | )
21 |
22 | df.printSchema()
23 | df.show(false)
24 |
25 | //Convert unix seconds to date
26 | df.select(
27 | to_date(col("unix_timestamp_seconds").cast(TimestampType)).as("current_date")
28 | ).show(false)
29 |
30 | //convert date to unix seconds
31 | df.select(
32 | unix_timestamp(col("current_date")).as("unix_seconds"),
33 | unix_timestamp(lit("12-21-2019"),"mm-DD-yyyy").as("unix_seconds2")
34 | ).show(false)
35 |
36 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateLastDay.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{col, last_day, to_date}
5 |
6 | object DateLastDay extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.sqlContext.implicits._
15 |
16 | Seq(("2019-01-01"),("2020-02-24"),("2019-02-24"),
17 | ("2019-05-01"),("2018-03-24"),("2007-12-19"))
18 | .toDF("Date").select(
19 | col("Date"),
20 | last_day(col("Date")).as("last_day")
21 | ).show()
22 |
23 |
24 | Seq(("06-03-2009"),("07-24-2009")).toDF("Date").select(
25 | col("Date"),
26 | last_day(to_date(col("Date"),"MM-dd-yyyy")).as("last_day")
27 | ).show()
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DateToString.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import com.sparkbyexamples.spark.dataframe.functions.datetime.DateFormat.spark
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions.{current_date, current_timestamp, date_format}
6 |
7 | object DateToString extends App {
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.sqlContext.implicits._
16 |
17 | Seq(1).toDF("seq").select(
18 | current_date().as("current_date"),
19 | date_format(current_timestamp(),"yyyy MM dd").as("yyyy MM dd"),
20 | date_format(current_timestamp(),"MM/dd/yyyy hh:mm").as("MM/dd/yyyy"),
21 | date_format(current_timestamp(),"yyyy MMM dd").as("yyyy MMMM dd"),
22 | date_format(current_timestamp(),"yyyy MMMM dd E").as("yyyy MMMM dd E")
23 | ).show(false)
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DayAndWeekOfYear.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{col, date_format, to_timestamp}
5 |
6 |
7 | object DayAndWeekOfYear extends App {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.sqlContext.implicits._
16 |
17 | val df = Seq(("2019-01-03 12:01:19.000"),
18 | ("2019-02-01 12:01:19.000"),
19 | ("2019-7-16 16:44:55.406"),
20 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
21 |
22 | //Get Day of the Year example
23 | df.withColumn("input_timestamp",
24 | to_timestamp(col("input_timestamp")))
25 | .withColumn("day_of_year", date_format(col("input_timestamp"), "D"))
26 | .show(false)
27 |
28 | //Get Week of the Year example
29 | df.withColumn("input_timestamp",
30 | to_timestamp(col("input_timestamp")))
31 | .withColumn("week_of_year", date_format(col("input_timestamp"), "w"))
32 | .show(false)
33 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/DayWeekAndWeekMonth.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{col, to_timestamp,date_format}
5 |
6 |
7 | object DayWeekAndWeekMonth extends App {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.sqlContext.implicits._
16 |
17 | val df = Seq(("2019-07-01 12:01:19.000"),
18 | ("2019-06-24 12:01:19.000"),
19 | ("2019-11-16 16:44:55.406"),
20 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
21 |
22 | df.withColumn("input_timestamp",
23 | to_timestamp(col("input_timestamp")))
24 | .withColumn("week_day_number", date_format(col("input_timestamp"), "u"))
25 | .withColumn("week_day_abb", date_format(col("input_timestamp"), "E"))
26 | .show(false)
27 |
28 | df.withColumn("input_timestamp",
29 | to_timestamp(col("input_timestamp")))
30 | .withColumn("week_day_full", date_format(col("input_timestamp"), "EEEE"))
31 | .withColumn("week_of_month", date_format(col("input_timestamp"), "W"))
32 | .show(false)
33 | }
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/GetTimeFromTimestamp.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{col,hour,minute,second}
5 |
6 | object GetTimeFromTimestamp extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.sqlContext.implicits._
15 |
16 | val df = Seq(("2019-07-01 12:01:19.000"),
17 | ("2019-06-24 12:01:19.000"),
18 | ("2019-11-16 16:44:55.406"),
19 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
20 |
21 |
22 | df.withColumn("hour", hour(col("input_timestamp")))
23 | .withColumn("minute", minute(col("input_timestamp")))
24 | .withColumn("second", second(col("input_timestamp")))
25 | .show(false)
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/Spark3Date.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{to_timestamp, _}
5 |
6 | object Spark3Date extends App{
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.sqlContext.implicits._
15 |
16 | val df3 = Seq(1).toDF("seq").select(
17 | current_date().as("current_date"),
18 | current_timestamp().as("current_time"),
19 | unix_timestamp().as("epoch_time_seconds")
20 | )
21 | //
22 | // val data2 = df.collect()
23 | // data2.foreach(println)
24 | //
25 | // val df2 = Seq(("06-03-2009","07-01-2009 12:01:19.000")).toDF("Date","Time").select(
26 | // col("Date"),col("Time"),
27 | // to_date(col("Date"),"MM-dd-yyyy").as("to_date"),
28 | // to_timestamp(col("Time"),"MM-dd-yyyy HH:mm:ss.SSS").as("to_timestamp")
29 | // )
30 | // df2.show(false)
31 | //
32 | // val df3 = Seq(("06-03-1500","07-01-1500 12:01:19.000")).toDF("Date","Time").select(
33 | // col("Date"),col("Time"),
34 | // to_date(col("Date"),"MM-dd-yyyy").as("to_date"),
35 | // to_timestamp(col("Time"),"MM-dd-yyyy HH:mm:ss.SSS").as("to_timestamp")
36 | //
37 | // )
38 | val df=spark.range(1,10000).toDF("num")
39 | println("Before re-partition :"+df.rdd.getNumPartitions)
40 | df.createOrReplaceTempView("RANGE_TABLE")
41 | val df2=spark.sql("SELECT /*+ REPARTITION(20) */ * FROM RANGE_TABLE")
42 | println("After re-partition :"+df2.rdd.getNumPartitions)
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/StringToDate.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{col, to_date}
5 |
6 | object StringToDate extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.sqlContext.implicits._
15 |
16 | Seq(("06-03-2009"),("07-24-2009")).toDF("Date").select(
17 | col("Date"),
18 | to_date(col("Date"),"MM-dd-yyyy").as("to_date")
19 | ).show()
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/StringToTimestamp.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types.LongType
6 |
7 | object StringToTimestamp extends App {
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.sqlContext.implicits._
16 |
17 | //String to timestamps
18 | val df = Seq(("2019-07-01 12:01:19.000"),
19 | ("2019-06-24 12:01:19.000"),
20 | ("2019-11-16 16:44:55.406"),
21 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
22 |
23 | df.withColumn("datetype_timestamp",
24 | to_timestamp(col("input_timestamp")))
25 | .printSchema()
26 |
27 |
28 | //Convert string to timestamp when input string has just time
29 | val df1 = Seq(("12:01:19.345"),
30 | ("12:01:20.567"),
31 | ("16:02:44.406"),
32 | ("16:50:59.406"))
33 | .toDF("input_timestamp")
34 |
35 | df1.withColumn("datetype_timestamp",
36 | to_timestamp(col("input_timestamp"),"HH:mm:ss.SSS"))
37 | .show(false)
38 |
39 | //when dates are not in Spark DateType format 'yyyy-MM-dd HH:mm:ss.SSS'.
40 | //Note that when dates are not in Spark DateType format, all Spark functions returns null
41 | //Hence, first convert the input dates to Spark DateType using to_timestamp function
42 | val dfDate = Seq(("07-01-2019 12 01 19 406"),
43 | ("06-24-2019 12 01 19 406"),
44 | ("11-16-2019 16 44 55 406"),
45 | ("11-16-2019 16 50 59 406")).toDF("input_timestamp")
46 |
47 | dfDate.withColumn("datetype_timestamp",
48 | to_timestamp(col("input_timestamp"),"MM-dd-yyyy HH mm ss SSS"))
49 | .show(false)
50 |
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimeInMilli.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types.{LongType, TimestampType}
6 |
7 | object TimeInMilli extends App{
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.sqlContext.implicits._
16 |
17 | val df = Seq(1).toDF("seq").select(
18 | current_timestamp().as("current_time"),
19 | unix_timestamp().as("epoch_time_seconds")
20 | )
21 |
22 | df.printSchema()
23 | df.show(false)
24 |
25 | //Convert epoch_time to timestamp
26 | df.select(
27 | col("epoch_time_seconds").cast(TimestampType).as("current_time"),
28 | col("epoch_time_seconds").cast("timestamp").as("current_time2")
29 | ).show(false)
30 |
31 | //convert timestamp to Unix epoch time
32 | df.select(
33 | unix_timestamp(col("current_time")).as("unix_epoch_time"),
34 | col("current_time").cast(LongType).as("unix_epoch_time2")
35 | ).show(false)
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimestampToDate.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{col, to_date, to_timestamp}
5 | import org.apache.spark.sql.types.DateType
6 |
7 | object TimestampToDate extends App {
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | import spark.sqlContext.implicits._
16 |
17 | val df = Seq(("2019-07-01 12:01:19.000"),
18 | ("2019-06-24 12:01:19.000"),
19 | ("2019-11-16 16:44:55.406"),
20 | ("2019-11-16 16:50:59.406")).toDF("input_timestamp")
21 |
22 | //Timestamp String to DateType
23 | df.withColumn("datetype",
24 | to_date(col("input_timestamp"),"yyyy-MM-dd"))
25 | .show(false)
26 |
27 | //Timestamp type to DateType
28 | df.withColumn("ts",to_timestamp(col("input_timestamp")))
29 | .withColumn("datetype",to_date(col("ts")))
30 | .show(false)
31 |
32 | //Using Cast
33 | df.withColumn("ts",to_timestamp(col("input_timestamp")))
34 | .withColumn("datetype",col("ts").cast(DateType))
35 | .show(false)
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/TimestampToString.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{current_date, current_timestamp, date_format}
5 |
6 | object TimestampToString extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.sqlContext.implicits._
15 |
16 | import spark.sqlContext.implicits._
17 | Seq(1).toDF("seq").select(
18 | current_timestamp().as("current_date"),
19 | date_format(current_timestamp(),"yyyy MM dd").as("yyyy MM dd"),
20 | date_format(current_timestamp(),"MM/dd/yyyy hh:mm").as("MM/dd/yyyy"),
21 | date_format(current_timestamp(),"yyyy MMM dd").as("yyyy MMMM dd"),
22 | date_format(current_timestamp(),"yyyy MMMM dd E").as("yyyy MMMM dd E")
23 | ).show(false)
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/UnixTimestamp.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.datetime
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{from_unixtime, unix_timestamp, _}
5 |
6 | object UnixTimestamp extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.sqlContext.implicits._
15 |
16 | //Convert Timestamp to Unix timestamp
17 | val inputDF = Seq(("2019-07-01 12:01:19.000","07-01-2019 12:01:19.000", "07-01-2019"))
18 | .toDF("timestamp_1","timestamp_2","timestamp_3")
19 | inputDF.printSchema()
20 | inputDF.show(false)
21 |
22 | //Convert timestamp to unix timestamp
23 | val df = inputDF.select(
24 | unix_timestamp(col("timestamp_1")).as("timestamp_1"),
25 | unix_timestamp(col("timestamp_2"),"MM-dd-yyyy HH:mm:ss").as("timestamp_2"),
26 | unix_timestamp(col("timestamp_3"),"MM-dd-yyyy").as("timestamp_3"),
27 | unix_timestamp().as("timestamp_4")
28 | )
29 | df.printSchema()
30 | df.show(false)
31 |
32 | // Convert Unix timestamp to timestamp
33 | val df2 = df.select(
34 | from_unixtime(col("timestamp_1")).as("timestamp_1"),
35 | from_unixtime(col("timestamp_2"),"MM-dd-yyyy HH:mm:ss").as("timestamp_2"),
36 | from_unixtime(col("timestamp_3"),"MM-dd-yyyy").as("timestamp_3"),
37 | from_unixtime(col("timestamp_4")).as("timestamp_4")
38 | )
39 | df2.printSchema()
40 | df2.show(false)
41 |
42 | //Convert unix timestamp to timestamp
43 | val timeDF = Seq(1).toDF("seq").select(
44 | from_unixtime(unix_timestamp()).as("timestamp_1"),
45 | from_unixtime(unix_timestamp(),"MM-dd-yyyy HH:mm:ss").as("timestamp_2"),
46 | from_unixtime(unix_timestamp(),"dd-MM-yyyy HH:mm:ss").as("timestamp_3"),
47 | from_unixtime(unix_timestamp(),"HH:mm:ss").as("timestamp_4")
48 | ).show()
49 |
50 | //Convert unix timestamp to date
51 | val dateDF = Seq(1).toDF("seq").select(
52 | from_unixtime(unix_timestamp(),"MM-dd-yyyy").as("date_1"),
53 | from_unixtime(unix_timestamp(),"dd-MM-yyyy HH:mm:ss").as("date_2"),
54 | from_unixtime(unix_timestamp(),"yyyy-MM-dd").as("date_3")
55 | ).show(false)
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/from_json.scala:
--------------------------------------------------------------------------------
1 | //package com.sparkbyexamples.spark.dataframe.functions
2 | //
3 | //import org.apache.spark.sql.SparkSession
4 | //import org.apache.spark.sql.functions.col
5 | //import org.apache.spark.sql.types.{StringType, StructType}
6 | //
7 | //object from_json {
8 | // def main(args:Array[String]):Unit= {
9 | //
10 | // val spark: SparkSession = SparkSession.builder()
11 | // .master("local[1]")
12 | // .appName("SparkByExample")
13 | // .getOrCreate()
14 | //
15 | //
16 | // val data = Seq(("1","{\"name\":\"Anne\",\"Age\":\"12\",\"country\":\"Denmark\"}"),
17 | // ("2","{\"name\":\"Zen\",\"Age\":\"24\"}"),
18 | // ("3","{\"name\":\"Fred\",\"Age\":\"20\",\"country\":\"France\"}"),
19 | // ("4","{\"name\":\"Mona\",\"Age\":\"18\",\"country\":\"Denmark\"}")
20 | // )
21 | //
22 | // import spark.sqlContext.implicits._
23 | // val df = data.toDF("ID","details_Json")
24 | //
25 | // val schema = (new StructType()).add("name",StringType,true)
26 | // .add("Age",StringType,true)
27 | // .add("country",StringType,true)
28 | //
29 | // val df2 = df.withColumn("details_Struct", from_json($"details_Json", schema))
30 | // .withColumn("country",col("details_Struct").getField("country"))
31 | // .filter(col("country").equalTo("Denmark"))
32 | //
33 | //
34 | // df2.printSchema()
35 | // df2.show(false)
36 | // }
37 | //}
38 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/litTypeLit.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.types.IntegerType
5 |
6 | object litTypeLit extends App {
7 |
8 |
9 |
10 | val spark = SparkSession.builder()
11 | .appName("sparkbyexamples.com")
12 | .master("local")
13 | .getOrCreate()
14 |
15 | import spark.sqlContext.implicits._
16 | import org.apache.spark.sql.functions._
17 |
18 | val data = Seq(("111",50000),("222",60000),("333",40000))
19 | val df = data.toDF("EmpId","Salary")
20 | val df2 = df.select(col("EmpId"),col("Salary"),lit("1").as("lit_value1"))
21 | df2.show()
22 |
23 | val df3 = df2.withColumn("lit_value2",
24 | when(col("Salary") >=40000 && col("Salary") <= 50000, lit("100").cast(IntegerType))
25 | .otherwise(lit("200").cast(IntegerType))
26 | )
27 |
28 | df3.show()
29 |
30 | val df4 = df3.withColumn("typedLit_seq",typedLit(Seq(1, 2, 3)))
31 | .withColumn("typedLit_map",typedLit(Map("a" -> 1, "b" -> 2)))
32 | .withColumn("typedLit_struct",typedLit(("a", 2, 1.0)))
33 |
34 | df4.printSchema()
35 | df4.show()
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/string/ConcatExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.string
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.{lit, _}
5 | object ConcatExample extends App {
6 |
7 | val spark = SparkSession.builder()
8 | .appName("SparkByExamples.com")
9 | .master("local")
10 | .getOrCreate()
11 |
12 | val data = Seq(("James","A","Smith","2018","M",3000),
13 | ("Michael","Rose","Jones","2010","M",4000),
14 | ("Robert","K","Williams","2010","M",4000),
15 | ("Maria","Anne","Jones","2005","F",4000),
16 | ("Jen","Mary","Brown","2010","",-1)
17 | )
18 |
19 | val columns = Seq("fname","mname","lname","dob_year","gender","salary")
20 | import spark.sqlContext.implicits._
21 | val df = data.toDF(columns:_*)
22 | df.printSchema()
23 | df.show(false)
24 |
25 | df.select(concat(col("fname"),lit(','),
26 | col("mname"),lit(','),col("lname")).as("FullName"))
27 | .show(false)
28 |
29 | df.withColumn("FullName",concat(col("fname"),lit(','),
30 | col("mname"),lit(','),col("lname")))
31 | .drop("fname")
32 | .drop("mname")
33 | .drop("lname")
34 | .show(false)
35 |
36 | df.withColumn("FullName",concat_ws(",",col("fname"),col("mname"),col("lname")))
37 | .drop("fname")
38 | .drop("mname")
39 | .drop("lname")
40 | .show(false)
41 |
42 | df.createOrReplaceTempView("EMP")
43 |
44 | spark.sql("select CONCAT(fname,' ',lname,' ',mname) as FullName from EMP")
45 | .show(false)
46 |
47 | spark.sql("select fname ||' '|| lname ||' '|| mname as FullName from EMP")
48 | .show(false)
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/string/SplitExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.string
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes}
6 |
7 | object SplitExample extends App{
8 |
9 | val spark = SparkSession.builder()
10 | .appName("SparkByExamples.com")
11 | .master("local")
12 | .getOrCreate()
13 |
14 | val data = Seq(("James, A, Smith","2018","M",3000),
15 | ("Michael, Rose, Jones","2010","M",4000),
16 | ("Robert,K,Williams","2010","M",4000),
17 | ("Maria,Anne,Jones","2005","F",4000),
18 | ("Jen,Mary,Brown","2010","",-1)
19 | )
20 |
21 | import spark.sqlContext.implicits._
22 | val df = data.toDF("name","dob_year","gender","salary")
23 | df.printSchema()
24 | df.show(false)
25 |
26 | val df2 = df.select(split(col("name"),",").getItem(0).as("FirstName"),
27 | split(col("name"),",").getItem(1).as("MiddleName"),
28 | split(col("name"),",").getItem(2).as("LastName"))
29 | .drop("name")
30 |
31 | df2.printSchema()
32 | df2.show(false)
33 |
34 |
35 | val splitDF = df.withColumn("FirstName",split(col("name"),",").getItem(0))
36 | .withColumn("MiddleName",split(col("name"),",").getItem(1))
37 | .withColumn("LastName",split(col("name"),",").getItem(2))
38 | .withColumn("NameArray",split(col("name"),","))
39 | .drop("name")
40 | splitDF.printSchema()
41 | splitDF.show(false)
42 |
43 | df.createOrReplaceTempView("PERSON")
44 | spark.sql("select SPLIT(name,',') as NameArray from PERSON")
45 | .show(false)
46 |
47 |
48 | val splitDF2 = df.withColumn("FirstName",split(col("name"),",").getItem(0))
49 | .withColumn("MiddleName",array_join(slice(split(col("name"),","),2,3),"/"))
50 |
51 | .withColumn("NameArray",split(col("name"),","))
52 | .drop("name")
53 | splitDF2.printSchema()
54 | splitDF2.show(false)
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/functions/window/RowNumber.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.functions.window
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.expressions.Window
5 | import org.apache.spark.sql.functions.row_number
6 |
7 | object RowNumber extends App {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | spark.sparkContext.setLogLevel("ERROR")
15 |
16 | import spark.implicits._
17 |
18 | val simpleData = Seq(("James", "Sales", 3000),
19 | ("Michael", "Sales", 4600),
20 | ("Robert", "Sales", 4100),
21 | ("Maria", "Finance", 3000),
22 | ("James", "Sales", 3000),
23 | ("Scott", "Finance", 3300),
24 | ("Jen", "Finance", 3900),
25 | ("Jeff", "Marketing", 3000),
26 | ("Kumar", "Marketing", 2000),
27 | ("Saif", "Sales", 4100)
28 | )
29 | val df = simpleData.toDF("employee_name", "department", "salary")
30 | df.show()
31 |
32 | //row_number
33 | val windowSpec = Window.partitionBy("department").orderBy("salary")
34 | df.withColumn("row_number",row_number.over(windowSpec))
35 | .show()
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/join/CrossJoinExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.join
2 |
3 | class CrossJoinExample {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/join/InnerJoinExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.join
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.catalyst.plans.Inner
5 |
6 | object InnerJoinExample extends App {
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | val emp = Seq((1,"Smith",-1,"2018","10","M",3000),
16 | (2,"Rose",1,"2010","20","M",4000),
17 | (3,"Williams",1,"2010","10","M",1000),
18 | (4,"Jones",2,"2005","10","F",2000),
19 | (5,"Brown",2,"2010","40","",-1),
20 | (6,"Brown",2,"2010","50","",-1)
21 | )
22 | val empColumns = Seq("emp_id","name","superior_emp_id","year_joined","emp_dept_id","gender","salary")
23 | import spark.sqlContext.implicits._
24 | val empDF = emp.toDF(empColumns:_*)
25 | empDF.show(false)
26 |
27 | val dept = Seq(("Finance",10),
28 | ("Marketing",20),
29 | ("Sales",30),
30 | ("IT",40)
31 | )
32 |
33 | val deptColumns = Seq("dept_name","dept_id")
34 | val deptDF = dept.toDF(deptColumns:_*)
35 | deptDF.show(false)
36 |
37 |
38 | println("Inner join")
39 | empDF.join(deptDF,empDF("emp_dept_id") === deptDF("dept_id"),"inner")
40 | .show(false)
41 | empDF.join(deptDF,empDF("emp_dept_id") === deptDF("dept_id"))
42 | .show(false)
43 |
44 | empDF.join(deptDF,empDF("emp_dept_id") === deptDF("dept_id"),Inner.sql)
45 | .show(false)
46 |
47 | empDF.join(deptDF).where(empDF("emp_dept_id") === deptDF("dept_id"))
48 | .show(false)
49 |
50 | empDF.join(deptDF).filter(empDF("emp_dept_id") === deptDF("dept_id"))
51 | .show(false)
52 |
53 | empDF.createOrReplaceTempView("EMP")
54 | deptDF.createOrReplaceTempView("DEPT")
55 |
56 | val joinDF2 = spark.sql("select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id")
57 | joinDF2.show(false)
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/join/JoinMultipleColumns.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.join
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object JoinMultipleColumns extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | val emp = Seq((1,"Smith",-1,"2018",10,"M",3000),
15 | (2,"Rose",1,"2010",20,"M",4000),
16 | (3,"Williams",1,"2010",10,"M",1000),
17 | (4,"Jones",2,"2005",10,"F",2000),
18 | (5,"Brown",2,"2010",30,"",-1),
19 | (6,"Brown",2,"2010",50,"",-1)
20 | )
21 | val empColumns = Seq("emp_id","name","superior_emp_id","branch_id","dept_id","gender","salary")
22 | import spark.sqlContext.implicits._
23 | val empDF = emp.toDF(empColumns:_*)
24 | empDF.show(false)
25 |
26 | val dept = Seq(("Finance",10,"2018"),
27 | ("Marketing",20,"2010"),
28 | ("Marketing",20,"2018"),
29 | ("Sales",30,"2005"),
30 | ("Sales",30,"2010"),
31 | ("IT",50,"2010")
32 | )
33 |
34 | val deptColumns = Seq("dept_name","dept_id","branch_id")
35 | val deptDF = dept.toDF(deptColumns:_*)
36 | deptDF.show(false)
37 |
38 | //Using multiple columns on join expression
39 | empDF.join(deptDF, empDF("dept_id") === deptDF("dept_id") &&
40 | empDF("branch_id") === deptDF("branch_id"),"inner")
41 | .show(false)
42 |
43 | //Using Join with multiple columns on where clause
44 | empDF.join(deptDF).where(empDF("dept_id") === deptDF("dept_id") &&
45 | empDF("branch_id") === deptDF("branch_id"))
46 | .show(false)
47 |
48 | //Using Join with multiple columns on filter clause
49 | empDF.join(deptDF).filter(empDF("dept_id") === deptDF("dept_id") &&
50 | empDF("branch_id") === deptDF("branch_id"))
51 | .show(false)
52 |
53 | //Using SQL & multiple columns on join expression
54 | empDF.createOrReplaceTempView("EMP")
55 | deptDF.createOrReplaceTempView("DEPT")
56 |
57 | val resultDF = spark.sql("select e.* from EMP e, DEPT d " +
58 | "where e.dept_id == d.dept_id and e.branch_id == d.branch_id")
59 | resultDF.show(false)
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/join/JoinMultipleDataFrames.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.join
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object JoinMultipleDataFrames extends App {
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | val emp = Seq((1,"Smith","10"),
15 | (2,"Rose","20"),
16 | (3,"Williams","10"),
17 | (4,"Jones","10"),
18 | (5,"Brown","40"),
19 | (6,"Brown","50")
20 | )
21 | val empColumns = Seq("emp_id","name","emp_dept_id")
22 | import spark.sqlContext.implicits._
23 | val empDF = emp.toDF(empColumns:_*)
24 | empDF.show(false)
25 |
26 | val dept = Seq(("Finance",10),
27 | ("Marketing",20),
28 | ("Sales",30),
29 | ("IT",40)
30 | )
31 | val deptColumns = Seq("dept_name","dept_id")
32 | val deptDF = dept.toDF(deptColumns:_*)
33 | deptDF.show(false)
34 |
35 | val address = Seq((1,"1523 Main St","SFO","CA"),
36 | (2,"3453 Orange St","SFO","NY"),
37 | (3,"34 Warner St","Jersey","NJ"),
38 | (4,"221 Cavalier St","Newark","DE"),
39 | (5,"789 Walnut St","Sandiago","CA")
40 | )
41 | val addColumns = Seq("emp_id","addline1","city","state")
42 | val addDF = address.toDF(addColumns:_*)
43 | addDF.show(false)
44 |
45 | //Using Join expression
46 | empDF.join(deptDF,empDF("emp_dept_id") === deptDF("dept_id"),"inner" )
47 | .join(addDF,empDF("emp_id") === addDF("emp_id"),"inner")
48 | .show(false)
49 |
50 | //Using where
51 | empDF.join(deptDF).where(empDF("emp_dept_id") === deptDF("dept_id"))
52 | .join(addDF).where(empDF("emp_id") === addDF("emp_id"))
53 | .show(false)
54 |
55 | //Using Filter
56 | empDF.join(deptDF).filter(empDF("emp_dept_id") === deptDF("dept_id"))
57 | .join(addDF).filter(empDF("emp_id") === addDF("emp_id"))
58 | .show(false)
59 |
60 | //Using SQL expression
61 | empDF.createOrReplaceTempView("EMP")
62 | deptDF.createOrReplaceTempView("DEPT")
63 | addDF.createOrReplaceTempView("ADD")
64 |
65 | spark.sql("select * from EMP e, DEPT d, ADD a " +
66 | "where e.emp_dept_id == d.dept_id and e.emp_id == a.emp_id")
67 | .show(false)
68 |
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/join/SelfJoinExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.join
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.col
5 |
6 | object SelfJoinExample extends App {
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | val emp = Seq((1,"Smith",1,"10",3000),
16 | (2,"Rose",1,"20",4000),
17 | (3,"Williams",1,"10",1000),
18 | (4,"Jones",2,"10",2000),
19 | (5,"Brown",2,"40",-1),
20 | (6,"Brown",2,"50",-1)
21 | )
22 | val empColumns = Seq("emp_id","name","superior_emp_id","emp_dept_id","salary")
23 | import spark.sqlContext.implicits._
24 | val empDF = emp.toDF(empColumns:_*)
25 | empDF.show(false)
26 |
27 | println("self join")
28 | val selfDF = empDF.as("emp1").join(empDF.as("emp2"),
29 | col("emp1.superior_emp_id") === col("emp2.emp_id"),"inner")
30 | selfDF.show(false)
31 |
32 | selfDF.select(col("emp1.emp_id"),col("emp1.name"),
33 | col("emp2.emp_id").as("superior_emp_id"),
34 | col("emp2.name").as("superior_emp_name"))
35 | .show(false)
36 |
37 | //Spark SQL self join with where clause
38 | empDF.as("emp1").join(empDF.as("emp2")).where(
39 | col("emp1.superior_emp_id") === col("emp2.emp_id"))
40 | .select(col("emp1.emp_id"),col("emp1.name"),
41 | col("emp2.emp_id").as("superior_emp_id"),
42 | col("emp2.name").as("superior_emp_name"))
43 | .show(false)
44 |
45 | //Spark SQL self join with filter clause
46 | empDF.as("emp1").join(empDF.as("emp2")).filter(
47 | col("emp1.superior_emp_id") === col("emp2.emp_id"))
48 | .select(col("emp1.emp_id"),col("emp1.name"),
49 | col("emp2.emp_id").as("superior_emp_id"),
50 | col("emp2.name").as("superior_emp_name"))
51 | .show(false)
52 |
53 |
54 | empDF.createOrReplaceTempView("EMP")
55 | spark.sql("select emp1.emp_id,emp1.name," +
56 | "emp2.emp_id as superior_emp_id, emp2.name as superior_emp_name " +
57 | "from EMP emp1 INNER JOIN EMP emp2 on emp1.superior_emp_id == emp2.emp_id")
58 | .show(false)
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/PersonsComplexXML.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.xml
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
5 |
6 | object PersonsComplexXML {
7 |
8 | def main(args: Array[String]): Unit = {
9 | val spark = SparkSession.builder().master("local[1]")
10 | .appName("SparkByExample")
11 | .getOrCreate()
12 |
13 | /*
14 | Read XML File
15 | */
16 | val df = spark.read
17 | .format("xml")
18 | .option("rowTag", "person")
19 | .load("src/main/resources/persons_complex.xml")
20 |
21 | df.printSchema()
22 |
23 | df.show()
24 | val schema = new StructType()
25 | .add("_id",StringType)
26 | .add("firstname",StringType)
27 | .add("middlename",StringType)
28 | .add("lastname",StringType)
29 | .add("dob_year",StringType)
30 | .add("dob_month",StringType)
31 | .add("gender",StringType)
32 | .add("salary",StringType)
33 | .add("addresses", new StructType()
34 | .add("address",ArrayType(
35 | new StructType()
36 | .add("_type",StringType)
37 | .add("addressLine",StringType)
38 | .add("city",StringType)
39 | .add("state",StringType)
40 | )
41 | )
42 | )
43 |
44 | val df2 = spark.read
45 | .format("xml")
46 | .option("rowTag", "person")
47 | .schema(schema)
48 | .load("src/main/resources/persons.xml")
49 |
50 | // df.foreach(row=>{
51 | // println("ID:"+row.getAs("_id") )
52 | // println("ID:"+row(0))
53 | // println("ID:"+row.get(0))
54 | // println(row.getAs("addresses"))
55 | // // println("ID:"+row.getString(0))
56 | // })
57 | //
58 | df2.write
59 | .format("com.databricks.spark.xml")
60 | .option("rootTag", "persons")
61 | .option("rowTag", "person")
62 | .save("src/main/resources/persons_new.xml")
63 |
64 | }
65 | }
66 |
67 |
68 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/PersonsXML.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.xml
2 |
3 | import org.apache.spark.sql.{SparkSession, types}
4 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
5 |
6 | object PersonsXML {
7 |
8 | def main(args: Array[String]): Unit = {
9 | val spark = SparkSession.builder().master("local[1]")
10 | .appName("SparkByExample")
11 | .getOrCreate()
12 |
13 | /*
14 | Read XML File
15 | */
16 | val df = spark.read
17 | .format("xml")
18 | .option("rowTag", "person")
19 | .load("src/main/resources/persons.xml")
20 |
21 | df.printSchema()
22 | df.show()
23 |
24 | val schema = new StructType()
25 | .add("_id",StringType)
26 | .add("firstname",StringType)
27 | .add("middlename",StringType)
28 | .add("lastname",StringType)
29 | .add("dob_year",StringType)
30 | .add("dob_month",StringType)
31 | .add("gender",StringType)
32 | .add("salary",StringType)
33 |
34 | val df2 = spark.read
35 | .format("xml")
36 | .option("rowTag", "person")
37 | .schema(schema)
38 | .load("src/main/resources/persons.xml")
39 |
40 | df2.write
41 | .format("com.databricks.spark.xml")
42 | .option("rootTag", "persons")
43 | .option("rowTag", "person")
44 | .save("src/main/resources/persons_new.xml")
45 |
46 | }
47 | }
48 |
49 |
50 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataframe/xml/ReadBooksXMLWithNestedArray.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.xml
2 |
3 | import com.sparkbyexamples.spark.beans.BooksWithArray
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
6 | import org.apache.spark.sql.types.StructType
7 |
8 | object ReadBooksXMLWithNestedArray {
9 |
10 | def main(args: Array[String]): Unit = {
11 | val spark = SparkSession.builder().master("local[1]")
12 | .appName("SparkByExample")
13 | .getOrCreate()
14 |
15 | val df = spark.sqlContext.read
16 | .format("com.databricks.spark.xml")
17 | .option("rowTag", "book")
18 | .load("src/main/resources/books_withnested_array.xml")
19 |
20 | df.printSchema()
21 | df.show()
22 |
23 | df.foreach(row=>{
24 | println(""+row.getAs("author")+","+row.getAs("_id"))
25 | println(row.getStruct(4).getAs("country"))
26 | println(row.getStruct(4).getClass)
27 | val arr = row.getStruct(7).getList(0)
28 | for (i<-0 to arr.size-1){
29 | val b = arr.get(i).asInstanceOf[GenericRowWithSchema]
30 | println(""+b.getAs("name") +","+b.getAs("location"))
31 | }
32 | })
33 |
34 | }
35 | }
36 |
37 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataset/DataSetFromData.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataset
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object DataSetFromData {
6 |
7 | def main(args:Array[String]):Unit= {
8 |
9 | val spark: SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExample")
12 | .getOrCreate()
13 |
14 | val data = Seq((1,2),(3,4),(5,6))
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataset/DataSetWithCustomClass.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataset
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | class Test(field1:String,field2:String,field3:String) extends Serializable{
6 |
7 |
8 | }
9 |
10 | object TestEncoders {
11 | implicit def testEncoder: org.apache.spark.sql.Encoder[Test] =
12 | org.apache.spark.sql.Encoders.kryo[Test]
13 | }
14 | object DataSetWithCustomClass {
15 |
16 | def main(args:Array[String]):Unit= {
17 |
18 | val spark: SparkSession = SparkSession.builder()
19 | .master("local[1]")
20 | .appName("SparkByExample")
21 | .getOrCreate()
22 |
23 | val test:Test = new Test("Field1","Field2","Field3")
24 |
25 | import spark.sqlContext.implicits._
26 | import org.apache.spark.sql.Encoders
27 | import TestEncoders._
28 | // implicit val encoder = Encoders.bean[Test](classOf[Test])
29 |
30 | val data = Seq(test)
31 | val rdd = spark.sparkContext.parallelize(data)
32 | val ds = spark.createDataset(rdd)
33 |
34 | val ds2 = ds.selectExpr("CAST(value AS String)")
35 | .as[(String)]
36 |
37 |
38 | ds.printSchema()
39 | ds2.show(false)
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXML.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataset.xml
2 |
3 | import com.sparkbyexamples.spark.beans.{Books, BooksDiscounted}
4 | import org.apache.spark.sql.{Encoders, SparkSession}
5 |
6 | object ReadBooksXML {
7 |
8 | def main(args: Array[String]): Unit = {
9 | val spark = SparkSession.builder().master("local[1]")
10 | .appName("SparkByExample")
11 | .getOrCreate()
12 |
13 | import spark.implicits._
14 |
15 | val ds = spark.sqlContext.read
16 | .format("com.databricks.spark.xml")
17 | .option("rowTag", "book")
18 | .load("src/main/resources/books.xml").as[Books]
19 |
20 |
21 | val newds = ds.map(f=>{
22 | BooksDiscounted(f._id,f.author,f.description,f.price,f.publish_date,f.title, f.price - f.price*20/100)
23 | })
24 |
25 | newds.printSchema()
26 | newds.show()
27 |
28 | newds.foreach(f=>{
29 | println("Price :"+f.price + ", Discounted Price :"+f.discountPrice)
30 | })
31 |
32 | //First element
33 | println("First Element" +newds.first()._id)
34 |
35 | }
36 | }
37 |
38 |
39 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXMLWithNestedArray.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataset.xml
2 |
3 | import com.sparkbyexamples.spark.beans.{Books, BooksWithArray}
4 | import org.apache.spark.sql.{SparkSession, functions}
5 |
6 | object ReadBooksXMLWithNestedArray {
7 |
8 | def main(args: Array[String]): Unit = {
9 | val spark = SparkSession.builder().master("local[1]")
10 | .appName("SparkByExample")
11 | .getOrCreate()
12 |
13 | import spark.implicits._
14 | val ds = spark.sqlContext.read
15 | .format("com.databricks.spark.xml")
16 | .option("rowTag", "book")
17 | .load("src/main/resources/books_withnested_array.xml").as[BooksWithArray]
18 |
19 | ds.printSchema()
20 | ds.show()
21 |
22 | ds.foreach(f=>{
23 | println(f.author+","+f.otherInfo.country+","+f.otherInfo.address.addressline1)
24 | for(s<-f.stores.store){
25 | println(s.name)
26 | }
27 |
28 | })
29 |
30 | }
31 | }
32 |
33 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataset/xml/ReadBooksXMLWithNestedArrayDSL.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataset.xml
2 |
3 |
4 |
5 | import com.sparkbyexamples.spark.beans.Books
6 | import org.apache.spark.sql.{Encoders, SparkSession, functions}
7 |
8 | object ReadBooksXMLWithNestedArrayDSL {
9 |
10 | def main(args: Array[String]): Unit = {
11 | val spark = SparkSession.builder().master("local[1]")
12 | .appName("SparkByExample")
13 | .getOrCreate()
14 |
15 | import spark.implicits._
16 | val xmlDF = spark.sqlContext.read
17 | .format("com.databricks.spark.xml")
18 | .option("rowTag", "book")
19 | .load("src/main/resources/books_withnested_array.xml")
20 |
21 | xmlDF.printSchema()
22 | println(xmlDF.count())
23 |
24 | xmlDF.show()
25 |
26 | xmlDF.select(xmlDF("title"),xmlDF("price")*100).show()
27 |
28 | xmlDF.select("author").show()
29 |
30 |
31 | xmlDF.select("stores").show()
32 |
33 | xmlDF.withColumn("store", functions.explode(xmlDF("stores.store"))).show()
34 |
35 | val df = xmlDF.withColumn("store", functions.explode(xmlDF("stores.store")))
36 | .select("_id","author","stores.country","store.name")
37 |
38 | val storeDF = xmlDF.select("stores.store")
39 | storeDF.printSchema()
40 |
41 | df.foreach(f=>{
42 | println(f.getAs("_id"))
43 | })
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | }
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataset/xml/SparkXMLUsingXstream.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataframe.xml
2 |
3 | import com.thoughtworks.xstream.XStream
4 | import com.thoughtworks.xstream.io.xml.DomDriver
5 | import org.apache.spark.sql.SparkSession
6 |
7 | case class Animal(cri:String,taille:Int)
8 |
9 | object SparkXMLUsingXStream{
10 | def main(args: Array[String]): Unit = {
11 | val spark = SparkSession.
12 | builder.master ("local[*]")
13 | .appName ("sparkbyexamples.com")
14 | .getOrCreate ()
15 |
16 | var animal:Animal = Animal("Rugissement",150)
17 | val xstream1 = new XStream(new DomDriver())
18 | xstream1.alias("testAni",classOf[Animal])
19 | xstream1.aliasField("cricri",classOf[Animal],"cri")
20 | val xmlString = Seq(xstream1.toXML(animal))
21 |
22 | import spark.implicits._
23 | val newDf = xmlString.toDF()
24 | newDf.show(false)
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/dataset/xml/sparkXml.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.dataset.xml
2 |
3 | import org.apache.spark.sql.functions.{col, explode}
4 | import org.apache.spark.sql.{SQLContext, SparkSession}
5 |
6 | object sparkXml {
7 | def main(args: Array[String]): Unit = {
8 |
9 | val spark = SparkSession.
10 | builder.master("local[*]")
11 | //.config("spark.debug.maxToStringFields", "100")
12 | .appName("Insight Application Big Data")
13 | .getOrCreate()
14 |
15 | val df = spark.read
16 | .format("com.databricks.spark.xml")
17 | .option("rowTag", "row")
18 | .load("src/main/resources/input.xml")
19 | df.createOrReplaceTempView("categ_entry")
20 |
21 | df.printSchema()
22 | spark.sql("Select c26['_VALUE'] as value, c26['_m'] as option from categ_entry").show(false)
23 |
24 | val df2 = df.withColumn("c26Struct",explode(df("c26")))
25 | df2.select(col("c26Struct._VALUE").alias("value"),col("c26Struct._m").alias("option") ).show(false)
26 |
27 |
28 |
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/CreateEmptyRDD.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object CreateEmptyRDD extends App{
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local[3]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val rdd = spark.sparkContext.emptyRDD
13 | val rddString = spark.sparkContext.emptyRDD[String]
14 |
15 | println(rdd)
16 | println(rddString)
17 | println("Num of Partitions: "+rdd.getNumPartitions)
18 |
19 | rddString.saveAsTextFile("c:/tmp/test5.txt")
20 |
21 | val rdd2 = spark.sparkContext.parallelize(Seq.empty[String])
22 | println(rdd2)
23 | println("Num of Partitions: "+rdd2.getNumPartitions)
24 |
25 | rdd2.saveAsTextFile("c:/tmp/test3.txt")
26 |
27 | // Pair RDD
28 |
29 | type dataType = (String,Int)
30 | var pairRDD = spark.sparkContext.emptyRDD[dataType]
31 | println(pairRDD)
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/CreateRDD.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object CreateRDD {
6 |
7 | def main(args:Array[String]): Unit ={
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local[3]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | val rdd=spark.sparkContext.parallelize(Seq(("Java", 20000), ("Python", 100000), ("Scala", 3000)))
15 | rdd.foreach(println)
16 |
17 | val rdd1 = spark.sparkContext.textFile("/path/textFile.txt")
18 |
19 | val rdd2 = spark.sparkContext.wholeTextFiles("/path/textFile.txt")
20 | rdd2.foreach(record=>println("FileName : "+record._1+", FileContents :"+record._2))
21 |
22 | val rdd3 = rdd.map(row=>{(row._1,row._2+100)})
23 | rdd3.foreach(println)
24 |
25 | val myRdd2 = spark.range(20).toDF().rdd
26 | myRdd2.foreach(println)
27 |
28 |
29 |
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/OperationsOnPairRDD.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | import scala.collection.mutable
6 |
7 | object OperationsOnPairRDD {
8 |
9 | def main(args: Array[String]): Unit = {
10 |
11 | val spark = SparkSession.builder()
12 | .appName("SparkByExample")
13 | .master("local")
14 | .getOrCreate()
15 |
16 | spark.sparkContext.setLogLevel("ERROR")
17 |
18 | val rdd = spark.sparkContext.parallelize(
19 | List("Germany India USA","USA India Russia","India Brazil Canada China")
20 | )
21 |
22 | val wordsRdd = rdd.flatMap(_.split(" "))
23 | val pairRDD = wordsRdd.map(f=>(f,1))
24 | pairRDD.foreach(println)
25 |
26 | println("Distinct ==>")
27 | pairRDD.distinct().foreach(println)
28 |
29 |
30 | //SortByKey
31 | println("Sort by Key ==>")
32 | val sortRDD = pairRDD.sortByKey()
33 | sortRDD.foreach(println)
34 |
35 | //reduceByKey
36 | println("Reduce by Key ==>")
37 | val wordCount = pairRDD.reduceByKey((a,b)=>a+b)
38 | wordCount.foreach(println)
39 |
40 | def param1= (accu:Int,v:Int) => accu + v
41 | def param2= (accu1:Int,accu2:Int) => accu1 + accu2
42 | println("Aggregate by Key ==> wordcount")
43 | val wordCount2 = pairRDD.aggregateByKey(0)(param1,param2)
44 | wordCount2.foreach(println)
45 |
46 | //keys
47 | println("Keys ==>")
48 | wordCount2.keys.foreach(println)
49 |
50 | //values
51 | println("values ==>")
52 | wordCount2.values.foreach(println)
53 |
54 | println("Count :"+wordCount2.count())
55 |
56 | println("collectAsMap ==>")
57 | pairRDD.collectAsMap().foreach(println)
58 |
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/OperationsOnRDD.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object OperationsOnRDD {
6 |
7 | def main(args: Array[String]): Unit = {
8 |
9 | val spark = SparkSession.builder()
10 | .appName("SparkByExample")
11 | .master("local")
12 | .getOrCreate()
13 |
14 | spark.sparkContext.setLogLevel("ERROR")
15 |
16 | val rdd = spark.sparkContext.parallelize(
17 | List("Germany India USA","USA London Russia","Mexico Brazil Canada China")
18 | )
19 |
20 | val listRdd = spark.sparkContext.parallelize(List(9,2,3,4,5,6,7,8))
21 |
22 | //reduce
23 | println("Minimum :"+listRdd.reduce((a,b)=> a min b))
24 | println("Maximum :"+listRdd.reduce((a,b)=> a max b))
25 | println("Sum :"+listRdd.reduce((a,b)=> a + b))
26 |
27 | //flatMap
28 | val wordsRdd = rdd.flatMap(_.split(" "))
29 | wordsRdd.foreach(println)
30 |
31 | //sortBy
32 | println("Sort by word name")
33 | val sortRdd = wordsRdd.sortBy(f=>f) // also can write f=>f
34 |
35 | //GroupBy
36 | val groupRdd = wordsRdd.groupBy(word=>word.length)
37 | groupRdd.foreach(println)
38 |
39 | //map
40 | val tupp2Rdd = wordsRdd.map(f=>(f,1))
41 | tupp2Rdd.foreach(println)
42 |
43 |
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/PartitionBy.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.HashPartitioner
4 | import org.apache.spark.rdd.RDD
5 | import org.apache.spark.sql.SparkSession
6 |
7 | object PartitionBy {
8 |
9 |
10 | def main(args:Array[String]): Unit = {
11 |
12 | val spark:SparkSession = SparkSession.builder()
13 | .master("local[3]")
14 | .appName("SparkByExample")
15 | .getOrCreate()
16 |
17 | val sc = spark.sparkContext
18 |
19 | val rdd = sc.textFile("C://000_Projects/opt/BigData/zipcodes.csv")
20 |
21 | val rdd2:RDD[Array[String]] = rdd.map(m=>m.split(","))
22 |
23 |
24 | val rdd3 = rdd2.map(a=>(a(1),a.mkString(",")))
25 |
26 | val rdd4 = rdd3.partitionBy(new HashPartitioner(3))
27 |
28 | rdd4.saveAsTextFile("c:/tmp/output/partition")
29 |
30 |
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDAccumulator.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.SparkSession
5 |
6 | object RDDAccumulator extends App {
7 |
8 | val spark = SparkSession.builder()
9 | .appName("SparkByExample")
10 | .master("local")
11 | .getOrCreate()
12 |
13 | val longAcc = spark.sparkContext.longAccumulator("SumAccumulator")
14 |
15 | val rdd = spark.sparkContext.parallelize(Array(1, 2, 3))
16 |
17 | rdd.foreach(x => longAcc.add(x))
18 | println(longAcc.value)
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDBroadcast.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object RDDBroadcast extends App {
6 |
7 | val spark = SparkSession.builder()
8 | .appName("SparkByExamples.com")
9 | .master("local")
10 | .getOrCreate()
11 |
12 | val states = Map(("NY","New York"),("CA","California"),("FL","Florida"))
13 | val countries = Map(("USA","United States of America"),("IN","India"))
14 |
15 | val broadcastStates = spark.sparkContext.broadcast(states)
16 | val broadcastCountries = spark.sparkContext.broadcast(countries)
17 |
18 | val data = Seq(("James","Smith","USA","CA"),
19 | ("Michael","Rose","USA","NY"),
20 | ("Robert","Williams","USA","CA"),
21 | ("Maria","Jones","USA","FL")
22 | )
23 |
24 | val rdd = spark.sparkContext.parallelize(data)
25 |
26 | val rdd2 = rdd.map(f=>{
27 | val country = f._3
28 | val state = f._4
29 | val fullCountry = broadcastCountries.value.get(country).get
30 | val fullState = broadcastStates.value.get(state).get
31 | (f._1,f._2,fullCountry,fullState)
32 | })
33 |
34 | println(rdd2.collect().mkString("\n"))
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDCache.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql.SparkSession
5 |
6 | object RDDCache extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 | val sc = spark.sparkContext
13 |
14 | val rdd = sc.textFile("src/main/resources/zipcodes-noheader.csv")
15 |
16 | val rdd2:RDD[ZipCode] = rdd.map(row=>{
17 | val strArray = row.split(",")
18 | ZipCode(strArray(0).toInt,strArray(1),strArray(3),strArray(4))
19 | })
20 |
21 | rdd2.cache()
22 |
23 |
24 | println(rdd2.count())
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromCSVFile.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql.SparkSession
5 |
6 | object RDDFromCSVFile {
7 |
8 | def main(args:Array[String]): Unit ={
9 |
10 | def splitString(row:String):Array[String]={
11 | row.split(",")
12 | }
13 |
14 | val spark:SparkSession = SparkSession.builder()
15 | .master("local[3]")
16 | .appName("SparkByExample")
17 | .getOrCreate()
18 | val sc = spark.sparkContext
19 |
20 | val rdd = sc.textFile("src/main/resources/zipcodes-noheader.csv")
21 |
22 | val rdd2:RDD[ZipCode] = rdd.map(row=>{
23 | val strArray = splitString(row)
24 | ZipCode(strArray(0).toInt,strArray(1),strArray(3),strArray(4))
25 | })
26 |
27 | rdd2.foreach(a=>println(a.city))
28 | }
29 |
30 | }
31 |
32 |
33 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromDataUsingParallelize.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.SQLContext
6 |
7 | object RDDFromDataUsingParallelize {
8 |
9 | def main(args: Array[String]): Unit = {
10 | val spark:SparkSession = SparkSession.builder()
11 | .master("local[3]")
12 | .appName("SparkByExample")
13 | .getOrCreate()
14 | val rdd:RDD[Int] = spark.sparkContext.parallelize(List(1,2,3,4,5))
15 | val rddCollect:Array[Int] = rdd.collect()
16 | println("Number of Partitions: "+rdd.getNumPartitions)
17 | println("Action: First element: "+rdd.first())
18 | println("Action: RDD converted to Array[Int] : ")
19 | rddCollect.foreach(println)
20 |
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromParallelizeRange.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql.SparkSession
5 |
6 | object RDDFromParallelizeRange {
7 | def main(args: Array[String]): Unit = {
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local[3]")
11 | .appName("SparkByExample")
12 | .getOrCreate()
13 |
14 | val sc = spark.sparkContext
15 |
16 | val rdd4:RDD[Range] = sc.parallelize(List(1 to 1000))
17 | println("Number of Partitions : "+rdd4.getNumPartitions)
18 |
19 | val rdd5 = rdd4.repartition(5)
20 | println("Number of Partitions : "+rdd5.getNumPartitions)
21 |
22 | val rdd6:Array[Range] = rdd5.collect()
23 | println(rdd6.mkString(","))
24 |
25 | val rdd7:Array[Array[Range]] = rdd5.glom().collect()
26 | println("After glom");
27 | rdd7.foreach(f=>{
28 | println("For each partition")
29 | f.foreach(f1=>println(f1))
30 | })
31 |
32 |
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDFromWholeTextFile.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql.SparkSession
5 |
6 | object RDDFromWholeTextFile {
7 |
8 | def main(args:Array[String]): Unit = {
9 |
10 | val spark:SparkSession = SparkSession.builder()
11 | .master("local[3]")
12 | .appName("SparkByExamples.com")
13 | .getOrCreate()
14 | val sc = spark.sparkContext
15 |
16 | val rdd = sc.wholeTextFiles("C://000_Projects/opt/BigData/alice.txt")
17 | rdd.foreach(a=>println(a._1+"---->"+a._2))
18 |
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDHadoopInputFormat.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | object RDDHadoopInputFormat_ {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDPrint.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object RDDPrint extends App{
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExample")
10 | .getOrCreate()
11 | val dept = List(("Finance",10),("Marketing",20),
12 | ("Sales",30), ("IT",40))
13 | val rdd=spark.sparkContext.parallelize(dept)
14 | println(rdd)
15 | val dataColl=rdd.collect()
16 | println(dataColl)
17 | dataColl.foreach(println)
18 |
19 | dataColl.foreach(f=>println(f._1 +","+f._2))
20 | val dataCollLis=rdd.collectAsMap()
21 | dataCollLis.foreach(f=>println(f._1 +","+f._2))
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDReadFilesFromDirectory.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | object RDDReadFilesFromDirectory_ {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDRepartitionExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object RDDRepartitionExample extends App {
6 |
7 | val spark:SparkSession = SparkSession.builder()
8 | .master("local[5]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val rdd = spark.sparkContext.parallelize(Range(0,20))
13 | println("From local[5]"+rdd.partitions.size)
14 |
15 | val rdd1 = spark.sparkContext.parallelize(Range(0,20), 6)
16 | println("parallelize : "+rdd1.partitions.size)
17 |
18 | rdd1.partitions.foreach(f=> f.toString)
19 | val rddFromFile = spark.sparkContext.textFile("src/main/resources/test.txt",9)
20 |
21 | println("TextFile : "+rddFromFile.partitions.size)
22 |
23 | rdd1.saveAsTextFile("c:/tmp/partition")
24 | val rdd2 = rdd1.repartition(4)
25 | println("Repartition size : "+rdd2.partitions.size)
26 |
27 | rdd2.saveAsTextFile("c:/tmp/re-partition")
28 |
29 | val rdd3 = rdd1.coalesce(4)
30 | println("Repartition size : "+rdd3.partitions.size)
31 |
32 | rdd3.saveAsTextFile("c:/tmp/coalesce")
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDSaveAsObjectFile.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | object RDDSaveAsObjectFile_ {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDSequenceFiles.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | object RDDSequenceFiles_ {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/RDDShuffleExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql.SparkSession
5 |
6 | object RDDShuffleExample extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local[5]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | val sc = spark.sparkContext
14 |
15 | val rdd:RDD[String] = sc.textFile("src/main/resources/test.txt")
16 |
17 | println(rdd.getNumPartitions)
18 | val rdd2 = rdd.flatMap(f=>f.split(" "))
19 | .map(m=>(m,1))
20 |
21 | //ReduceBy transformation
22 | val rdd5 = rdd2.reduceByKey(_ + _)
23 |
24 | println(rdd5.getNumPartitions)
25 |
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/ReadMultipleCSVFiles.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql.SparkSession
5 |
6 | object ReadMultipleCSVFiles extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | println("spark read csv files from a directory into RDD")
16 | val rddFromFile = spark.sparkContext.textFile("C:/tmp/files/text01.csv")
17 | println(rddFromFile.getClass)
18 |
19 | val rdd = rddFromFile.map(f=>{
20 | f.split(",")
21 | })
22 |
23 | println("Iterate RDD")
24 | rdd.foreach(f=>{
25 | println("Col1:"+f(0)+",Col2:"+f(1))
26 | })
27 | println(rdd)
28 |
29 | println("Get data Using collect")
30 | rdd.collect().foreach(f=>{
31 | println("Col1:"+f(0)+",Col2:"+f(1))
32 | })
33 |
34 | println("read all csv files from a directory to single RDD")
35 | val rdd2 = spark.sparkContext.textFile("C:/tmp/files/*")
36 | rdd2.foreach(f=>{
37 | println(f)
38 | })
39 |
40 | println("read csv files base on wildcard character")
41 | val rdd3 = spark.sparkContext.textFile("C:/tmp/files/text*.csv")
42 | rdd3.foreach(f=>{
43 | println(f)
44 | })
45 |
46 | println("read multiple csv files into a RDD")
47 | val rdd4 = spark.sparkContext.textFile("C:/tmp/files/text01.csv,C:/tmp/files/text02.csv")
48 | rdd4.foreach(f=>{
49 | println(f)
50 | })
51 |
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/ReadMultipleFiles.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | object ReadMultipleFiles extends App {
4 |
5 | import org.apache.spark.sql.SparkSession
6 |
7 | object ReadMultipleFiles extends App {
8 |
9 | val spark:SparkSession = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExamples.com")
12 | .getOrCreate()
13 |
14 | spark.sparkContext.setLogLevel("ERROR")
15 |
16 | println("read all text files from a directory to single RDD")
17 | val rdd = spark.sparkContext.textFile("C:/tmp/files/*")
18 | rdd.foreach(f=>{
19 | println(f)
20 | })
21 |
22 | println("read text files base on wildcard character")
23 | val rdd2 = spark.sparkContext.textFile("C:/tmp/files/text*.txt")
24 | rdd2.foreach(f=>{
25 | println(f)
26 | })
27 |
28 | println("read multiple text files into a RDD")
29 | val rdd3 = spark.sparkContext.textFile("C:/tmp/files/text01.txt,C:/tmp/files/text02.txt")
30 | rdd3.foreach(f=>{
31 | println(f)
32 | })
33 |
34 | println("Read files and directory together")
35 | val rdd4 = spark.sparkContext.textFile("C:/tmp/files/text01.txt,C:/tmp/files/text02.txt,C:/tmp/files/*")
36 | rdd4.foreach(f=>{
37 | println(f)
38 | })
39 |
40 |
41 | val rddWhole = spark.sparkContext.wholeTextFiles("C:/tmp/files/*")
42 | rddWhole.foreach(f=>{
43 | println(f._1+"=>"+f._2)
44 | })
45 |
46 | val rdd5 = spark.sparkContext.textFile("C:/tmp/files/*")
47 | val rdd6 = rdd5.map(f=>{
48 | f.split(",")
49 | })
50 |
51 | rdd6.foreach(f => {
52 | println("Col1:"+f(0)+",Col2:"+f(1))
53 | })
54 |
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/ReadTextFiles.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql.SparkSession
5 |
6 | object ReadTextFiles extends App {
7 |
8 | val spark:SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | spark.sparkContext.setLogLevel("ERROR")
14 |
15 | println("##spark read text files from a directory into RDD")
16 | val rddFromFile = spark.sparkContext.textFile("src/main/resources/csv/text01.txt")
17 | println(rddFromFile.getClass)
18 |
19 | println("##Get data Using collect")
20 | rddFromFile.collect().foreach(f=>{
21 | println(f)
22 | })
23 |
24 | println("##read multiple text files into a RDD")
25 | val rdd4 = spark.sparkContext.textFile("src/main/resources/csv/text01.txt," +
26 | "src/main/resources/csv/text02.txt")
27 | rdd4.foreach(f=>{
28 | println(f)
29 | })
30 |
31 | println("##read text files base on wildcard character")
32 | val rdd3 = spark.sparkContext.textFile("src/main/resources/csv/text*.txt")
33 | rdd3.foreach(f=>{
34 | println(f)
35 | })
36 |
37 | println("##read all text files from a directory to single RDD")
38 | val rdd2 = spark.sparkContext.textFile("src/main/resources/csv/*")
39 | rdd2.foreach(f=>{
40 | println(f)
41 | })
42 |
43 | println("##read whole text files")
44 | val rddWhole:RDD[(String,String)] = spark.sparkContext.wholeTextFiles("src/main/resources/csv/text01.txt")
45 | println(rddWhole.getClass)
46 | rddWhole.foreach(f=>{
47 | println(f._1+"=>"+f._2)
48 | })
49 | }
50 |
51 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/SortBy.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
6 |
7 | object SortBy {
8 |
9 | def main(args: Array[String]): Unit = {
10 |
11 | val spark:SparkSession = SparkSession.builder()
12 | .master("local[3]")
13 | .appName("SparkByExample")
14 | .getOrCreate()
15 |
16 | val sc = spark.sparkContext
17 |
18 | val rdd:RDD[String] = sc.textFile("C://000_Projects/opt/BigData/zipcodes-noheader.csv")
19 |
20 | val rddZip:RDD[ZipCode] = rdd.map(f=>{
21 | val arr = split(f)
22 | ZipCode(arr(0).toInt,arr(1),arr(3),arr(4))
23 | })
24 |
25 | //SortBy
26 | val rddSort = rddZip.sortBy(f=>f.recordNumber)
27 | rddSort.collect().foreach(f=>println(f.toString))
28 |
29 | //SorybyKey
30 | //First create pairRDD
31 | val rddTuple=rddZip.map(f=>{
32 | Tuple2(f.recordNumber,f.toString)
33 | })
34 | rddTuple.sortByKey().collect().foreach(f=>println(f._2))
35 | }
36 |
37 | def split(str:String): Array[String] ={
38 | str.split(",")
39 | }
40 |
41 | }
42 |
43 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/ZipCode.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd
2 |
3 | case class ZipCode(recordNumber:Int,zipCode:String,city:String,state:String)
4 |
5 |
6 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/functions/FlatMapExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd.functions
2 |
3 | import org.apache.spark.sql.{Row, SparkSession}
4 | import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
5 |
6 | object FlatMapExample extends App{
7 |
8 | val spark: SparkSession = SparkSession.builder()
9 | .master("local[1]")
10 | .appName("SparkByExamples.com")
11 | .getOrCreate()
12 |
13 | val data = Seq("Project Gutenberg’s",
14 | "Alice’s Adventures in Wonderland",
15 | "Project Gutenberg’s",
16 | "Adventures in Wonderland",
17 | "Project Gutenberg’s")
18 | val rdd=spark.sparkContext.parallelize(data)
19 | rdd.foreach(println)
20 |
21 | val rdd1 = rdd.flatMap(f=>f.split(" "))
22 | rdd1.foreach(println)
23 |
24 | val arrayStructureData = Seq(
25 | Row("James,,Smith",List("Java","Scala","C++"),"CA"),
26 | Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"),
27 | Row("Robert,,Williams",List("CSharp","VB","R"),"NV")
28 | )
29 |
30 | val arrayStructureSchema = new StructType()
31 | .add("name",StringType)
32 | .add("languagesAtSchool", ArrayType(StringType))
33 | .add("currentState", StringType)
34 |
35 |
36 | val df = spark.createDataFrame(
37 | spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
38 | import spark.implicits._
39 | val df2=df.flatMap(f=> f.getSeq[String](1).map((f.getString(0),_,f.getString(2))))
40 | .toDF("Name","Language","State")
41 | df2.show(false)
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/functions/MapExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object MapExample extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val data = Seq("Project",
13 | "Gutenberg’s",
14 | "Alice’s",
15 | "Adventures",
16 | "in",
17 | "Wonderland",
18 | "Project",
19 | "Gutenberg’s",
20 | "Adventures",
21 | "in",
22 | "Wonderland",
23 | "Project",
24 | "Gutenberg’s")
25 |
26 | val rdd=spark.sparkContext.parallelize(data)
27 |
28 | val rdd2=rdd.map(f=> (f,1))
29 | rdd2.foreach(println)
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/functions/ReduceByKeyExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object ReduceByKeyExample extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val data = Seq(("Project", 1),
13 | ("Gutenberg’s", 1),
14 | ("Alice’s", 1),
15 | ("Adventures", 1),
16 | ("in", 1),
17 | ("Wonderland", 1),
18 | ("Project", 1),
19 | ("Gutenberg’s", 1),
20 | ("Adventures", 1),
21 | ("in", 1),
22 | ("Wonderland", 1),
23 | ("Project", 1),
24 | ("Gutenberg’s", 1))
25 |
26 | val rdd=spark.sparkContext.parallelize(data)
27 |
28 | val rdd2=rdd.reduceByKey(_ + _)
29 |
30 | rdd2.foreach(println)
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/functions/SortByKeyExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object SortByKeyExample extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | val data = Seq(("Project","A", 1),
13 | ("Gutenberg’s", "X",3),
14 | ("Alice’s", "C",5),
15 | ("Adventures","B", 1)
16 | )
17 |
18 | val rdd=spark.sparkContext.parallelize(data)
19 | rdd.foreach(println)
20 | val rdd2=rdd.map(f=>{(f._2, (f._1,f._2,f._3))})
21 | rdd2.foreach(println)
22 | val rdd3= rdd2.sortByKey()
23 | val rdd4= rdd2.sortByKey(false)
24 | rdd4.foreach(println)
25 |
26 | val rdd5 = rdd.sortBy(f=>(f._3,f._2),false)
27 | rdd5.foreach(println)
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/functions/aggregateExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object aggregateExample extends App {
6 |
7 | val spark = SparkSession.builder()
8 | .appName("SparkByExamples.com")
9 | .master("local[3]")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | //aggregate example
15 | val listRdd = spark.sparkContext.parallelize(List(1,2,3,4,5,3,2))
16 | def param0= (accu:Int, v:Int) => accu + v
17 | def param1= (accu1:Int,accu2:Int) => accu1 + accu2
18 | println("output 1 : "+listRdd.aggregate(0)(param0,param1))
19 |
20 |
21 | val inputRDD = spark.sparkContext.parallelize(List(("Z", 1),("A", 20),("B", 30),("C", 40),("B", 30),("B", 60)))
22 | def param3= (accu:Int, v:(String,Int)) => accu + v._2
23 | def param4= (accu1:Int,accu2:Int) => accu1 + accu2
24 | println("output 2 : "+inputRDD.aggregate(0)(param3,param4))
25 |
26 | println("Number fo Partitions :"+listRdd.getNumPartitions)
27 | //aggregate example
28 | println("output 1 : "+listRdd.aggregate(1)(param0,param1))
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/functions/foldExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object foldExample extends App {
6 |
7 | val spark = SparkSession.builder()
8 | .appName("SparkByExamples.com")
9 | .master("local[3]")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | //fold example
15 | val listRdd = spark.sparkContext.parallelize(List(1,2,3,4,5,3,2))
16 | println("Partitions : "+listRdd.getNumPartitions)
17 | println("Total : "+listRdd.fold(0)((acc,ele) => {acc + ele}))
18 | println("Total with init value 2 : "+listRdd.fold(2)((acc,ele) => {acc + ele}))
19 | println("Min : "+listRdd.fold(0)((acc,ele) => {acc min ele}))
20 | println("Max : "+listRdd.fold(0)((acc,ele) => {acc max ele}))
21 |
22 | val inputRDD = spark.sparkContext.parallelize(List(("Z", 1),("A", 20),("B", 30),("C", 40),("B", 30),("B", 60)))
23 |
24 | println("Total : "+inputRDD.fold(("",0))( (acc,ele)=>{ ("Total", acc._2 + ele._2) }))
25 | println("Min : "+inputRDD.fold(("",0))( (acc,ele)=>{ ("Min", acc._2 min ele._2) }))
26 | println("Max : "+inputRDD.fold(("",0))( (acc,ele)=>{ ("Max", acc._2 max ele._2) }))
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/functions/reduceExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object reduceExample extends App {
6 |
7 | val spark = SparkSession.builder()
8 | .appName("SparkByExamples.com")
9 | .master("local[3]")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | val listRdd = spark.sparkContext.parallelize(List(1,2,3,4,5,3,2))
15 |
16 | println("output min using binary : "+listRdd.reduce(_ min _))
17 | println("output max using binary : "+listRdd.reduce(_ max _))
18 | println("output sum using binary : "+listRdd.reduce(_ + _))
19 |
20 |
21 | // Alternatively you can write
22 | println("output min : "+listRdd.reduce( (a,b) => a min b))
23 | println("output max : "+listRdd.reduce( (a,b) => a max b))
24 | println("output sum : "+listRdd.reduce( (a,b) => a + b))
25 |
26 |
27 | val inputRDD = spark.sparkContext.parallelize(List(("Z", 1),("A", 20),("B", 30),
28 | ("C", 40),("B", 30),("B", 60)))
29 |
30 | println("output min : "+inputRDD.reduce( (a,b)=> ("max",a._2 min b._2))._2)
31 | println("output max : "+inputRDD.reduce( (a,b)=> ("max",a._2 max b._2))._2)
32 | println("output sum : "+inputRDD.reduce( (a,b)=> ("Sum",a._2 + b._2))._2)
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/rdd/xml/XmlRecordReader.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.rdd.xml
2 |
3 | import com.databricks.spark.xml.XmlInputFormat
4 | import org.apache.hadoop.conf.Configuration
5 | import org.apache.hadoop.io.{LongWritable, Text}
6 | import org.apache.spark.api.java.JavaSparkContext
7 | import org.apache.spark.api.java.function.VoidFunction
8 | import org.apache.spark.sql.SparkSession
9 |
10 | import scala.xml.XML
11 |
12 |
13 | object XmlRecordReader {
14 | def main(args: Array[String]): Unit = {
15 | val sparkSession = SparkSession.builder.appName("XmlRecordReader").master("local").getOrCreate
16 | val javaSparkContext = new JavaSparkContext(sparkSession.sparkContext)
17 | val configuration = new Configuration
18 | configuration.set("xmlinput.start", "")
19 | configuration.set("xmlinput.end", "")
20 | configuration.set("mapreduce.input.fileinputformat.inputdir", "src/main/resources/records.xml")
21 | val javaPairRDD = javaSparkContext.newAPIHadoopRDD(configuration, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text])
22 | javaPairRDD.foreach(new VoidFunction[Tuple2[LongWritable, Text]]() {
23 | @throws[Exception]
24 | override def call(tuple: Tuple2[LongWritable, Text]): Unit = { // TODO Auto-generated method stub
25 |
26 | val xml = XML.loadString(tuple._2.toString)
27 | val forecast = (xml \ "Name") text
28 |
29 | println("forecast" + forecast)
30 |
31 | }
32 | })
33 | }
34 | }
35 |
36 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/spark30/ADQExample.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.spark30
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object ADQExample extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[5]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | spark.sparkContext.setLogLevel("ERROR")
13 |
14 | import spark.implicits._
15 | val simpleData = Seq(("James","Sales","NY",90000,34,10000),
16 | ("Michael","Sales","NY",86000,56,20000),
17 | ("Robert","Sales","CA",81000,30,23000),
18 | ("Maria","Finance","CA",90000,24,23000),
19 | ("Raman","Finance","CA",99000,40,24000),
20 | ("Scott","Finance","NY",83000,36,19000),
21 | ("Jen","Finance","NY",79000,53,15000),
22 | ("Jeff","Marketing","CA",80000,25,18000),
23 | ("Kumar","Marketing","NY",91000,50,21000)
24 | )
25 | val df = simpleData.toDF("employee_name","department","state","salary","age","bonus")
26 |
27 | val df1=df.groupBy("department").count()
28 | println(df1.rdd.getNumPartitions)
29 |
30 | spark.conf.set("spark.sql.adaptive.enabled",200)
31 | val df2=df.groupBy("department").count()
32 | println(df2.rdd.getNumPartitions)
33 |
34 |
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/spark30/ReadBinary.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.spark30
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object ReadBinary extends App{
6 |
7 | val spark: SparkSession = SparkSession.builder()
8 | .master("local[1]")
9 | .appName("SparkByExamples.com")
10 | .getOrCreate()
11 |
12 | //spark.sparkContext.setLogLevel("ERROR")
13 |
14 | val df = spark.read.format("binaryFile").load("C:\\tmp\\binary\\spark.png")
15 | df.printSchema()
16 | df.show()
17 |
18 | val df2 = spark.read.format("binaryFile").load("C:\\tmp\\binary\\")
19 | df2.printSchema()
20 | //df2.show(false)
21 |
22 | val df3 = spark.read.format("binaryFile").load("C:\\tmp\\binary\\*.png")
23 | df3.printSchema()
24 | df3.show(false)
25 |
26 | // To load files with paths matching a given glob pattern while keeping the behavior of partition discovery
27 | val df4 = spark.read.format("binaryFile")
28 | .option("pathGlobFilter", "*.png")
29 | .load("C:\\tmp\\binary\\")
30 | df4.printSchema()
31 | //df4.show(false)
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/stackoverflow/AddingLiterral.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.stackoverflow
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
5 | case class Employee(EmpId: String, Experience: Double, Salary: Double)
6 |
7 | case class Employee2(EmpId: EmpData, Experience: EmpData, Salary: EmpData)
8 | case class EmpData(key: String,value:String)
9 | object AddingLiterral {
10 | def main(args: Array[String]): Unit = {
11 |
12 | val spark = SparkSession.builder()
13 | .master("local[1]")
14 | .appName("SparkByExample")
15 | .getOrCreate();
16 | import spark.sqlContext.implicits._
17 | import org.apache.spark.sql.functions._
18 | val data = Seq(("111",5,50000),("222",6,60000),("333",7,60000))
19 | val df = data.toDF("EmpId","Experience","Salary")
20 |
21 | val newdf = df.withColumn("EmpId", struct(lit("1").as("key"),col("EmpId").as("value")))
22 | .withColumn("Experience", struct(lit("2").as("key"),col("Experience").as("value")))
23 | .withColumn("Salary", struct(lit("3").as("key"),col("Salary").as("value")))
24 | .show(false)
25 |
26 | val ds = df.as[Employee]
27 | val newDS = ds.map(rec=>{
28 | (EmpData("1",rec.EmpId), EmpData("2",rec.Experience.toString),EmpData("3",rec.Salary.toString))
29 | })
30 | val finalDS = newDS.toDF("EmpId","Experience","Salary").as[Employee2]
31 | finalDS.show(false)
32 | // newDS.withColumnRenamed("_1","EmpId")
33 | // .withColumnRenamed("_2","Experience")
34 | // .withColumnRenamed("_3","Salary")
35 | // .show(false)
36 |
37 |
38 |
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/stackoverflow/SparkContextOld.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.stackoverflow
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | object SparkContextOld extends App{
6 |
7 | val conf = new SparkConf().setAppName("sparkbyexamples.com").setMaster("local[1]")
8 | val sparkContext = new SparkContext(conf)
9 | val rdd = sparkContext.textFile("/src/main/resources/text/alice.txt")
10 |
11 | sparkContext.setLogLevel("ERROR")
12 |
13 | println("First SparkContext:")
14 | println("APP Name :"+sparkContext.appName)
15 | println("Deploy Mode :"+sparkContext.deployMode)
16 | println("Master :"+sparkContext.master)
17 | println("Master :"+sparkContext.applicationId)
18 | // sparkContext.stop()
19 |
20 | val conf2 = new SparkConf().setAppName("sparkbyexamples.com-2").setMaster("local[1]")
21 | val sparkContext2 = new SparkContext(conf2)
22 |
23 | println("Second SparkContext:")
24 | println("APP Name :"+sparkContext2.appName)
25 | println("Deploy Mode :"+sparkContext2.deployMode)
26 | println("Master :"+sparkContext2.master)
27 |
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/stackoverflow/Test.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.stackoverflow
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 | import org.apache.spark.sql.functions._
5 | object Test {
6 |
7 | def main(args: Array[String]): Unit = {
8 |
9 | val spark = SparkSession.builder()
10 | .master("local[1]")
11 | .appName("SparkByExample")
12 | .getOrCreate();
13 | import spark.sqlContext.implicits._
14 |
15 | val df1:DataFrame = Seq(
16 | ("Mark", "2018-02-20 00:00:00"),
17 | ("Alex", "2018-03-01 00:00:00"),
18 | ("Bob", "2018-03-01 00:00:00"),
19 | ("Mark", "2018-07-01 00:00:00"),
20 | ("Kate", "2018-07-01 00:00:00")
21 | ).toDF("USER_NAME", "REQUEST_DATE")
22 |
23 | df1.show()
24 |
25 | val df2: DataFrame = Seq(
26 | ("Alex", "2018-01-01 00:00:00", "2018-02-01 00:00:00", "OUT"),
27 | ("Bob", "2018-02-01 00:00:00", "2018-02-05 00:00:00", "IN"),
28 | ("Mark", "2018-02-01 00:00:00", "2018-03-01 00:00:00", "IN"),
29 | ("Mark", "2018-05-01 00:00:00", "2018-08-01 00:00:00", "OUT"),
30 | ("Meggy", "2018-02-01 00:00:00", "2018-02-01 00:00:00", "OUT")
31 | ).toDF("NAME", "START_DATE", "END_DATE", "STATUS")
32 |
33 | df2.show()
34 |
35 | val df3 = df1.join(df2, col("USER_NAME") === col("NAME"), "left_outer")
36 |
37 |
38 | df3.groupBy("USER_NAME","REQUEST_DATE")
39 |
40 | val df4 = df3.withColumn("USER_STATUS", when($"REQUEST_DATE" > $"START_DATE" and $"REQUEST_DATE" < $"END_DATE", "Our user") otherwise ("Not our user"))
41 |
42 | df4.select("USER_NAME","REQUEST_DATE","USER_STATUS").distinct()show(false)
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/scala/com/sparkbyexamples/spark/stackoverflow/Test2.scala:
--------------------------------------------------------------------------------
1 | package com.sparkbyexamples.spark.stackoverflow
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object Test2 {
6 |
7 | // def main(args: Array[String]): Unit = {
8 | //
9 | // val spark = SparkSession.builder()
10 | // .master("local[1]")
11 | // .appName("SparkByExample")
12 | // .getOrCreate();
13 | //
14 | // val peopleDFCsv = spark.read.format("csv")
15 | // .load("src/main/resources/stack.csv")
16 | //
17 | // val d = peopleDFCsv.map(row=>{
18 | // val col1=row.get(1)
19 | // val col2=row.get(1)
20 | // (col1,col2)
21 | // }).toDF()
22 | //
23 | // }
24 | }
25 |
--------------------------------------------------------------------------------