├── README.md
├── Spark-1.5
├── README.md
├── pom.xml
└── src
│ └── main
│ ├── resources
│ ├── TravelData.txt
│ ├── baby_names.txt
│ ├── books.xml
│ ├── campaign.csv
│ ├── cars.txt
│ ├── dialy_show_guests.txt
│ ├── ebay.csv
│ ├── emp.txt
│ ├── flights.csv
│ ├── log.txt
│ ├── log4j_conf.xml
│ ├── olaCabData.txt
│ ├── olympics_data.txt
│ ├── partitioner.txt
│ ├── person.txt
│ ├── sales.json
│ ├── sensoranalytics.txt
│ ├── users.txt
│ ├── wordcount.txt
│ ├── yahoo_stocks.csv
│ └── youtubedata.txt
│ └── scala
│ └── com
│ └── spark
│ ├── cassandra
│ ├── CassandraCQL.scala
│ ├── Cassandra_SparkStreaming.scala
│ ├── KafkaConsumerToCassandra.scala
│ ├── Spark_Cassandra .scala
│ └── Spark_Cassandra_Delete.scala
│ ├── customudf
│ ├── CustomMean.scala
│ ├── CustomUDAF.scala
│ ├── CustomUDF.scala
│ └── SparkUDF.scala
│ ├── examples
│ ├── CustomPartitioner.scala
│ ├── CustomReceiver.scala
│ ├── DataFrame.scala
│ ├── DataFrame_Joins.scala
│ ├── DataFramesRollup.scala
│ ├── KafkaConsumer.scala
│ ├── KafkaConsumerToCassandra.scala
│ ├── KafkaConsumerToHDFS.scala
│ ├── KafkaProducer.scala
│ ├── MovingAverage.scala
│ ├── ReadHDFSFolders.scala
│ ├── ReadMultipleFiles.scala
│ ├── SparkFileStreaming.scala
│ ├── SparkJDBC.scala
│ ├── SparkStructType.scala
│ ├── Spark_Avro.scala
│ ├── Spark_CSV_Reader.scala
│ ├── Spark_CustomReceiver.scala
│ ├── Spark_Hive.scala
│ ├── Spark_Hive_ORC.scala
│ ├── Spark_Joins.scala
│ ├── Spark_Json_Reader.scala
│ ├── Spark_SequenceFiles.scala
│ ├── Spark_StructType.scala
│ ├── Spark_XML.scala
│ ├── Stateful_WordCount.scala
│ ├── WindowBasedStreaming.scala
│ ├── Window_Sliding_Interval.scala
│ └── WordCount.scala
│ ├── mangodb
│ └── Spark_MangoDB.scala
│ ├── transformations
│ ├── AggregateByKey.scala
│ ├── Cars.scala
│ ├── Cogroup.scala
│ ├── Filter.scala
│ ├── FoldByKey.scala
│ ├── GroupBY_ReduceBY.scala
│ ├── MapvsFlatMap.scala
│ └── Reduce.scala
│ ├── usecases
│ ├── FlightDataAnalysis.scala
│ ├── NamesAnalysis.scala
│ ├── OlaDataAnalysis.scala
│ ├── OlympicsDataAnalysis.scala
│ ├── TVShowDataAnalysis.scala
│ ├── TravelDataAnalysis.scala
│ ├── YoutubeDataAnalysis.scala
│ ├── loganalysis
│ │ ├── ApacheAccessLog.scala
│ │ ├── LogAnalyzer.scala
│ │ └── LogAnalyzerSQL.scala
│ ├── sensoranalytics
│ │ ├── Models.scala
│ │ ├── SchemaParser.scala
│ │ └── SensorAnalytics.scala
│ └── twitteranalytics
│ │ ├── TwitterAnalytics.scala
│ │ └── twitterclient.scala
│ └── util
│ ├── LogHelper.scala
│ └── Utills .scala
├── Spark-2.1
├── README.md
├── input
│ ├── Fire_Department_Calls.csv
│ ├── Fire_Incidents.csv
│ ├── README.md
│ ├── Real_Estate_Data.csv
│ ├── badrecords
│ ├── books.xml
│ ├── conf.properties
│ ├── emp.txt
│ ├── empData.csv
│ ├── iap_sw_cpu_mem_stats_rollup
│ │ ├── part-00000-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet
│ │ └── part-00001-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet
│ ├── lines
│ ├── nested.json
│ ├── one.xml
│ ├── pbs.csv
│ ├── product
│ ├── purchases.log
│ ├── schools.json
│ ├── sw_hp_system_cpu_stats_records
│ │ └── date_month=2020-01
│ │ │ ├── date_hour=2020-01-13-04
│ │ │ └── data.parquet
│ │ │ └── date_hour=2020-01-13-05
│ │ │ └── data.parquet
│ ├── sw_hp_system_info_stats_records
│ │ └── date_month=2020-01
│ │ │ ├── date_hour=2020-01-13-04
│ │ │ └── data.parquet
│ │ │ └── date_hour=2020-01-13-05
│ │ │ └── data.parquet
│ └── sw_hp_system_memory_stats_records
│ │ └── date_month=2020-01
│ │ ├── date_hour=2020-01-13-04
│ │ └── data.parquet
│ │ └── date_hour=2020-01-13-05
│ │ └── data.parquet
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── com
│ │ ├── datadog
│ │ └── metrics
│ │ │ ├── AbstractCaseClass.scala
│ │ │ ├── CaseClasses.scala
│ │ │ ├── DataDogHttpTransport.scala
│ │ │ ├── DatadogCollector.scala
│ │ │ ├── MetricsCollector.scala
│ │ │ ├── MetricsCollectorFactory.scala
│ │ │ ├── Spark_Accumulator.scala
│ │ │ └── Transport.scala
│ │ └── spark2
│ │ ├── aws
│ │ └── Spark_AWS_S3.scala
│ │ ├── cassandra
│ │ ├── ChangeDFTypes.scala
│ │ ├── ConvetTimestampToLong.scala
│ │ ├── ExportCassandraData.scala
│ │ ├── FilterCassandraData.scala
│ │ ├── SparkStreaming_Cassandra.scala
│ │ ├── Spark_To_Caasandra.scala
│ │ ├── Writting_DF_To_Cassandra.scala
│ │ └── export
│ │ │ ├── CassandraYaml.scala
│ │ │ ├── ExportCassandraData.scala
│ │ │ ├── Export_Cassandra_Data.scala
│ │ │ ├── Export_Cassandra_Table_Data.scala
│ │ │ ├── Utils.scala
│ │ │ ├── YamlProps.scala
│ │ │ └── cassandra-table-export.yml
│ │ ├── custom
│ │ ├── CustomPartitioner.scala
│ │ ├── HashJoin.scala
│ │ ├── PairRDD.scala
│ │ ├── SemiStructuredUtilUDF.scala
│ │ ├── UDAF.scala
│ │ └── UDF.scala
│ │ ├── dataframes
│ │ ├── ComplexSchema.scala
│ │ ├── DataFrame_DropDuplicates.scala
│ │ ├── DatasetConversion.scala
│ │ ├── DateTime.scala
│ │ ├── DropColumns.scala
│ │ ├── GenerateUniqueId.scala
│ │ ├── GroupingAndAggregation.scala
│ │ ├── HDFSFilesList.scala
│ │ ├── HandlingNulls.scala
│ │ ├── PartitionBy.scala
│ │ ├── PartitionByColumn.scala
│ │ ├── PartitionBy_WithUDF.scala
│ │ ├── ProblemStatement.scala
│ │ └── RecordsCount.scala
│ │ ├── dataset
│ │ ├── ComplexType.scala
│ │ ├── DatasetBasic.scala
│ │ ├── SemiStructuredData.scala
│ │ └── WordCountDS.scala
│ │ ├── elasticsearch
│ │ ├── CsvToESLoad.scala
│ │ ├── ESDeleteByQuery.scala
│ │ ├── ESQuerying.scala
│ │ ├── Read_And_Delete_From_ES.scala
│ │ ├── Read_From_ES.scala
│ │ └── Write_To_ES.scala
│ │ ├── examples
│ │ ├── ExplodeDemo.scala
│ │ ├── Filter.scala
│ │ ├── FilterEmpty.scala
│ │ ├── LoadPropsFile.scala
│ │ ├── ParquetCompactor.scala
│ │ ├── Spark_Accumulator.scala
│ │ ├── Spark_CatalogAPI.scala
│ │ ├── Spark_To_Caasandra.scala
│ │ └── Test.scala
│ │ ├── fileformats
│ │ ├── AvroToJson.scala
│ │ ├── NestedJsonParser.scala
│ │ ├── Simple_XMLParser.scala
│ │ ├── Simple_XMLParser1.scala
│ │ ├── Spark_To_ObjectFile.scala
│ │ ├── Spark_To_SequenceFiles.scala
│ │ ├── ToParquet.scala
│ │ └── XMLParsing.scala
│ │ ├── hive
│ │ ├── AddHivePartitions.scala
│ │ ├── Save_As_Hive_Parquet.scala
│ │ ├── Save_To_Hive.scala
│ │ ├── Save_To_Hive_Partitioned_External_Table.scala
│ │ └── Spark_CatalogAPI.scala
│ │ ├── jdbc
│ │ └── Spark_To_Jdbc.scala
│ │ ├── mangodb
│ │ └── Spark_To_MangoDB.scala
│ │ ├── parquet
│ │ ├── FileCompression.scala
│ │ ├── ParquetCompactor.scala
│ │ └── TestDataFrame.scala
│ │ ├── problemstatement
│ │ ├── FireDepartmentCalls.scala
│ │ └── ProblemStatement.scala
│ │ ├── streaming
│ │ └── Spark_Kafka_Streaming.scala
│ │ └── window
│ │ └── functions
│ │ ├── ApStats.scala
│ │ ├── CPUTidSiteRollup.scala
│ │ └── SwitchCPUMemStats.scala
│ └── test
│ ├── resources
│ └── log4j.properties
│ └── scala
│ └── test
│ └── MetricsTest.scala
└── Spark-Zeppelin
├── FirstSparkCassandraApp.git.iml
├── README.md
├── Setup.md
├── StandAloneApp.md
├── Zeppelin.md
├── images
├── Interpreter.png
├── SetupImplicits.png
├── SparkOptions.png
└── makenote.png
└── notebooks
├── Spark Cassandra Challenges.json
└── Spark Cassandra Note.json
/Spark-1.5/src/main/resources/campaign.csv:
--------------------------------------------------------------------------------
1 | Week,Campaign Type,Campaign,Account,Branded vs. Unbranded,Category,Impressions,Clicks,Cost,Engagements,Patient Journey,Device,Indication,Country,Region,Metro Area
2 | 5/9/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,"Computers,revanth",Multicentric Castleman's Disease (MCD),United States,Nevada,Las Vegas NV
3 | 5/9/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Texas,El Paso TX
4 | 5/23/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Pennsylvania,Wilkes Barre-Scranton PA
5 | 5/16/2016,SDTC,Sylvant,Google,Branded,Branded,3,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Idaho,Boise ID
6 | 5/23/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Delaware,Philadelphia PA
7 | 5/9/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Nevada,Las Vegas NV
8 | 5/9/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Texas,El Paso TX
9 | 5/23/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Pennsylvania,Wilkes Barre-Scranton PA
10 | 5/16/2016,SDTC,Sylvant,Google,Branded,Branded,3,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Idaho,Boise ID
11 | 5/23/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Delaware,Philadelphia PA
12 | 5/30/2016,SDTC,Lymph Nodes,Bing,Unbranded,Condition,99,0,0,,Diagnosis,Smartphone,Multicentric Castleman's Disease (MCD),United States,Indiana,"Chicago, IL"
13 | 5/30/2016,SDTC,Lymph Nodes,Bing,Unbranded,Condition,99,0,0,,Diagnosis,Smartphone,Multicentric Castleman's Disease (MCD),United States,Indiana,"Chicago, IL"
14 |
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/dialy_show_guests.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-1.5/src/main/resources/dialy_show_guests.txt
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/ebay.csv:
--------------------------------------------------------------------------------
1 | auctionid,bid,bidtime,bidder,bidderrate,openbid,price,item
2 | 1,175,2.230949,schadenfreud,0,99,100,xbox
3 | 2,100,2.600116,chuik,0,99,101,iphone
4 | 3,120,2.60081,kiwisstuff,2,99,102,oneplus
5 | 4,150,2.601076,kiwisstuff,2,99,103,xiomi
6 | 5,177.5,2.909826,eli.flint,4,99,104,sanbox
7 | 6,1,0.355856,bfalconb,2,1,105,mobile
8 | 7,1.25,0.484757,sbord,1,1,106,mouse
9 | 8,1.5,0.492639,bfalconb,2,1,107,keyboard
10 | 9,25,0.49463,sbord,1,1,108,laptop
11 | 1,175,2.230949,schadenfreud,0,99,200,xbox
12 | 2,100,2.600116,chuik,0,99,201,iphone
13 | 3,120,2.60081,kiwisstuff,2,99,202,oneplus
14 | 4,150,2.601076,kiwisstuff,2,99,203,xiomi
15 | 5,177.5,2.909826,eli.flint,4,99,204,sanbox
16 | 6,1,0.355856,bfalconb,2,1,205,mobile
17 | 7,1.25,0.484757,sbord,1,1,206,mouse
18 | 8,1.5,0.492639,bfalconb,2,1,207,keyboard
19 | 9,25,0.49463,sbord,1,1,208,laptop
20 |
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/emp.txt:
--------------------------------------------------------------------------------
1 | empid|name|Dept|salary|No of projects worked
2 | 328561|Revanth1|DPE|1000|5|
3 | 328562|Revanth2|DPE|2000|6|
4 | 328563|Revanth3|DPE|3000|3|
5 | 328564|Revanth4|DPE|4000|4|
6 | 328565|Revanth5|DPE|5000|6|
7 | 328566|Revanth6|DPE|6000|5|
8 |
9 | 328561|Revanth1|DPE|7000|1|
10 | 328562|Revanth2|DPE|18000|2|
11 | 328563|Revanth3|DPE|5000|4|
12 | 328564|Revanth4|DPE|3000|3|
13 | 328565|Revanth5|DPE|4000|5|
14 | 328566|Revanth6|DPE|7000|7|
15 |
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/log4j_conf.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/olaCabData.txt:
--------------------------------------------------------------------------------
1 | dispatching_base_number,date,active_vehicles,trips
2 | B02512,3/21/2015,1,1
3 | B02513,3/21/2016,2,3
4 | B02764,3/22/2016,3,2
5 | B02512,3/21/2015,1,2
6 | B02513,3/21/2014,2,1
7 | B02764,3/22/2014,3,1
8 | B02512,3/21/2013,1,4
9 | B02513,3/21/2014,2,3
10 | B02764,3/22/2014,3,4
11 | B02764,3/22/2014,3,1
12 | B02764,3/22/2014,3,1
13 | B02764,3/22/2014,3,1
14 | B02682,2/26/2015,1465,13814
15 | B02512,2/26/2015,243,1797
16 | B02765,2/26/2015,745,6744
17 | B02764,2/26/2015,4101,36091
18 | B02765,2/27/2015,786,7563
19 | B02617,2/27/2015,1551,14677
20 | B02598,2/27/2015,1114,10755
21 | B02512,2/27/2015,272,2056
22 | B02764,2/27/2015,4253,38780
23 | B02682,2/27/2015,1510,14975
24 | B02598,2/28/2015,994,10319
25 | B02764,2/28/2015,3952,39812
26 | B02617,2/28/2015,1372,14022
27 | B02682,2/28/2015,1386,14472
28 | B02512,2/28/2015,230,1803
29 | B02765,2/28/2015,747,7753
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/partitioner.txt:
--------------------------------------------------------------------------------
1 | venkat
2 | 1000
3 | suman
4 | 2000
5 | sachin
6 | 3000
7 | senthil
8 | 4000
9 | revanth
10 | hari
11 | ganesh
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/person.txt:
--------------------------------------------------------------------------------
1 | revanth,reddy,26
2 | shyam,sunder,22
3 | kiran,kumar,24
4 | revanth,reddy,26
5 | shyam,sunder,22
6 | kiran,kumar,24
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/sales.json:
--------------------------------------------------------------------------------
1 | {"transactionId":111,"customerId":1,"itemId": 1,"itemName": "xbox","amountPaid": 1400.0}
2 | {"transactionId":112,"customerId":2,"itemId": 2,"itemName": "Iphonr","amountPaid": 5035.0}
3 | {"transactionId":113,"customerId":3,"itemId": 3,"itemName": "OnePlus","amountPaid": 5310.0}
4 | {"transactionId":114,"customerId":4,"itemId": 4,"itemName": "HP","amountPaid": 6000.0}
5 | {"transactionId":115,"customerId":1,"itemId": 2,"itemName": "Dell","amountPaid": 5010.0}
6 | {"transactionId":116,"customerId":1,"itemId": 2,"itemName": "Letv","amountPaid": 5020.0}
7 | {"transactionId":117,"customerId":1,"itemId": 2,"itemName": "Reuters","amountPaid": 500.0}
8 | {"transactionId":118,"customerId":1,"itemId": 2,"itemName": "Wipro","amountPaid": 5400.0}
9 | {"transactionId":119,"customerId":2,"itemId": 3,"itemName": "Thomson","amountPaid": 5010.0}
10 | {"transactionId":120,"customerId":1,"itemId": 2,"itemName": "HTC","amountPaid": 5020.0}
11 | {"transactionId":121,"customerId":1,"itemId": 4,"itemName": "Nokia","amountPaid": 5034.0}
12 | {"transactionId":122,"customerId":1,"itemId": 2,"itemName": "Ericson","amountPaid": 5300.0}
13 | {"transactionId":123,"customerId":1,"itemId": 4,"itemName": "Samsung","amountPaid": 5050.0}
14 | {"transactionId":124,"customerId":1,"itemId": 2,"itemName": "Panasonic","amountPaid": 5060.0}
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/sensoranalytics.txt:
--------------------------------------------------------------------------------
1 | 2015/10/09 12:00:00.188 ,India,TN,Chennai,Success
2 | 2015/10/09 13:00:00.189 ,India,TN,Chennai,Failure
3 | 2015/10/09 15:00:00.233 ,India,TN,Chennai,Success
4 | 2015/10/09 16:00:00.268 ,India,TN,Chennai,Failure
5 | 2015/10/09 07:00:00.449 ,US,Washington,Seattle,Failure
6 | 2015/10/09 01:00:00.449 ,US,Washington,Seattle,Success
7 | 2015/10/09 04:00:00.449 ,US,Washington,Seattle,Failure
8 | 2015/10/09 05:00:00.449 ,US,Washington,Seattle,Success
9 | 2016/03/07 02:00:00.010 ,India,Karnataka,Banglore,Success
10 | 2016/03/07 07:00:00.053 ,India,Karnataka,Banglore,Failure
11 | 2016/03/07 02:00:00.010 ,India,Karnataka,Banglore,Success
12 | 2016/03/07 07:00:00.053 ,India,Karnataka,Banglore,Failure
13 | 2016/03/25 02:00:00.010 ,India,Karnataka,Banglore,Success
14 | 2016/03/25 07:00:00.053 ,India,Karnataka,Banglore,Failure
15 | 2016/03/25 02:00:00.010 ,India,Karnataka,Banglore,Success
16 | 2016/03/25 07:00:00.053 ,India,Karnataka,Banglore,Failure
17 | 2016/03/26 02:00:00.010 ,India,Karnataka,Banglore,Success
18 | 2016/03/26 07:00:00.053 ,India,Karnataka,Banglore,Failure
19 | 2016/03/26 02:00:00.010 ,India,Karnataka,Banglore,Success
20 | 2016/03/26 07:00:00.053 ,India,Karnataka,Banglore,Failure
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/users.txt:
--------------------------------------------------------------------------------
1 | 1201,satish,25
2 | 1202,krishna,28
3 | 1203,amith,39
4 | 1204,javed,23
5 | 1205,prudvi,23
--------------------------------------------------------------------------------
/Spark-1.5/src/main/resources/wordcount.txt:
--------------------------------------------------------------------------------
1 | Please note that here we have just defined RDD, data is not loaded still.
2 | This means that if you go to access the data in this RDD it could fail.
3 | The computation to create the data in an RDD is only done when the data is referenced;
4 | for example, it is created by caching or writing out the RDD
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/cassandra/CassandraCQL.scala:
--------------------------------------------------------------------------------
1 | package com.spark.cassandra
2 | import org.apache.spark.{ SparkContext, SparkConf }
3 | import com.datastax.spark.connector._
4 | import org.apache.spark._
5 | import java.util.UUID
6 | import org.apache.spark.sql.cassandra.CassandraSQLContext
7 | import org.apache.spark.sql._
8 | object CassandraCQL extends App {
9 |
10 | case class Emp(Id: Int, name: String, salary: String)
11 |
12 | val conf = new SparkConf(true).set("spark.cassandra.connection.host", "127.0.0.1").setAppName("CassandraCQL").setMaster("local[2]")
13 | val sc = new SparkContext(conf)
14 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
15 |
16 | //implementation using cassandra sql context
17 | val cc = new CassandraSQLContext(sc)
18 | val rdd = cc.sql("SELECT id,name,salary FROM spark_kafka_cassandra.employee where name like 'HARI%'")
19 |
20 | rdd.collect().foreach(println)
21 |
22 | /* //implementation using cassandra table converting to df
23 | val user_table = sc.cassandraTable("tutorial", "user")
24 |
25 | val df = sqlContext
26 | .read
27 | .format("org.apache.spark.sql.cassandra")
28 | .options(Map("table" -> "user", "keyspace" -> "tutorial"))
29 | .load()
30 |
31 | df.registerTempTable("user")
32 | val results = sqlContext.sql("SELECT empname,sum(empsal),sum(projno) FROM tutorial.user GROUP BY empid,empname,deptno")
33 | //results.collect().foreach(println)
34 | */ sc.stop
35 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/cassandra/Cassandra_SparkStreaming.scala:
--------------------------------------------------------------------------------
1 | package com.spark.cassandra
2 | import org.apache.spark._
3 | import org.apache.spark.streaming._
4 | import com.datastax.spark.connector.streaming._
5 | import org.apache.spark.rdd._
6 | import org.apache.spark.streaming.dstream.ConstantInputDStream
7 |
8 | //Reading from Cassandra using Spark Streaming
9 | object Cassandra_SparkStreaming {
10 | case class Employee(Id: Int, name: String, salary: Int)
11 |
12 | def main(args: Array[String]) {
13 | val KEY_SPACE_NAME = "spark_kafka_cassandra"
14 | val TABLE_NAME = "employee"
15 | val conf = new SparkConf().setAppName("Cassandra_SparkStreaming").set("spark.cassandra.connection.host", "127.0.0.1")
16 |
17 | val ssc = new StreamingContext(conf, Seconds(10))
18 |
19 | val cassandraRDD = ssc.cassandraTable[Employee](KEY_SPACE_NAME, TABLE_NAME).select("id", "name", "salary")
20 |
21 | val dstream = new ConstantInputDStream(ssc, cassandraRDD)
22 |
23 | dstream.foreachRDD { rdd =>
24 | println("Total Records cont in DB : " + rdd.count)
25 | println(rdd.collect.mkString("\n"))
26 | }
27 |
28 | ssc.start()
29 | ssc.awaitTermination()
30 | }
31 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/cassandra/KafkaConsumerToCassandra.scala:
--------------------------------------------------------------------------------
1 | package com.spark.cassandra
2 |
3 | import scala.reflect.runtime.universe
4 |
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.SparkContext
7 | import org.apache.spark.streaming.Minutes
8 | import org.apache.spark.streaming.StreamingContext
9 | import org.apache.spark.streaming.kafka.KafkaUtils
10 | import com.datastax.driver.core.Cluster
11 | import com.datastax.spark.connector.SomeColumns
12 | import com.datastax.spark.connector.toNamedColumnRef
13 | import com.datastax.spark.connector.toRDDFunctions
14 |
15 | import kafka.serializer.StringDecoder
16 | object KafkaConsumerToCassandra {
17 | val SLIDE_INTERVAL = 1
18 | def startStreaming(args: Array[String]): Unit = {
19 | try {
20 | val Array(zkQuorum, topics) = args
21 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Kafka-Streaming").setMaster("local[2]").set("spark.cassandra.connection.host", "127.0.0.1"))
22 | val ssc = new StreamingContext(sc, Minutes(SLIDE_INTERVAL))
23 | val topicsSet = topics.split(",").toSet
24 | val kafkaParams = Map[String, String]("metadata.broker.list" -> zkQuorum)
25 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
26 | ssc, kafkaParams, topicsSet).map(_._2).map(line => line.split('|'))
27 |
28 | val cluster = Cluster.builder().addContactPoint("127.0.0.1").build()
29 | //Creating Session object
30 | val session = cluster.connect()
31 | session.execute("CREATE KEYSPACE IF NOT EXISTS spark_kafka_cassandra WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 };");
32 | val query = "CREATE TABLE IF NOT EXISTS spark_kafka_cassandra.employee (id int PRIMARY KEY,name VARCHAR, salary int);"
33 | //Executing the query
34 | session.execute(query)
35 |
36 | messages.foreachRDD(
37 | rdd => {
38 | if (!rdd.isEmpty()) {
39 | println(rdd.first())
40 | println("rdd count " + rdd.count())
41 | val resRDD = rdd.map(line => (line(0), line(1), line(2)))
42 | .saveToCassandra("spark_kafka_cassandra", "employee", SomeColumns("id", "name", "salary"))
43 | } else {
44 | println("Data is not yet recevied from the producer....")
45 | }
46 | })
47 | ssc.start()
48 | ssc.awaitTermination()
49 | } catch {
50 | case ex: Exception => {
51 | println(ex.getMessage)
52 | }
53 | }
54 | }
55 |
56 | def main(args: Array[String]) {
57 | /*if (args.length < 2) {
58 | System.err.println("Usage: KafkaConsumer ")
59 | System.exit(1)
60 | }*/
61 | startStreaming(args)
62 | }
63 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/cassandra/Spark_Cassandra_Delete.scala:
--------------------------------------------------------------------------------
1 | package com.spark.cassandra
2 |
3 | import scala.reflect.runtime.universe
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.SparkContext
6 | import com.datastax.driver.core.BoundStatement
7 | import com.datastax.driver.core.Cluster
8 | import com.datastax.spark.connector.toSparkContextFunctions
9 | import org.apache.log4j.Logger
10 | import org.apache.log4j.Level
11 | object Spark_Cassandra_Delete {
12 | case class Employee(Id: Int, name: String, salary: Int)
13 | def main(args: Array[String]) {
14 | Logger.getLogger("org").setLevel(Level.WARN)
15 | val CASSANDRA_HOST = "127.0.0.1"
16 | val conf = new SparkConf(true).set("spark.cassandra.connection.host", CASSANDRA_HOST).setAppName("Spark-Cassandra-Delete").setMaster("local[2]")
17 | val sc = new SparkContext(conf)
18 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
19 | val KEY_SPACE_NAME = "spark_kafka_cassandra"
20 | val TABLE_NAME = "employee"
21 |
22 | //Creating Cluster object
23 | val cluster = Cluster.builder().addContactPoint(CASSANDRA_HOST).build()
24 | //Creating Session object
25 | val session = cluster.connect()
26 |
27 | try {
28 | val deleteQuery = " delete from " + KEY_SPACE_NAME + "." + TABLE_NAME + " WHERE id = 102 "
29 | val deletequeryprepared = session.prepare(deleteQuery)
30 | val deleteBoundStatement = new BoundStatement(deletequeryprepared)
31 | session.execute(deleteBoundStatement)
32 |
33 | //Displaying the records
34 | val rows = sc.cassandraTable[Employee](KEY_SPACE_NAME, TABLE_NAME)
35 | rows.toArray.foreach(println)
36 |
37 | } catch {
38 | case e: Exception =>
39 | println(e)
40 | } finally {
41 | session.close()
42 | cluster.close()
43 | sc.stop()
44 | }
45 |
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/customudf/CustomMean.scala:
--------------------------------------------------------------------------------
1 | package com.spark.customudf
2 | import org.apache.spark.sql.Row
3 | import org.apache.spark.sql.expressions.{ MutableAggregationBuffer, UserDefinedAggregateFunction }
4 | import org.apache.spark.sql.types._
5 |
6 | //Extend UserDefinedAggregateFunction to write custom aggregate function
7 | //You can also specify any constructor arguments. For instance you
8 | //can have CustomMean(arg1: Int, arg2: String)
9 | class CustomMean() extends UserDefinedAggregateFunction {
10 |
11 | // Input Data Type Schema
12 | def inputSchema: StructType = StructType(Array(StructField("item", DoubleType)))
13 |
14 | // Intermediate Schema
15 | def bufferSchema = StructType(Array(
16 | StructField("sum", DoubleType),
17 | StructField("cnt", LongType)))
18 |
19 | // Returned Data Type .
20 | def dataType: DataType = DoubleType
21 |
22 | // Self-explaining
23 | def deterministic = true
24 |
25 | // This function is called whenever key changes
26 | def initialize(buffer: MutableAggregationBuffer) = {
27 | buffer(0) = 0.toDouble // set sum to zero
28 | buffer(1) = 0L // set number of items to 0
29 | }
30 |
31 | // Iterate over each entry of a group
32 | def update(buffer: MutableAggregationBuffer, input: Row) = {
33 | buffer(0) = buffer.getDouble(0) + input.getDouble(0)
34 | buffer(1) = buffer.getLong(1) + 1
35 | }
36 |
37 | // Merge two partial aggregates
38 | def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = {
39 | buffer1(0) = buffer1.getDouble(0) + buffer2.getDouble(0)
40 | buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1)
41 | }
42 |
43 | // Called after all the entries are exhausted.
44 | def evaluate(buffer: Row) = {
45 | buffer.getDouble(0) / buffer.getLong(1).toDouble
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/customudf/CustomUDAF.scala:
--------------------------------------------------------------------------------
1 | package com.spark.customudf
2 |
3 | import org.apache.spark.sql.Row
4 | import org.apache.spark.sql.types.{ DoubleType, StringType, StructField, StructType }
5 | import org.apache.spark.{ SparkConf, SparkContext }
6 | import org.apache.spark.sql.functions._
7 |
8 | object CustomUDAF {
9 | val sparkConf = new SparkConf().setAppName("Spark-CustomUDAF").setMaster("local[1]")//.set("spark.sql.warehouse.dir", "file:///D:/Spark-WorkSpace/Spark-Windows/spark-warehouse")
10 | val sc = new SparkContext(sparkConf)
11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
12 | def main(args: Array[String]) {
13 | // define UDAF
14 | val customMean = new CustomMean()
15 |
16 | // create test dataset
17 | val data = (1 to 100).map { x: Int =>
18 | x match {
19 | case t if t <= 50 => Row("A", t.toDouble)
20 | case t => Row("B", t.toDouble)
21 | }
22 | }
23 |
24 | // create schema of the test dataset
25 | val schema = StructType(Array(
26 | StructField("key", StringType),
27 | StructField("value", DoubleType)))
28 |
29 | // construct data frame
30 | val rdd = sc.parallelize(data)
31 | val df = sqlContext.createDataFrame(rdd, schema)
32 |
33 | // Calculate average value for each group
34 | df.groupBy("key").agg(
35 | customMean(df.col("value")).as("custom_mean"),
36 | avg("value").as("avg")).show()
37 | }
38 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/customudf/CustomUDF.scala:
--------------------------------------------------------------------------------
1 | package com.spark.customudf
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 |
6 | object CustomUDF {
7 | val sparkConf = new SparkConf().setAppName("Spark-CustomUDF").setMaster("local[1]")//.set("spark.sql.warehouse.dir", "file:///D:/Spark-WorkSpace/Spark-Windows/spark-warehouse")
8 | val sc = new SparkContext(sparkConf)
9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
10 | def main(args: Array[String]) {
11 |
12 | // Construct Dummy Data
13 | import util.Random
14 | import org.apache.spark.sql.Row
15 | implicit class Crossable[X](xs: Traversable[X]) {
16 | def cross[Y](ys: Traversable[Y]) = for { x <- xs; y <- ys } yield (x, y)
17 | }
18 |
19 | val students = Seq("John", "Mike", "Matt")
20 | val subjects = Seq("Math", "Sci", "Geography", "History")
21 | val random = new Random(1)
22 | val data = (students cross subjects).map { x => Row(x._1, x._2, random.nextInt(100)) }.toSeq
23 |
24 | data.foreach { x => println(x)}
25 |
26 | // Create Schema Object
27 | import org.apache.spark.sql.types.{ StructType, StructField, IntegerType, StringType }
28 | val schema = StructType(Array(
29 | StructField("student", StringType, nullable = false),
30 | StructField("subject", StringType, nullable = false),
31 | StructField("score", IntegerType, nullable = false)))
32 |
33 | // Create DataFrame
34 | val rdd = sc.parallelize(data)
35 | val df = sqlContext.createDataFrame(rdd, schema)
36 | // Define udf
37 | import org.apache.spark.sql.functions.udf
38 | def udfScoreToCategory = udf((score: Int) => {
39 | score match {
40 | case t if t >= 80 => "A"
41 | case t if t >= 60 => "B"
42 | case t if t >= 35 => "C"
43 | case _ => "D"
44 | }
45 | })
46 | df.withColumn("category", udfScoreToCategory(df("score"))).show(10)
47 | }
48 |
49 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/customudf/SparkUDF.scala:
--------------------------------------------------------------------------------
1 | package com.spark.customudf
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 |
6 | object SparkUDF {
7 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double)
8 |
9 | def main(args: Array[String]) {
10 |
11 | val sc = new SparkContext(new SparkConf().setAppName("Spark-custom-UDF").setMaster("local[1]"))
12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
13 | import sqlContext.implicits._
14 |
15 | val x = sc.parallelize(Array(
16 | Purchase(123, 234, "2007-12-12", "20:50", "UTC", 500.99),
17 | Purchase(123, 247, "2007-12-12", "15:30", "PST", 300.22),
18 | Purchase(189, 254, "2007-12-13", "00:50", "EST", 122.19),
19 | Purchase(187, 299, "2007-12-12", "07:30", "UTC", 524.37)))
20 |
21 | val df = sqlContext.createDataFrame(x)
22 | df.registerTempTable("df")
23 |
24 | def makeDT(date: String, time: String, tz: String) = s"$date $time $tz"
25 | sqlContext.udf.register("makeDt", makeDT(_: String, _: String, _: String))
26 |
27 | // Now we can use our function directly in SparkSQL.
28 | val res = sqlContext.sql("SELECT amount, makeDt(date, time, tz) from df").take(2)
29 | res.foreach { x => print(x) }
30 |
31 | // but not outsideit fails
32 | // df.select($"customer_id", makeDt($"date", $"time", $"tz"), $"amount").take(2)
33 |
34 | //You can see above that we can use it within SQL but not outside of it.
35 | //To do that we're going to have to create a different UDF using spark.sql.function.udf
36 |
37 | import org.apache.spark.sql.functions.udf
38 | val makeDt = udf(makeDT(_: String, _: String, _: String))
39 | // now this works
40 | df.select($"customer_id", makeDt($"date", $"time", $"tz"), $"amount").take(2).foreach { x => print(x) }
41 |
42 | // In Spark version 1.5, functions to create date times were introduced.
43 | // Now we can leave our function the same however we're just going to create a format and wrap our MakeDT
44 | // function in the unix_timestampfunction call, we can do this both in and out of SparkSQL!
45 |
46 | import org.apache.spark.sql.functions.unix_timestamp
47 |
48 | val fmt = "yyyy-MM-dd hh:mm z"
49 | df.select($"customer_id", unix_timestamp(makeDt($"date", $"time", $"tz"), fmt), $"amount").take(2).foreach { x => print(x) }
50 |
51 | sqlContext.sql(s"SELECT customer_id, unix_timestamp(makeDt(date, time, tz), '$fmt'), amount FROM df").take(2).foreach { x => print(x) }
52 | }
53 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/CustomPartitioner.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.Partitioner
5 |
6 | object CustomPartitioner {
7 |
8 | def main(args: Array[String]) {
9 |
10 | val sparkConf = new SparkConf().setAppName("Spark-Custom-Partitioner").setMaster("local[1]")
11 | val sc = new SparkContext(sparkConf)
12 | val inputFile = sc.textFile("E:/Software/Spark/data/partitioner.txt")
13 |
14 | //create paired RDD
15 | val pairedData = inputFile.flatMap(x => x.split(" ")).map(x => (x, 1))
16 |
17 | //Define custom pertitioner for paired RDD
18 | val partitionedData = pairedData.partitionBy(new MyCustomerPartitioner(2)).map(f => f._1)
19 |
20 | //verify result using mapPartitionWithIndex
21 | val finalOut = partitionedData.mapPartitionsWithIndex {
22 | (partitionIndex, dataIterator) => dataIterator.map(dataInfo => (dataInfo + " is located in " + partitionIndex + " partition."))
23 | }
24 | //Save Output in HDFS
25 | finalOut.saveAsTextFile("E:/Software/Spark/data/partitionOutput")
26 |
27 | }
28 | }
29 | class MyCustomerPartitioner(numParts: Int) extends Partitioner {
30 | override def numPartitions: Int = numParts
31 |
32 | override def getPartition(key: Any): Int =
33 | {
34 | val out = toInt(key.toString)
35 | out
36 | }
37 |
38 | override def equals(other: Any): Boolean = other match {
39 | case dnp: MyCustomerPartitioner =>
40 | dnp.numPartitions == numPartitions
41 | case _ =>
42 | false
43 | }
44 |
45 | def toInt(s: String): Int =
46 | {
47 | try {
48 | s.toInt
49 | 0
50 | } catch {
51 | case e: Exception => 1
52 |
53 | }
54 | }
55 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/CustomReceiver.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 |
3 | import java.io.{ BufferedReader, InputStreamReader }
4 | import java.net.Socket
5 | import java.nio.charset.StandardCharsets
6 |
7 | import org.apache.spark.SparkConf
8 | import org.apache.spark.storage.StorageLevel
9 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
10 | import org.apache.spark.streaming.receiver.Receiver
11 |
12 | object CustomReceiver {
13 | def main(args: Array[String]) {
14 | if (args.length < 2) {
15 | System.err.println("Usage: CustomReceiver ")
16 | System.exit(1)
17 | }
18 |
19 | // Create the context with a 10 second batch size
20 | val sparkConf = new SparkConf().setAppName("Spark-CustomReceiver")
21 | val ssc = new StreamingContext(sparkConf, Seconds(10))
22 |
23 | // Create an input stream with the custom receiver on target ip:port and count the
24 | // words in input stream of \n delimited text (eg. generated by 'nc')
25 | val lines = ssc.receiverStream(new CustomReceiver(args(0), args(1).toInt))
26 |
27 |
28 | val words = lines.flatMap(_.split(" "))
29 | val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
30 | wordCounts.print()
31 | ssc.start()
32 | ssc.awaitTermination()
33 | }
34 | }
35 |
36 | class CustomReceiver(host: String, port: Int)
37 | extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) {
38 |
39 | def onStart() {
40 | // Start the thread that receives data over a connection
41 | new Thread("Socket Receiver") {
42 | override def run() { receive() }
43 | }.start()
44 | }
45 |
46 | def onStop() {
47 | // There is nothing much to do as the thread calling receive()
48 | // is designed to stop by itself isStopped() returns false
49 | }
50 |
51 | /** Create a socket connection and receive data until receiver is stopped */
52 | private def receive() {
53 | var socket: Socket = null
54 | var userInput: String = null
55 | try {
56 | socket = new Socket(host, port)
57 | val reader = new BufferedReader(
58 | new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8))
59 | userInput = reader.readLine()
60 | while (!isStopped && userInput != null) {
61 | store(userInput)
62 | userInput = reader.readLine()
63 | }
64 | reader.close()
65 | socket.close()
66 | println("Stopped receiving")
67 | restart("Trying to connect again")
68 | } catch {
69 | case e: java.net.ConnectException =>
70 | restart("Error connecting to " + host + ":" + port, e)
71 | case t: Throwable =>
72 | restart("Error receiving data", t)
73 | }
74 | }
75 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/DataFrame.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.functions._
5 | import com.spark.util._
6 | object DataFrame {
7 | case class Employee(empid: Int, name: String, dept: String, salary: Int, nop: Int)
8 | case class AggregatedEmpData(empid: Int, name: String, dept: String, sumsalary: Long, sumnop: Long, maxsalary: Int, date: String)
9 | def main(args: Array[String]) {
10 | val conf = new SparkConf().setAppName("Spark-DataFrame").setMaster("local[1]")
11 | val sc = new SparkContext(conf)
12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
13 | import sqlContext.implicits._
14 |
15 | val empDataRDD = sc.textFile(Utills.DATA_PATH +"emp.txt") //path to emp.txt
16 | val dropHeaderRDD = empDataRDD.mapPartitions(_.drop(1)) //remove the header information from the file
17 |
18 | /*println(dropHeaderRDD.first())
19 |
20 | val df = empDataRDD.toDF("line")
21 | val errors = df.filter(col("line").like("%Revanth1%"))
22 | println(errors.count())*/
23 |
24 | val empDF = dropHeaderRDD.filter { lines => lines.length() > 0 }.
25 | map(_.split("\\|")).
26 | map(p => Employee(p(0).trim.toInt, p(1), p(2), p(3).trim.toInt, p(4).trim.toInt)).toDF()
27 |
28 | empDF.show()
29 | /*val whereCond = "dept = 'DPE' and salary > 1000 or dept = 'MGF' and salary > 5000"
30 | val res = empDF.select("empid", "name", "salary", "dept").where(whereCond)
31 | res.show()*/
32 |
33 | //Spark Aggregations
34 | val aggDF = empDF.groupBy("empid", "name", "dept").
35 | agg(sum(empDF.col("salary")), sum(empDF.col("nop")), max(empDF.col("salary")))
36 | aggDF.printSchema()
37 |
38 | //Adding extra column at the end ..
39 | val finalDF = aggDF.map(row => AggregatedEmpData(row.getInt(0), row.getString(1), row.getString(2), row.getLong(3), row.getLong(4), row.getInt(5), Utills.getTime()))
40 | println(finalDF.first())
41 |
42 | //Saving data as text file
43 | aggDF.rdd.coalesce(1, false).saveAsTextFile("F:/Software/Spark/data/aggData/" + Utills.getTime())
44 |
45 | empDF.groupBy("empid").agg(max(empDF.col("salary"))).show()
46 | empDF.select(max($"salary")).show()
47 |
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/KafkaConsumer.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.streaming.Seconds
5 | import org.apache.spark.streaming.StreamingContext
6 | import org.apache.spark.streaming.dstream.InputDStream
7 | import org.apache.spark.streaming.kafka.KafkaUtils
8 | import java.util.Properties
9 | import kafka.producer.KeyedMessage
10 | import kafka.producer.Producer
11 | import kafka.producer.ProducerConfig
12 | import kafka.serializer.StringDecoder
13 | object KafkaConsumer {
14 |
15 | def main(args: Array[String]) {
16 | try {
17 | val Array(brokerList, topics) = args
18 | val sc = new SparkContext(new SparkConf().setAppName("KafkaConsumer-Streaming").setMaster("local[2]"))
19 | val ssc = new StreamingContext(sc, Seconds(5))
20 | val topicsSet = topics.split(",").toSet
21 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokerList)
22 |
23 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
24 | ssc, kafkaParams, topicsSet).map(_._2)
25 |
26 | messages.foreachRDD(x => {
27 | if (!x.isEmpty()) {
28 | x.foreach { x => println(x) }
29 | println("--------------------------------------------------------")
30 | println(x.first())
31 | }else{
32 | println("Data is not received from the producer")
33 | }
34 | })
35 | ssc.start()
36 | ssc.awaitTermination()
37 |
38 | } catch {
39 | case ex: Exception => {
40 | println(ex.printStackTrace())
41 | }
42 | }
43 | }
44 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/KafkaConsumerToCassandra.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 |
3 | import scala.reflect.runtime.universe
4 |
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.SparkContext
7 | import org.apache.spark.streaming.Minutes
8 | import org.apache.spark.streaming.StreamingContext
9 | import org.apache.spark.streaming.kafka.KafkaUtils
10 | import com.datastax.driver.core.Cluster
11 | import com.datastax.spark.connector.SomeColumns
12 | import com.datastax.spark.connector.toNamedColumnRef
13 | import com.datastax.spark.connector.toRDDFunctions
14 |
15 | import kafka.serializer.StringDecoder
16 | object KafkaConsumerToCassandra {
17 | val SLIDE_INTERVAL = 1
18 | def startStreaming(args: Array[String]): Unit = {
19 | try {
20 | val Array(zkQuorum, topics) = args
21 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Kafka-Streaming").setMaster("local[2]").set("spark.cassandra.connection.host", "127.0.0.1"))
22 | val ssc = new StreamingContext(sc, Minutes(SLIDE_INTERVAL))
23 | val topicsSet = topics.split(",").toSet
24 | val kafkaParams = Map[String, String]("metadata.broker.list" -> zkQuorum)
25 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
26 | ssc, kafkaParams, topicsSet).map(_._2).map(line => line.split('|'))
27 |
28 | val cluster = Cluster.builder().addContactPoint("127.0.0.1").build()
29 | //Creating Session object
30 | val session = cluster.connect()
31 | session.execute("CREATE KEYSPACE IF NOT EXISTS spark_kafka_cassandra WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 };");
32 | val query = "CREATE TABLE IF NOT EXISTS spark_kafka_cassandra.employee (id int PRIMARY KEY,name VARCHAR, salary int);"
33 | //Executing the query
34 | session.execute(query)
35 |
36 | messages.foreachRDD(
37 | rdd => {
38 | if (!rdd.isEmpty()) {
39 | println(rdd.first())
40 | println("rdd count " + rdd.count())
41 | val resRDD = rdd.map(line => (line(0), line(1), line(2)))
42 | .saveToCassandra("spark_kafka_cassandra", "employee", SomeColumns("id", "name", "salary"))
43 | } else {
44 | println("Data is not yet recevied from the producer....")
45 | }
46 | })
47 | ssc.start()
48 | ssc.awaitTermination()
49 | } catch {
50 | case ex: Exception => {
51 | println(ex.getMessage)
52 | }
53 | }
54 | }
55 |
56 | def main(args: Array[String]) {
57 | /*if (args.length < 2) {
58 | System.err.println("Usage: KafkaConsumer ")
59 | System.exit(1)
60 | }*/
61 | startStreaming(args)
62 | }
63 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/KafkaConsumerToHDFS.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 |
3 | import org.apache.hadoop.conf.Configuration
4 | import org.apache.hadoop.fs.FileSystem
5 | import org.apache.hadoop.fs.Path
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.SparkContext
8 | import org.apache.spark.streaming.Minutes
9 | import org.apache.spark.streaming.StreamingContext
10 | import org.apache.spark.streaming.kafka.KafkaUtils
11 | import com.spark.util._
12 | import kafka.serializer.StringDecoder
13 | object KafkaConsumerToHDFS {
14 | private val conf = new Configuration()
15 | val fs = FileSystem.get(conf)
16 | private val hdfsCoreSitePath = new Path("/home/centos/hadoop-2.6.0/etc/hadoop/core-site.xml")
17 | conf.addResource(hdfsCoreSitePath)
18 | val uri = conf.get("fs.default.name")
19 | val SLIDE_INTERVAL = 1
20 | def startStreaming(args: Array[String]): Unit = {
21 | try {
22 | val Array(zkQuorum, topics) = args
23 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Kafka-Streaming").setMaster("local[2]"))
24 | val ssc = new StreamingContext(sc, Minutes(SLIDE_INTERVAL))
25 | val topicsSet = topics.split(",").toSet
26 | val kafkaParams = Map[String, String]("metadata.broker.list" -> zkQuorum)
27 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
28 | ssc, kafkaParams, topicsSet).map(_._2)
29 | println("Messages.count()" + messages.count())
30 | messages.foreachRDD(
31 | rdd => {
32 | if (!rdd.isEmpty()) {
33 | println(rdd.first())
34 | println("rdd count " + rdd.count())
35 | println("URI = " + uri)
36 | val hdfsPath = uri + "/user/data/" + Utills.getTime()
37 | println("HDFS Path = " + hdfsPath)
38 | rdd.saveAsTextFile(hdfsPath)
39 | } else {
40 | println("Data is not yet recevied from the producer....")
41 | }
42 | })
43 | ssc.start()
44 | ssc.awaitTermination()
45 | } catch {
46 | case ex: Exception => {
47 | println(ex.getMessage)
48 | }
49 | }
50 | }
51 |
52 | def main(args: Array[String]) {
53 | /*if (args.length < 2) {
54 | System.err.println("Usage: KafkaConsumer ")
55 | System.exit(1)
56 | }*/
57 | startStreaming(args)
58 | }
59 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/KafkaProducer.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import java.util.Properties
3 |
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.SparkContext
6 |
7 | import kafka.producer.KeyedMessage
8 | import kafka.producer.Producer
9 | import kafka.producer.ProducerConfig
10 | import scala.util.Random
11 | object KafkaProducer {
12 | def main(args: Array[String]) {
13 | val conf = new SparkConf().setAppName("Spark-Kafka-Producer").setMaster("local[1]")
14 | val sc = new SparkContext(conf)
15 | val Array(zkQuorum, topic) = args
16 | val props: Properties = new Properties()
17 | // props.put("metadata.broker.list", "10.220.11.171:9092")
18 | props.put("metadata.broker.list", zkQuorum)
19 | props.put("serializer.class", "kafka.serializer.StringEncoder")
20 |
21 | val config = new ProducerConfig(props)
22 | val producer = new Producer[String, String](config)
23 | var events = 0;
24 | var totalEvents = 10;
25 | // for loop execution with a range
26 | for (index <- 1 to totalEvents) {
27 | val salary = Random.nextInt(500000);
28 | val empId = Random.nextInt(1000);
29 | val empName = "Revanth-" + empId
30 | val msg = empId + "|" + empName + "|" + salary;
31 | producer.send(new KeyedMessage[String, String](topic, msg))
32 | }
33 | }
34 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/MovingAverage.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.expressions.Window
5 | import org.apache.spark.sql.functions._
6 |
7 | object MovingAverage {
8 |
9 | def main(args: Array[String]) {
10 |
11 | val sc = new SparkContext(new SparkConf().setAppName("Spark-MovingAverage").setMaster("local[1]")
12 | //.set("spark.sql.warehouse.dir", "file:///E:/MyStuff/HadoopProj/Scala/WorkSpace/Spark/spark-warehouse")
13 | )
14 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
15 | import sqlContext.implicits._
16 | val customers = sc.parallelize(List(
17 | ("Alice", "2016-05-01", 50.00),
18 | ("Alice", "2016-05-03", 45.00),
19 | ("Alice", "2016-05-04", 55.00),
20 | ("Bob", "2016-05-01", 25.00),
21 | ("Bob", "2016-05-04", 29.00),
22 | ("Bob", "2016-05-06", 27.00))).
23 | toDF("name", "date", "amountSpent")
24 |
25 | // Create a window spec.
26 | val wSpec1 = Window.partitionBy("name").orderBy("date").rowsBetween(-1, 1)
27 |
28 | // Calculate the moving average
29 | customers.withColumn("movingAvg",
30 | avg(customers("amountSpent")).over(wSpec1)).show()
31 |
32 | val wSpec2 = Window.partitionBy("name").orderBy("date").rowsBetween(Long.MinValue, 0)
33 |
34 | // Create a new column which calculates the sum over the defined window frame.
35 | customers.withColumn("cumSum",
36 | sum(customers("amountSpent")).over(wSpec2)).show()
37 |
38 | // Window spec. No need to specify a frame in this case.
39 | val wSpec3 = Window.partitionBy("name").orderBy("date")
40 |
41 | // Use the lag function to look backwards by one row.
42 | customers.withColumn("prevAmountSpent",
43 | lag(customers("amountSpent"), 1).over(wSpec3)).show()
44 |
45 | customers.withColumn("rank", rank().over(wSpec3)).show()
46 | }
47 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/ReadHDFSFolders.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 |
3 | import org.apache.hadoop.conf.Configuration
4 | import org.apache.hadoop.fs.FileSystem
5 | import org.apache.hadoop.fs.Path
6 | import org.apache.hadoop.fs.FileStatus
7 | import org.apache.hadoop.fs.FileUtil
8 |
9 | import org.apache.spark.SparkConf
10 | import org.apache.spark.SparkContext
11 |
12 | object ReadHDFSFolders {
13 | private val conf = new Configuration()
14 | val fs = FileSystem.get(conf)
15 | val uri = conf.get("fs.default.name")
16 |
17 | def main(args: Array[String]) {
18 | val sc = new SparkContext(new SparkConf().setAppName("Spark-ReadHDFSFolders").setMaster("local[1]"))
19 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
20 |
21 | //Hdfs folder path
22 | var DATA_PATH = "/user/data/stocks"
23 |
24 | //No of Hdfs folders to read
25 | val intervalCount = 3
26 |
27 | var fileStatus: Array[FileStatus] = fs.listStatus(new Path(uri + DATA_PATH))
28 | var paths: Array[Path] = FileUtil.stat2Paths(fileStatus)
29 |
30 | var filesWithInterval = getHDFSFoldersBasedOnModtime(intervalCount, fileStatus)
31 |
32 | if (fileStatus != null && filesWithInterval.length >= intervalCount) {
33 | val dataframeArray = filesWithInterval.map(folder => {
34 | sqlContext.read.parquet(folder.getPath.toString)
35 | })
36 |
37 | //Union all the folders and form a single data frame.
38 | val combinedDataFrame = dataframeArray.reduce((x, y) => x.unionAll(y))
39 |
40 | combinedDataFrame.printSchema()
41 |
42 | println("First Record --> " + combinedDataFrame.first())
43 | }
44 |
45 | }
46 |
47 | //get the folders from HDFS based on the count provided.
48 | def getHDFSFoldersBasedOnModtime(intervalCount: Int, fileStatus: Array[FileStatus]): Array[FileStatus] = {
49 | var sortedList: List[FileStatus] = fileStatus.toList.sortWith(_.getModificationTime > _.getModificationTime)
50 | var returnList: List[FileStatus] = List()
51 | var itr: Int = 0
52 | var iterator = sortedList.iterator
53 | while (iterator.hasNext) {
54 | var value = iterator.next()
55 | if (itr < intervalCount) {
56 | returnList = returnList.::(value)
57 | itr = itr + 1
58 | }
59 | }
60 | returnList.toArray
61 | }
62 |
63 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/ReadMultipleFiles.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.functions._
5 | object ReadMultipleFiles {
6 | case class Employee(empid: Int, name: String, dept: String, salary: Int, nop: Int)
7 |
8 | def main(args: Array[String]) {
9 | val conf = new SparkConf().setAppName("Spark-ReadMultipleFiles").setMaster("local[1]")
10 | val sc = new SparkContext(conf)
11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
12 | import sqlContext.implicits._
13 |
14 | //Directory structure
15 | // employee/folder1/emp.txt (or) employee/emp.txt,emp1.txt
16 | // employee/folder2/emp1.txt (or) employee/folder2/emp1.txt,emp2.txt
17 |
18 | val empDataRDD = sc.textFile("E:/employee/*").coalesce(1, false)
19 |
20 | val filteredRDD = empDataRDD.filter(line => !line.contains("empid")) //removing the header section
21 |
22 | val empDF = filteredRDD.filter { lines => lines.length() > 0 }.
23 | map(_.split("\\|")).
24 | map(p => Employee(p(0).trim.toInt, p(1), p(2), p(3).trim.toInt, p(4).trim.toInt)).toDF()
25 |
26 | empDF.show()
27 |
28 | //val empDataRDD1 = sc.wholeTextFiles("E:/test/*")
29 | //empDataRDD1.collect().foreach { x => println(x._2) }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/SparkFileStreaming.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
4 | import StreamingContext._
5 | import org.apache.hadoop.conf._
6 | import org.apache.hadoop.fs._
7 | import org.apache.hadoop.io.LongWritable
8 | import org.apache.hadoop.io.Text
9 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
10 | object SparkFileStreaming {
11 | def main(args: Array[String]) {
12 | val sparkConf = new SparkConf().setAppName("Spark-FileStreaming").setMaster("local[2]")
13 | // Create the context
14 | val ssc = new StreamingContext(sparkConf, Seconds(3))
15 |
16 | // Create the FileInputDStream on the directory and use the
17 | val lines = ssc.textFileStream("hdfs://sandbox.hortonworks.com:8020/user/data/")
18 | val words = lines.flatMap(_.split(" "))
19 | val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
20 | wordCounts.print()
21 | ssc.start()
22 | ssc.awaitTermination()
23 | }
24 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/SparkJDBC.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.rdd.RDD
6 |
7 | object SparkJDBC {
8 |
9 | def getDetails(sc: SparkContext): Unit = {
10 |
11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
12 | import sqlContext.implicits._
13 | val url = "jdbc:oracle:thin:@localhost:1521:XE"
14 | val prop = new java.util.Properties
15 | prop.setProperty("driver", "oracle.jdbc.driver.OracleDriver");
16 | prop.setProperty("user", "root")
17 | prop.setProperty("password", "root")
18 | val employee = sqlContext.read.jdbc(url, "emp", prop)
19 | employee.cache()
20 | employee.registerTempTable("emp")
21 |
22 | sqlContext.sql("select * from emp where NAME like 'HARI%' ").show()
23 |
24 | employee.select("EMPID", "NAME", "SALARY").show()
25 |
26 | employee.filter(employee("SALARY") > 7000).show()
27 |
28 | employee.groupBy("NAME").count().show()
29 |
30 | sc.stop()
31 |
32 | }
33 |
34 | def main(args: Array[String]) {
35 | val conf = new SparkConf().setAppName("Spark-JDBC").setMaster("local[1]")
36 | val sc = new SparkContext(conf);
37 |
38 | getDetails(sc)
39 |
40 | }
41 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/SparkStructType.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types.{ StructType, StructField, StringType, IntegerType }
6 | import com.spark.util._
7 | object SparkStructType extends LogHelper {
8 | def main(args: Array[String]) {
9 | logger.info("SparkStructType.main()")
10 | val conf = new SparkConf().setAppName("Spark-StructType-Example").setMaster("local[1]")
11 | val sc = new SparkContext(conf)
12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
13 | val person = sc.textFile(Utills.DATA_PATH +"person.txt")
14 |
15 | val schema = StructType(Array(StructField("firstName", StringType, true), StructField("lastName", StringType, true), StructField("age", IntegerType, true)))
16 | val rowRDD = person.map(_.split(",")).map(p => org.apache.spark.sql.Row(p(0), p(1), p(2).toInt))
17 | val personDF = sqlContext.createDataFrame(rowRDD, schema)
18 | personDF.registerTempTable("person")
19 | sqlContext.sql("select * from person").foreach(println)
20 |
21 | //saving as parquet file
22 | val path = Utills.DATA_PATH +"person-" + Utills.getTime()
23 | personDF.coalesce(1).write.parquet(path)
24 |
25 | //saving DataFrame as Text file
26 | //personDF.rdd.coalesce(1, false).saveAsTextFile(path)
27 |
28 | //reading a parquet file
29 | val parqfileDF = sqlContext.read.parquet(path)
30 | parqfileDF.filter("age > 25").show()
31 | val df = parqfileDF.groupBy("firstName", "lastName").agg(sum(parqfileDF.col("age")))
32 | df.show()
33 |
34 | }
35 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Spark_Avro.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.functions._
5 | import com.databricks.spark.avro._
6 | import com.spark.util._
7 | object Spark_Avro {
8 | case class Employee(empid: Int, name: String, dept: String, salary: Int, nop: Int)
9 | def main(args: Array[String]) {
10 | val conf = new SparkConf().setAppName("Spark-Avro").setMaster("local[1]")
11 | val sc = new SparkContext(conf)
12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
13 | // configuration to use deflate compression
14 | sqlContext.setConf("spark.sql.avro.compression.codec", "deflate")
15 | sqlContext.setConf("spark.sql.avro.deflate.level", "5")
16 | import sqlContext.implicits._
17 |
18 | val empDataRDD = sc.textFile(Utills.DATA_PATH + "emp.txt") //path to emp.txt
19 | val dropHeaderRDD = empDataRDD.mapPartitions(_.drop(1)) //remove the header information from the file
20 |
21 | val empDF = dropHeaderRDD.filter { lines => lines.length() > 0 }.
22 | map(_.split("\\|")).
23 | map(p => Employee(p(0).trim.toInt, p(1), p(2), p(3).trim.toInt, p(4).trim.toInt)).toDF()
24 |
25 | empDF.show()
26 |
27 | //write as avro file.
28 | empDF.write.avro("/user/data/Emp_avro")
29 |
30 | //reading from avro file.
31 | val df = sqlContext.read.avro("/user/data/Emp_avro")
32 | df.filter("salary > 1000").show()
33 |
34 | //Writing Partitioned Data
35 | val moviesDF = Seq(
36 | (2012, 8, "Batman", 9.8),
37 | (2012, 8, "Hero", 8.7),
38 | (2012, 7, "Robot", 5.5),
39 | (2011, 7, "Git", 2.0)).toDF("year", "month", "title", "rating")
40 |
41 | moviesDF.write.partitionBy("year", "month").avro("/user/data/movies")
42 |
43 | //Reading Partitioned Data
44 | val resultDF = sqlContext.read.avro("/user/data/movies")
45 | resultDF.printSchema()
46 | resultDF.filter("year = 2011").collect().foreach(println)
47 | }
48 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Spark_CSV_Reader.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.functions._
5 | import com.spark.util._
6 | object Spark_CSV_Reader {
7 | def main(args: Array[String]) {
8 | val conf = new SparkConf().setAppName("Spark-CSV-Example").setMaster("local[1]")
9 | val sc = new SparkContext(conf)
10 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
11 | val auctionDF = sqlContext.read
12 | .format("com.databricks.spark.csv")
13 | .option("header", "true") // Use first line of all files as header
14 | .option("inferSchema", "true") // Automatically infer data types
15 | .load(Utills.DATA_PATH +"ebay.csv")
16 | //auctionDF.printSchema()
17 | //auctionDF.select("auctionid", "bidder").show
18 |
19 | // How many auctions were held?
20 | val count = auctionDF.select("auctionid").distinct.count
21 | println("Distinct items : " + count)
22 | // How many bids per item?
23 | auctionDF.groupBy("auctionid", "item").count.sort("auctionid").show
24 |
25 | // What's the min number of bids per item? what's the average? what's the max?
26 | auctionDF.groupBy("item", "auctionid").count.agg(min("count"), avg("count"), max("count")).show
27 | // Get the auctions with closing price > 100
28 | auctionDF.filter("price > 100").sort("auctionid").show
29 |
30 | // register the DataFrame as a temp table
31 | auctionDF.registerTempTable("auction")
32 | // SQL statements can be run
33 | // How many bids per auction?
34 | val results = sqlContext.sql("SELECT auctionid, item, count(bid) as BidCount FROM auction GROUP BY auctionid, item")
35 | // display dataframe in a tabular format
36 | results.sort("auctionid").show()
37 |
38 | sqlContext.sql("SELECT auctionid,item, MAX(price) as MaxPrice FROM auction GROUP BY item,auctionid").sort("auctionid").show()
39 | }
40 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Spark_Hive.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | object Spark_Hive {
6 | def main(args: Array[String]) {
7 | val conf = new SparkConf().setAppName("Spark_Hive").setMaster("local[1]")
8 | val sc = new SparkContext(conf)
9 |
10 | //create hive context
11 | val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
12 |
13 | //Create Table and load data
14 | hiveContext.sql("CREATE EXTERNAL TABLE IF NOT EXISTS users(id INT, name STRING, age INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'")
15 | hiveContext.sql("LOAD DATA LOCAL INPATH '/hdp/dev/hive/users.txt' INTO TABLE users") //specify path to file accordingly
16 |
17 | val result = hiveContext.sql("FROM users SELECT id, name, age").show()
18 |
19 | val rddFromSql = hiveContext.sql("SELECT id,name,age FROM users WHERE age > 25 ORDER BY age")
20 | rddFromSql.show()
21 |
22 | // The results of SQL queries are themselves RDDs and support all normal RDD functions.The
23 | // items in the RDD are of type Row, which allows you to access each column by ordinal.
24 | println("Result of RDD.map:")
25 | val rddAsStrings = rddFromSql.rdd.map {
26 | case Row(id: Int, name: String, age: Int) => s"Id: $id, Name: $name, Age: $age"
27 | }
28 | rddAsStrings.foreach { x => println(x) }
29 |
30 | // Aggregation queries are also supported.
31 | val count = hiveContext.sql("SELECT COUNT(*) FROM users").collect().head.getLong(0)
32 | println(s"count is : $count")
33 |
34 | // Queries are expressed in HiveQL
35 | println("Result of 'SELECT *': ")
36 | hiveContext.sql("SELECT * FROM users").collect().foreach(println)
37 |
38 | }
39 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Spark_Hive_ORC.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.hive.orc._
5 | import org.apache.spark.sql._
6 | object Spark_Hive_ORC {
7 | case class YahooStockPrice(date: String, open: Float, high: Float, low: Float, close: Float, volume: Integer, adjClose: Float)
8 | def main(args: Array[String]) {
9 | val conf = new SparkConf().setAppName("Spark_Hive_ORC").setMaster("local[1]")
10 | val sc = new SparkContext(conf)
11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
12 | import sqlContext.implicits._
13 |
14 | //create hive context
15 | val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
16 |
17 | //Create ORC Table and load data
18 | hiveContext.sql("create EXTERNAL table yahoo_orc_table (date STRING, open_price FLOAT, high_price FLOAT, low_price FLOAT, close_price FLOAT, volume INT, adj_price FLOAT) stored as orc")
19 |
20 | val yahoo_stocks = sc.textFile("hdfs://sandbox.hortonworks.com:8020/user/data/yahoo_stocks.csv")
21 |
22 | //filter out the header information
23 | val header = yahoo_stocks.first
24 | val data = yahoo_stocks.filter(_ != header)
25 |
26 | //Map the data to a case class and register it as a temp table.
27 | val stockprice = data.map(_.split(",")).map(row => YahooStockPrice(row(0), row(1).trim.toFloat, row(2).trim.toFloat, row(3).trim.toFloat, row(4).trim.toFloat, row(5).trim.toInt, row(6).trim.toFloat)).toDF()
28 | stockprice.registerTempTable("yahoo_stocks_temp")
29 | val results = sqlContext.sql("SELECT * FROM yahoo_stocks_temp")
30 |
31 | results.map(t => "Stock Entry: " + t.toString).collect().foreach(println)
32 |
33 | //save the data to HDFS in ORC file format.
34 | results.coalesce(1).write.format("orc").save("/user/data/yahoo_stocks_orc")
35 |
36 | //load the data in ORC format to visualize it .
37 | val yahoo_stocks_orc = hiveContext.read.format("orc").load("/user/data/yahoo_stocks_orc")
38 | yahoo_stocks_orc.registerTempTable("orcTest")
39 | hiveContext.sql("SELECT * from orcTest").collect.foreach(println)
40 |
41 | //load the ORC data in to ORC hive table created at the top.
42 | hiveContext.sql("LOAD DATA INPATH '/user/data/yahoo_stocks_orc' INTO TABLE yahoo_orc_table")
43 | val orcResults = hiveContext.sql("FROM yahoo_orc_table SELECT date, open_price,high_price")
44 | orcResults.show
45 |
46 | }
47 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Spark_Joins.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.rdd.RDD
5 |
6 | object Spark_Joins {
7 | def main(args: Array[String]) {
8 | val conf = new SparkConf().setAppName("Spark-Joins").setMaster("local[2]")
9 | val sc = new SparkContext(conf)
10 | // Create emp RDD
11 | val emp = sc.parallelize(Seq((1, "revanth", 10), (2, "dravid", 20), (3, "kiran", 30), (4, "nanda", 35), (5, "kishore", 30)))
12 |
13 | // Create dept RDD
14 | val dept = sc.parallelize(Seq(("hadoop", 10), ("spark", 20), ("hive", 30), ("sqoop", 40)))
15 |
16 | // Establishing that the third field is to be considered as the Key for the emp RDD
17 | val manipulated_emp = emp.keyBy(t => t._3)
18 |
19 | // Establishing that the second field need to be considered as the Key for dept RDD
20 | val manipulated_dept = dept.keyBy(t => t._2)
21 |
22 | // Inner Join
23 | val join_data = manipulated_emp.join(manipulated_dept)
24 |
25 | // Left Outer Join
26 | val left_outer_join_data = manipulated_emp.leftOuterJoin(manipulated_dept)
27 | left_outer_join_data.collect().foreach(f => println(f))
28 |
29 | // Right Outer Join
30 | val right_outer_join_data = manipulated_emp.rightOuterJoin(manipulated_dept)
31 | right_outer_join_data.collect().foreach(f => println(f))
32 |
33 | // Full Outer Join
34 | val full_outer_join_data = manipulated_emp.fullOuterJoin(manipulated_dept)
35 | full_outer_join_data.collect().foreach(f => println(f))
36 |
37 | // Formatting the Joined Data for better understandable (using map)
38 | val cleaned_joined_data = join_data.map(t => (t._2._1._1, t._2._1._2, t._1, t._2._2._1))
39 |
40 | cleaned_joined_data.collect().foreach(f => println(f))
41 | }
42 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Spark_Json_Reader.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.functions._
5 | import com.spark.util._
6 | object Spark_Json_Reader {
7 | def main(args: Array[String]) {
8 | val conf = new SparkConf().setAppName("Spark_Json_Reader").setMaster("local[1]")
9 | val sc = new SparkContext(conf)
10 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
11 |
12 | val path = Utills.DATA_PATH +"sales.json"
13 | val salesDF = sqlContext.read.json(path)
14 | salesDF.registerTempTable("sales")
15 | val aggDF = sqlContext.sql("select sum(amountPaid) from sales")
16 | println(aggDF.collectAsList())
17 |
18 | val results = sqlContext.sql("SELECT customerId,itemName FROM sales ORDER BY itemName")
19 | // display dataframe in a tabular format
20 | results.show()
21 | }
22 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Spark_SequenceFiles.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.rdd.RDD
6 |
7 | object Spark_SequenceFiles {
8 |
9 | def main(args: Array[String]) {
10 |
11 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Sequence-Files").setMaster("local[1]"))
12 |
13 | val data = sc.textFile("file:////data/Spark/spark-scala/src/main/resources/olympics_data.txt")
14 |
15 | data.map(x => x.split(",")).map(x => (x(1).toString(), x(2).toString())).foreach(f => print(f))
16 |
17 | val pairs: RDD[(String, String)] = data.map(x => x.split(",")).map(x => (x(1).toString(), x(2).toString()))
18 |
19 | pairs.saveAsSequenceFile("/data/spark/rdd_to_seq")
20 |
21 | //Loading sequenceFiles into an RDD in Spark
22 |
23 | val data1: RDD[(String, String)] = sc.sequenceFile("/data/spark/rdd_to_seq")
24 |
25 | data1.take(5).foreach(f => print(f))
26 | }
27 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Spark_StructType.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.sql._
6 | import org.apache.spark.sql.types._
7 | import com.spark.util._
8 | object Spark_StructType {
9 | def main(args: Array[String]) {
10 | val conf = new SparkConf().setAppName("Spark_StructType_Example").setMaster("local[1]")
11 | val sc = new SparkContext(conf)
12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
13 | import sqlContext.implicits._
14 |
15 | val people = sc.textFile(Utills.DATA_PATH +"person.txt")
16 | val schemaString = "firstName lastName age"
17 |
18 | val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
19 | val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1), p(2).trim))
20 | val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema)
21 |
22 | peopleDataFrame.registerTempTable("people")
23 |
24 | val results = sqlContext.sql("SELECT firstName,age FROM people")
25 |
26 | results.map(t => "Name: " + t(0) + "," + "Age: " + t(1)).collect().foreach(println)
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Spark_XML.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.functions._
5 | object Spark_XML {
6 |
7 | //Reference ---> https://github.com/databricks/spark-xml
8 |
9 | def main(args: Array[String]) {
10 |
11 | val conf = new SparkConf().setAppName("Spark_XML_Parsing").setMaster("local[1]")
12 | val sc = new SparkContext(conf)
13 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
14 | import sqlContext.implicits._
15 |
16 | val df = sqlContext.read.format("com.databricks.spark.xml")
17 | .option("rowTag", "book").load("/user/data/books.xml")
18 |
19 | val selectedData = df.select("author", "title", "_id")
20 | selectedData.show()
21 |
22 | //You can manually specify the schema when reading data:
23 |
24 | import org.apache.spark.sql.SQLContext
25 | import org.apache.spark.sql.types.{ StructType, StructField, StringType, DoubleType };
26 |
27 | val customSchema = StructType(Array(
28 | StructField("_id", StringType, nullable = true),
29 | StructField("author", StringType, nullable = true),
30 | StructField("description", StringType, nullable = true),
31 | StructField("genre", StringType, nullable = true),
32 | StructField("price", DoubleType, nullable = true),
33 | StructField("publish_date", StringType, nullable = true),
34 | StructField("title", StringType, nullable = true)))
35 |
36 | val df1 = sqlContext.read
37 | .format("com.databricks.spark.xml")
38 | .option("rowTag", "book")
39 | .schema(customSchema)
40 | .load("/user/data/books.xml")
41 |
42 | val selectedData1 = df1.select("author", "_id")
43 | selectedData1.show()
44 |
45 | }
46 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Stateful_WordCount.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark._
3 | import org.apache.spark.streaming._
4 | import org.apache.spark.streaming.StreamingContext._
5 |
6 | object Stateful_WordCount extends App {
7 |
8 | val updateFunc = (values: Seq[Int], state: Option[Int]) => {
9 | val currentCount = values.foldLeft(0)(_ + _)
10 |
11 | val previousCount = state.getOrElse(0)
12 |
13 | Some(currentCount + previousCount)
14 | }
15 |
16 | val conf = new SparkConf().setAppName("Stateful_WordCount").setMaster("local[*]")
17 | val sc = new SparkContext(conf)
18 | val ssc = new StreamingContext(sc, Seconds(10))
19 |
20 | ssc.checkpoint("/user/data/checkpoints/")
21 |
22 | val lines = ssc.socketTextStream("localhost", 9999)
23 | val words = lines.flatMap(_.split(" "))
24 | val pairs = words.map(word => (word, 1))
25 |
26 | val windowedWordCounts = pairs.updateStateByKey(updateFunc)
27 | windowedWordCounts.saveAsTextFiles("/user/data/result")
28 |
29 | ssc.start()
30 | ssc.awaitTermination()
31 |
32 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/WindowBasedStreaming.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.streaming.StreamingContext
6 | import org.apache.spark.streaming.Seconds
7 | import com.spark.util._
8 | object WindowBasedStreaming {
9 |
10 | //nc -lk 9999
11 |
12 | def main(args: Array[String]) {
13 | val conf = new SparkConf().setAppName("Window-Based-Streaming").setMaster("local[*]")
14 | val sc = new SparkContext(conf)
15 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
16 | import sqlContext.implicits._
17 | sc.setLogLevel("WARN")
18 | val ssc = new StreamingContext(sc, Seconds(10))
19 |
20 | //ssc.checkpoint("/user/data/checkpoints/")
21 |
22 | val lines = ssc.socketTextStream("localhost", 9999)
23 |
24 | val messages = lines.window(Seconds(30), Seconds(10))
25 |
26 | messages.foreachRDD(
27 | rdd => {
28 | if (!rdd.isEmpty()) {
29 | println("rdd count " + rdd.count())
30 | val path = "file:///opt/home/data/" + Utills.getTime()
31 | rdd.coalesce(1, false).saveAsTextFile(path)
32 | } else {
33 | println("Data is not yet recevied from the producer....")
34 | }
35 | })
36 |
37 | ssc.start()
38 | ssc.awaitTermination()
39 | }
40 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/Window_Sliding_Interval.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.streaming.StreamingContext
5 | import org.apache.spark.streaming.Seconds
6 |
7 | object Window_Sliding_Interval {
8 |
9 | //nc -lk 9999
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setAppName("Window_Sliding_Interval").setMaster("local[*]")
13 | val sc = new SparkContext(conf)
14 | val ssc = new StreamingContext(sc, Seconds(10))
15 |
16 | ssc.checkpoint("/user/data/checkpoints/")
17 |
18 | val lines = ssc.socketTextStream("localhost", 9999)
19 | val words = lines.flatMap(_.split(" "))
20 | val pairs = words.map(word => (word, 1))
21 |
22 | // Reduce last 30 seconds of data, every 10 seconds
23 | val windowedWordCounts = pairs.reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(30), Seconds(10))
24 | windowedWordCounts.print()
25 |
26 | ssc.start()
27 | ssc.awaitTermination()
28 | }
29 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/examples/WordCount.scala:
--------------------------------------------------------------------------------
1 | package com.spark.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import java.util.Calendar
5 | import org.apache.spark.sql.SQLContext
6 |
7 | object WordCount {
8 | def main(args: Array[String]) {
9 |
10 | val sc = new SparkContext(new SparkConf().setAppName("Spark Count").setMaster("local[1]"))
11 |
12 | val today = Calendar.getInstance().getTime()
13 |
14 | val threshold = 2
15 |
16 | // split each document into words
17 | val tokenized = sc.textFile(args(0)).flatMap(_.split(" "))
18 |
19 | // count the occurrence of each word
20 | val wordCounts = tokenized.map((_, 1)).reduceByKey(_ + _)
21 |
22 | // filter out words with less than threshold occurrences
23 | val filtered = wordCounts.filter(_._2 >= threshold)
24 |
25 | // count characters
26 | val charCounts = filtered.flatMap(_._1.toCharArray).map((_, 1)).reduceByKey(_ + _)
27 |
28 | //wordCounts.saveAsTextFile(args(1))
29 | println("---------------------------------------------------")
30 | println(charCounts.collect().mkString(", "))
31 |
32 | }
33 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/transformations/AggregateByKey.scala:
--------------------------------------------------------------------------------
1 | package com.spark.transformations
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
5 |
6 | object AggregateByKey {
7 |
8 | def myfunc(index: Int, iter: Iterator[(String, Int)]): Iterator[String] = {
9 | iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
10 | }
11 |
12 | def main(args: Array[String]) {
13 | val conf = new SparkConf().setAppName("AggregateByKey-Example").setMaster("local[1]")
14 | val sc = new SparkContext(conf)
15 |
16 | val pairRDD = sc.parallelize(List(("cat", 2), ("cat", 5), ("mouse", 4), ("cat", 12), ("dog", 12), ("mouse", 2)), 2)
17 |
18 | //lets have a look at what is in the partitions
19 | pairRDD.mapPartitionsWithIndex(myfunc).collect.foreach(f => println(f))
20 | println("***********************************************")
21 |
22 | pairRDD.aggregateByKey(0)(math.max(_, _), _ + _).collect.foreach(f => println(f))
23 | println("-----------------------------------------------")
24 |
25 | pairRDD.aggregateByKey(100)(math.max(_, _), _ + _).collect.foreach(f => println(f))
26 | }
27 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/transformations/Cars.scala:
--------------------------------------------------------------------------------
1 | package com.spark.transformations
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import com.spark.util._
5 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
6 | object Cars {
7 | def main(args: Array[String]) {
8 | case class cars(make: String, model: String, mpg: String, cylinders: Integer, engine_disp: Integer, horsepower: Integer, weight: Integer, accelerate: Double, year: Integer, origin: String)
9 | val conf = new SparkConf().setAppName("Transformations").setMaster("local[1]")
10 | val sc = new SparkContext(conf)
11 | val rawData = sc.textFile(Utills.DATA_PATH +"cars.txt") //"path to/cars.txt"
12 |
13 | rawData.take(5).foreach { x => println(x) }
14 |
15 | val carsData = rawData.map(x => x.split("\t"))
16 | .map(x => cars(x(0).toString, x(1).toString, x(2).toString, x(3).toInt, x(4).toInt, x(5).toInt, x(6).toInt, x(7).toDouble, x(8).toInt, x(9).toString))
17 |
18 | carsData.take(2).foreach { x => println(x) }
19 | //persist to memory
20 | carsData.cache()
21 |
22 | //count cars origin wise
23 | val originWiseCount = carsData.map(x => (x.origin, 1)).reduceByKey((x, y) => x + y)
24 | println("originWiseCount :" + originWiseCount.collect().mkString(", "))
25 | //filter out american cars
26 | val americanCars = carsData.filter(x => (x.origin == "American"))
27 |
28 | //count total american cars
29 | println("americanCars count : " + americanCars.count())
30 |
31 | // take sum of weights according to make
32 | val makeWeightSum = americanCars.map(x => (x.make, x.weight.toInt)).combineByKey((x: Int) => (x, 1),
33 | (acc: (Int, Int), x) => (acc._1 + x, acc._2 + 1),
34 | (acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2))
35 |
36 | println("americanCars makeWeightSum : " + makeWeightSum.collect().mkString(", "))
37 |
38 | // take average
39 | val makeWeightAvg = makeWeightSum.map(x => (x._1, (x._2._1 / x._2._2)))
40 |
41 |
42 | println("americanCars makeWeightAvg : " +makeWeightAvg.collect().mkString(", "))
43 | }
44 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/transformations/Cogroup.scala:
--------------------------------------------------------------------------------
1 | package com.spark.transformations
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
5 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
6 | object Cogroup {
7 |
8 | def main(args: Array[String]) {
9 | val conf = new SparkConf().setAppName("Transformations-Example").setMaster("local[1]")
10 | val sc = new SparkContext(conf)
11 |
12 | // cartesian
13 |
14 | /*Computes the cartesian product between two RDDs (i.e. Each item of the first RDD is joined with each item of the second RDD)
15 | and returns them as a new RDD. (Warning: Be careful when using this function.! Memory consumption can quickly become an issue!)
16 | */
17 | val x = sc.parallelize(List(1, 2, 3, 4, 5))
18 | val y = sc.parallelize(List(6, 7, 8, 9, 10))
19 | x.cartesian(y).collect.foreach(f => println(f))
20 |
21 | //cogroup
22 | println("cogroup ---cogroup----cogroup")
23 | val a = sc.parallelize(List((1, "apple"), (2, "banana"), (3, "orange"), (4, "kiwi")), 2)
24 | val b = sc.parallelize(List((1, "apple"), (5, "computer"), (1, "laptop"), (1, "desktop"), (4, "iPad")), 2)
25 |
26 | a.cogroup(b).collect.foreach(f => println(f))
27 |
28 | //subtract 2 RRD's
29 | val diff = a.subtract(b)
30 | diff.collect().foreach(f => println(f._2))
31 |
32 | //collectAsMap
33 | println("collectAsMap ---collectAsMap----collectAsMap")
34 | val c = sc.parallelize(List(1, 2, 1, 3), 1)
35 | val c2 = sc.parallelize(List(5, 6, 5, 7), 1)
36 | val d = c.zip(c2)
37 | d.collectAsMap.foreach(f => println(f))
38 |
39 | //combineByKey
40 | println("combineByKey ---combineByKey----combineByKey")
41 | val a1 = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3)
42 | val b1 = sc.parallelize(List(1, 1, 2, 2, 2, 1, 2, 2, 2), 3)
43 | val c1 = b1.zip(a1)
44 | val d1 = c1.combineByKey(List(_), (x: List[String], y: String) => y :: x, (x: List[String], y: List[String]) => x ::: y)
45 | d1.collect.foreach(f => println(f))
46 |
47 | //filterByRange [Ordered]
48 | println("filterByRange ---filterByRange----filterByRange")
49 | val randRDD = sc.parallelize(List((2, "cat"), (6, "mouse"), (7, "cup"), (3, "book"), (4, "tv"), (1, "screen"), (5, "heater")), 3)
50 | val sortedRDD = randRDD.sortByKey()
51 |
52 | sortedRDD.filterByRange(1, 3).collect.foreach(f => println(f))
53 | }
54 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/transformations/Filter.scala:
--------------------------------------------------------------------------------
1 | package com.spark.transformations
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | object Filter {
5 | def main(args: Array[String]) {
6 | val conf = new SparkConf().setAppName("Spark Filter Example").setMaster("local[1]")
7 | val sc = new SparkContext(conf)
8 | val x = sc.parallelize(List("Transformation demo", "Test demo", "Filter demo", "Spark is powerfull", "Spark is faster", "Spark is in memory"))
9 |
10 | val lines1 = x.filter(line => line.contains("Spark") || line.contains("Transformation"))
11 | lines1.collect().foreach { line => println(line) }
12 |
13 | val lines = x.filter(line => !line.contains("Filter"))
14 | println("---------------------------------------------")
15 | lines.collect().foreach { line => println(line) }
16 | println("---------------------------------------------")
17 | val count = x.filter(line => line.contains("Spark")).count()
18 | println("count is : " + count)
19 | }
20 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/transformations/FoldByKey.scala:
--------------------------------------------------------------------------------
1 | package com.spark.transformations
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
6 | object FoldByKey {
7 | def main(args: Array[String]) {
8 | val conf = new SparkConf().setAppName("FoldByKey-Example").setMaster("local[1]")
9 | val sc = new SparkContext(conf)
10 |
11 | //Fold in spark
12 | val employeeData = List(("Jack", 1000.0), ("Bob", 2000.0), ("Carl", 7000.0))
13 | val employeeRDD = sc.makeRDD(employeeData)
14 |
15 | val dummyEmployee = ("dummy", 0.0)
16 |
17 | val maxSalaryEmployee = employeeRDD.fold(dummyEmployee)((acc, employee) => {
18 | if (acc._2 < employee._2) employee else acc
19 | })
20 | println("employee with maximum salary is" + maxSalaryEmployee)
21 |
22 | //Fold by key
23 | val deptEmployees = List(
24 | ("cs", ("jack", 1000.0)),
25 | ("cs", ("bron", 1200.0)),
26 | ("phy", ("sam", 2200.0)),
27 | ("phy", ("ronaldo", 500.0)))
28 | val empRDD = sc.makeRDD(deptEmployees)
29 | val dummyEmp = ("dummy", 0.0)
30 | val maxByDept = empRDD.foldByKey(dummyEmp)((acc, employee) => {
31 | if (acc._2 < employee._2) employee else acc
32 | })
33 | println("maximum salaries in each dept" + maxByDept.collect().toList)
34 |
35 | //Fold by key
36 | var rdd1 = sc.makeRDD(Array(("A", 0), ("A", 2), ("B", 1), ("B", 2), ("C", 1)))
37 | rdd1.foldByKey(0)(_ + _).collect.foreach(f => println(f))
38 | println("-----------------------------------------------")
39 | rdd1.foldByKey(1)(_ * _).collect.foreach(f => println(f))
40 |
41 |
42 | }
43 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/transformations/GroupBY_ReduceBY.scala:
--------------------------------------------------------------------------------
1 | package com.spark.transformations
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
5 | object GroupBY_ReduceBY {
6 | def main(args: Array[String]) {
7 | val conf = new SparkConf().setAppName("Spark-GroupBY-ReduceBY-Example").setMaster("local[1]")
8 | val sc = new SparkContext(conf)
9 | val words = Array("a", "b", "b", "c", "d", "e", "a", "b", "b", "c", "d", "e", "b", "b", "c", "d", "e")
10 | val wordPairsRDD = sc.parallelize(words).map(word => (word, 1))
11 |
12 | val wordCountsWithReduce = wordPairsRDD
13 | .reduceByKey(_ + _)
14 | .collect()
15 | wordCountsWithReduce.foreach(f => println(f))
16 |
17 | //Avoid GroupByKey
18 | println("Avoid GroupByKey")
19 | val wordCountsWithGroup = wordPairsRDD
20 | .groupByKey()
21 | .map(t => (t._1, t._2.sum))
22 | .collect()
23 | wordCountsWithGroup.foreach(f => println(f))
24 | }
25 |
26 | //https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html
27 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/transformations/MapvsFlatMap.scala:
--------------------------------------------------------------------------------
1 | package com.spark.transformations
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | object MapvsFlatMap {
5 | def main(args: Array[String]) {
6 |
7 | val conf = new SparkConf().setAppName("MapvsFlatMap").setMaster("local[1]")
8 | val sc = new SparkContext(conf)
9 | val x = sc.parallelize(List("spark rdd example", "sample example"))
10 |
11 | // map operation will return Array of Arrays in following case : check type of result
12 | val y = x.map(x => x.split(" ")) // split(" ") returns an array of words
13 | //result -> Array[Array[String]] = Array(Array(spark, rdd, example), Array(sample, example))
14 |
15 | /*Similar to map, but each input item can be mapped to 0 or more output items
16 | (so func should return a Seq rather than a single item).*/
17 |
18 | // flatMap operation will return Array of words in following case : Check type of result
19 | val z = x.flatMap(x => x.split(" "))
20 | z.collect().foreach { x => println(x) }
21 | //result -> Array[String] = Array(spark, rdd, example, sample, example)
22 |
23 | }
24 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/transformations/Reduce.scala:
--------------------------------------------------------------------------------
1 | package com.spark.transformations
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | object Reduce {
5 | def main(args: Array[String]) {
6 | val conf = new SparkConf().setAppName("Reduce-Example").setMaster("local[1]")
7 | val sc = new SparkContext(conf)
8 |
9 | /*val data = Array(1, 2, 3, 4, 5)
10 | val distData = sc.parallelize(data)
11 | distData.collect().foreach { x => println(x) }
12 | val red = distData.reduce((a, b) => a + b)
13 | println(red)*/
14 |
15 | val distFile = sc.textFile("F:\\Software\\Spark\\input.txt")
16 |
17 | val fil = distFile.map { x => x.split(" ").size }
18 | val rdd = distFile.reduce((a, b) => a + b)
19 | println(rdd)
20 |
21 | val res = distFile.map(s => s.length).reduce((a, b) => a + b)
22 | val res1 = distFile.reduce((a, b) => a + b)
23 | println(res)
24 | println(res1)
25 |
26 | }
27 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/NamesAnalysis.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import com.spark.util.Utills
5 | object NamesAnalysis {
6 | def main(args: Array[String]) {
7 | val conf = new SparkConf().setAppName("Names-Analysis").setMaster("local[1]")
8 | val sc = new SparkContext(conf)
9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
10 | import sqlContext.implicits._
11 |
12 | val babyNamesRDD = sc.textFile(Utills.DATA_PATH + "/baby_names.txt")
13 |
14 | //remove the header information from the file
15 | val dropHeaderRDD = babyNamesRDD.mapPartitions(_.drop(1))
16 | val rows = dropHeaderRDD.map(line => line.split(","))
17 |
18 | //unique counties over the years of data collect
19 | val count = rows.map(row => row(2)).distinct.count
20 | println("unique counties count --> " + count)
21 | //rows.collect().foreach { x => println(x(0) + " : " +x(1) + " : " + x(2) + " : " + x(3) + " : " + x(4)) }
22 |
23 | //rows containing the name "SACHIN"
24 | val sachinRows = rows.filter(row => row(1).contains("SACHIN"))
25 |
26 | //Number of rows where NAME "SACHIN" has a "Count" greater than 10
27 | sachinRows.filter(row => row(4).toInt > 10).count()
28 |
29 | val uniqueCounties = sachinRows.filter(row => row(4).toInt > 10).map(r => r(2)).distinct
30 | println("-------- unique country names which have had the name 'SACHIN' ---------")
31 | uniqueCounties.foreach { x => println(x) }
32 | // unique counties which have had the name SACHIN over 10 times in a given year
33 | val uniCountryCount = sachinRows.filter(row => row(4).toInt > 10).map(r => r(2)).distinct.count
34 | println("unique counties which have had the name SACHIN --> " + uniCountryCount)
35 |
36 | val names = rows.map(name => (name(1), 1))
37 | // shows number of times each name appears in file
38 | names.reduceByKey((a, b) => a + b).sortBy(_._2).foreach(println _)
39 |
40 | //Another way to filter the header information
41 | val filteredRows = babyNamesRDD.filter(line => !line.contains("Count")).map(line => line.split(","))
42 | filteredRows.map(n => (n(1), n(4).toInt)).reduceByKey((a, b) => a + b).sortBy(_._2).foreach(println _)
43 | }
44 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/OlaDataAnalysis.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases
2 | import org.apache.spark.SparkConf
3 | import com.spark.util.Utills
4 | import org.apache.spark.SparkContext
5 | object OlaDataAnalysis {
6 | def main(args: Array[String]) {
7 | val conf = new SparkConf().setAppName("Ola-Cab-Data-Analysis").setMaster("local[1]")
8 | val sc = new SparkContext(conf)
9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
10 | val dataset = sc.textFile(Utills.DATA_PATH + "/olaCabData.txt")
11 | val header = dataset.first()
12 | val format = new java.text.SimpleDateFormat("MM/dd/yyyy")
13 | var days = Array("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")
14 | val eliminate = dataset.filter(line => line != header)
15 |
16 | //Find the days on which each basement has more trips.
17 |
18 | val split = eliminate.map(line => line.split(",")).map { x => (x(0), format.parse(x(1)), x(3)) }
19 | split.foreach(f => println(f))
20 |
21 | val combine = split.map(x => (x._1 + " " + days(x._2.getDay), x._3.toInt))
22 | combine.foreach(f => println(f))
23 |
24 | val arrange = combine.reduceByKey(_ + _).map(item => item.swap).sortByKey(false).collect.foreach(println)
25 | }
26 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/OlympicsDataAnalysis.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import com.spark.util.Utills
5 | object OlympicsDataAnalysis {
6 | def main(args: Array[String]) {
7 | val conf = new SparkConf().setAppName("Travel-Data-Analysis").setMaster("local[1]")
8 | val sc = new SparkContext(conf)
9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
10 | val textFile = sc.textFile(Utills.DATA_PATH + "/olympics_data.txt")
11 | val olympicsDataRDD = textFile.mapPartitions(_.drop(1)) //remove the header information from the file
12 | val lines = olympicsDataRDD.filter { x => { if (x.toString().split(",").length >= 10) true else false } }
13 | .map(line => { line.toString().split(",") })
14 |
15 | //Find the total number of medals won by each country in swimming.
16 | val filteredRDD = lines.filter(x => { if (x(5).equalsIgnoreCase("swimming") && (x(9).matches(("\\d+")))) true else false })
17 | val results = filteredRDD.map(x => (x(2), x(9).toInt))
18 | val totalNoMedals = results.reduceByKey(_ + _).collect()
19 | println("---Total number of medals won by each country in swimming---")
20 | totalNoMedals.foreach(f => println(f))
21 |
22 | //Find the number of medals that won by India year wise.
23 | val filteredIndiaRDD = lines.filter(x => { if (x(2).equalsIgnoreCase("india") && (x(9).matches(("\\d+")))) true else false })
24 | val indiaResults = filteredIndiaRDD.map(x => (x(3), x(9).toInt))
25 | val indiaMedals = indiaResults.reduceByKey(_ + _).collect()
26 | println("---Number of medals that won by India year wise---")
27 | indiaMedals.foreach(f => println(f))
28 |
29 | //Find the total number of medals won by each country.
30 | val filteredLines = lines.filter(x => { if ((x(9).matches(("\\d+")))) true else false })
31 | val filteredResults = filteredLines.map(x => (x(2), x(9).toInt))
32 | val medalsCountryWise = filteredResults.reduceByKey(_ + _).collect()
33 | println("---Total number of medals won by each country---")
34 | medalsCountryWise.foreach(f => println(f))
35 |
36 | }
37 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/TVShowDataAnalysis.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import com.spark.util.Utills
5 | object TVShowDataAnalysis {
6 | def main(args: Array[String]) {
7 | val conf = new SparkConf().setAppName("TV-Show-Data-Analysis").setMaster("local[1]")
8 | val sc = new SparkContext(conf)
9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
10 | val format = new java.text.SimpleDateFormat("MM/dd/yy")
11 | val textFile = sc.textFile(Utills.DATA_PATH + "/dialy_show_guests.txt")
12 | val tvDataRDD = textFile.mapPartitions(_.drop(1)) //remove the header information from the file
13 |
14 | //Find the top 5 kinds of GoogleKnowlege_Occupation people guested the show in a particular time period.
15 |
16 | val splitedRDD = tvDataRDD.map(line => line.split(","))
17 | val pair = splitedRDD.map(line => (line(1), format.parse(line(2))))
18 | val fil = pair.filter(x => { if (x._2.after(format.parse("1/11/99")) && x._2.before(format.parse("6/11/99"))) true else false })
19 | val top5GuestsRDD = fil.map(x => (x._1, 1)).reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(5)
20 |
21 | top5GuestsRDD.foreach(f => println(f))
22 | }
23 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/TravelDataAnalysis.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.functions._
5 | import com.spark.util.Utills
6 | object TravelDataAnalysis {
7 | def main(args: Array[String]) {
8 | val conf = new SparkConf().setAppName("Travel-Data-Analysis").setMaster("local[1]")
9 | val sc = new SparkContext(conf)
10 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
11 | val textFile = sc.textFile(Utills.DATA_PATH + "/TravelData.txt")
12 | val travelDataRDD = textFile.mapPartitions(_.drop(1)) //remove the header information from the file
13 |
14 | //Top 20 destination people travel the most
15 | val top20DestinationRDD = travelDataRDD.map(lines => lines.split('\t')).
16 | map(x => (x(2), 1)).reduceByKey(_ + _).
17 | map(item => item.swap).sortByKey(false).take(20)
18 | top20DestinationRDD.foreach(f => println(f))
19 |
20 | //Top 20 locations from where people travel the most
21 | val top20LocationsRDD = travelDataRDD.map(lines => lines.split('\t')).map(x => (x(1), 1)).
22 | reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(20)
23 |
24 | top20LocationsRDD.foreach(f => println(f))
25 |
26 | //Top 20 cities that generate high airline revenues for travel
27 | val fil = travelDataRDD.map(x => x.split('\t')).filter(x => { if ((x(3).matches(("1")))) true else false })
28 | // fil.collect().foreach { x => println(x(2)) }
29 | val Top20Cities = fil.map(x => (x(2), 1)).reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(20)
30 | Top20Cities.foreach(f => println(f))
31 | }
32 |
33 | //https://acadgild.com/blog/spark-use-case-travel-data-analysis/
34 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/YoutubeDataAnalysis.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.functions._
5 | import com.spark.util.Utills
6 | object YoutubeDataAnalysis {
7 |
8 | def main(args: Array[String]) {
9 | val conf = new SparkConf().setAppName("Youtube-Data-Analysis").setMaster("local[1]")
10 | val sc = new SparkContext(conf)
11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
12 | val textFile = sc.textFile(Utills.DATA_PATH + "/youtubedata.txt")
13 |
14 | //Here, we will find out what are the top five categories with maximum number of videos uploaded.
15 | // val counts = textFile.map(line => { var YoutubeRecord = ""; val temp = line.split("\t"); ; if (temp.length >= 3) { YoutubeRecord = temp(3) }; YoutubeRecord })
16 | val counts = textFile.map(_.split("\t")).filter(_.length >= 3).map(_(3))
17 | val test = counts.map(x => (x, 1))
18 | val res = test.reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(5)
19 | res.foreach(f => println(f))
20 |
21 | //In this problem statement, we will find the top 10 rated videos in YouTube.
22 | val counts1 = textFile.filter { x => { if (x.toString().split("\t").length >= 6) true else false } }.map(line => { line.toString().split("\t") })
23 | val pairs = counts1.map(x => { (x(0), x(6).toDouble) })
24 | val res1 = pairs.reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(10)
25 | res1.foreach(f => println(f))
26 |
27 | }
28 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/loganalysis/ApacheAccessLog.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases.loganalysis
2 |
3 | case class ApacheAccessLog(ipAddress: String, clientIdentd: String,
4 | userId: String, dateTime: String, method: String,
5 | endpoint: String, protocol: String,
6 | responseCode: Int, contentSize: Long) {
7 |
8 | }
9 |
10 | object ApacheAccessLog {
11 | val PATTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r
12 |
13 | def parseLogLine(log: String): ApacheAccessLog = {
14 | val res = PATTERN.findFirstMatchIn(log)
15 | if (res.isEmpty) {
16 | throw new RuntimeException("Cannot parse log line: " + log)
17 | }
18 | val m = res.get
19 | ApacheAccessLog(m.group(1), m.group(2), m.group(3), m.group(4),
20 | m.group(5), m.group(6), m.group(7), m.group(8).toInt, m.group(9).toLong)
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/loganalysis/LogAnalyzer.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases.loganalysis
2 | import org.apache.spark.{ SparkContext, SparkConf }
3 | import org.apache.spark.SparkContext._
4 | import com.spark.util._
5 | object LogAnalyzer {
6 |
7 | object SecondValueOrdering extends Ordering[(String, Int)] {
8 | def compare(a: (String, Int), b: (String, Int)) = {
9 | a._2 compare b._2
10 | }
11 | }
12 |
13 | def main(args: Array[String]) {
14 | val sparkConf = new SparkConf().setAppName("Log Analysis").setMaster("local[1]")
15 | val sc = new SparkContext(sparkConf)
16 |
17 | val accessLogs = sc.textFile(Utills.DATA_PATH + "log.txt")
18 | .map(ApacheAccessLog.parseLogLine).cache()
19 |
20 | // Any IPAddress that has accessed the server more than 2 times.
21 | val ipAddresses = accessLogs
22 | .map(log => (log.ipAddress, 1))
23 | .reduceByKey(_ + _)
24 | .filter(_._2 > 4)
25 | .map(_._1)
26 | .take(10)
27 | println(s"""IPAddresses > 2 times: ${ipAddresses.mkString("[", ",", "]")}""")
28 |
29 | // Finding top 5 hits.
30 | val ipAddressesTop5 = accessLogs
31 | .map(log => (log.ipAddress, 1))
32 | .reduceByKey(_ + _)
33 | .top(5)(SecondValueOrdering)
34 |
35 | println(s"""Top 5 hits : ${ipAddressesTop5.mkString("[", ",", "]")}""")
36 |
37 | // Top Endpoints.
38 | val topEndpoints = accessLogs
39 | .map(log => (log.endpoint, 1))
40 | .reduceByKey(_ + _)
41 | .top(10)(SecondValueOrdering)
42 | println(s"""Top Endpoints: ${topEndpoints.mkString("[", ",", "]")}""")
43 |
44 | // Calculate statistics based on the content size.
45 | val contentSizes = accessLogs.map(log => log.contentSize).cache()
46 | println("Content Size Avg: %s, Min: %s, Max: %s".format(
47 | contentSizes.reduce(_ + _) / contentSizes.count,
48 | contentSizes.min,
49 | contentSizes.max))
50 |
51 | // Compute Response Code to Count.
52 | val responseCodeToCount = accessLogs
53 | .map(log => (log.responseCode, 1))
54 | .reduceByKey(_ + _)
55 | .take(100)
56 | println(s"""Response code counts: ${responseCodeToCount.mkString("[", ",", "]")}""")
57 | }
58 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/loganalysis/LogAnalyzerSQL.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases.loganalysis
2 | import org.apache.spark.sql.SQLContext
3 | import org.apache.spark.{ SparkContext, SparkConf }
4 | import com.spark.util._
5 | object LogAnalyzerSQL {
6 | def main(args: Array[String]) {
7 | val sparkConf = new SparkConf().setAppName("Log Analyzer SQL").setMaster("local[1]")
8 | val sc = new SparkContext(sparkConf)
9 | val sqlContext = new SQLContext(sc)
10 | import sqlContext.implicits._
11 |
12 | val accessLogs = sc.textFile(Utills.DATA_PATH + "log.txt").map(ApacheAccessLog.parseLogLine).toDF()
13 | accessLogs.registerTempTable("Logs")
14 | sqlContext.cacheTable("Logs");
15 |
16 | // Calculate statistics based on the content size.
17 | val contentSizeStats = sqlContext
18 | .sql("SELECT SUM(contentSize), COUNT(*), MIN(contentSize), MAX(contentSize) FROM Logs")
19 | .first()
20 | println("Content Size Avg: %s, Min: %s, Max: %s".format(
21 | contentSizeStats.getLong(0) / contentSizeStats.getLong(1),
22 | contentSizeStats(2),
23 | contentSizeStats(3)))
24 |
25 | // Compute Response Code to Count.
26 | val responseCodeToCount = sqlContext
27 | .sql("SELECT responseCode, COUNT(*) FROM Logs GROUP BY responseCode LIMIT 1000")
28 | .map(row => (row.getInt(0), row.getLong(1)))
29 | .collect()
30 | println(s"""Response code counts: ${responseCodeToCount.mkString("[", ",", "]")}""")
31 |
32 | // Any IPAddress that has accessed the server more than 10 times.
33 | val ipAddresses = sqlContext
34 | .sql("SELECT ipAddress, COUNT(*) AS total FROM Logs GROUP BY ipAddress HAVING total > 10 LIMIT 1000")
35 | .map(row => row.getString(0))
36 | .collect()
37 | println(s"""IPAddresses > 10 times: ${ipAddresses.mkString("[", ",", "]")}""")
38 |
39 | val topEndpoints = sqlContext
40 | .sql("SELECT endpoint, COUNT(*) AS total FROM Logs GROUP BY endpoint ORDER BY total DESC LIMIT 10")
41 | .map(row => (row.getString(0), row.getLong(1)))
42 | .collect()
43 | println(s"""Top Endpoints: ${topEndpoints.mkString("[", ",", "]")}""")
44 |
45 | sc.stop()
46 | }
47 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/sensoranalytics/Models.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases.sensoranalytics
2 |
3 | import java.sql.{Date, Timestamp}
4 | import java.text.SimpleDateFormat
5 | import java.util.Calendar
6 |
7 | import org.joda.time.DateTime
8 |
9 |
10 | case class SensorRecord(dateTime: DateTime,
11 | country:String,
12 | state:String,
13 | city:String,
14 | sensorStatus:String)
15 |
16 | case class CountryWiseStats(date: DateTime,country:String, count: BigInt)
17 |
18 | case class StateWiseStats(date: DateTime,country:String,state:String, count: BigInt)
19 |
20 | case class CityWiseStats(date: DateTime,city:String,sensorStatus:String, count: BigInt)
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/sensoranalytics/SchemaParser.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases.sensoranalytics
2 |
3 | import org.joda.time.DateTimeZone
4 | import org.joda.time.format.DateTimeFormat
5 |
6 | import scala.util.Try
7 |
8 | object SchemaParser {
9 |
10 | def parse(eachRow: String): Option[SensorRecord] = {
11 | val columns = eachRow.split(",")
12 | //println("columns --->" + columns(0) +","+ columns(1) +","+ columns(2) +","+ columns(3))
13 | Try {
14 | if (columns.length == 5) {
15 | Option(SensorRecord(createDate(columns(0)), columns(1), columns(2), columns(3), columns(4)))
16 | } else {
17 | None
18 | }
19 | }.getOrElse(None)
20 | }
21 |
22 | def createDate(input: String) = {
23 | val columns = input.split(" ")
24 | val pattern = "YYYY/MM/dd HH"
25 | DateTimeFormat.forPattern(pattern)
26 | .parseDateTime(columns(0) + " " + columns(1).split(":")(0))
27 | .withZone(DateTimeZone.getDefault())
28 | }
29 |
30 | def createDelay(input: String): Double = {
31 | val delay_regex = """[^\d|.]*([0-9\\.]+)\s*(ms|.*)""".r
32 |
33 | input match {
34 | case delay_regex(value, unit) => {
35 | if (unit.equalsIgnoreCase("ms")) {
36 | value.toDouble
37 | } else {
38 | 0
39 | }
40 | }
41 | }
42 | }
43 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/twitteranalytics/TwitterAnalytics.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases.twitteranalytics
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.streaming._
6 | import org.apache.spark.streaming.StreamingContext._
7 | import org.apache.spark.streaming.twitter._
8 |
9 | object TwitterAnalytics extends App {
10 | val conf = new SparkConf().setAppName("myStream").setMaster("local[2]")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("WARN")
13 | val ssc = new StreamingContext(sc, Seconds(2))
14 | val client = new twitterclient()
15 | val tweetauth = client.start()
16 | val inputDstream = TwitterUtils.createStream(ssc, Option(tweetauth.getAuthorization))
17 |
18 | // Split the stream on space and extract hashtags
19 | val hashTags = inputDstream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))
20 |
21 | // Get the top hashtags over the previous 60 sec window
22 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
23 | .map { case (topic, count) => (count, topic) }
24 | .transform(_.sortByKey(false))
25 |
26 | // Get the top hashtags over the previous 10 sec window
27 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
28 | .map { case (topic, count) => (count, topic) }
29 | .transform(_.sortByKey(false))
30 |
31 | // print tweets in the currect DStream
32 | inputDstream.print()
33 |
34 | // Print popular hashtags
35 | topCounts60.foreachRDD(rdd => {
36 | val topList = rdd.take(10)
37 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
38 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
39 | })
40 | topCounts10.foreachRDD(rdd => {
41 | val topList = rdd.take(10)
42 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
43 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
44 | })
45 |
46 |
47 | ssc.start()
48 | ssc.awaitTermination()
49 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/usecases/twitteranalytics/twitterclient.scala:
--------------------------------------------------------------------------------
1 | package com.spark.usecases.twitteranalytics
2 |
3 | import twitter4j.Twitter
4 | import twitter4j.TwitterException;
5 | import twitter4j.TwitterFactory;
6 | import twitter4j.auth.AccessToken;
7 | import twitter4j.auth.RequestToken;
8 | class twitterclient {
9 | val CONSUMER_KEY: String = "Tn6mCikBNxLviA6znN4FgIXfY"
10 | val CONSUMER_KEY_SECRET: String = "JoRN26wNoPUuUYsgR4zKwre82zTY53r8rDzy6nLSrS4cMqiRzg"
11 | val ACCESS_TOKEN = "199435611-ancQT2HKivvIrlrKg2FYLTBoQyA0zsISGhDbO7ug"
12 | val ACCESS_TOKEN_SECRET = "wHaw4X7ok2uWXVGvOAOzaSgZvRovK4xFY4CAMLoNuMOy8"
13 | def start(): Twitter = {
14 | val twitter: Twitter = new TwitterFactory().getInstance();
15 | twitter.setOAuthConsumer(CONSUMER_KEY, CONSUMER_KEY_SECRET);
16 | twitter.setOAuthAccessToken(new AccessToken(ACCESS_TOKEN, ACCESS_TOKEN_SECRET))
17 | twitter
18 | }
19 | }
--------------------------------------------------------------------------------
/Spark-1.5/src/main/scala/com/spark/util/LogHelper.scala:
--------------------------------------------------------------------------------
1 | package com.spark.util
2 |
3 | import org.apache.log4j.Logger
4 | import org.apache.log4j.xml.DOMConfigurator
5 |
6 | trait LogHelper {
7 | DOMConfigurator.configure(Utills.DATA_PATH +"log4j_conf.xml")
8 | val loggerName = this.getClass.getName
9 | final val logger = Logger.getLogger(loggerName)
10 | }
--------------------------------------------------------------------------------
/Spark-2.1/README.md:
--------------------------------------------------------------------------------
1 | # Spark-2.1
2 | Apache Spark is a fast and general-purpose cluster computing system. It provides high-level APIs in Java, Scala, Python and R, and an optimized engine that supports general execution graphs. It also supports a rich set of higher-level tools including Spark SQL for SQL and structured data processing, MLlib for machine learning, GraphX for graph processing, and Spark Streaming.
3 |
4 | Topics Covered :
5 | ----------------
6 | Implementing custom UDF,UDAF,Partitioner using Spark-2.1
7 | Working with DataFrames (ComplexSchema,DropDuplicates,DatasetConversion,GroupingAndAggregation)
8 | Working with DataSets
9 | Working with Parquet files
10 | Working with Spark Catalog API to access Hive tables
11 | Loading Data from Cassnadra table using Spark
12 | CRUD operations on Cassandra Using Spark
13 | Reading/Writing to S3 buckets Using Spark
14 | Spark MangoDB Integration
15 |
16 | Pushing Spark Accumulator Values as metrics to DataDog API
17 |
18 |
19 |
20 | ------------------------------------------------------------------------------------------------------------------------------------
21 |
22 | You can reach me for any suggestions/clarifications on : revanthkumar95@gmail.com
23 | Feel free to share any insights or constructive criticism. Cheers!!
24 | #Happy Sparking!!!..
25 |
--------------------------------------------------------------------------------
/Spark-2.1/input/README.md:
--------------------------------------------------------------------------------
1 | Developers have always loved Apache Spark for providing APIs that are simple yet powerful, a combination of traits that makes complex analysis possible with minimal programmer effort. At Databricks, we have continued to push Spark’s usability and performance envelope through the introduction of DataFrames and Spark SQL. These are high-level APIs for working with structured data (e.g. database tables, JSON files), which let Spark automatically optimize both storage and computation. Behind these APIs, the Catalyst optimizer and Tungsten execution engine optimize applications in ways that were not possible with Spark’s object-oriented (RDD) API, such as operating on data in a raw binary form.
--------------------------------------------------------------------------------
/Spark-2.1/input/badrecords:
--------------------------------------------------------------------------------
1 | Barack,Obama,53
2 | George,Bush,68
3 | Hillary,Clinton,F
4 | Bill,Clinton,68
5 | Tom,Cruise,M
6 | Barack,Obama,53
7 | George,Bush,68
8 | Hillary,Clinton,F
9 | Bill,Clinton,68
10 | Tom,Cruise,M
--------------------------------------------------------------------------------
/Spark-2.1/input/conf.properties:
--------------------------------------------------------------------------------
1 | tidList = 102:1,104:1,106:9
2 | topics = topic1,topic2,topic3
--------------------------------------------------------------------------------
/Spark-2.1/input/emp.txt:
--------------------------------------------------------------------------------
1 | 100,Steven,King,SKING,515.123.4567,1987-06-17,AD_PRES,24000.00,null,null,90
2 | 101,Neena,Kochhar,NKOCHHAR,515.123.4568,1989-09-21,AD_VP,17000.00,null,100,90
--------------------------------------------------------------------------------
/Spark-2.1/input/iap_sw_cpu_mem_stats_rollup/part-00000-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/iap_sw_cpu_mem_stats_rollup/part-00000-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet
--------------------------------------------------------------------------------
/Spark-2.1/input/iap_sw_cpu_mem_stats_rollup/part-00001-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/iap_sw_cpu_mem_stats_rollup/part-00001-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet
--------------------------------------------------------------------------------
/Spark-2.1/input/lines:
--------------------------------------------------------------------------------
1 | Note that this information is only available for the duration of the application by default. To view the web UI after the fact, set spark.eventLog.enabled to true before starting the application. This configures Spark to log Spark events that encode the information displayed in the UI to persisted storage.
--------------------------------------------------------------------------------
/Spark-2.1/input/nested.json:
--------------------------------------------------------------------------------
1 | {"queryResults":{"searchResponse":{"response":{"docs":[{"transactions":[{"recordDate":"2010-02-02 00:00:00","code":"PGM/","description":"Recordation of Patent Grant Mailed"},{"recordDate":"2010-01-13 00:00:00","code":"WPIR","description":"Issue Notification Mailed"},{"recordDate":"2009-12-17 00:00:00","code":"R1021","description":"Receipt into Pubs"}]}]}}}}
--------------------------------------------------------------------------------
/Spark-2.1/input/pbs.csv:
--------------------------------------------------------------------------------
1 | Name,Position Title,Department,Employee Annual Salary
2 | "AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
3 | "AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90745.00
4 | "AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
5 | "AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
6 | "AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
7 | "ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
8 | "ABARCA, ANABEL",ASST TO THE ALDERMAN,CITY COUNCIL,$70764.00
9 | "ABARCA, EMMANUEL",GENERAL LABORER - DSS,STREETS & SAN,$41849.60
10 | "ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,$20051.20
--------------------------------------------------------------------------------
/Spark-2.1/input/product:
--------------------------------------------------------------------------------
1 | 217,11,Fitness Gear 300 lb Olympic Weight Set,desc,209.99,http://images.acmesports.sports/Fitness+Gear+300+lb+Olympic+Weight+Set
2 | 218,11,Elevation Training Mask 2.0,,,http://images.acmesports.sports/Elevation+Training+Mask+2.0
3 | 219,11,Fitness Gear Pro Utility Bench,,179.99,http://images.acmesports.sports/Fitness+Gear+Pro+Utility+Bench
4 | 220,11,Teeter Hang Ups NXT-S Inversion Table,,299.99,http://images.acmesports.sports/Teeter+Hang+Ups+NXT-S+Inversion+Table
--------------------------------------------------------------------------------
/Spark-2.1/input/purchases.log:
--------------------------------------------------------------------------------
1 | Kolkata Central Avenue Groceries 233.65
2 |
3 | Kolkata Bowbazar Hair Care 198.99
4 | Bad data packet
5 | Kolkata Amherst Street Beverages 92.75
6 | Kolkata Amherst Street Beverages 0
7 | Kolkata Amherst Street Groceries 92.75
8 | Kolkata Amherst Street Beverages 92.75
9 | Kolkata Central Avenue 233.65
10 | Kolkata Amherst Street Hair Care 92.75
11 | Bad data packet
12 | Kolkata Bowbazar Groceries 198.99
13 | Kolkata Bowbazar Groceries 198.99
14 | Kolkata Bowbazar Hair Care 198.99
15 | Bad data packet
16 | Kolkata Bowbazar Groceries 198.99
17 | Kolkata Bowbazar Groceries 198.99
18 | Kolkata Bowbazar Beverages 198.99
19 | Kolkata Central Avenue Hair Care 0
20 | Bad data packet
21 | Kolkata Central Avenue Beverages 0
22 | Bad data packet
23 | Kolkata Central Avenue Groceries 233.65
24 | Kolkata Central Avenue Groceries 233.65
25 | Kolkata Central Avenue Beverages 233.65
--------------------------------------------------------------------------------
/Spark-2.1/input/schools.json:
--------------------------------------------------------------------------------
1 | {"name": "UC Berkeley", "yearFounded": 1868,"numStudents": 37581}
2 | {"name": "MIT", "yearFounded": 1860, "numStudents": 11318}
3 | {"name": "JNTU-A", "yearFounded": 1950,"numStudents": 37581}
4 | {"name": "BITIT", "yearFounded": 1999, "numStudents": 11318}
5 | {"name": "VIT", "yearFounded": 1900,"numStudents": 37581}
6 | {"name": "VTU", "yearFounded": 1900, "numStudents": 11318}
7 | {"name": "SRM", "yearFounded": 1968,"numStudents": 37581}
8 | {"name": "SASTRA", "yearFounded": 1990, "numStudents": 11318}
--------------------------------------------------------------------------------
/Spark-2.1/input/sw_hp_system_cpu_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_cpu_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet
--------------------------------------------------------------------------------
/Spark-2.1/input/sw_hp_system_cpu_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_cpu_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet
--------------------------------------------------------------------------------
/Spark-2.1/input/sw_hp_system_info_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_info_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet
--------------------------------------------------------------------------------
/Spark-2.1/input/sw_hp_system_info_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_info_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet
--------------------------------------------------------------------------------
/Spark-2.1/input/sw_hp_system_memory_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_memory_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet
--------------------------------------------------------------------------------
/Spark-2.1/input/sw_hp_system_memory_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_memory_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/datadog/metrics/AbstractCaseClass.scala:
--------------------------------------------------------------------------------
1 | package com.datadog.metrics
2 |
3 | abstract class AbstractCaseClass
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/datadog/metrics/CaseClasses.scala:
--------------------------------------------------------------------------------
1 | package com.datadog.metrics
2 |
3 | object CaseClasses {
4 |
5 | //case classes for events and metrics construction
6 | case class SeriesList(series: List[Series]) extends AbstractCaseClass
7 | case class Series(metric: String, `type`: String, points: List[List[Long]], tags: List[String]) extends AbstractCaseClass
8 | case class Event(title: String, text: String, priority: String, alert_type: String, date_happened: Long, tags: List[String]) extends AbstractCaseClass
9 |
10 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/datadog/metrics/DataDogHttpTransport.scala:
--------------------------------------------------------------------------------
1 | package com.datadog.metrics
2 |
3 | import org.apache.http.HttpHost
4 | import org.apache.http.HttpResponse
5 | import org.apache.http.client.fluent.Request.Post
6 | import org.apache.http.client.fluent.Response
7 | import org.apache.http.entity.ContentType
8 | import org.apache.log4j.Logger
9 |
10 | /**
11 | * Uses the datadog http webservice to push metrics.
12 | *
13 | * @see API docs
14 | */
15 |
16 | class DataDogHttpTransport(apiKey: String,
17 | private val connectTimeout: Int,
18 | private val socketTimeout: Int,
19 | private val proxy: HttpHost,
20 | isMetrics: java.lang.Boolean)
21 | extends Transport {
22 |
23 | val logger: Logger = Logger.getLogger(classOf[DataDogHttpTransport])
24 |
25 | private val BASE_URL: String = "https://app.datadoghq.com/api/v1"
26 |
27 | /**
28 | * seriesUrl gets constructed based on the 'isMetrics' value
29 | */
30 | private val seriesUrl: String =
31 | if (isMetrics) String.format("%s/series?api_key=%s", BASE_URL, apiKey)
32 | else String.format("%s/events?api_key=%s", BASE_URL, apiKey)
33 |
34 | /**
35 | * This method is used to send Metrics/Events to DataDog.
36 | * @return httpResponseCode
37 | */
38 | def sendToDataDog(transport: DataDogHttpTransport, jsonData: String): Int = {
39 | val request: org.apache.http.client.fluent.Request =
40 | Post(transport.seriesUrl)
41 | .useExpectContinue()
42 | .connectTimeout(transport.connectTimeout)
43 | .socketTimeout(transport.socketTimeout)
44 | .bodyString(jsonData, ContentType.APPLICATION_JSON)
45 | if (transport.proxy != null) {
46 | request.viaProxy(transport.proxy)
47 | }
48 | val response: Response = request.execute()
49 | val httpResponse: HttpResponse = response.returnResponse()
50 | httpResponse.getStatusLine.getStatusCode
51 | }
52 |
53 | /**
54 | * This method is used to send the Json request.
55 | * @return httpResponseCode
56 | */
57 | def send(jsonData: String) = sendToDataDog(this, jsonData)
58 |
59 | def close(): Unit = {}
60 | }
61 |
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/datadog/metrics/MetricsCollector.scala:
--------------------------------------------------------------------------------
1 | package com.datadog.metrics
2 |
3 | /**
4 | * @author revanthreddy
5 | */
6 | trait MetricsCollector {
7 |
8 |
9 | /**
10 | * This method is used to send metrics to DataDog .
11 | */
12 | def sendMetrics(metricName: String, metricValue: Long, tags: collection.mutable.Map[String, Any])
13 |
14 | /**
15 | * This method is used to send events to DataDog.
16 | */
17 | def sendEvents(title: String, text: String, priority: String, alert_type: String, tags: collection.mutable.Map[String, Any])
18 |
19 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/datadog/metrics/MetricsCollectorFactory.scala:
--------------------------------------------------------------------------------
1 | package com.datadog.metrics
2 |
3 | /**
4 | * @author revanthreddy
5 | */
6 | object MetricsCollectorFactory {
7 |
8 | def getDatadogCollector(apikey: String, env: String): MetricsCollector = new DataDogCollector(apikey, env)
9 |
10 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/datadog/metrics/Spark_Accumulator.scala:
--------------------------------------------------------------------------------
1 | package com.datadog.metrics
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object Spark_Accumulator {
6 | def main(args: Array[String]) {
7 |
8 | val sparkSession = SparkSession.builder.
9 | master("local[1]")
10 | .appName("Spark_Accumulator_Metrics_To_DataDog")
11 | .getOrCreate()
12 |
13 | val sc = sparkSession.sparkContext
14 |
15 | val accum = sc.longAccumulator("total.charecters.counter")
16 | val words = sc.textFile("input/lines").flatMap(_.split(" "))
17 | words.foreach(w => accum.add(w.length))
18 |
19 | //setting the metrics tags
20 | var tags = collection.mutable.Map[String, Any]()
21 | tags.put("counter", accum.name.get)
22 | tags += ("class" -> getClass)
23 | tags += ("count" -> accum.value, "accum name" -> "total.charecters.counter")
24 |
25 | //DataDog API Key needs to be generted by creating account in DataDog
26 | val apiKey="947d12f46dead405bf019033434f0xxx"
27 | //initializing the metrics collector
28 | val metricsCollector = MetricsCollectorFactory.getDatadogCollector(apiKey, "dev")
29 |
30 | //sending accumulator values as metrics to DataDog
31 | metricsCollector.sendMetrics(accum.name.get, accum.value, null)
32 |
33 | val badRecords = sc.longAccumulator("bad.records.counter")
34 | val baddata = sc.textFile("input/badrecords").map(v => v.split(","))
35 | baddata.foreach(r => { try { r(2).toInt } catch { case e: NumberFormatException => badRecords.add(1) } })
36 |
37 | //sending accumulator values as metrics to DataDog
38 | metricsCollector.sendMetrics(badRecords.name.get, badRecords.value, tags)
39 |
40 | val acc = sc.longAccumulator("counter.test")
41 | val baddata1 = sc.textFile("input/badrecords").map(x => acc.add(1))
42 | baddata1.collect()
43 |
44 | //sending events to DataDog
45 | metricsCollector.sendEvents("Spark-Events", "Test Run", "normal", "info", tags)
46 |
47 | sc.stop()
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/datadog/metrics/Transport.scala:
--------------------------------------------------------------------------------
1 | package com.datadog.metrics
2 | import java.io.Closeable
3 |
4 |
5 | /**
6 | * The transport layer for pushing metrics to datadog
7 | */
8 | trait Transport extends Closeable {
9 |
10 | /**
11 | * Build a request context.
12 | */
13 | def send(jsonData: String): Int
14 |
15 | /**
16 | * Send the request to datadog
17 | */
18 | def sendToDataDog(transport: DataDogHttpTransport,jsonData: String): Int
19 |
20 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/aws/Spark_AWS_S3.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.aws
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.SaveMode
5 |
6 | object Spark_AWS_S3 extends App {
7 | case class Employee(empid: Int, name: String, dept: String, salary: Double, nop: Int, dttime: String)
8 |
9 | val spark = SparkSession.builder().appName("Spark_AWS_S3").master("local[1]").getOrCreate()
10 | val sc = spark.sparkContext
11 |
12 | sc.hadoopConfiguration.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
13 | sc.hadoopConfiguration.set("fs.s3n.awsAccessKeyId", "xxxxxxxxxx")
14 | sc.hadoopConfiguration.set("fs.s3n.awsSecretAccessKey", "xxxxxxxxxxxx")
15 |
16 | import spark.implicits._
17 |
18 | val empDF = Seq(
19 | Employee(123, "revanth", "cloud", 1000, 2, "07-06-2016-06-08-27"),
20 | Employee(124, "shyam", "finance", 3000, 2, "07-06-2016-06-08-27"),
21 | Employee(125, "hari", "TAG", 6000, 2, "07-06-2016-06-08-27"),
22 | Employee(126, "kiran", "cloud", 2000, 2, "08-06-2016-07-08-27"),
23 | Employee(127, "nandha", "sales", 1000, 2, "08-06-2016-07-08-27"),
24 | Employee(128, "pawan", "cloud", 1000, 2, "08-06-2016-07-08-27"),
25 | Employee(129, "kalyan", "conectivity", 1000, 2, "09-06-2016-08-08-27"),
26 | Employee(121, "satish", "finance", 1000, 2, "09-06-2016-08-08-27"),
27 | Employee(131, "arun", "cloud", 1000, 2, "09-06-2016-08-08-27"),
28 | Employee(132, "ram", "cloud", 1000, 2, "10-06-2016-08-08-27"),
29 | Employee(133, "suda", "conectivity", 1000, 2, "10-06-2016-08-08-27"),
30 | Employee(134, "sunder", "sales", 1000, 2, "10-06-2016-08-08-27"),
31 | Employee(135, "charan", "TAG", 1000, 2, "12-06-2016-08-08-27"),
32 | Employee(136, "ravi", "TAG", 1000, 2, "11-06-2016-08-08-27"),
33 | Employee(137, "arjun", "cloud", 1000, 2, "11-06-2016-08-08-27")).toDF()
34 |
35 | empDF.coalesce(1).write.format("org.apache.spark.sql.json").mode(SaveMode.Append).save("s3n://snanpsat/emp")
36 |
37 | val empS3DF = spark.read.json("s3n://snanpsat/emp")
38 | empS3DF.printSchema()
39 |
40 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/cassandra/ChangeDFTypes.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.cassandra
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.types.LongType
5 |
6 | object ChangeDFTypes extends App {
7 |
8 | val spark = SparkSession.builder().appName("ChangeDFTypes-Job").master("local[1]")
9 | .config("spark.cassandra.connection.host", "127.0.0.1").getOrCreate()
10 |
11 | var testDF = (spark.read.format("org.apache.spark.sql.cassandra")
12 | .options(Map("table" -> "test", "keyspace" -> "dev"))
13 | .load)
14 |
15 | println("schema and data before conversion....")
16 | testDF.printSchema()
17 | testDF.show(3, false)
18 |
19 | val newTestDF = testDF.dtypes
20 |
21 | //converting all the timestamp columns in the dataframe to long type
22 | newTestDF.foreach { f =>
23 | val columnName = f._1
24 | val columnType = f._2
25 |
26 | if (columnType.equals("TimestampType")) {
27 | testDF = testDF.withColumn(columnName, testDF(columnName).cast(LongType))
28 |
29 | }
30 | }
31 | println("schema and data after conversion....")
32 | testDF.printSchema()
33 | testDF.show(3, false)
34 | }
35 | //CREATE TABLE TEST (ID TEXT, NAME TEXT, VALUE TEXT, LAST_MODIFIED_DATE TIMESTAMP,CREATED_DATE timestamp, PRIMARY KEY (ID));
36 | //INSERT INTO TEST (ID, NAME, VALUE, LAST_MODIFIED_DATE,CREATED_DATE) VALUES ('1', 'orange', 'fruit', toTimestamp(now()),toTimestamp(now()));
37 | //INSERT INTO TEST (ID, NAME, VALUE, LAST_MODIFIED_DATE,CREATED_DATE) VALUES ('2', 'elephant', 'animal', toTimestamp(now()),toTimestamp(now()));
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/cassandra/ConvetTimestampToLong.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.cassandra
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.types.LongType
5 | import org.apache.spark.sql.functions.{ col, udf }
6 | import java.sql.Timestamp
7 | object ConvetTimestampToLong extends App {
8 |
9 | val spark = SparkSession.builder().appName("ConvetTimestampToLong-Job").master("local[1]")
10 | .config("spark.cassandra.connection.host", "127.0.0.1").getOrCreate()
11 |
12 | var testDF = (spark.read.format("org.apache.spark.sql.cassandra")
13 | .options(Map("table" -> "test", "keyspace" -> "dev"))
14 | .load)
15 |
16 | println("schema and data before conversion....")
17 | testDF.printSchema()
18 | testDF.show(3, false)
19 |
20 | /* convert Spark Timestamp column type to Long in epoch-msecs */
21 | protected val udfTimestampToEpochMsecLong = udf { (ts: Timestamp) =>
22 | Option(ts) match {
23 | case Some(ts) => Some(ts.getTime())
24 | case _ => None
25 | }
26 | }
27 |
28 | val newTestDF = testDF.dtypes
29 |
30 | //converting all the timestamp columns in the dataframe to long type
31 | newTestDF.foreach { f =>
32 | val columnName = f._1
33 | val columnType = f._2
34 | /* for consistency with Parquet schema, convert it to Long (in epoch-millisecs).
35 | * -> Note: DO NOT directly cast to long, that returns epoch-seconds, which is 3 digits shorter! */
36 | if (columnType.equals("TimestampType")) {
37 | testDF = testDF.withColumn(columnName, udfTimestampToEpochMsecLong(col(columnName)))
38 | }
39 | }
40 | println("schema and data after conversion....")
41 | testDF.printSchema()
42 | testDF.show(3, false)
43 | }
44 | //CREATE TABLE TEST (ID TEXT, NAME TEXT, VALUE TEXT, LAST_MODIFIED_DATE TIMESTAMP,CREATED_DATE timestamp, PRIMARY KEY (ID));
45 | //INSERT INTO TEST (ID, NAME, VALUE, LAST_MODIFIED_DATE,CREATED_DATE) VALUES ('1', 'orange', 'fruit', toTimestamp(now()),toTimestamp(now()));
46 | //INSERT INTO TEST (ID, NAME, VALUE, LAST_MODIFIED_DATE,CREATED_DATE) VALUES ('2', 'elephant', 'animal', toTimestamp(now()),toTimestamp(now()));
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/cassandra/FilterCassandraData.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.cassandra
2 | import org.apache.spark.sql.SparkSession
3 |
4 | object FilterCassandraData extends App {
5 |
6 | case class Employee(id: Int, name: String, salary: Int)
7 |
8 | val spark = SparkSession.builder().appName("Spark_To_Caasandra").master("local[1]").getOrCreate()
9 | import spark.implicits._
10 |
11 | spark.conf.set("spark.sql.shuffle.partitions", "2")
12 | spark.conf.set("spark.cassandra.connection.host", "127.0.0.1")
13 |
14 | val KEY_SPACE_NAME = "dev"
15 | val TABLE_NAME = "employee"
16 | val TABLE_NAME1 = "master_collection1"
17 |
18 | //loading data from cassandra table
19 | val df = spark.read.format("org.apache.spark.sql.cassandra").option("table", TABLE_NAME)
20 | .option("keyspace", KEY_SPACE_NAME)
21 | .load()
22 | // df.printSchema()
23 | // df.show()
24 |
25 | val masterdf = spark.read.format("org.apache.spark.sql.cassandra").option("table", TABLE_NAME1)
26 | .option("keyspace", KEY_SPACE_NAME)
27 | .load()
28 | val tidfiltDF = masterdf.select("id").where(masterdf("disable") === "0")
29 | tidfiltDF.show()
30 | val tidList = tidfiltDF.select("id").map(r => r.getInt(0)).collect.toList
31 | val filt = tidList.mkString("id in (", ",", ")")
32 | println(filt)
33 |
34 | val finalfildf = df.filter(filt)
35 | finalfildf.show()
36 | finalfildf.select("id").distinct.show()
37 |
38 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/cassandra/SparkStreaming_Cassandra.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.cassandra
2 |
3 | import scala.reflect.runtime.universe
4 |
5 | import org.apache.spark.sql.SparkSession
6 | import org.apache.spark.streaming.Seconds
7 | import org.apache.spark.streaming.StreamingContext
8 | import org.apache.spark.streaming.dstream.ConstantInputDStream
9 | import com.datastax.spark.connector.streaming.toStreamingContextFunctions
10 | import com.datastax.spark.connector.toNamedColumnRef
11 |
12 | /**
13 | * Reading from Cassandra using Spark Streaming
14 | */
15 | object SparkStreaming_Cassandra extends App {
16 | case class Employee(id: Int, name: String, salary: Int)
17 |
18 | val spark = SparkSession.builder().appName("Spark_Streaming_Cassandra").master("local[*]").getOrCreate()
19 |
20 | spark.conf.set("spark.sql.shuffle.partitions", "2")
21 | spark.conf.set("spark.cassandra.connection.host", "127.0.0.1")
22 |
23 | val KEY_SPACE_NAME = "dev"
24 | val TABLE_NAME = "employee"
25 |
26 | val ssc = new StreamingContext(spark.sparkContext, Seconds(10))
27 | val cassandraRDD = ssc.cassandraTable[Employee](KEY_SPACE_NAME, TABLE_NAME).select("id", "name", "salary")
28 |
29 | val dstream = new ConstantInputDStream(ssc, cassandraRDD)
30 |
31 | dstream.foreachRDD { rdd =>
32 | println("Total Records cont in DB : " + rdd.count)
33 |
34 | println(rdd.collect.mkString("\n"))
35 | }
36 |
37 | ssc.start()
38 | ssc.awaitTermination()
39 |
40 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/cassandra/Writting_DF_To_Cassandra.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.cassandra
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.SaveMode
5 |
6 | object Writting_DF_To_Cassandra extends App {
7 |
8 | case class Emp(id: Int, name: String, salary: Int)
9 | val spark = SparkSession.builder().appName("Spark_To_Caasandra").master("local[1]").getOrCreate()
10 |
11 | spark.conf.set("spark.sql.shuffle.partitions", "2")
12 | spark.conf.set("spark.cassandra.connection.host", "127.0.0.1")
13 |
14 | val KEY_SPACE_NAME = "dev"
15 | val TABLE_NAME = "employee"
16 |
17 | val emps = List(
18 | Emp(1, "Mike", 1032230),
19 | Emp(2, "Shyam", 1322200),
20 | Emp(3, "Revanth", 2223300),
21 | Emp(4, "Raghu", 2773666),
22 | Emp(5, "naga", 2002233),
23 | Emp(6, "siva", 2773666))
24 |
25 | val empDF = spark.createDataFrame(emps)
26 |
27 |
28 | empDF.write.format("org.apache.spark.sql.cassandra").option("table", TABLE_NAME)
29 | .option("keyspace", KEY_SPACE_NAME).mode(SaveMode.Append).save()
30 |
31 | println("done .......")
32 | }
33 |
34 | //CREATE TABLE dev.employee (
35 | // id int PRIMARY KEY,
36 | // name text,
37 | // salary int
38 | //);
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/cassandra/export/CassandraYaml.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.cassandra.export
2 |
3 | import scala.beans.BeanProperty
4 | class CassandraYaml {
5 | @BeanProperty var cassandra_table_export = new java.util.ArrayList[YamlProps]()
6 | }
7 |
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/cassandra/export/Utils.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.cassandra.export
2 |
3 | object Utils {
4 |
5 | /**
6 | * Method to parse the input arguments
7 | * @param args: Array[String]
8 | * @return java.util.HashMap[String, String]
9 | */
10 | def argsParser(args: Array[String]): java.util.HashMap[String, String] = {
11 | val result = new java.util.HashMap[String, String]()
12 | var index = 0
13 | for (arg <- args) {
14 | index += 1
15 | val trimmedArg = arg.trim()
16 | if (trimmedArg.startsWith("--")) {
17 | val key = trimmedArg.replaceAll("--", "")
18 | if (index < args.length) {
19 | val value = args(index).trim()
20 | result.put(key, value)
21 | }
22 | }
23 | }
24 | result
25 | }
26 |
27 | /**
28 | * This method is used to parse the timeStamp(2017-09-26 05:00:00.0)
29 | * @param String: timeStamp
30 | * @return String: 2017-09
31 | */
32 | val setDateMonth: (String) => String = (timeStamp: String) => {
33 | var date_hour_list = timeStamp.split(" ")
34 | var date = date_hour_list(0)
35 | var month = date.split("-")
36 | month(0) + "-" + month(1)
37 | }
38 |
39 | /**
40 | * This method is used to parse the timeStamp(2017-09-26 05:00:00.0)
41 | * @param String: timeStamp
42 | * @return String: 2017-09-26-05
43 | */
44 | val setDateHour: (String) => String = (timeStamp: String) => {
45 | var date_hour_list = timeStamp.split(" ")
46 | var date = date_hour_list(0)
47 | var month = date.split("-")
48 | month(0) + "-" + month(1)
49 | var hour_min_sec = date_hour_list(1).split(":")
50 | date + "-" + hour_min_sec(0)
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/cassandra/export/YamlProps.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.cassandra.export
2 |
3 | import scala.beans.BeanProperty
4 |
5 | class YamlProps {
6 | @BeanProperty var table_name = ""
7 | @BeanProperty var keyspace = ""
8 | @BeanProperty var output_location = ""
9 | @BeanProperty var duration_in_hour = ""
10 | }
11 |
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/cassandra/export/cassandra-table-export.yml:
--------------------------------------------------------------------------------
1 | # Configurations to backup cassandra tables
2 | cassandra_table_export:
3 |
4 | - table_name : insight
5 | keyspace: demo_database
6 | output_location: backup/cassandra/parquet_insight
7 | duration_in_hour: 24
8 |
9 | - table_name : insight_rc_data
10 | keyspace: demo_database
11 | output_location: backup/cassandra/parquet_insight_rc_data
12 | duration_in_hour: 24
13 |
14 | - table_name : insight_rc_data_count
15 | keyspace: demo_database
16 | output_location: backup/cassandra/parquet_insight_rc_data_count
17 | duration_in_hour: 24
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/custom/CustomPartitioner.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.custom
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.{ Partitioner, SparkContext, SparkConf }
5 | import org.apache.spark.SparkContext._
6 |
7 | class SpecialPartitioner extends Partitioner {
8 | def numPartitions = 10
9 |
10 | def getPartition(key: Any): Int = {
11 | key match {
12 | case (x, y: Int, z) => y % numPartitions
13 | case _ => throw new ClassCastException
14 | }
15 | }
16 | }
17 |
18 | object CustomPartitioner {
19 | def analyze[T](r: RDD[T]): Unit = {
20 | val partitions = r.glom()
21 | println(partitions.count() + " parititons")
22 |
23 | // use zipWithIndex() to see the index of each partition
24 | // we need to loop sequentially so we can see them in order: use collect()
25 | partitions.zipWithIndex().collect().foreach {
26 | case (a, i) => {
27 | println("Partition " + i + " contents (count " + a.count(_ => true) + "):" +
28 | a.foldLeft("")((e, s) => e + " " + s))
29 | }
30 | }
31 | }
32 |
33 | def main(args: Array[String]) {
34 | val conf = new SparkConf().setAppName("Streaming").setMaster("local[4]")
35 | val sc = new SparkContext(conf)
36 |
37 | val triplets =
38 | for (x <- 1 to 3; y <- 1 to 20; z <- 'a' to 'd')
39 | yield ((x, y, z), x * y)
40 |
41 | // Spark has the good sense to use the first tuple element
42 | // for range partitioning, but for this data-set it makes a mess
43 | val defaultRDD = sc.parallelize(triplets, 10)
44 | println("with default partitioning")
45 | analyze(defaultRDD)
46 |
47 | // out custom partitioner uses the second tuple element
48 | val deliberateRDD = defaultRDD.partitionBy(new SpecialPartitioner())
49 | println("with deliberate partitioning")
50 | analyze(deliberateRDD)
51 |
52 | }
53 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/custom/HashJoin.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.custom
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.{ SparkContext, SparkConf }
5 |
6 | import scala.collection.mutable
7 |
8 | // This gives is access to the PairRDDFunctions
9 | import org.apache.spark.SparkContext._
10 |
11 | // encapsulate a small sequence of pairs to be joined with pair RDDs --
12 | // making this serializable effectively allows the hash table to be
13 | // broadcast to each worker
14 | // Reference: http://en.wikipedia.org/wiki/Hash_join
15 | // (this is specifically an inner equi-join on pairs)
16 | class HashJoiner[K, V](small: Seq[(K, V)]) extends java.io.Serializable {
17 |
18 | // stash it as a hash table, remembering that the keys may not be unique,
19 | // so we need to collect values for each key in a list
20 | val m = new mutable.HashMap[K, mutable.ListBuffer[V]]()
21 | small.foreach {
22 | case (k, v) => if (m.contains(k)) m(k) += v else m(k) = mutable.ListBuffer(v)
23 | }
24 |
25 | // when joining the RDD, remember that each key in it may or may not have
26 | // a matching key in the array, and we need a result tuple for each value
27 | // in the list contained in the corresponding hash table entry
28 | def joinOnLeft[U](large: RDD[(K, U)]): RDD[(K, (U, V))] = {
29 | large.flatMap {
30 | case (k, u) =>
31 | m.get(k).flatMap(ll => Some(ll.map(v => (k, (u, v))))).getOrElse(mutable.ListBuffer())
32 | }
33 | }
34 | }
35 |
36 | object HashJoin {
37 | def main(args: Array[String]) {
38 | val conf = new SparkConf().setAppName("HashJoin").setMaster("local[4]")
39 | val sc = new SparkContext(conf)
40 |
41 | val smallRDD = sc.parallelize(
42 | Seq((1, 'a'), (1, 'c'), (2, 'a'), (3, 'x'), (3, 'y'), (4, 'a')),
43 | 4)
44 |
45 | val largeRDD = sc.parallelize(
46 | for (x <- 1 to 10000) yield (x % 4, x),
47 | 4)
48 |
49 | // simply joining the two RDDs will be slow as it requires
50 | // lots of communication
51 | val joined = largeRDD.join(smallRDD)
52 | joined.collect().foreach(println)
53 |
54 | // If the smaller RDD is small enough we're better of with it not
55 | // being an RDD -- and we can implement a hash join by hand, effectively
56 | // broadcasting the hash table to each worker
57 | println("hash join result")
58 | // NOTE: it may be tempting to use "collectAsMap" below instead of "collect",
59 | // and simplify the joiner accordingly, but that only works if the keys
60 | // are unique
61 | val joiner = new HashJoiner(smallRDD.collect())
62 | val hashJoined = joiner.joinOnLeft(largeRDD)
63 | hashJoined.collect().foreach(println)
64 |
65 | }
66 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/custom/SemiStructuredUtilUDF.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.custom
2 |
3 | import org.apache.spark.sql.{ Row, SQLContext, SparkSession }
4 | import org.apache.spark.{ SparkConf, SparkContext }
5 |
6 | import scala.collection.mutable.ArrayBuffer
7 |
8 | //
9 | // Show various ways to query in SQL using user-defined functions UDFs.
10 | //
11 |
12 | object SemiStructuredUtilUDF {
13 |
14 | def isAtomic(o: AnyRef): Boolean = {
15 | o match {
16 | case l: ArrayBuffer[_] => false
17 | case _ => true
18 | }
19 | }
20 |
21 | def isString(o: AnyRef): Boolean = {
22 | o match {
23 | case s: String => true
24 | case _ => false
25 | }
26 | }
27 |
28 | //def isInt(o:AnyRef) : Boolean = {
29 | // o match {
30 | // case i:Int => true
31 | // case _ => false
32 | // }
33 | //}
34 |
35 | def isArray(o: AnyRef): Boolean = {
36 | o match {
37 | case l: ArrayBuffer[_] => true
38 | case _ => false
39 | }
40 | }
41 |
42 | def arrayLength(o: AnyRef): Int = {
43 | o match {
44 | case l: ArrayBuffer[_] => l.size
45 | case null => 0
46 | case _ => 1
47 | }
48 | }
49 |
50 | def isStruct(o: AnyRef): Boolean = {
51 | o match {
52 | case r: Row => true
53 | case _ => false
54 | }
55 | }
56 |
57 | def arrayContains(a: AnyRef, v: AnyRef): Boolean = {
58 | a match {
59 | case l: ArrayBuffer[_] => l.contains(v)
60 | case _ => false
61 | }
62 | }
63 |
64 | def struct(a: AnyRef): Boolean = {
65 | println("hello")
66 | true
67 | }
68 |
69 | def main(args: Array[String]) {
70 | val spark =
71 | SparkSession.builder()
72 | .appName("Custom")
73 | .master("local[4]")
74 | .getOrCreate()
75 |
76 | val transactions = spark.read.json("src/main/resources/data/mixed.json")
77 | transactions.printSchema()
78 | transactions.createOrReplaceTempView("transactions")
79 |
80 | spark.udf.register("struct", struct _)
81 |
82 | val all =
83 | spark.sql("SELECT a, id, struct(address) FROM transactions")
84 | all.foreach(r => println(r))
85 |
86 | spark.udf.register("isAtomic", isAtomic _)
87 | spark.udf.register("arrayLength", arrayLength _)
88 |
89 | val lotsOfOrders =
90 | spark.sql("SELECT id FROM transactions WHERE arrayLength(orders) > 2")
91 | //lotsOfOrders.foreach(println)
92 | }
93 |
94 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/DataFrame_DropDuplicates.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object DataFrame_DropDuplicates {
6 | def main(args: Array[String]) {
7 | val spark =
8 | SparkSession.builder()
9 | .appName("DataFrame-DropDuplicates")
10 | .master("local[4]")
11 | .getOrCreate()
12 |
13 | import spark.implicits._
14 |
15 | // create an RDD of tuples with some data
16 | val custs = Seq(
17 | (1, "Widget Co", 120000.00, 0.00, "AZ"),
18 | (2, "Acme Widgets", 410500.00, 500.00, "CA"),
19 | (3, "Widgetry", 410500.00, 200.00, "CA"),
20 | (4, "Widgets R Us", 410500.00, 0.0, "CA"),
21 | (3, "Widgetry", 410500.00, 200.00, "CA"),
22 | (5, "Ye Olde Widgete", 500.00, 0.0, "MA"),
23 | (6, "Widget Co", 12000.00, 10.00, "AZ"))
24 | val customerRows = spark.sparkContext.parallelize(custs, 4)
25 |
26 | // convert RDD of tuples to DataFrame by supplying column names
27 | val customerDF = customerRows.toDF("id", "name", "sales", "discount", "state")
28 |
29 | println("*** Here's the whole DataFrame with duplicates")
30 |
31 | customerDF.printSchema()
32 |
33 | customerDF.show()
34 |
35 | // drop fully identical rows
36 | val withoutDuplicates = customerDF.dropDuplicates()
37 |
38 | println("*** Now without duplicates")
39 |
40 | withoutDuplicates.show()
41 |
42 | // drop fully identical rows
43 | val withoutPartials = customerDF.dropDuplicates(Seq("name", "state"))
44 |
45 | println("*** Now without partial duplicates too")
46 |
47 | withoutPartials.show()
48 |
49 | }
50 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/DateTime.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 |
3 | import java.sql.{ Date, Timestamp }
4 |
5 | import org.apache.spark.sql.{ Row, SparkSession }
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql.functions._
8 |
9 | //
10 | // Functions for querying against columns of DateType and TimestampType in
11 | // a DataFrame.
12 | //
13 | object DateTime {
14 | def main(args: Array[String]) {
15 | val spark =
16 | SparkSession.builder()
17 | .appName("DataFrame-DateTime")
18 | .master("local[4]")
19 | .getOrCreate()
20 |
21 | import spark.implicits._
22 |
23 | val schema = StructType(
24 | Seq(
25 | StructField("id", IntegerType, true),
26 | StructField("dt", DateType, true),
27 | StructField("ts", TimestampType, true)))
28 | val rows = spark.sparkContext.parallelize(
29 | Seq(
30 | Row(
31 | 1,
32 | Date.valueOf("1999-01-11"),
33 | Timestamp.valueOf("2011-10-02 09:48:05.123456")),
34 | Row(
35 | 1,
36 | Date.valueOf("2004-04-14"),
37 | Timestamp.valueOf("2011-10-02 12:30:00.123456")),
38 | Row(
39 | 1,
40 | Date.valueOf("2008-12-31"),
41 | Timestamp.valueOf("2011-10-02 15:00:00.123456"))), 4)
42 | val tdf = spark.createDataFrame(rows, schema)
43 |
44 | println("DataFrame with both DateType and TimestampType")
45 | tdf.show()
46 |
47 | println("Pull a DateType apart when querying")
48 | tdf.select($"dt", year($"dt"), quarter($"dt"), month($"dt"),
49 | weekofyear($"dt"), dayofyear($"dt"), dayofmonth($"dt")).show()
50 |
51 | println("Date arithmetic")
52 | tdf.select($"dt", datediff(current_date(), $"dt"),
53 | date_sub($"dt", 20),
54 | date_add($"dt", 10),
55 | add_months($"dt", 6)).show()
56 |
57 | println("Date truncation")
58 | tdf.select($"dt", trunc($"dt", "YYYY"), trunc($"dt", "YY"),
59 | trunc($"dt", "MM")).show()
60 |
61 | println("Date formatting")
62 | tdf.select($"dt", date_format($"dt", "MMM dd, YYYY")).show()
63 |
64 | println("Pull a Timestamp type apart when querying")
65 | tdf.select($"ts", year($"ts"), hour($"ts"), minute($"ts"), second($"ts")).show()
66 | }
67 |
68 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/DropColumns.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object DropColumns {
6 | def main(args: Array[String]) {
7 | val spark =
8 | SparkSession.builder()
9 | .appName("DataFrame-DropColumns")
10 | .master("local[4]")
11 | .getOrCreate()
12 |
13 | import spark.implicits._
14 |
15 | // create an RDD of tuples with some data
16 | val custs = Seq(
17 | (1, "Widget Co", 120000.00, 0.00, "AZ"),
18 | (2, "Acme Widgets", 410500.00, 500.00, "CA"),
19 | (3, "Widgetry", 410500.00, 200.00, "CA"),
20 | (4, "Widgets R Us", 410500.00, 0.0, "CA"),
21 | (5, "Ye Olde Widgete", 500.00, 0.0, "MA"))
22 | val customerRows = spark.sparkContext.parallelize(custs, 4)
23 |
24 | // convert RDD of tuples to DataFrame by supplying column names
25 | val customerDF = customerRows.toDF("id", "name", "sales", "discount", "state")
26 |
27 | println("*** Here's the whole DataFrame")
28 |
29 | customerDF.printSchema()
30 |
31 | customerDF.show()
32 |
33 | // remove a couple of columns
34 | val fewerCols = customerDF.drop("sales").drop("discount")
35 |
36 | println("*** Now with fewer columns")
37 |
38 | fewerCols.printSchema()
39 |
40 | fewerCols.show()
41 |
42 | }
43 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/GenerateUniqueId.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 | import org.apache.spark.sql.functions._
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.Row
5 |
6 | object GenerateUniqueId {
7 | def main(args: Array[String]): Unit = {
8 |
9 | val session = SparkSession.builder().appName("GenerateUniqueId").master("local[1]").getOrCreate()
10 | val sc = session.sparkContext
11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
12 | import session.implicits._
13 | val df = sc.parallelize(Seq(("Databricks", 20000), ("Spark", 100000), ("Hadoop", 3000))).toDF("word", "count")
14 |
15 | //Option 1 => Using MontotonicallyIncreasingID or ZipWithUniqueId methods
16 | df.withColumn("uniqueID", monotonicallyIncreasingId).show()
17 |
18 | import org.apache.spark.sql.types.{ StructType, StructField, LongType }
19 | val df1 = sc.parallelize(Seq(("Databricks", 20000), ("Spark", 100000), ("Hadoop", 3000))).toDF("word", "count")
20 | val wcschema = df1.schema
21 | val inputRows = df1.rdd.zipWithUniqueId.map {
22 | case (r: Row, id: Long) => Row.fromSeq(id +: r.toSeq)
23 | }
24 | val wcID = sqlContext.createDataFrame(inputRows, StructType(StructField("id", LongType, false) +: wcschema.fields))
25 |
26 | wcID.show()
27 |
28 | //Option 2 => Use Row_Number Function
29 |
30 | //With PartitionBy Column:
31 |
32 | val df2 = sc.parallelize(Seq(("Databricks", 20000), ("Spark", 100000), ("Hadoop", 3000))).toDF("word", "count")
33 | df2.createOrReplaceTempView("wordcount")
34 | val tmpTable = sqlContext.sql("select row_number() over (partition by word order by count) as rnk,word,count from wordcount")
35 | tmpTable.show()
36 |
37 | //Without PartitionBy Column:
38 | val tmpTable1 = sqlContext.sql("select row_number() over (order by count) as rnk,word,count from wordcount")
39 | tmpTable1.show()
40 | }
41 |
42 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/HDFSFilesList.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 |
3 | import org.apache.hadoop.conf.Configuration
4 | import org.apache.hadoop.fs.FileStatus
5 | import org.apache.hadoop.fs.FileSystem
6 | import org.apache.hadoop.fs.FileUtil
7 | import org.apache.hadoop.fs.Path
8 | import org.apache.spark.sql.SparkSession
9 |
10 | object HDFSFilesList {
11 | private val conf = new Configuration()
12 | val fs = FileSystem.get(conf)
13 | val uri = conf.get("fs.default.name")
14 |
15 | def main(args: Array[String]) {
16 |
17 | val spark = SparkSession.builder().appName("Spark-Read-HDFS-Folders").master("local[*]")
18 | .getOrCreate()
19 |
20 | //Hdfs folder path
21 | var DATA_PATH = args(0)
22 |
23 | //No of Hdfs folders to read
24 | val intervalCount = 1
25 |
26 | var fileStatus: Array[FileStatus] = fs.listStatus(new Path(uri + DATA_PATH))
27 | var paths: Array[Path] = FileUtil.stat2Paths(fileStatus)
28 |
29 | var filesWithInterval = getHDFSFoldersBasedOnModtime(intervalCount, fileStatus)
30 |
31 | if (fileStatus != null && filesWithInterval.length >= intervalCount) {
32 | val dataframeArray = filesWithInterval.map(folder => {
33 | println(folder.getPath.toString)
34 | val path = folder.getPath.toString
35 | //spark.read.parquet(folder.getPath.toString)
36 | })
37 | }
38 |
39 | }
40 |
41 | //get the folders from HDFS based on the count provided.
42 | def getHDFSFoldersBasedOnModtime(intervalCount: Int, fileStatus: Array[FileStatus]): Array[FileStatus] = {
43 | var sortedList: List[FileStatus] = fileStatus.toList.sortWith(_.getModificationTime > _.getModificationTime)
44 | var returnList: List[FileStatus] = List()
45 | var itr: Int = 0
46 | var iterator = sortedList.iterator
47 | while (iterator.hasNext) {
48 | var value = iterator.next()
49 | if (itr < intervalCount) {
50 | returnList = returnList.::(value)
51 | itr = itr + 1
52 | }
53 | }
54 | returnList.toArray
55 | }
56 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/HandlingNulls.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object HandlingNulls {
6 |
7 | val spark = SparkSession.builder().appName("Handling-Nulls").master("local[*]")
8 | .getOrCreate()
9 |
10 | case class employee(
11 | employee_id: Int, first_name: String, last_name: String,
12 | email: String, phone_number: String, hire_date: String,
13 | job_id: String, salary: Float, commission_pct: Float,
14 | manager_id: Int, department_id: Int)
15 |
16 | private def checkNullForFloat(value: String): Float = {
17 | if (!"null".equals(value)) {
18 | return value.toFloat;
19 | } else if (!"".equals(value)) {
20 | return value.toFloat;
21 | }
22 | return 0f;
23 | }
24 | private def checkNullForInt(value: String): Int = {
25 | if (!"null".equals(value)) {
26 | return value.toInt;
27 | } else if (!"".equals(value)) {
28 | return value.toInt;
29 | }
30 | return 0;
31 | }
32 | def main(args: Array[String]): Unit = {
33 |
34 | val employeeData = spark.sparkContext.textFile("input/emp.txt")
35 | import spark.implicits._
36 |
37 | val employeeDF = employeeData.map(
38 | rec => {
39 | var d = rec.split(",")
40 | employee(d(0).toInt,
41 | d(1), d(2), d(3), d(4), d(5), d(6), d(7).toFloat,
42 | checkNullForFloat(d(8)),
43 | checkNullForInt(d(9)),
44 | checkNullForInt(d(10)))
45 |
46 | }).toDF()
47 |
48 |
49 | //or another way of filtering null columns
50 | val employeeDF1 = employeeData.map(
51 | rec => {
52 | var d = rec.split(",")
53 | employee(d(0).toInt,
54 | d(1), d(2), d(3), d(4), d(5), d(6), d(7).toFloat,
55 | if (d(8).asInstanceOf[Any] != "null") d(8).toFloat else 0F,
56 | if (d(9).asInstanceOf[Any] != "null") d(9).toInt else 0,
57 | if (d(10).asInstanceOf[Any] != "null") d(10).toInt else 0)
58 | }).toDF()
59 |
60 | employeeDF.show()
61 | }
62 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/PartitionBy.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 |
3 | import org.apache.spark.sql.SparkSession
4 | object PartitionBy {
5 |
6 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double)
7 |
8 | def main(args: Array[String]): Unit = {
9 |
10 | val session = SparkSession.builder().appName("Spark-PartitionBy").master("local[1]").getOrCreate()
11 |
12 | import session.implicits._
13 | val purchaseDF = List(
14 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99),
15 | Purchase(122, 247, "2017-04-19", "15:30", "PST", 300.22),
16 | Purchase(123, 254, "2017-04-19", "00:50", "EST", 122.19),
17 | Purchase(124, 234, "2017-04-19", "20:50", "UTC", 500.99),
18 | Purchase(125, 247, "2017-04-19", "15:30", "PST", 300.22),
19 | Purchase(126, 254, "2017-04-19", "00:50", "EST", 122.19),
20 | Purchase(125, 250, "2017-04-19", "15:30", "PST", 300.22),
21 | Purchase(126, 251, "2017-04-19", "00:50", "EST", 122.19),
22 | Purchase(127, 299, "2017-04-19", "07:30", "UTC", 524.37)).toDF()
23 |
24 | purchaseDF.coalesce(1).write.parquet("input/parqOut1")
25 |
26 | val df = session.read.parquet("input/parqOut1")
27 |
28 | val duplicated = df.withColumn("_cust_id", $"customer_id")
29 |
30 | duplicated.coalesce(1).write.partitionBy("_cust_id").csv("input/csv/")
31 | }
32 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/PartitionByColumn.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 |
3 | import org.apache.spark.sql.SparkSession
4 | /**
5 | * Partition the data by a specific column and store it partition wise.
6 | */
7 | object PartitionByColumn {
8 |
9 | case class Emp(empId: Int, emp_name: String, deptId: Int, deptName: String, location: String)
10 |
11 | def main(args: Array[String]): Unit = {
12 |
13 | val session = SparkSession.builder().appName("PartitionByColumn").master("local[1]").getOrCreate()
14 |
15 | val emps = List(
16 | Emp(1, "Mike", 1, "Cloud", "BNGL"),
17 | Emp(2, "Shyam", 1, "Cloud", "HYD"),
18 | Emp(3, "Revanth", 2, "Bigdata", "BNGL"),
19 | Emp(4, "Raghu", 2, "Bigdata", "HYD"),
20 | Emp(6, "Apporva", 3, "Apac", "BNGL"),
21 | Emp(5, "Naga", 3, "Apac", "HYD"))
22 |
23 | val empDF = session.createDataFrame(emps)
24 |
25 | //Partitioning the data by deptName
26 | empDF.write.partitionBy("deptName").csv("output/test")
27 |
28 | //Partitioning the data by deptName,location
29 | empDF.write.partitionBy("deptName", "location").csv("output/test1")
30 |
31 | println("Done ....")
32 | }
33 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/PartitionBy_WithUDF.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 | import org.apache.spark.sql.functions._
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.SaveMode
5 |
6 | object PartitionBy_withUDF {
7 |
8 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double)
9 |
10 | val setDateMonth: (String) => String = (timeStamp: String) => {
11 | var date_hour_list = timeStamp.split(" ")
12 | var date = date_hour_list(0)
13 | var month = date.split("-")
14 | month(0) + "-" + month(1)
15 | }
16 | val setDateHour: (String) => String = (timeStamp: String) => {
17 | var date_hour_list = timeStamp.split(" ")
18 | var date = date_hour_list(0)
19 | var month = date.split("-")
20 | month(0) + "-" + month(1)
21 | var hour_min_sec = date_hour_list(1).split(":")
22 | date + "-" + hour_min_sec(0)
23 | }
24 | val getDateMonth = udf(setDateMonth)
25 | val getDateHour = udf(setDateHour)
26 |
27 | def main(args: Array[String]): Unit = {
28 |
29 | val session = SparkSession.builder().appName("Spark-PartitionBy").master("local[1]").getOrCreate()
30 |
31 | import session.implicits._
32 | val purchaseDF = List(
33 | Purchase(121, 234, "2017-09-26 05:00:00.0", "20:50", "UTC", 500.99),
34 | Purchase(122, 247, "2017-07-26 05:00:00.0", "15:30", "PST", 300.22),
35 | Purchase(123, 254, "2017-09-26 05:00:00.0", "00:50", "EST", 122.19),
36 | Purchase(124, 234, "2017-09-26 04:00:00.0", "20:50", "UTC", 500.99),
37 | Purchase(125, 247, "2017-08-26 05:00:00.0", "15:30", "PST", 300.22),
38 | Purchase(126, 254, "2017-08-26 05:00:00.0", "00:50", "EST", 122.19),
39 | Purchase(125, 250, "2017-08-26 05:00:00.0", "15:30", "PST", 300.22),
40 | Purchase(121, 251, "2017-07-26 07:00:00.0", "00:50", "EST", 122.19),
41 | Purchase(127, 299, "2017-07-26 05:00:00.0", "07:30", "UTC", 524.37)).toDF()
42 |
43 | purchaseDF.coalesce(1).write.parquet("input/parqOut1")
44 |
45 | val df = session.read.parquet("input/parqOut1")
46 |
47 | df.printSchema()
48 |
49 | val finalDF = df.withColumn("date_month", getDateMonth(df.col("date"))).withColumn("date_hour", getDateHour(df.col("date")))
50 |
51 | finalDF.coalesce(1).write.mode(SaveMode.Overwrite).partitionBy("date_month", "date_hour", "customer_id").csv("input/csv/")
52 | }
53 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataframes/ProblemStatement.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | object ProblemStatement {
6 |
7 | def main(args: Array[String]) {
8 | val spark =
9 | SparkSession.builder()
10 | .appName("DataFrame-ProblemStatement")
11 | .master("local[2]")
12 | .getOrCreate()
13 | import spark.implicits._
14 |
15 | //What are the best-salary and the second best-salary of employees in every department?
16 |
17 | val dataRDD = spark.read.format("csv").option("header", "true").load("/Users/revanthreddy/Desktop/Docs/empData.csv").rdd
18 | val filteredDF = dataRDD.map(x => (x(2).toString(), x(3).toString().replace("$", "").toDouble)).toDF("dept", "salary").dropDuplicates().toDF()
19 |
20 | val maxSalDF = filteredDF.groupBy("dept").agg(max(filteredDF.col("salary")).as("MaxSal")).sort("dept")
21 | maxSalDF.show
22 |
23 | val subDF = filteredDF.except(maxSalDF)
24 |
25 | val ScndMaxSalDF = subDF.groupBy("dept").agg(max(subDF.col("salary")).as("SecMaxSal")).sort("dept")
26 | ScndMaxSalDF.show
27 |
28 | val pboblem1ResDF = maxSalDF.join(ScndMaxSalDF, Seq("dept")).sort("dept").toDF()
29 | pboblem1ResDF.show
30 | pboblem1ResDF.coalesce(1).write.option("header", "true").csv("/Users/revanthreddy/Desktop/Docs/file1.csv")
31 |
32 | //What is the difference between the salary of each employee and the highest salary of employee in the same department?
33 |
34 | val pboblem2DF = dataRDD.map(x => (x(0).toString(), x(2).toString(), x(3).toString().replace("$", "").toDouble)).toDF("name", "dept", "salary").dropDuplicates().toDF()
35 |
36 | val resDF = pboblem2DF.join(maxSalDF, Seq("dept")).sort("dept").toDF()
37 |
38 | val pboblem2ResDF = resDF.withColumn("diffSal", (resDF.col("MaxSal") - resDF.col("salary")))
39 | pboblem2ResDF.coalesce(1).write.option("header", "true").csv("/Users/revanthreddy/Desktop/Docs/file2.csv")
40 |
41 | }
42 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataset/DatasetBasic.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataset
2 | import org.apache.spark.sql.SparkSession
3 |
4 | //
5 | // Create Datasets of primitive type and tuple type ands show simple operations.
6 | //
7 | object DatasetBasic {
8 | // define a case class for the elements of the Dataset
9 | // NOTE: this needs to be outside the scope of the method where the
10 | // Dataset is created
11 | case class Number(i: Int, english: String, french: String)
12 |
13 | def main(args: Array[String]) {
14 | val spark =
15 | SparkSession.builder()
16 | .appName("Dataset-CaseClass")
17 | .master("local[4]")
18 | .getOrCreate()
19 |
20 | import spark.implicits._
21 |
22 | val numbers = Seq(
23 | Number(1, "one", "un"),
24 | Number(2, "two", "deux"),
25 | Number(3, "three", "trois"))
26 | val numberDS = numbers.toDS()
27 |
28 | println("*** case class Dataset types")
29 | numberDS.dtypes.foreach(println(_))
30 |
31 | // Since we used a case class we can query using the field names
32 | // as column names
33 | println("*** filter by one column and fetch another")
34 | numberDS.where($"i" > 2).select($"english", $"french").show()
35 |
36 | println("*** could have used SparkSession.createDataset() instead")
37 | val anotherDS = spark.createDataset(numbers)
38 |
39 | println("*** case class Dataset types")
40 | anotherDS.dtypes.foreach(println(_))
41 | }
42 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataset/SemiStructuredData.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataset
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.SQLContext
5 |
6 | object SemiStructuredData {
7 |
8 | case class University(name: String, numStudents: Long, yearFounded: Long)
9 | def main(args: Array[String]): Unit = {
10 |
11 | val session = SparkSession.builder().appName("SemiStructuredData").master("local[1]").getOrCreate()
12 |
13 | import session.implicits._
14 | val sc = session.sparkContext
15 | val sqlContext = new SQLContext(sc)
16 |
17 | val schools = sqlContext.read.json("input/schools.json").as[University]
18 | schools.printSchema()
19 | val res = schools.map(s => s"${s.name} is ${2017 - s.yearFounded} years old")
20 | res.foreach { x => println(x) }
21 | }
22 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/dataset/WordCountDS.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataset
2 | import org.apache.spark.sql.SQLContext
3 | import org.apache.spark.{ SparkConf, SparkContext }
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions._
6 |
7 | object WordCountDS {
8 | def main(args: Array[String]): Unit = {
9 |
10 | val session = SparkSession.builder().appName("WordCountDS Application").master("local[1]").getOrCreate()
11 |
12 | import session.implicits._
13 | val sc = session.sparkContext
14 | val sqlContext = new SQLContext(sc)
15 |
16 | //With Spark DataSets API
17 | //Since the Dataset version of word count can take advantage of the built-in aggregate count,
18 | // this computation can not only be expressed with less code, but it will also execute significantly faster.
19 |
20 | val ds = sqlContext.read.text("input/README.md").as[String]
21 | val result = ds
22 | .flatMap(_.split(" ")) // Split on whitespace
23 | .filter(_ != "") // Filter empty words
24 | .toDF() // Convert to DataFrame to perform aggregation / sorting
25 | .groupBy($"value") // Count number of occurences of each word
26 | .agg(count("*") as "numOccurances")
27 | .orderBy($"numOccurances" desc) // Show most common words first
28 |
29 | result.foreach { x => println(x) }
30 |
31 | }
32 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/elasticsearch/CsvToESLoad.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.elasticsearch
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.elasticsearch.spark.sql._
5 |
6 | object CsvToESLoad {
7 |
8 | def main(args: Array[String]) {
9 |
10 | val spark = SparkSession.builder().appName("CsvToESLoad").master("local[*]").getOrCreate()
11 |
12 | val esConfig = Map(("es.nodes", "localhost"), ("es.port", "9200"),
13 | ("es.index.auto.create", "true"), ("es.http.timeout", "5m"),
14 | ("es.nodes.wan.only" -> "true"))
15 |
16 | val index = "realestatedata/data"
17 |
18 | import spark.implicits._
19 |
20 | val esdf = spark.read.format("com.databricks.spark.csv")
21 | .option("header", "true")
22 | .option("inferSchema", "true")
23 | .load("input/Real_Estate_Data.csv")
24 |
25 | esdf.show(2, false)
26 |
27 | //writing to ElasticSearch index
28 | esdf.saveToEs(index, cfg = esConfig)
29 |
30 | //reading from ElasticSearch index
31 | val df = spark.read.format("org.elasticsearch.spark.sql").load(index)
32 | df.show(10, false)
33 | }
34 | //https://www.elastic.co/guide/en/elasticsearch/reference/current/getting-started.html
35 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/elasticsearch/Read_From_ES.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.elasticsearch
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.SparkSession
5 | object Read_From_ES {
6 |
7 | def main(args: Array[String]) {
8 |
9 | val spark = SparkSession.builder().appName("Spark-Read-ElasticSearch").master("local[*]").getOrCreate()
10 |
11 | val esConfig = Map(("es.nodes", "localhost"), ("es.port", "9200"),
12 | ("es.index.auto.create", "true"), ("es.http.timeout", "5m"),
13 | ("es.nodes.wan.only" -> "true"))
14 |
15 | // load the elasticsearch index into spark dataframe
16 | val df = spark.read.format("org.elasticsearch.spark.sql").load("blabla/joke")
17 |
18 | df.show(10, false)
19 |
20 | }
21 |
22 | //https://www.elastic.co/guide/en/elasticsearch/reference/current/getting-started.html
23 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/elasticsearch/Write_To_ES.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.elasticsearch
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.elasticsearch.spark.sql._
5 | object Write_To_ES {
6 | case class SimpsonCharacter(name: String, actor: String, episodeDebut: String)
7 |
8 | def main(args: Array[String]) {
9 |
10 | val spark = SparkSession.builder().appName("Spark-Write-ElasticSearch").master("local[*]").getOrCreate()
11 |
12 | val index = "shows/data"
13 |
14 | val esConfig = Map(("es.nodes", "localhost"), ("es.port", "9200"),
15 | ("es.index.auto.create", "true"), ("es.http.timeout", "5m"),
16 | ("es.nodes.wan.only" -> "true"))
17 |
18 | import spark.implicits._
19 |
20 | val simpsonsDF = spark.sparkContext.parallelize(
21 | SimpsonCharacter("Homer", "Dan Castellaneta", "Good Night") ::
22 | SimpsonCharacter("Marge", "Julie Kavner", "Good Night") ::
23 | SimpsonCharacter("Bart", "Nancy Cartwright", "Good Night") ::
24 | SimpsonCharacter("Lisa", "Yeardley Smith", "Good Night") ::
25 | SimpsonCharacter("Maggie", "Liz Georges and more", "Good Night") ::
26 | SimpsonCharacter("Sideshow Bob", "Kelsey Grammer", "The Telltale Head") ::
27 | Nil).toDF().repartition(1)
28 |
29 | //writing to ElasticSearch index
30 | simpsonsDF.saveToEs(index, cfg = esConfig)
31 |
32 | //reading from ElasticSearch index
33 | val df = spark.read.format("org.elasticsearch.spark.sql").load(index)
34 | df.show(10, false)
35 | }
36 | //https://www.elastic.co/guide/en/elasticsearch/reference/current/getting-started.html
37 | //https://marekj.github.io/2016/03/22/elasticsearch-mac-osx
38 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/examples/ExplodeDemo.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.examples
2 | import org.apache.spark.sql.{ Row, SQLContext }
3 | import org.apache.spark.{ SparkConf, SparkContext }
4 | import org.apache.spark.sql.functions._
5 | object ExplodeDemo {
6 |
7 | def main(args: Array[String]): Unit = {
8 | val conf = new SparkConf()
9 | .setAppName("ExplodeDemo")
10 | .setMaster("local")
11 | val sc = new SparkContext(conf)
12 | val sqlContext = new SQLContext(sc)
13 | import sqlContext.implicits._
14 | val df = sc.parallelize(Seq((1, Seq(2, 3, 4), Seq(5, 6, 7)), (2, Seq(3, 4, 5), Seq(6, 7, 8)), (3, Seq(4, 5, 6), Seq(7, 8, 9)))).toDF("a", "b", "c")
15 |
16 | val df1 = df.select(df("a"), explode(df("b")).alias("b_columns"), df("c"))
17 |
18 | val df2 = df1.select(df1("a"), df1("b_columns"), explode(df1("c").alias("c_columns")))
19 |
20 | df.show()
21 | df1.show()
22 | df2.show()
23 | }
24 |
25 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/examples/Filter.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.examples
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | object Filter {
5 | def main(args: Array[String]) {
6 | val conf = new SparkConf().setAppName("Spark Filter Example")
7 |
8 |
9 | val sc = new SparkContext(conf)
10 | println("------------ Printing Spark configs ------------")
11 |
12 | sc.getConf.getAll.foreach(f => println(f))
13 | val x = sc.parallelize(List("Transformation demo", "Test demo", "Filter demo", "Spark is powerfull", "Spark is faster", "Spark is in memory"))
14 | val lines1 = x.filter(line => line.contains("Spark") || line.contains("Transformation"))
15 | println("\n")
16 | println("---------------------------------------------")
17 | lines1.collect().foreach { line => println(line) }
18 |
19 | val lines = x.filter(line => !line.contains("Filter"))
20 | println("---------------------------------------------")
21 | lines.collect().foreach { line => println(line) }
22 | println("---------------------------------------------")
23 | val count = x.filter(line => line.contains("Spark")).count()
24 | println("count is : " + count)
25 | }
26 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/examples/FilterEmpty.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.examples
2 | import org.apache.spark.sql.SparkSession
3 |
4 | //The problem statement is remove the rows having product_price as null or empty
5 | object FilterEmpty extends App {
6 |
7 | private def checkNullForFloat(value: String): Float = {
8 | if (!"".equals(value)) {
9 | return value.toFloat;
10 | }
11 | return 0;
12 | }
13 | case class Product(product_id: Int, product_category_id: Int, product_name: String, product_description: String, product_price: Float, product_image: String)
14 | val session = SparkSession.builder().appName("Spark-FilterEmpty")
15 | .master("local[1]").getOrCreate()
16 |
17 | import session.implicits._
18 | val rawRDD = session.sparkContext.textFile("input/product")
19 |
20 | val dummyRDD = rawRDD.map(_.split("\\,")).map(p => (p(0).toInt, p(1)toInt, p(2), p(3), p(4), p(5)))
21 | val filteredRDD = dummyRDD.filter(x => (x._5 != null) && (x._5.length > 0))
22 |
23 | filteredRDD.map(f => Product(f._1, f._2, f._3, f._4, f._5.toFloat, f._6)).toDF()
24 | .sort($"product_price".desc).show()
25 |
26 | //OR
27 | val prodRDD = rawRDD.map(_.split("\\,")).map(p => Product(p(0).toInt, p(1)toInt, p(2), p(3), checkNullForFloat(p(4)), p(5)))
28 |
29 | //removing the products that have product_price = 0.0
30 | val resDF = prodRDD.filter(x => x.product_price != 0.0).toDF()
31 |
32 | resDF.sort($"product_price".desc).show()
33 |
34 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/examples/LoadPropsFile.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.dataframes
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import java.util.Properties;
5 | import org.apache.hadoop.fs.FSDataInputStream;
6 | import org.apache.hadoop.fs.FileSystem;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.spark.SparkFiles;
9 | import org.apache.hadoop.conf.Configuration
10 |
11 | object LoadPropsFile {
12 | def main(args: Array[String]): Unit = {
13 |
14 | val spark = SparkSession.builder().appName("Loading-PropsFile-Spark").master("local[*]")
15 | .getOrCreate()
16 |
17 | val hdfsConf = new Configuration()
18 | val fs = FileSystem.get(hdfsConf)
19 |
20 | //file should be in HDFS directory
21 | val is = fs.open(new Path("/user/centos/input/conf.properties"))
22 | val prop = new Properties()
23 |
24 | //load properties
25 | prop.load(is)
26 |
27 | //retrieve properties
28 | val tidList = prop.getProperty("tidList")
29 | println("tidList--> " + tidList)
30 |
31 | val topicsList = prop.getProperty("topics")
32 | println("topicsList--> " + topicsList)
33 | }
34 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/examples/ParquetCompactor.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.examples
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object ParquetCompactor {
6 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double)
7 |
8 | def main(args: Array[String]): Unit = {
9 |
10 | val session = SparkSession.builder().appName("ParquetCompactor").master("local[1]").getOrCreate()
11 |
12 | import session.implicits._
13 | val purchaseDF = List(
14 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99),
15 | Purchase(122, 247, "2017-04-19", "15:30", "PST", 300.22),
16 | Purchase(185, 254, "2017-04-19", "00:50", "EST", 122.19),
17 | Purchase(186, 299, "2017-04-19", "07:30", "UTC", 524.37)).toDF()
18 |
19 | //purchaseDF.write.parquet("input/parqOut1")
20 |
21 | // val df = session.read.parquet("input/parqOut")
22 | //
23 | // df.show()
24 | // print("count before :" + df.count())
25 | // val dropedDF = df.dropDuplicates("customer_id")
26 | // println("count after :" + dropedDF.count())
27 | // dropedDF.show()
28 |
29 | val df = session.read.parquet("/Users/revanthreddy/Desktop/date_month=2017-04")
30 | df.printSchema()
31 | println("count before :" + df.count())
32 | //df.write.parquet("input/parqOut1")
33 |
34 | val resdf = session.read.parquet("input/parqOut1")
35 | println("count after :" + resdf.count())
36 | }
37 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/examples/Spark_Accumulator.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.examples
2 |
3 | import org.apache.spark.{ SparkContext, SparkConf }
4 | import org.apache.spark.rdd.RDD
5 | import org.apache.spark.sql.SparkSession
6 |
7 | object Spark_Accumulator {
8 | def main(args: Array[String]): Unit = {
9 |
10 | val sparkSession = SparkSession.builder.
11 | master("local")
12 | .appName("Spark_Accumulator")
13 | .getOrCreate()
14 |
15 | val sc = sparkSession.sparkContext
16 |
17 | val badPkts = sc.longAccumulator("bad.packets")
18 | val zeroValueSales = sc.longAccumulator("Zero.Value.Sales")
19 | val missingFields = sc.longAccumulator("Missing.Fields")
20 | val blankLines = sc.longAccumulator("Blank.Lines")
21 |
22 | sc.textFile("input/purchases.log", 4)
23 | .foreach { line =>
24 |
25 | if (line.length() == 0) blankLines.add(1)
26 | else if (line.contains("Bad data packet")) badPkts.add(1)
27 | else {
28 | val fields = line.split("\t")
29 | if (fields.length != 4) missingFields.add(1)
30 | else if (fields(3).toFloat == 0) zeroValueSales.add(1)
31 | }
32 | }
33 |
34 | println("Purchase Log Analysis Counters:")
35 | println(s"\tBad Data Packets=${badPkts.value}")
36 | println(s"\tZero Value Sales=${zeroValueSales.value}")
37 | println(s"\tMissing Fields=${missingFields.value}")
38 | println(s"\tBlank Lines=${blankLines.value}")
39 |
40 | sc.stop
41 | }
42 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/examples/Spark_CatalogAPI.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.examples
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object Spark_CatalogAPI {
6 |
7 | def main(args: Array[String]) {
8 |
9 | val sparkSession = SparkSession.builder.
10 | master("local[2]")
11 | .appName("Spark-Catalog-Example")
12 | .enableHiveSupport()
13 | .config("spark.sql.warehouse.dir", "/Users/revanthreddy/Project/Spark-2.1/spark-warehouse")
14 | .getOrCreate()
15 |
16 | //interacting with catalogue
17 |
18 | val catalog = sparkSession.catalog
19 |
20 | //print the databases
21 |
22 | catalog.listDatabases().foreach { x => println(x) }
23 | catalog.setCurrentDatabase("airline_db")
24 | catalog.listTables.show
25 | catalog.listColumns("airline").foreach { x => println(x) }
26 |
27 | import sparkSession.implicits._
28 | import sparkSession.sql
29 |
30 | sql("SELECT * FROM airline limit 3").show()
31 | }
32 |
33 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/examples/Spark_To_Caasandra.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.examples
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.sql.SQLContext
6 | import org.apache.spark.sql.SparkSession
7 | object Spark_To_Caasandra extends App {
8 |
9 | val spark = SparkSession.builder().appName("Spark_To_Caasandra").master("local[1]").getOrCreate()
10 |
11 | spark.conf.set("spark.sql.shuffle.partitions", "2")
12 | spark.conf.set("spark.cassandra.connection.host", "127.0.0.1")
13 |
14 | val df = spark.read.format("org.apache.spark.sql.cassandra").option("table", "emp")
15 | .option("keyspace", "dev")
16 | .load()
17 |
18 | df.printSchema()
19 |
20 | df.show()
21 |
22 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/examples/Test.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.examples
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.sql.DataFrame
6 | import org.apache.spark.sql.Row
7 | import org.apache.spark.sql.catalyst.InternalRow
8 | import org.apache.spark.unsafe.types.UTF8String
9 | import org.apache.spark.sql.catalyst.expressions.UnsafeRow
10 | import org.apache.spark.sql.types.StructType
11 | import org.apache.spark.sql.types.StructField
12 | import org.apache.spark.sql.types.IntegerType
13 | import org.apache.spark.sql.types.StringType
14 | import org.apache.spark.sql.SQLContext
15 | import org.apache.spark.sql.SparkSession
16 | import org.apache.spark.sql.Column
17 | import org.apache.spark.sql.Dataset
18 |
19 | case class Person(name: String, empId: Int)
20 | case class Employee(empId: Int,emp_name:String)
21 |
22 | case class Select(cols: Column*) {
23 | def transform(ds: DataFrame) = ds.select(cols: _*)
24 | }
25 |
26 | object Test {
27 |
28 | def main(args: Array[String]): Unit = {
29 |
30 | val session = SparkSession.builder().appName("test").master("local[1]").getOrCreate()
31 |
32 | val person = Array(Person("John", 1), Person("Mike", 2))
33 |
34 | val employee = Array(Employee(1,"Aruba"))
35 |
36 | val personDf = session.createDataFrame(person)
37 |
38 | val employeeDf = session.createDataFrame(employee)
39 |
40 | val joinDf = personDf.join(employeeDf, Seq("empId"), "left")
41 |
42 | joinDf.write.partitionBy("name").parquet("output/test")
43 |
44 |
45 | joinDf.show()
46 |
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/fileformats/AvroToJson.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.fileformats
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import com.databricks.spark.avro._
5 | object AvroToJson {
6 | case class Emp(empId: Int, emp_name: String, deptId: Int, deptName: String, location: String)
7 |
8 | def main(args: Array[String]): Unit = {
9 |
10 | val spark = SparkSession.builder().appName("Spark_ToAvro").master("local[1]").getOrCreate()
11 | spark.conf.set("spark.sql.avro.compression.codec", "snappy")
12 | import spark.implicits._
13 |
14 | val empDF = List(
15 | Emp(1, "Mike", 1, "Cloud", "BNGL"),
16 | Emp(2, "Shyam", 1, "Cloud", "HYD"),
17 | Emp(3, "Revanth", 2, "Bigdata", "BNGL"),
18 | Emp(4, "Raghu", 2, "Bigdata", "HYD"),
19 | Emp(6, "Apporva", 3, "Apac", "BNGL"),
20 | Emp(5, "Naga", 3, "Apac", "HYD")).toDF()
21 |
22 | empDF.write.avro("output/to_avro")
23 |
24 | val avroDF = spark.read.avro("output/to_avro")
25 | avroDF.show
26 |
27 | avroDF.coalesce(1).write.option("compression", "gzip").json("output/avro_to_json")
28 |
29 | }
30 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/fileformats/NestedJsonParser.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.fileformats
2 | import org.apache.spark.sql.SparkSession
3 | import org.apache.spark.sql.types.ArrayType
4 | import org.apache.spark.sql.types.StringType
5 | import org.apache.spark.sql.types.StructField
6 | import org.apache.spark.sql.types.StructType
7 | import org.apache.spark.sql.functions._
8 | object NestedJsonParser extends App {
9 |
10 | val session = SparkSession.builder().appName("Spark-JsonParser")
11 | .master("local[1]").getOrCreate()
12 |
13 | val schema = StructType(List(
14 | StructField("queryResults", StructType(
15 | List(StructField("searchResponse", StructType(
16 | List(StructField("response", StructType(
17 | List(StructField("docs", ArrayType(StructType(
18 | List(
19 | StructField("appCustNumber", StringType, nullable = true),
20 | StructField("transactions", ArrayType(StructType(
21 | List(
22 | StructField("code", StringType, nullable = true),
23 | StructField("description", StringType, nullable = true),
24 | StructField("recordDate", StringType, nullable = true))))))))))))))))))))
25 |
26 | val dff = session.read.schema(schema).json("input/nested.json")
27 | println(dff.printSchema())
28 |
29 | val dfContent = dff.select(explode(dff("queryResults.searchResponse.response.docs.transactions"))).toDF("transaction")
30 | val code = dfContent.select("transaction.code")
31 | code.show(false)
32 |
33 | val dfFinal = dfContent.select("transaction.code", "transaction.description", "transaction.recordDate")
34 | dfFinal.show(false)
35 |
36 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/fileformats/Simple_XMLParser.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.fileformats
2 | import org.apache.spark.sql.SparkSession
3 |
4 | object Simple_XMLParser {
5 |
6 | def main(args: Array[String]): Unit = {
7 |
8 | val xmlFilePath = args(0)
9 | val spark = SparkSession.builder().appName("Spark-XMLParsing").master("local[*]").getOrCreate()
10 | spark.conf.set("spark.debug.maxToStringFields", "10000000")
11 |
12 | val rawDataDF = spark.read.format("com.databricks.spark.xml")
13 | .option("rowTag", "book")
14 | .option("treatEmptyValuesAsNulls", true)
15 | .load(xmlFilePath)
16 |
17 | println("Total books count : " + rawDataDF.count())
18 |
19 | val selectedData = rawDataDF.select("author", "_id")
20 |
21 | selectedData.show(10, false)
22 |
23 | }
24 |
25 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/fileformats/Simple_XMLParser1.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.fileformats
2 | import org.apache.spark.sql.SparkSession
3 | import org.apache.spark.sql.types.{ StructType, StructField, StringType, DoubleType };
4 |
5 | object Simple_XMLParser1 {
6 |
7 | val customSchema = StructType(Array(
8 | StructField("_id", StringType, nullable = true),
9 | StructField("author", StringType, nullable = true),
10 | StructField("description", StringType, nullable = true),
11 | StructField("genre", StringType, nullable = true),
12 | StructField("price", DoubleType, nullable = true),
13 | StructField("publish_date", StringType, nullable = true),
14 | StructField("title", StringType, nullable = true)))
15 |
16 | def main(args: Array[String]): Unit = {
17 |
18 | val xmlFilePath = "input/books.xml"
19 | val spark = SparkSession.builder().appName("Spark-XMLParsing").master("local[*]").getOrCreate()
20 | spark.conf.set("spark.debug.maxToStringFields", "10000000")
21 |
22 | val rawDataDF = spark.read.format("com.databricks.spark.xml")
23 | .option("rowTag", "book")
24 | .option("treatEmptyValuesAsNulls", true)
25 | .schema(customSchema)
26 | .load(xmlFilePath)
27 |
28 | val selectedData = rawDataDF.select("author", "_id")
29 |
30 | selectedData.write
31 | .format("com.databricks.spark.xml")
32 | .option("rootTag", "books")
33 | .option("rowTag", "book")
34 | .save("output/newbooks.xml")
35 |
36 | }
37 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/fileformats/Spark_To_ObjectFile.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.fileformats
2 | import org.apache.spark.sql.SparkSession
3 |
4 | object Spark_To_ObjectFile {
5 |
6 | case class Emp(empId: Int, emp_name: String, deptId: Int, deptName: String, location: String)
7 |
8 | def main(args: Array[String]): Unit = {
9 |
10 | val session = SparkSession.builder().appName("Spark_To_ObjectFile").master("local[1]").getOrCreate()
11 | val sc = session.sparkContext
12 |
13 | val nums = sc.makeRDD(1 to 3).map(x => (x, "a" * x))
14 | nums.saveAsObjectFile("output/test")
15 |
16 | // Try reading the output back as an object file
17 | val output = sc.objectFile[(Int, String)]("output/test")
18 | output.foreach(f => println(f))
19 |
20 | val emps = List(
21 | Emp(1, "Mike", 1, "Cloud", "BNGL"),
22 | Emp(2, "Shyam", 1, "Cloud", "HYD"),
23 | Emp(3, "Revanth", 2, "Bigdata", "BNGL"),
24 | Emp(4, "Raghu", 2, "Bigdata", "HYD"),
25 | Emp(6, "Apporva", 3, "Apac", "BNGL"),
26 | Emp(5, "Naga", 3, "Apac", "HYD"))
27 |
28 | //Saving rdd as ObjectFile and reading back
29 | val empRDD = sc.parallelize(emps)
30 | empRDD.saveAsObjectFile("output/rdd_to_obj")
31 |
32 | val resRDD = sc.objectFile[Any]("output/rdd_to_obj")
33 | resRDD.foreach(f => println(f))
34 |
35 | //Saving DataFrame as ObjectFile and reading back
36 | import session.implicits._
37 | val empDF = emps.toDF()
38 | empDF.rdd.saveAsObjectFile("output/df_to_obj")
39 |
40 | val resDF = sc.objectFile[Any]("output/df_to_obj")
41 | resDF.foreach(f => println(f))
42 |
43 | }
44 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/fileformats/Spark_To_SequenceFiles.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.fileformats
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.hadoop.io.compress.GzipCodec
5 |
6 | object Spark_To_SequenceFiles {
7 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double)
8 |
9 | def main(args: Array[String]): Unit = {
10 |
11 | val session = SparkSession.builder().appName("Spark_To_SequenceFiles").master("local[1]").getOrCreate()
12 |
13 | import session.implicits._
14 | val purchaseDF = List(
15 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99),
16 | Purchase(122, 247, "2017-05-20", "15:30", "PST", 300.22),
17 | Purchase(123, 254, "2016-03-09", "00:50", "EST", 122.19),
18 | Purchase(124, 234, "2016-02-14", "20:50", "UTC", 500.99),
19 | Purchase(125, 247, "2015-01-13", "15:30", "PST", 300.22),
20 | Purchase(126, 254, "2015-05-16", "00:50", "EST", 122.19),
21 | Purchase(127, 250, "2016-09-17", "15:30", "PST", 300.22),
22 | Purchase(128, 251, "2018-08-15", "00:50", "EST", 122.19),
23 | Purchase(129, 299, "2019-02-19", "07:30", "UTC", 524.37)).toDF()
24 |
25 | import org.apache.spark.rdd.RDD
26 | import org.apache.spark.sql.Row
27 |
28 | val purchaseRDD: RDD[(Int, String)] = purchaseDF.rdd.map {
29 | case r: Row => (r.getAs[Int](0), r.getAs[String](2))
30 | }
31 | purchaseRDD.saveAsSequenceFile("output/rdd_to_seq")
32 |
33 | //Loading sequenceFiles into an RDD in Spark
34 |
35 | val data: RDD[(Int, String)] = session.sparkContext.sequenceFile("output/rdd_to_seq")
36 |
37 | data.foreach(f => println(f))
38 |
39 | }
40 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/fileformats/ToParquet.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.fileformats
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object ToParquet {
6 | case class Emp(empId: Int, emp_name: String, deptId: Int, deptName: String, location: String)
7 |
8 | def main(args: Array[String]): Unit = {
9 |
10 | val spark = SparkSession.builder().appName("Spark_ToAvro").master("local[1]").getOrCreate()
11 | spark.conf.set("spark.sql.parquet.compression.codec", "gzip")
12 | import spark.implicits._
13 |
14 | val empDF = List(
15 | Emp(1, "Mike", 1, "Cloud", "BNGL"),
16 | Emp(2, "Shyam", 1, "Cloud", "HYD"),
17 | Emp(3, "Revanth", 2, "Bigdata", "BNGL"),
18 | Emp(4, "Raghu", 2, "Bigdata", "HYD"),
19 | Emp(6, "Apporva", 3, "Apac", "BNGL"),
20 | Emp(5, "Naga", 3, "Apac", "HYD")).toDF()
21 |
22 | empDF.coalesce(1).write.parquet("output/to_parquet")
23 |
24 | val parquetDF = spark.read.parquet("output/to_parquet")
25 | parquetDF.show
26 |
27 | }
28 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/fileformats/XMLParsing.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.fileformats
2 | import org.apache.spark.sql.SQLContext
3 | import com.databricks.spark.xml._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql
8 | object XMLParsing {
9 | //spark-submit --class com.spark2.fileformats.XMLParsing --master local[*] Spark-2.1-1.0.jar file:////home/centos/revanth/one.xml
10 | def main(args: Array[String]): Unit = {
11 |
12 | val xmlFilePath = args(0)
13 | val spark = SparkSession.builder().appName("XMLParsing").getOrCreate()
14 | spark.conf.set("spark.debug.maxToStringFields", "10000000")
15 |
16 | import spark.implicits._
17 |
18 | val df = spark.read.format("com.databricks.spark.xml")
19 | .option("rowTag", "us-bibliographic-data-grant")
20 | .option("treatEmptyValuesAsNulls", true)
21 | .load(xmlFilePath)
22 |
23 | val q1 = df.withColumn("country", $"publication-reference.document-id.country".cast(sql.types.StringType))
24 | .withColumn("document_number", $"publication-reference.document-id.doc-number".cast(sql.types.StringType)).select("country", "document_number")
25 | for (l <- q1) {
26 | val m1 = l.get(0)
27 | val m2 = l.get(1)
28 | println(m1, m2)
29 | }
30 | }
31 |
32 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/hive/Save_To_Hive.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.hive
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.SaveMode
5 |
6 | object Save_To_Hive {
7 |
8 | case class Purchase(customer_id: Int, purchase_id: Int, day: String, time: String, tz: String, amount: Double)
9 |
10 | def main(args: Array[String]) {
11 |
12 | // warehouseLocation points to the default location for managed databases and tables
13 | val warehouseLocation = "file:${system:user.dir}/spark-warehouse"
14 |
15 | val spark = SparkSession.builder.
16 | master("local[2]")
17 | .appName("Save_Saprk_To_Hive-Example")
18 | .enableHiveSupport()
19 | .config("spark.sql.warehouse.dir", warehouseLocation)
20 | .config(" hive.metastore.warehouse.dir", "/user/hive/warehouse")
21 | .getOrCreate()
22 |
23 | import spark.implicits._
24 | import spark.sql
25 |
26 | val purchaseDF = List(
27 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99),
28 | Purchase(122, 247, "2017-04-19", "15:30", "PST", 300.22),
29 | Purchase(123, 254, "2017-04-19", "00:50", "EST", 122.19),
30 | Purchase(124, 234, "2017-04-19", "20:50", "UTC", 500.99),
31 | Purchase(125, 247, "2017-04-19", "15:30", "PST", 300.22),
32 | Purchase(126, 254, "2017-04-19", "00:50", "EST", 122.19),
33 | Purchase(125, 250, "2017-04-19", "15:30", "PST", 300.22),
34 | Purchase(126, 251, "2017-04-19", "00:50", "EST", 122.19),
35 | Purchase(127, 299, "2017-04-19", "07:30", "UTC", 524.37)).toDF()
36 |
37 | //Storing in to hive internal/managed tables
38 | purchaseDF.coalesce(1).write.mode(SaveMode.Append).insertInto("sales")
39 |
40 | //loading the data from the table
41 | val salesDf = spark.read.table("sales")
42 | salesDf.show
43 | //or
44 | sql("SELECT * FROM sales").show()
45 |
46 | //Storing in to hive external tables
47 | purchaseDF.coalesce(1).write.mode(SaveMode.Append).insertInto("sales_ext")
48 |
49 | sql("SELECT * FROM sales_ext").show()
50 |
51 | }
52 |
53 | /* CREATE TABLE IF NOT EXISTS sales ( customer_id int, purchase_id int,day String, time String, tz String, amount double)
54 | COMMENT 'Sales Data'
55 | ROW FORMAT DELIMITED
56 | FIELDS TERMINATED BY ','
57 | LINES TERMINATED BY '\n'
58 | STORED AS TEXTFILE LOCATION '/user/centos/hive/sale';
59 | */
60 |
61 | /*
62 | CREATE EXTERNAL TABLE IF NOT EXISTS sales_ext ( customer_id int, purchase_id int,day String, time String, tz String, amount double)
63 | COMMENT 'Sales Data'
64 | ROW FORMAT DELIMITED
65 | FIELDS TERMINATED BY ','
66 | LINES TERMINATED BY '\n'
67 | STORED AS TEXTFILE LOCATION '/user/centos/hive/sale_ext';
68 | */
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/hive/Spark_CatalogAPI.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.hive
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object Spark_CatalogAPI {
6 |
7 | def main(args: Array[String]) {
8 |
9 | // warehouseLocation points to the default location for managed databases and tables
10 | val warehouseLocation = "file:${system:user.dir}/spark-warehouse"
11 |
12 | val spark = SparkSession.builder.
13 | master("local[2]")
14 | .appName("Spark-Catalog-Example")
15 | .enableHiveSupport()
16 | .config("spark.sql.warehouse.dir", warehouseLocation)
17 | .getOrCreate()
18 |
19 | //interacting with catalogue
20 |
21 | val catalog = spark.catalog
22 |
23 | //print the databases
24 |
25 | catalog.listDatabases().foreach { x => println(x) }
26 | catalog.setCurrentDatabase("default")
27 | catalog.listTables.show
28 | catalog.listColumns("employee").foreach { x => println(x) }
29 |
30 | import spark.implicits._
31 | import spark.sql
32 |
33 | sql("SELECT * FROM employee").show()
34 | }
35 |
36 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/mangodb/Spark_To_MangoDB.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.mangodb
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql._
5 | import com.mongodb.casbah.{ WriteConcern => MongodbWriteConcern }
6 | import com.stratio.datasource.mongodb._
7 | import com.stratio.datasource.mongodb.config._
8 | import com.stratio.datasource.mongodb.config.MongodbConfig._
9 |
10 | object Spark_To_MangoDB {
11 |
12 | case class Student(name: String, age: Int, gender: String)
13 |
14 | def main(args: Array[String]) {
15 |
16 | val spark = SparkSession.builder().appName("Spark_To_MangoDB").master("local[1]").getOrCreate()
17 |
18 | //To save a DataFrame in MongoDB
19 | val saveConfig = MongodbConfigBuilder(Map(Host -> List("localhost:27017"), Database -> "dev", Collection -> "students", SamplingRatio -> 1.0, WriteConcern -> "normal", SplitSize -> 8, SplitKey -> "_id"))
20 |
21 | val df: DataFrame = spark.createDataFrame(spark.sparkContext.parallelize(
22 | List(Student("ravali", 27, "female"), Student("abc", 34, "male"))))
23 |
24 | df.saveToMongodb(saveConfig.build)
25 |
26 | //fromMongoDB() function to read from MongoDB and transform it to a DataFrame
27 | val builder = MongodbConfigBuilder(Map(Host -> List("localhost:27017"), Database -> "dev", Collection -> "students", SamplingRatio -> 1.0, WriteConcern -> "normal"))
28 | val readConfig = builder.build
29 | val mongoRDD = spark.sqlContext.fromMongoDB(readConfig)
30 | mongoRDD.createTempView("students")
31 |
32 | val dataFrame = spark.sql("SELECT name, age,gender FROM students")
33 | dataFrame.show
34 |
35 | //Using DataFrameWriter
36 | import org.apache.spark.sql._
37 | val options = Map("host" -> "localhost:27017", "database" -> "dev", "collection" -> "students")
38 | val dfw: DataFrame = spark.createDataFrame(spark.sparkContext.parallelize(List(Student("ravi", 30, "female"))))
39 | dfw.write.format("com.stratio.datasource.mongodb").mode(SaveMode.Append).options(options).save()
40 | val resDF = spark.read.format("com.stratio.datasource.mongodb").options(options).load
41 | resDF.show
42 |
43 | }
44 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/parquet/FileCompression.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.parquet
2 |
3 | import org.apache.spark.sql.{ SQLContext, SparkSession }
4 | import org.apache.spark.{ SparkConf, SparkContext }
5 | object FileCompression {
6 |
7 | case class DataFrameSample(name: String, actor: String, episodeDebut: String)
8 |
9 | def main(args: Array[String]): Unit = {
10 |
11 | val spark = SparkSession
12 | .builder()
13 | .appName("Spark File Compression Handling")
14 | .master("local[1]")
15 | .getOrCreate()
16 |
17 | val df = spark.createDataFrame(
18 | DataFrameSample("Homer", "Dan Castellaneta", "Good Night") ::
19 | DataFrameSample("Marge", "Julie Kavner", "Good Night") ::
20 | DataFrameSample("Bart", "Nancy Cartwright", "Good Night") ::
21 | DataFrameSample("Lisa", "Yeardley Smith", "Good Night") ::
22 | DataFrameSample("Maggie", "Liz Georges and more", "Good Night") ::
23 | DataFrameSample("Sideshow Bob", "Kelsey Grammer", "The Telltale Head") ::
24 | Nil).toDF().cache()
25 |
26 | df.write.mode("overwrite").format("parquet").option("compression", "none").mode("overwrite").save("/tmp/file_no_compression_parq")
27 | df.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save("/tmp/file_with_gzip_parq")
28 | df.write.mode("overwrite").format("parquet").option("compression", "snappy").mode("overwrite").save("/tmp/file_with_snappy_parq")
29 | //lzo - requires a different method in terms of implementation.
30 |
31 | df.write.mode("overwrite").format("orc").option("compression", "none").mode("overwrite").save("/tmp/file_no_compression_orc")
32 | df.write.mode("overwrite").format("orc").option("compression", "snappy").mode("overwrite").save("/tmp/file_with_snappy_orc")
33 | df.write.mode("overwrite").format("orc").option("compression", "zlib").mode("overwrite").save("/tmp/file_with_zlib_orc")
34 | }
35 |
36 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/parquet/ParquetCompactor.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.parquet
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object ParquetCompactor {
6 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double)
7 |
8 | def main(args: Array[String]): Unit = {
9 |
10 | val session = SparkSession.builder().appName("ParquetCompactor").master("local[1]").getOrCreate()
11 |
12 | import session.implicits._
13 | val purchaseDF = List(
14 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99),
15 | Purchase(122, 247, "2017-04-19", "15:30", "PST", 300.22),
16 | Purchase(185, 254, "2017-04-19", "00:50", "EST", 122.19),
17 | Purchase(186, 299, "2017-04-19", "07:30", "UTC", 524.37)).toDF()
18 |
19 | purchaseDF.write.parquet("input/parqOut")
20 |
21 | val df = session.read.parquet("input/parqOut")
22 |
23 | df.show()
24 | print("count before dropping :" + df.count())
25 |
26 | //dropping the duplicate rows based on customer_id
27 | val dropedDF = df.dropDuplicates("customer_id")
28 |
29 | println("count after dropping :" + dropedDF.count())
30 | dropedDF.show()
31 |
32 | }
33 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/parquet/TestDataFrame.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.parquet
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.sql.DataFrame
6 | import org.apache.spark.sql.Row
7 | import org.apache.spark.sql.catalyst.InternalRow
8 | import org.apache.spark.unsafe.types.UTF8String
9 | import org.apache.spark.sql.catalyst.expressions.UnsafeRow
10 | import org.apache.spark.sql.types.StructType
11 | import org.apache.spark.sql.types.StructField
12 | import org.apache.spark.sql.types.IntegerType
13 | import org.apache.spark.sql.types.StringType
14 | import org.apache.spark.sql.SQLContext
15 | import org.apache.spark.sql.SparkSession
16 | import org.apache.spark.sql.Column
17 | import org.apache.spark.sql.Dataset
18 |
19 | case class Person(name: String, empId: Int)
20 | case class Employee(empId: Int, emp_name: String)
21 |
22 | object TestDataFrame {
23 |
24 | def main(args: Array[String]): Unit = {
25 |
26 | val session = SparkSession.builder().appName("test").master("local[1]").getOrCreate()
27 |
28 | val person = Array(Person("John", 1), Person("Mike", 2))
29 |
30 | val employee = Array(Employee(1, "Aruba"))
31 |
32 | val personDf = session.createDataFrame(person)
33 |
34 | val employeeDf = session.createDataFrame(employee)
35 |
36 | val joinDf = personDf.join(employeeDf, Seq("empId"), "left")
37 |
38 | joinDf.write.partitionBy("name").parquet("output/test")
39 |
40 | joinDf.show()
41 |
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/problemstatement/ProblemStatement.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.problemstatements
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | object ProblemStatement {
6 |
7 | def main(args: Array[String]) {
8 | val spark =
9 | SparkSession.builder()
10 | .appName("DataFrame-ProblemStatement")
11 | .master("local[2]")
12 | .getOrCreate()
13 | import spark.implicits._
14 |
15 | //What are the best-salary and the second best-salary of employees in every department?
16 |
17 | val dataRDD = spark.read.format("csv").option("header", "true").load("input/pbs.csv").rdd
18 | val filteredDF = dataRDD.map(x => (x(2).toString(), x(3).toString().replace("$", "").toDouble)).toDF("dept", "salary").dropDuplicates().toDF()
19 |
20 | val maxSalDF = filteredDF.groupBy("dept").agg(max(filteredDF.col("salary")).as("MaxSal")).sort("dept")
21 | maxSalDF.show
22 |
23 | val subDF = filteredDF.except(maxSalDF)
24 |
25 | val ScndMaxSalDF = subDF.groupBy("dept").agg(max(subDF.col("salary")).as("SecMaxSal")).sort("dept")
26 | ScndMaxSalDF.show
27 |
28 | val pboblem1ResDF = maxSalDF.join(ScndMaxSalDF, Seq("dept")).sort("dept").toDF()
29 | pboblem1ResDF.show
30 | pboblem1ResDF.coalesce(1).write.option("header", "true").csv("/Users/revanthreddy/Desktop/Docs/file1.csv")
31 |
32 | //What is the difference between the salary of each employee and the highest salary of employee in the same department?
33 |
34 | val pboblem2DF = dataRDD.map(x => (x(0).toString(), x(2).toString(), x(3).toString().replace("$", "").toDouble)).toDF("name", "dept", "salary").dropDuplicates().toDF()
35 |
36 | val resDF = pboblem2DF.join(maxSalDF, Seq("dept")).sort("dept").toDF()
37 |
38 | val pboblem2ResDF = resDF.withColumn("diffSal", (resDF.col("MaxSal") - resDF.col("salary")))
39 | pboblem2ResDF.coalesce(1).write.option("header", "true").csv("/Users/revanthreddy/Desktop/Docs/file2.csv")
40 |
41 | }
42 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/streaming/Spark_Kafka_Streaming.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.streaming
2 |
3 | import org.apache.spark.streaming.kafka.KafkaUtils
4 | import kafka.serializer.StringDecoder
5 | import org.apache.spark.streaming.StreamingContext
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.SparkContext
8 | import org.apache.spark.streaming.Seconds
9 |
10 | object Spark_Kafka_Streaming {
11 |
12 | def main(args: Array[String]) {
13 | val SLIDE_INTERVAL = 20 //in seconds
14 | val topics = "demo"
15 | val zkQuorum = "localhost:9092"
16 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Kafka-Streaming").setMaster("local[2]"))
17 | val ssc = new StreamingContext(sc, Seconds(SLIDE_INTERVAL))
18 |
19 | val topicsSet = topics.split(",").toSet
20 | println("Streaming topics : " + topicsSet)
21 |
22 | val kafkaParams = Map[String, String]("metadata.broker.list" -> zkQuorum)
23 |
24 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
25 | ssc, kafkaParams, topicsSet).map(_._2)
26 |
27 | messages.foreachRDD(
28 | rdd => {
29 | if (!rdd.isEmpty()) {
30 | println("First record : " + rdd.first())
31 | println("rdd count : " + rdd.count())
32 | } else {
33 | println("Data is not yet recevied from the producer....")
34 | }
35 | })
36 |
37 | ssc.start()
38 | ssc.awaitTermination()
39 | }
40 | }
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/window/functions/ApStats.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.window.functions
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types._
6 |
7 | object ApStats extends App {
8 |
9 | val spark = SparkSession.builder().appName("ApStats").master("local[1]").getOrCreate()
10 |
11 | import spark.implicits._
12 |
13 | val input_switch_cpu = "input/iap_ap_info_records/date_month=2020-01/date_hour=2020-01-24-11/*"
14 |
15 | val df = spark.read.parquet(input_switch_cpu)
16 |
17 | df.printSchema()
18 |
19 | val stats = df.select($"tid", $"ap_name", $"ap_mac".as("apmac"), $"cpu_usage", $"memory_total", $"memory_free", $"ts")
20 | .withColumn(
21 | "memory_usage",
22 | lit(((col("memory_total") - col("memory_free")) / col("memory_total")) * 100))
23 | .withColumn("temp_ts", col("ts").divide(1000).cast("timestamp"))
24 | .select("tid", "ap_name", "apmac", "cpu_usage", "memory_usage", "temp_ts")
25 | .withColumn("cpu_timeseries", struct(
26 | $"temp_ts".cast("long").as("ts"),
27 | $"cpu_usage".cast("float").as("avg"),
28 | $"cpu_usage".cast("float").as("max")))
29 | .withColumn("memory_timeseries", struct(
30 | $"temp_ts".cast("long").as("ts"),
31 | $"memory_usage".cast("float").as("avg"),
32 | $"memory_usage".cast("float").as("max")))
33 | .groupBy(col("tid"), col("apmac"),
34 | window(col("temp_ts"), "1 hour").alias("ts")).
35 | agg(
36 | avg("cpu_usage").as("cl_ap_system_stats_cpu_util"),
37 | avg("memory_usage").as("cl_ap_system_stats_mem_util"),
38 | collect_list($"cpu_timeseries").as("cpu_timeseries"),
39 | collect_list($"memory_timeseries").as("memory_timeseries"))
40 | .withColumn("ts_hr", hour($"ts.start"))
41 |
42 |
43 | stats.printSchema()
44 | stats.show(5, false)
45 | }
46 |
--------------------------------------------------------------------------------
/Spark-2.1/src/main/scala/com/spark2/window/functions/CPUTidSiteRollup.scala:
--------------------------------------------------------------------------------
1 | package com.spark2.window.functions
2 | import org.apache.spark.sql.SparkSession
3 | import java.sql.Date
4 | import org.apache.spark.sql.expressions.Window
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.sql.types._
7 |
8 | object CPUTidSiteRollup extends App {
9 |
10 | val spark = SparkSession.builder().appName("SwitchCPU").master("local[1]").getOrCreate()
11 | import spark.implicits._
12 |
13 | val input_switch_cpu = "input/iap_sw_cpu_mem_stats_rollup"
14 |
15 | val dfIntermed = spark.read.parquet(input_switch_cpu)
16 | dfIntermed.show(3, false)
17 | dfIntermed.printSchema()
18 |
19 | var dfRollup = dfIntermed
20 | .groupBy("tid")
21 | .agg(countDistinct("serial_number").cast(IntegerType).as("num_switches_impacted"))
22 | .withColumn("data_type", lit("iap_insight"))
23 |
24 | dfRollup.show(5, false)
25 |
26 | val dfMpdelRollUp = dfIntermed
27 | .groupBy("tid", "model")
28 | .agg(countDistinct("serial_number").alias("num_switches_impacted"))
29 | .withColumn("model_switch_count", struct(
30 | $"model".as("model"),
31 | $"num_switches_impacted".as("count")))
32 | .groupBy("tid")
33 | .agg(collect_list("model_switch_count").alias("model_switch_count_list"))
34 |
35 | dfMpdelRollUp.show(5, false)
36 |
37 | val dfFirmwareRollup = dfIntermed
38 | .groupBy("tid", "firmware")
39 | .agg(countDistinct("serial_number").alias("num_switches_impacted"))
40 | .withColumn("firmware_switch_count", struct(
41 | $"firmware".as("firmware"),
42 | $"num_switches_impacted".as("count")))
43 | .groupBy("tid")
44 | .agg(collect_list("firmware_switch_count").alias("firmware_switch_count_list"))
45 |
46 | dfRollup = dfRollup.join(
47 | dfMpdelRollUp,
48 | Seq("tid"), "left_outer")
49 | .join(
50 | dfFirmwareRollup,
51 | Seq("tid"), "left_outer")
52 | .withColumn("timeline_metric", $"num_switches_impacted".cast(FloatType))
53 |
54 | dfRollup.show(5, false)
55 | }
56 |
--------------------------------------------------------------------------------
/Spark-2.1/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=WARN, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %t %c{1}:%L - %m%n
9 |
10 | log4j.logger.com.rasa.cloud=DEBUG,stdout
11 | log4j.logger.com.rasa.cloud.nade=DEBUG,stdout
12 | log4j.additivity.com.rasa.cloud.nade=false
--------------------------------------------------------------------------------
/Spark-2.1/src/test/scala/test/MetricsTest.scala:
--------------------------------------------------------------------------------
1 | package test
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import com.datadog.metrics.MetricsCollectorFactory
5 |
6 | object MetricsTest {
7 | def main(args: Array[String]) {
8 |
9 | val sparkSession = SparkSession.builder.
10 | master("local[1]")
11 | .appName("Spark_Accumulator_Metrics_To_DataDog")
12 | .getOrCreate()
13 |
14 | val sc = sparkSession.sparkContext
15 |
16 | val accum = sc.longAccumulator("total.charecters.counter")
17 | val words = sc.textFile("input/lines").flatMap(_.split(" "))
18 | words.foreach(w => accum.add(w.length))
19 |
20 | //setting the metrics tags
21 | var metricsTags = collection.mutable.Map[String, Any]()
22 | metricsTags.put("accum name", accum.name.get)
23 | metricsTags.put("accum value", accum.value)
24 |
25 | //initializing the metrics collector
26 | val metricsCollector = MetricsCollectorFactory.getDatadogCollector("947d12f46dead405bf019033434f0cba", "dev")
27 |
28 | //sending accumulator values as metrics to DataDog
29 | metricsCollector.sendMetrics(accum.name.get, accum.value, metricsTags)
30 |
31 | val badRecords = sc.longAccumulator("bad.records.counter")
32 | val baddata = sc.textFile("input/badrecords").map(v => v.split(","))
33 | baddata.foreach(r => { try { r(2).toInt } catch { case e: NumberFormatException => badRecords.add(1) } })
34 |
35 | //sending accumulator values as metrics to DataDog
36 | metricsCollector.sendMetrics(badRecords.name.get, badRecords.value, null)
37 |
38 | val acc = sc.longAccumulator("counter.test")
39 | val baddata1 = sc.textFile("input/badrecords").map(x => acc.add(1))
40 | baddata1.collect()
41 |
42 | //setting the event tags
43 | var eventTags = collection.mutable.Map[String, Any]()
44 | eventTags.put("accum name", acc.name.get)
45 | eventTags.put("accum value", acc.value)
46 |
47 | //sending events to DataDog
48 | metricsCollector.sendEvents("DataDog Event Test", "Sending events", "normal", "info", eventTags)
49 |
50 | sc.stop()
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/Spark-Zeppelin/FirstSparkCassandraApp.git.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/Spark-Zeppelin/README.md:
--------------------------------------------------------------------------------
1 | # Setting up your Spark Cassandra Dev Environment
2 |
3 | A quick workshop on building your first Spark Cassandra Stand Alone Application. In this workshop we will
4 | walk through setting up our Spark and Cassandra Dev environment with IntelliJ. In addition we wil setup Zeppelin
5 | to use as a Spark Interpreter.
6 |
7 | 1. [Setup and Download Components](Setup.md)
8 | 2. [Zeppelin Tutorial](Zeppelin.md)
9 | 3. [Stand Alone App Tutorial](StandAloneApp.md)
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/Spark-Zeppelin/Setup.md:
--------------------------------------------------------------------------------
1 | # Setup for Workshop
2 |
3 | ## Downloading our components
4 |
5 | [Download Apache Spark 2.2.1](https://www.apache.org/dyn/closer.lua/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz)
6 |
7 | [Download Apache Cassandra 3.0.15](http://www.apache.org/dyn/closer.lua/cassandra/3.0.15/apache-cassandra-3.0.15-bin.tar.gz)
8 |
9 | [Download Apache Zeppelin 0.7.3](http://mirrors.gigenet.com/apache/zeppelin/zeppelin-0.7.3/zeppelin-0.7.3-bin-netinst.tgz)
10 |
11 |
12 | ## Let's start by Setting up Cassandra
13 |
14 | ### Start up Cassandra
15 |
16 | tar -xvf apache-cassandra-3.0.15-bin.tar.gz
17 | cd apache-cassandra-3.0.15
18 | ./bin/cassandra
19 |
20 | ### Test out our Cassandra Connection
21 |
22 | ./bin/cqlsh
23 |
24 | CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 };
25 | use test;
26 | CREATE TABLE tab ( k int, c int, v int, PRIMARY KEY (k,c))
27 | INSERT INTO tab (k , c , v ) VALUES ( 1, 1, 1) ;
28 | SELECT * FROM test.tab;
29 |
30 | k | c | v
31 | ---+---+---
32 | 1 | 1 | 1
33 |
34 | ### What does our Cassandra Table look like?
35 |
36 | k :: partition key
37 | c :: clustering key
38 | v :: a value
39 |
40 | On disk this looks like
41 |
42 | k1 -> (c1,v1) , (c2,v2), (c3,v3)
43 | k2 -> (c1,v1) , (c2,v2), (c3,v3)
44 |
45 |
46 | #### Important Cassandra Concepts
47 | For more information on Cassandra and Data Layout
48 | * Tokens : Where data lives
49 | * DataModeling : How data is laid out on disk
50 | * Replication : How many copies of the data will there be on the Server
51 | * Consistency Level : How many acknowledgements the Client needs for success
52 |
53 | Study more later!
54 | [Datastax Academy](https://academy.datastax.com/)
55 |
56 | #### Setup Spark
57 |
58 | tar -xvf spark-2.2.1-bin-hadoop2.7.tgz
59 |
60 |
61 | #### Add Spark-Cassandra-Connector
62 |
63 | Edit our environment
64 |
65 | Add to conf/spark-defaults :
66 |
67 | cp spark-defaults.conf.template spark-defaults.conf
68 |
69 | edit spark-defaults.conf file
70 |
71 | spark.jars.packages datastax:spark-cassandra-connector:2.0.7-s_2.11
72 |
73 | [Spark Cassandra Connector at Spark Packages](https://spark-packages.org/package/datastax/spark-cassandra-connector)
74 |
75 |
--------------------------------------------------------------------------------
/Spark-Zeppelin/Zeppelin.md:
--------------------------------------------------------------------------------
1 | ## Working with Zeppelin
2 |
3 | ### Set up Zeppelin
4 |
5 | tar -xvf zeppelin-0.7.3-bin-netinst.tgz
6 |
7 | Set SPARK_HOME for Zeppelin to our spark directory
8 | Edit zeppelin-0.7.3-bin-netinst/conf/zeppelin-env.sh
9 |
10 | export SPARK_HOME="/Users/russellspitzer/SparkInstalls/spark-2.2.1-bin-hadoop2.7" ## Replace this with your install directory
11 |
12 |
13 | #### Start Zeppelin
14 |
15 | ./zeppelin-0.7.3-bin-netinst/bin/zeppelin.sh
16 |
17 | ### Zeppelin Home Screen
18 |
19 | [Local Zeppelin](http://localhost:8080/#/)
20 | 
21 |
22 | * Name is just something personal to identify the notebook for us
23 | * Interpreter is the code execution engine used for snippets
24 |
25 | ### Setting up the Interpreter
26 |
27 | 
28 | 
29 |
30 | Important for us
31 |
32 | * master : The Spark Resource manager used for our Application
33 |
34 | ### Go over some basics with Dataframes
35 |
36 | 
37 |
38 |
39 | [Dataframe Notebook](notebooks/Spark%20Cassandra%20Note.json)
40 |
41 |
42 | ### Zeppelin Challenges
43 |
44 | [Challenges Notebook](notebooks/Spark%20Cassandra%20Challenges.json)
--------------------------------------------------------------------------------
/Spark-Zeppelin/images/Interpreter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-Zeppelin/images/Interpreter.png
--------------------------------------------------------------------------------
/Spark-Zeppelin/images/SetupImplicits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-Zeppelin/images/SetupImplicits.png
--------------------------------------------------------------------------------
/Spark-Zeppelin/images/SparkOptions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-Zeppelin/images/SparkOptions.png
--------------------------------------------------------------------------------
/Spark-Zeppelin/images/makenote.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-Zeppelin/images/makenote.png
--------------------------------------------------------------------------------