├── README.md ├── Spark-1.5 ├── README.md ├── pom.xml └── src │ └── main │ ├── resources │ ├── TravelData.txt │ ├── baby_names.txt │ ├── books.xml │ ├── campaign.csv │ ├── cars.txt │ ├── dialy_show_guests.txt │ ├── ebay.csv │ ├── emp.txt │ ├── flights.csv │ ├── log.txt │ ├── log4j_conf.xml │ ├── olaCabData.txt │ ├── olympics_data.txt │ ├── partitioner.txt │ ├── person.txt │ ├── sales.json │ ├── sensoranalytics.txt │ ├── users.txt │ ├── wordcount.txt │ ├── yahoo_stocks.csv │ └── youtubedata.txt │ └── scala │ └── com │ └── spark │ ├── cassandra │ ├── CassandraCQL.scala │ ├── Cassandra_SparkStreaming.scala │ ├── KafkaConsumerToCassandra.scala │ ├── Spark_Cassandra .scala │ └── Spark_Cassandra_Delete.scala │ ├── customudf │ ├── CustomMean.scala │ ├── CustomUDAF.scala │ ├── CustomUDF.scala │ └── SparkUDF.scala │ ├── examples │ ├── CustomPartitioner.scala │ ├── CustomReceiver.scala │ ├── DataFrame.scala │ ├── DataFrame_Joins.scala │ ├── DataFramesRollup.scala │ ├── KafkaConsumer.scala │ ├── KafkaConsumerToCassandra.scala │ ├── KafkaConsumerToHDFS.scala │ ├── KafkaProducer.scala │ ├── MovingAverage.scala │ ├── ReadHDFSFolders.scala │ ├── ReadMultipleFiles.scala │ ├── SparkFileStreaming.scala │ ├── SparkJDBC.scala │ ├── SparkStructType.scala │ ├── Spark_Avro.scala │ ├── Spark_CSV_Reader.scala │ ├── Spark_CustomReceiver.scala │ ├── Spark_Hive.scala │ ├── Spark_Hive_ORC.scala │ ├── Spark_Joins.scala │ ├── Spark_Json_Reader.scala │ ├── Spark_SequenceFiles.scala │ ├── Spark_StructType.scala │ ├── Spark_XML.scala │ ├── Stateful_WordCount.scala │ ├── WindowBasedStreaming.scala │ ├── Window_Sliding_Interval.scala │ └── WordCount.scala │ ├── mangodb │ └── Spark_MangoDB.scala │ ├── transformations │ ├── AggregateByKey.scala │ ├── Cars.scala │ ├── Cogroup.scala │ ├── Filter.scala │ ├── FoldByKey.scala │ ├── GroupBY_ReduceBY.scala │ ├── MapvsFlatMap.scala │ └── Reduce.scala │ ├── usecases │ ├── FlightDataAnalysis.scala │ ├── NamesAnalysis.scala │ ├── OlaDataAnalysis.scala │ ├── OlympicsDataAnalysis.scala │ ├── TVShowDataAnalysis.scala │ ├── TravelDataAnalysis.scala │ ├── YoutubeDataAnalysis.scala │ ├── loganalysis │ │ ├── ApacheAccessLog.scala │ │ ├── LogAnalyzer.scala │ │ └── LogAnalyzerSQL.scala │ ├── sensoranalytics │ │ ├── Models.scala │ │ ├── SchemaParser.scala │ │ └── SensorAnalytics.scala │ └── twitteranalytics │ │ ├── TwitterAnalytics.scala │ │ └── twitterclient.scala │ └── util │ ├── LogHelper.scala │ └── Utills .scala ├── Spark-2.1 ├── README.md ├── input │ ├── Fire_Department_Calls.csv │ ├── Fire_Incidents.csv │ ├── README.md │ ├── Real_Estate_Data.csv │ ├── badrecords │ ├── books.xml │ ├── conf.properties │ ├── emp.txt │ ├── empData.csv │ ├── iap_sw_cpu_mem_stats_rollup │ │ ├── part-00000-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet │ │ └── part-00001-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet │ ├── lines │ ├── nested.json │ ├── one.xml │ ├── pbs.csv │ ├── product │ ├── purchases.log │ ├── schools.json │ ├── sw_hp_system_cpu_stats_records │ │ └── date_month=2020-01 │ │ │ ├── date_hour=2020-01-13-04 │ │ │ └── data.parquet │ │ │ └── date_hour=2020-01-13-05 │ │ │ └── data.parquet │ ├── sw_hp_system_info_stats_records │ │ └── date_month=2020-01 │ │ │ ├── date_hour=2020-01-13-04 │ │ │ └── data.parquet │ │ │ └── date_hour=2020-01-13-05 │ │ │ └── data.parquet │ └── sw_hp_system_memory_stats_records │ │ └── date_month=2020-01 │ │ ├── date_hour=2020-01-13-04 │ │ └── data.parquet │ │ └── date_hour=2020-01-13-05 │ │ └── data.parquet ├── pom.xml └── src │ ├── main │ └── scala │ │ └── com │ │ ├── datadog │ │ └── metrics │ │ │ ├── AbstractCaseClass.scala │ │ │ ├── CaseClasses.scala │ │ │ ├── DataDogHttpTransport.scala │ │ │ ├── DatadogCollector.scala │ │ │ ├── MetricsCollector.scala │ │ │ ├── MetricsCollectorFactory.scala │ │ │ ├── Spark_Accumulator.scala │ │ │ └── Transport.scala │ │ └── spark2 │ │ ├── aws │ │ └── Spark_AWS_S3.scala │ │ ├── cassandra │ │ ├── ChangeDFTypes.scala │ │ ├── ConvetTimestampToLong.scala │ │ ├── ExportCassandraData.scala │ │ ├── FilterCassandraData.scala │ │ ├── SparkStreaming_Cassandra.scala │ │ ├── Spark_To_Caasandra.scala │ │ ├── Writting_DF_To_Cassandra.scala │ │ └── export │ │ │ ├── CassandraYaml.scala │ │ │ ├── ExportCassandraData.scala │ │ │ ├── Export_Cassandra_Data.scala │ │ │ ├── Export_Cassandra_Table_Data.scala │ │ │ ├── Utils.scala │ │ │ ├── YamlProps.scala │ │ │ └── cassandra-table-export.yml │ │ ├── custom │ │ ├── CustomPartitioner.scala │ │ ├── HashJoin.scala │ │ ├── PairRDD.scala │ │ ├── SemiStructuredUtilUDF.scala │ │ ├── UDAF.scala │ │ └── UDF.scala │ │ ├── dataframes │ │ ├── ComplexSchema.scala │ │ ├── DataFrame_DropDuplicates.scala │ │ ├── DatasetConversion.scala │ │ ├── DateTime.scala │ │ ├── DropColumns.scala │ │ ├── GenerateUniqueId.scala │ │ ├── GroupingAndAggregation.scala │ │ ├── HDFSFilesList.scala │ │ ├── HandlingNulls.scala │ │ ├── PartitionBy.scala │ │ ├── PartitionByColumn.scala │ │ ├── PartitionBy_WithUDF.scala │ │ ├── ProblemStatement.scala │ │ └── RecordsCount.scala │ │ ├── dataset │ │ ├── ComplexType.scala │ │ ├── DatasetBasic.scala │ │ ├── SemiStructuredData.scala │ │ └── WordCountDS.scala │ │ ├── elasticsearch │ │ ├── CsvToESLoad.scala │ │ ├── ESDeleteByQuery.scala │ │ ├── ESQuerying.scala │ │ ├── Read_And_Delete_From_ES.scala │ │ ├── Read_From_ES.scala │ │ └── Write_To_ES.scala │ │ ├── examples │ │ ├── ExplodeDemo.scala │ │ ├── Filter.scala │ │ ├── FilterEmpty.scala │ │ ├── LoadPropsFile.scala │ │ ├── ParquetCompactor.scala │ │ ├── Spark_Accumulator.scala │ │ ├── Spark_CatalogAPI.scala │ │ ├── Spark_To_Caasandra.scala │ │ └── Test.scala │ │ ├── fileformats │ │ ├── AvroToJson.scala │ │ ├── NestedJsonParser.scala │ │ ├── Simple_XMLParser.scala │ │ ├── Simple_XMLParser1.scala │ │ ├── Spark_To_ObjectFile.scala │ │ ├── Spark_To_SequenceFiles.scala │ │ ├── ToParquet.scala │ │ └── XMLParsing.scala │ │ ├── hive │ │ ├── AddHivePartitions.scala │ │ ├── Save_As_Hive_Parquet.scala │ │ ├── Save_To_Hive.scala │ │ ├── Save_To_Hive_Partitioned_External_Table.scala │ │ └── Spark_CatalogAPI.scala │ │ ├── jdbc │ │ └── Spark_To_Jdbc.scala │ │ ├── mangodb │ │ └── Spark_To_MangoDB.scala │ │ ├── parquet │ │ ├── FileCompression.scala │ │ ├── ParquetCompactor.scala │ │ └── TestDataFrame.scala │ │ ├── problemstatement │ │ ├── FireDepartmentCalls.scala │ │ └── ProblemStatement.scala │ │ ├── streaming │ │ └── Spark_Kafka_Streaming.scala │ │ └── window │ │ └── functions │ │ ├── ApStats.scala │ │ ├── CPUTidSiteRollup.scala │ │ └── SwitchCPUMemStats.scala │ └── test │ ├── resources │ └── log4j.properties │ └── scala │ └── test │ └── MetricsTest.scala └── Spark-Zeppelin ├── FirstSparkCassandraApp.git.iml ├── README.md ├── Setup.md ├── StandAloneApp.md ├── Zeppelin.md ├── images ├── Interpreter.png ├── SetupImplicits.png ├── SparkOptions.png └── makenote.png └── notebooks ├── Spark Cassandra Challenges.json └── Spark Cassandra Note.json /Spark-1.5/src/main/resources/campaign.csv: -------------------------------------------------------------------------------- 1 | Week,Campaign Type,Campaign,Account,Branded vs. Unbranded,Category,Impressions,Clicks,Cost,Engagements,Patient Journey,Device,Indication,Country,Region,Metro Area 2 | 5/9/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,"Computers,revanth",Multicentric Castleman's Disease (MCD),United States,Nevada,Las Vegas NV 3 | 5/9/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Texas,El Paso TX 4 | 5/23/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Pennsylvania,Wilkes Barre-Scranton PA 5 | 5/16/2016,SDTC,Sylvant,Google,Branded,Branded,3,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Idaho,Boise ID 6 | 5/23/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Delaware,Philadelphia PA 7 | 5/9/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Nevada,Las Vegas NV 8 | 5/9/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Texas,El Paso TX 9 | 5/23/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Pennsylvania,Wilkes Barre-Scranton PA 10 | 5/16/2016,SDTC,Sylvant,Google,Branded,Branded,3,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Idaho,Boise ID 11 | 5/23/2016,SDTC,Sylvant,Google,Branded,Branded,1,0,0,,Adherence,Computers,Multicentric Castleman's Disease (MCD),United States,Delaware,Philadelphia PA 12 | 5/30/2016,SDTC,Lymph Nodes,Bing,Unbranded,Condition,99,0,0,,Diagnosis,Smartphone,Multicentric Castleman's Disease (MCD),United States,Indiana,"Chicago, IL" 13 | 5/30/2016,SDTC,Lymph Nodes,Bing,Unbranded,Condition,99,0,0,,Diagnosis,Smartphone,Multicentric Castleman's Disease (MCD),United States,Indiana,"Chicago, IL" 14 | -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/dialy_show_guests.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-1.5/src/main/resources/dialy_show_guests.txt -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/ebay.csv: -------------------------------------------------------------------------------- 1 | auctionid,bid,bidtime,bidder,bidderrate,openbid,price,item 2 | 1,175,2.230949,schadenfreud,0,99,100,xbox 3 | 2,100,2.600116,chuik,0,99,101,iphone 4 | 3,120,2.60081,kiwisstuff,2,99,102,oneplus 5 | 4,150,2.601076,kiwisstuff,2,99,103,xiomi 6 | 5,177.5,2.909826,eli.flint,4,99,104,sanbox 7 | 6,1,0.355856,bfalconb,2,1,105,mobile 8 | 7,1.25,0.484757,sbord,1,1,106,mouse 9 | 8,1.5,0.492639,bfalconb,2,1,107,keyboard 10 | 9,25,0.49463,sbord,1,1,108,laptop 11 | 1,175,2.230949,schadenfreud,0,99,200,xbox 12 | 2,100,2.600116,chuik,0,99,201,iphone 13 | 3,120,2.60081,kiwisstuff,2,99,202,oneplus 14 | 4,150,2.601076,kiwisstuff,2,99,203,xiomi 15 | 5,177.5,2.909826,eli.flint,4,99,204,sanbox 16 | 6,1,0.355856,bfalconb,2,1,205,mobile 17 | 7,1.25,0.484757,sbord,1,1,206,mouse 18 | 8,1.5,0.492639,bfalconb,2,1,207,keyboard 19 | 9,25,0.49463,sbord,1,1,208,laptop 20 | -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/emp.txt: -------------------------------------------------------------------------------- 1 | empid|name|Dept|salary|No of projects worked 2 | 328561|Revanth1|DPE|1000|5| 3 | 328562|Revanth2|DPE|2000|6| 4 | 328563|Revanth3|DPE|3000|3| 5 | 328564|Revanth4|DPE|4000|4| 6 | 328565|Revanth5|DPE|5000|6| 7 | 328566|Revanth6|DPE|6000|5| 8 | 9 | 328561|Revanth1|DPE|7000|1| 10 | 328562|Revanth2|DPE|18000|2| 11 | 328563|Revanth3|DPE|5000|4| 12 | 328564|Revanth4|DPE|3000|3| 13 | 328565|Revanth5|DPE|4000|5| 14 | 328566|Revanth6|DPE|7000|7| 15 | -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/log4j_conf.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/olaCabData.txt: -------------------------------------------------------------------------------- 1 | dispatching_base_number,date,active_vehicles,trips 2 | B02512,3/21/2015,1,1 3 | B02513,3/21/2016,2,3 4 | B02764,3/22/2016,3,2 5 | B02512,3/21/2015,1,2 6 | B02513,3/21/2014,2,1 7 | B02764,3/22/2014,3,1 8 | B02512,3/21/2013,1,4 9 | B02513,3/21/2014,2,3 10 | B02764,3/22/2014,3,4 11 | B02764,3/22/2014,3,1 12 | B02764,3/22/2014,3,1 13 | B02764,3/22/2014,3,1 14 | B02682,2/26/2015,1465,13814 15 | B02512,2/26/2015,243,1797 16 | B02765,2/26/2015,745,6744 17 | B02764,2/26/2015,4101,36091 18 | B02765,2/27/2015,786,7563 19 | B02617,2/27/2015,1551,14677 20 | B02598,2/27/2015,1114,10755 21 | B02512,2/27/2015,272,2056 22 | B02764,2/27/2015,4253,38780 23 | B02682,2/27/2015,1510,14975 24 | B02598,2/28/2015,994,10319 25 | B02764,2/28/2015,3952,39812 26 | B02617,2/28/2015,1372,14022 27 | B02682,2/28/2015,1386,14472 28 | B02512,2/28/2015,230,1803 29 | B02765,2/28/2015,747,7753 -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/partitioner.txt: -------------------------------------------------------------------------------- 1 | venkat 2 | 1000 3 | suman 4 | 2000 5 | sachin 6 | 3000 7 | senthil 8 | 4000 9 | revanth 10 | hari 11 | ganesh -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/person.txt: -------------------------------------------------------------------------------- 1 | revanth,reddy,26 2 | shyam,sunder,22 3 | kiran,kumar,24 4 | revanth,reddy,26 5 | shyam,sunder,22 6 | kiran,kumar,24 -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/sales.json: -------------------------------------------------------------------------------- 1 | {"transactionId":111,"customerId":1,"itemId": 1,"itemName": "xbox","amountPaid": 1400.0} 2 | {"transactionId":112,"customerId":2,"itemId": 2,"itemName": "Iphonr","amountPaid": 5035.0} 3 | {"transactionId":113,"customerId":3,"itemId": 3,"itemName": "OnePlus","amountPaid": 5310.0} 4 | {"transactionId":114,"customerId":4,"itemId": 4,"itemName": "HP","amountPaid": 6000.0} 5 | {"transactionId":115,"customerId":1,"itemId": 2,"itemName": "Dell","amountPaid": 5010.0} 6 | {"transactionId":116,"customerId":1,"itemId": 2,"itemName": "Letv","amountPaid": 5020.0} 7 | {"transactionId":117,"customerId":1,"itemId": 2,"itemName": "Reuters","amountPaid": 500.0} 8 | {"transactionId":118,"customerId":1,"itemId": 2,"itemName": "Wipro","amountPaid": 5400.0} 9 | {"transactionId":119,"customerId":2,"itemId": 3,"itemName": "Thomson","amountPaid": 5010.0} 10 | {"transactionId":120,"customerId":1,"itemId": 2,"itemName": "HTC","amountPaid": 5020.0} 11 | {"transactionId":121,"customerId":1,"itemId": 4,"itemName": "Nokia","amountPaid": 5034.0} 12 | {"transactionId":122,"customerId":1,"itemId": 2,"itemName": "Ericson","amountPaid": 5300.0} 13 | {"transactionId":123,"customerId":1,"itemId": 4,"itemName": "Samsung","amountPaid": 5050.0} 14 | {"transactionId":124,"customerId":1,"itemId": 2,"itemName": "Panasonic","amountPaid": 5060.0} -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/sensoranalytics.txt: -------------------------------------------------------------------------------- 1 | 2015/10/09 12:00:00.188 ,India,TN,Chennai,Success 2 | 2015/10/09 13:00:00.189 ,India,TN,Chennai,Failure 3 | 2015/10/09 15:00:00.233 ,India,TN,Chennai,Success 4 | 2015/10/09 16:00:00.268 ,India,TN,Chennai,Failure 5 | 2015/10/09 07:00:00.449 ,US,Washington,Seattle,Failure 6 | 2015/10/09 01:00:00.449 ,US,Washington,Seattle,Success 7 | 2015/10/09 04:00:00.449 ,US,Washington,Seattle,Failure 8 | 2015/10/09 05:00:00.449 ,US,Washington,Seattle,Success 9 | 2016/03/07 02:00:00.010 ,India,Karnataka,Banglore,Success 10 | 2016/03/07 07:00:00.053 ,India,Karnataka,Banglore,Failure 11 | 2016/03/07 02:00:00.010 ,India,Karnataka,Banglore,Success 12 | 2016/03/07 07:00:00.053 ,India,Karnataka,Banglore,Failure 13 | 2016/03/25 02:00:00.010 ,India,Karnataka,Banglore,Success 14 | 2016/03/25 07:00:00.053 ,India,Karnataka,Banglore,Failure 15 | 2016/03/25 02:00:00.010 ,India,Karnataka,Banglore,Success 16 | 2016/03/25 07:00:00.053 ,India,Karnataka,Banglore,Failure 17 | 2016/03/26 02:00:00.010 ,India,Karnataka,Banglore,Success 18 | 2016/03/26 07:00:00.053 ,India,Karnataka,Banglore,Failure 19 | 2016/03/26 02:00:00.010 ,India,Karnataka,Banglore,Success 20 | 2016/03/26 07:00:00.053 ,India,Karnataka,Banglore,Failure -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/users.txt: -------------------------------------------------------------------------------- 1 | 1201,satish,25 2 | 1202,krishna,28 3 | 1203,amith,39 4 | 1204,javed,23 5 | 1205,prudvi,23 -------------------------------------------------------------------------------- /Spark-1.5/src/main/resources/wordcount.txt: -------------------------------------------------------------------------------- 1 | Please note that here we have just defined RDD, data is not loaded still. 2 | This means that if you go to access the data in this RDD it could fail. 3 | The computation to create the data in an RDD is only done when the data is referenced; 4 | for example, it is created by caching or writing out the RDD -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/cassandra/CassandraCQL.scala: -------------------------------------------------------------------------------- 1 | package com.spark.cassandra 2 | import org.apache.spark.{ SparkContext, SparkConf } 3 | import com.datastax.spark.connector._ 4 | import org.apache.spark._ 5 | import java.util.UUID 6 | import org.apache.spark.sql.cassandra.CassandraSQLContext 7 | import org.apache.spark.sql._ 8 | object CassandraCQL extends App { 9 | 10 | case class Emp(Id: Int, name: String, salary: String) 11 | 12 | val conf = new SparkConf(true).set("spark.cassandra.connection.host", "127.0.0.1").setAppName("CassandraCQL").setMaster("local[2]") 13 | val sc = new SparkContext(conf) 14 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 15 | 16 | //implementation using cassandra sql context 17 | val cc = new CassandraSQLContext(sc) 18 | val rdd = cc.sql("SELECT id,name,salary FROM spark_kafka_cassandra.employee where name like 'HARI%'") 19 | 20 | rdd.collect().foreach(println) 21 | 22 | /* //implementation using cassandra table converting to df 23 | val user_table = sc.cassandraTable("tutorial", "user") 24 | 25 | val df = sqlContext 26 | .read 27 | .format("org.apache.spark.sql.cassandra") 28 | .options(Map("table" -> "user", "keyspace" -> "tutorial")) 29 | .load() 30 | 31 | df.registerTempTable("user") 32 | val results = sqlContext.sql("SELECT empname,sum(empsal),sum(projno) FROM tutorial.user GROUP BY empid,empname,deptno") 33 | //results.collect().foreach(println) 34 | */ sc.stop 35 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/cassandra/Cassandra_SparkStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.spark.cassandra 2 | import org.apache.spark._ 3 | import org.apache.spark.streaming._ 4 | import com.datastax.spark.connector.streaming._ 5 | import org.apache.spark.rdd._ 6 | import org.apache.spark.streaming.dstream.ConstantInputDStream 7 | 8 | //Reading from Cassandra using Spark Streaming 9 | object Cassandra_SparkStreaming { 10 | case class Employee(Id: Int, name: String, salary: Int) 11 | 12 | def main(args: Array[String]) { 13 | val KEY_SPACE_NAME = "spark_kafka_cassandra" 14 | val TABLE_NAME = "employee" 15 | val conf = new SparkConf().setAppName("Cassandra_SparkStreaming").set("spark.cassandra.connection.host", "127.0.0.1") 16 | 17 | val ssc = new StreamingContext(conf, Seconds(10)) 18 | 19 | val cassandraRDD = ssc.cassandraTable[Employee](KEY_SPACE_NAME, TABLE_NAME).select("id", "name", "salary") 20 | 21 | val dstream = new ConstantInputDStream(ssc, cassandraRDD) 22 | 23 | dstream.foreachRDD { rdd => 24 | println("Total Records cont in DB : " + rdd.count) 25 | println(rdd.collect.mkString("\n")) 26 | } 27 | 28 | ssc.start() 29 | ssc.awaitTermination() 30 | } 31 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/cassandra/KafkaConsumerToCassandra.scala: -------------------------------------------------------------------------------- 1 | package com.spark.cassandra 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Minutes 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.kafka.KafkaUtils 10 | import com.datastax.driver.core.Cluster 11 | import com.datastax.spark.connector.SomeColumns 12 | import com.datastax.spark.connector.toNamedColumnRef 13 | import com.datastax.spark.connector.toRDDFunctions 14 | 15 | import kafka.serializer.StringDecoder 16 | object KafkaConsumerToCassandra { 17 | val SLIDE_INTERVAL = 1 18 | def startStreaming(args: Array[String]): Unit = { 19 | try { 20 | val Array(zkQuorum, topics) = args 21 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Kafka-Streaming").setMaster("local[2]").set("spark.cassandra.connection.host", "127.0.0.1")) 22 | val ssc = new StreamingContext(sc, Minutes(SLIDE_INTERVAL)) 23 | val topicsSet = topics.split(",").toSet 24 | val kafkaParams = Map[String, String]("metadata.broker.list" -> zkQuorum) 25 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 26 | ssc, kafkaParams, topicsSet).map(_._2).map(line => line.split('|')) 27 | 28 | val cluster = Cluster.builder().addContactPoint("127.0.0.1").build() 29 | //Creating Session object 30 | val session = cluster.connect() 31 | session.execute("CREATE KEYSPACE IF NOT EXISTS spark_kafka_cassandra WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 };"); 32 | val query = "CREATE TABLE IF NOT EXISTS spark_kafka_cassandra.employee (id int PRIMARY KEY,name VARCHAR, salary int);" 33 | //Executing the query 34 | session.execute(query) 35 | 36 | messages.foreachRDD( 37 | rdd => { 38 | if (!rdd.isEmpty()) { 39 | println(rdd.first()) 40 | println("rdd count " + rdd.count()) 41 | val resRDD = rdd.map(line => (line(0), line(1), line(2))) 42 | .saveToCassandra("spark_kafka_cassandra", "employee", SomeColumns("id", "name", "salary")) 43 | } else { 44 | println("Data is not yet recevied from the producer....") 45 | } 46 | }) 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } catch { 50 | case ex: Exception => { 51 | println(ex.getMessage) 52 | } 53 | } 54 | } 55 | 56 | def main(args: Array[String]) { 57 | /*if (args.length < 2) { 58 | System.err.println("Usage: KafkaConsumer ") 59 | System.exit(1) 60 | }*/ 61 | startStreaming(args) 62 | } 63 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/cassandra/Spark_Cassandra_Delete.scala: -------------------------------------------------------------------------------- 1 | package com.spark.cassandra 2 | 3 | import scala.reflect.runtime.universe 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import com.datastax.driver.core.BoundStatement 7 | import com.datastax.driver.core.Cluster 8 | import com.datastax.spark.connector.toSparkContextFunctions 9 | import org.apache.log4j.Logger 10 | import org.apache.log4j.Level 11 | object Spark_Cassandra_Delete { 12 | case class Employee(Id: Int, name: String, salary: Int) 13 | def main(args: Array[String]) { 14 | Logger.getLogger("org").setLevel(Level.WARN) 15 | val CASSANDRA_HOST = "127.0.0.1" 16 | val conf = new SparkConf(true).set("spark.cassandra.connection.host", CASSANDRA_HOST).setAppName("Spark-Cassandra-Delete").setMaster("local[2]") 17 | val sc = new SparkContext(conf) 18 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 19 | val KEY_SPACE_NAME = "spark_kafka_cassandra" 20 | val TABLE_NAME = "employee" 21 | 22 | //Creating Cluster object 23 | val cluster = Cluster.builder().addContactPoint(CASSANDRA_HOST).build() 24 | //Creating Session object 25 | val session = cluster.connect() 26 | 27 | try { 28 | val deleteQuery = " delete from " + KEY_SPACE_NAME + "." + TABLE_NAME + " WHERE id = 102 " 29 | val deletequeryprepared = session.prepare(deleteQuery) 30 | val deleteBoundStatement = new BoundStatement(deletequeryprepared) 31 | session.execute(deleteBoundStatement) 32 | 33 | //Displaying the records 34 | val rows = sc.cassandraTable[Employee](KEY_SPACE_NAME, TABLE_NAME) 35 | rows.toArray.foreach(println) 36 | 37 | } catch { 38 | case e: Exception => 39 | println(e) 40 | } finally { 41 | session.close() 42 | cluster.close() 43 | sc.stop() 44 | } 45 | 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/customudf/CustomMean.scala: -------------------------------------------------------------------------------- 1 | package com.spark.customudf 2 | import org.apache.spark.sql.Row 3 | import org.apache.spark.sql.expressions.{ MutableAggregationBuffer, UserDefinedAggregateFunction } 4 | import org.apache.spark.sql.types._ 5 | 6 | //Extend UserDefinedAggregateFunction to write custom aggregate function 7 | //You can also specify any constructor arguments. For instance you 8 | //can have CustomMean(arg1: Int, arg2: String) 9 | class CustomMean() extends UserDefinedAggregateFunction { 10 | 11 | // Input Data Type Schema 12 | def inputSchema: StructType = StructType(Array(StructField("item", DoubleType))) 13 | 14 | // Intermediate Schema 15 | def bufferSchema = StructType(Array( 16 | StructField("sum", DoubleType), 17 | StructField("cnt", LongType))) 18 | 19 | // Returned Data Type . 20 | def dataType: DataType = DoubleType 21 | 22 | // Self-explaining 23 | def deterministic = true 24 | 25 | // This function is called whenever key changes 26 | def initialize(buffer: MutableAggregationBuffer) = { 27 | buffer(0) = 0.toDouble // set sum to zero 28 | buffer(1) = 0L // set number of items to 0 29 | } 30 | 31 | // Iterate over each entry of a group 32 | def update(buffer: MutableAggregationBuffer, input: Row) = { 33 | buffer(0) = buffer.getDouble(0) + input.getDouble(0) 34 | buffer(1) = buffer.getLong(1) + 1 35 | } 36 | 37 | // Merge two partial aggregates 38 | def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = { 39 | buffer1(0) = buffer1.getDouble(0) + buffer2.getDouble(0) 40 | buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1) 41 | } 42 | 43 | // Called after all the entries are exhausted. 44 | def evaluate(buffer: Row) = { 45 | buffer.getDouble(0) / buffer.getLong(1).toDouble 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/customudf/CustomUDAF.scala: -------------------------------------------------------------------------------- 1 | package com.spark.customudf 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.sql.types.{ DoubleType, StringType, StructField, StructType } 5 | import org.apache.spark.{ SparkConf, SparkContext } 6 | import org.apache.spark.sql.functions._ 7 | 8 | object CustomUDAF { 9 | val sparkConf = new SparkConf().setAppName("Spark-CustomUDAF").setMaster("local[1]")//.set("spark.sql.warehouse.dir", "file:///D:/Spark-WorkSpace/Spark-Windows/spark-warehouse") 10 | val sc = new SparkContext(sparkConf) 11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 12 | def main(args: Array[String]) { 13 | // define UDAF 14 | val customMean = new CustomMean() 15 | 16 | // create test dataset 17 | val data = (1 to 100).map { x: Int => 18 | x match { 19 | case t if t <= 50 => Row("A", t.toDouble) 20 | case t => Row("B", t.toDouble) 21 | } 22 | } 23 | 24 | // create schema of the test dataset 25 | val schema = StructType(Array( 26 | StructField("key", StringType), 27 | StructField("value", DoubleType))) 28 | 29 | // construct data frame 30 | val rdd = sc.parallelize(data) 31 | val df = sqlContext.createDataFrame(rdd, schema) 32 | 33 | // Calculate average value for each group 34 | df.groupBy("key").agg( 35 | customMean(df.col("value")).as("custom_mean"), 36 | avg("value").as("avg")).show() 37 | } 38 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/customudf/CustomUDF.scala: -------------------------------------------------------------------------------- 1 | package com.spark.customudf 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | 6 | object CustomUDF { 7 | val sparkConf = new SparkConf().setAppName("Spark-CustomUDF").setMaster("local[1]")//.set("spark.sql.warehouse.dir", "file:///D:/Spark-WorkSpace/Spark-Windows/spark-warehouse") 8 | val sc = new SparkContext(sparkConf) 9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 10 | def main(args: Array[String]) { 11 | 12 | // Construct Dummy Data 13 | import util.Random 14 | import org.apache.spark.sql.Row 15 | implicit class Crossable[X](xs: Traversable[X]) { 16 | def cross[Y](ys: Traversable[Y]) = for { x <- xs; y <- ys } yield (x, y) 17 | } 18 | 19 | val students = Seq("John", "Mike", "Matt") 20 | val subjects = Seq("Math", "Sci", "Geography", "History") 21 | val random = new Random(1) 22 | val data = (students cross subjects).map { x => Row(x._1, x._2, random.nextInt(100)) }.toSeq 23 | 24 | data.foreach { x => println(x)} 25 | 26 | // Create Schema Object 27 | import org.apache.spark.sql.types.{ StructType, StructField, IntegerType, StringType } 28 | val schema = StructType(Array( 29 | StructField("student", StringType, nullable = false), 30 | StructField("subject", StringType, nullable = false), 31 | StructField("score", IntegerType, nullable = false))) 32 | 33 | // Create DataFrame 34 | val rdd = sc.parallelize(data) 35 | val df = sqlContext.createDataFrame(rdd, schema) 36 | // Define udf 37 | import org.apache.spark.sql.functions.udf 38 | def udfScoreToCategory = udf((score: Int) => { 39 | score match { 40 | case t if t >= 80 => "A" 41 | case t if t >= 60 => "B" 42 | case t if t >= 35 => "C" 43 | case _ => "D" 44 | } 45 | }) 46 | df.withColumn("category", udfScoreToCategory(df("score"))).show(10) 47 | } 48 | 49 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/customudf/SparkUDF.scala: -------------------------------------------------------------------------------- 1 | package com.spark.customudf 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | 6 | object SparkUDF { 7 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double) 8 | 9 | def main(args: Array[String]) { 10 | 11 | val sc = new SparkContext(new SparkConf().setAppName("Spark-custom-UDF").setMaster("local[1]")) 12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 13 | import sqlContext.implicits._ 14 | 15 | val x = sc.parallelize(Array( 16 | Purchase(123, 234, "2007-12-12", "20:50", "UTC", 500.99), 17 | Purchase(123, 247, "2007-12-12", "15:30", "PST", 300.22), 18 | Purchase(189, 254, "2007-12-13", "00:50", "EST", 122.19), 19 | Purchase(187, 299, "2007-12-12", "07:30", "UTC", 524.37))) 20 | 21 | val df = sqlContext.createDataFrame(x) 22 | df.registerTempTable("df") 23 | 24 | def makeDT(date: String, time: String, tz: String) = s"$date $time $tz" 25 | sqlContext.udf.register("makeDt", makeDT(_: String, _: String, _: String)) 26 | 27 | // Now we can use our function directly in SparkSQL. 28 | val res = sqlContext.sql("SELECT amount, makeDt(date, time, tz) from df").take(2) 29 | res.foreach { x => print(x) } 30 | 31 | // but not outsideit fails 32 | // df.select($"customer_id", makeDt($"date", $"time", $"tz"), $"amount").take(2) 33 | 34 | //You can see above that we can use it within SQL but not outside of it. 35 | //To do that we're going to have to create a different UDF using spark.sql.function.udf 36 | 37 | import org.apache.spark.sql.functions.udf 38 | val makeDt = udf(makeDT(_: String, _: String, _: String)) 39 | // now this works 40 | df.select($"customer_id", makeDt($"date", $"time", $"tz"), $"amount").take(2).foreach { x => print(x) } 41 | 42 | // In Spark version 1.5, functions to create date times were introduced. 43 | // Now we can leave our function the same however we're just going to create a format and wrap our MakeDT 44 | // function in the unix_timestampfunction call, we can do this both in and out of SparkSQL! 45 | 46 | import org.apache.spark.sql.functions.unix_timestamp 47 | 48 | val fmt = "yyyy-MM-dd hh:mm z" 49 | df.select($"customer_id", unix_timestamp(makeDt($"date", $"time", $"tz"), fmt), $"amount").take(2).foreach { x => print(x) } 50 | 51 | sqlContext.sql(s"SELECT customer_id, unix_timestamp(makeDt(date, time, tz), '$fmt'), amount FROM df").take(2).foreach { x => print(x) } 52 | } 53 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/CustomPartitioner.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.Partitioner 5 | 6 | object CustomPartitioner { 7 | 8 | def main(args: Array[String]) { 9 | 10 | val sparkConf = new SparkConf().setAppName("Spark-Custom-Partitioner").setMaster("local[1]") 11 | val sc = new SparkContext(sparkConf) 12 | val inputFile = sc.textFile("E:/Software/Spark/data/partitioner.txt") 13 | 14 | //create paired RDD 15 | val pairedData = inputFile.flatMap(x => x.split(" ")).map(x => (x, 1)) 16 | 17 | //Define custom pertitioner for paired RDD 18 | val partitionedData = pairedData.partitionBy(new MyCustomerPartitioner(2)).map(f => f._1) 19 | 20 | //verify result using mapPartitionWithIndex 21 | val finalOut = partitionedData.mapPartitionsWithIndex { 22 | (partitionIndex, dataIterator) => dataIterator.map(dataInfo => (dataInfo + " is located in " + partitionIndex + " partition.")) 23 | } 24 | //Save Output in HDFS 25 | finalOut.saveAsTextFile("E:/Software/Spark/data/partitionOutput") 26 | 27 | } 28 | } 29 | class MyCustomerPartitioner(numParts: Int) extends Partitioner { 30 | override def numPartitions: Int = numParts 31 | 32 | override def getPartition(key: Any): Int = 33 | { 34 | val out = toInt(key.toString) 35 | out 36 | } 37 | 38 | override def equals(other: Any): Boolean = other match { 39 | case dnp: MyCustomerPartitioner => 40 | dnp.numPartitions == numPartitions 41 | case _ => 42 | false 43 | } 44 | 45 | def toInt(s: String): Int = 46 | { 47 | try { 48 | s.toInt 49 | 0 50 | } catch { 51 | case e: Exception => 1 52 | 53 | } 54 | } 55 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/CustomReceiver.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | 3 | import java.io.{ BufferedReader, InputStreamReader } 4 | import java.net.Socket 5 | import java.nio.charset.StandardCharsets 6 | 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.storage.StorageLevel 9 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 10 | import org.apache.spark.streaming.receiver.Receiver 11 | 12 | object CustomReceiver { 13 | def main(args: Array[String]) { 14 | if (args.length < 2) { 15 | System.err.println("Usage: CustomReceiver ") 16 | System.exit(1) 17 | } 18 | 19 | // Create the context with a 10 second batch size 20 | val sparkConf = new SparkConf().setAppName("Spark-CustomReceiver") 21 | val ssc = new StreamingContext(sparkConf, Seconds(10)) 22 | 23 | // Create an input stream with the custom receiver on target ip:port and count the 24 | // words in input stream of \n delimited text (eg. generated by 'nc') 25 | val lines = ssc.receiverStream(new CustomReceiver(args(0), args(1).toInt)) 26 | 27 | 28 | val words = lines.flatMap(_.split(" ")) 29 | val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) 30 | wordCounts.print() 31 | ssc.start() 32 | ssc.awaitTermination() 33 | } 34 | } 35 | 36 | class CustomReceiver(host: String, port: Int) 37 | extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { 38 | 39 | def onStart() { 40 | // Start the thread that receives data over a connection 41 | new Thread("Socket Receiver") { 42 | override def run() { receive() } 43 | }.start() 44 | } 45 | 46 | def onStop() { 47 | // There is nothing much to do as the thread calling receive() 48 | // is designed to stop by itself isStopped() returns false 49 | } 50 | 51 | /** Create a socket connection and receive data until receiver is stopped */ 52 | private def receive() { 53 | var socket: Socket = null 54 | var userInput: String = null 55 | try { 56 | socket = new Socket(host, port) 57 | val reader = new BufferedReader( 58 | new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) 59 | userInput = reader.readLine() 60 | while (!isStopped && userInput != null) { 61 | store(userInput) 62 | userInput = reader.readLine() 63 | } 64 | reader.close() 65 | socket.close() 66 | println("Stopped receiving") 67 | restart("Trying to connect again") 68 | } catch { 69 | case e: java.net.ConnectException => 70 | restart("Error connecting to " + host + ":" + port, e) 71 | case t: Throwable => 72 | restart("Error receiving data", t) 73 | } 74 | } 75 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/DataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.functions._ 5 | import com.spark.util._ 6 | object DataFrame { 7 | case class Employee(empid: Int, name: String, dept: String, salary: Int, nop: Int) 8 | case class AggregatedEmpData(empid: Int, name: String, dept: String, sumsalary: Long, sumnop: Long, maxsalary: Int, date: String) 9 | def main(args: Array[String]) { 10 | val conf = new SparkConf().setAppName("Spark-DataFrame").setMaster("local[1]") 11 | val sc = new SparkContext(conf) 12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 13 | import sqlContext.implicits._ 14 | 15 | val empDataRDD = sc.textFile(Utills.DATA_PATH +"emp.txt") //path to emp.txt 16 | val dropHeaderRDD = empDataRDD.mapPartitions(_.drop(1)) //remove the header information from the file 17 | 18 | /*println(dropHeaderRDD.first()) 19 | 20 | val df = empDataRDD.toDF("line") 21 | val errors = df.filter(col("line").like("%Revanth1%")) 22 | println(errors.count())*/ 23 | 24 | val empDF = dropHeaderRDD.filter { lines => lines.length() > 0 }. 25 | map(_.split("\\|")). 26 | map(p => Employee(p(0).trim.toInt, p(1), p(2), p(3).trim.toInt, p(4).trim.toInt)).toDF() 27 | 28 | empDF.show() 29 | /*val whereCond = "dept = 'DPE' and salary > 1000 or dept = 'MGF' and salary > 5000" 30 | val res = empDF.select("empid", "name", "salary", "dept").where(whereCond) 31 | res.show()*/ 32 | 33 | //Spark Aggregations 34 | val aggDF = empDF.groupBy("empid", "name", "dept"). 35 | agg(sum(empDF.col("salary")), sum(empDF.col("nop")), max(empDF.col("salary"))) 36 | aggDF.printSchema() 37 | 38 | //Adding extra column at the end .. 39 | val finalDF = aggDF.map(row => AggregatedEmpData(row.getInt(0), row.getString(1), row.getString(2), row.getLong(3), row.getLong(4), row.getInt(5), Utills.getTime())) 40 | println(finalDF.first()) 41 | 42 | //Saving data as text file 43 | aggDF.rdd.coalesce(1, false).saveAsTextFile("F:/Software/Spark/data/aggData/" + Utills.getTime()) 44 | 45 | empDF.groupBy("empid").agg(max(empDF.col("salary"))).show() 46 | empDF.select(max($"salary")).show() 47 | 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/KafkaConsumer.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.streaming.Seconds 5 | import org.apache.spark.streaming.StreamingContext 6 | import org.apache.spark.streaming.dstream.InputDStream 7 | import org.apache.spark.streaming.kafka.KafkaUtils 8 | import java.util.Properties 9 | import kafka.producer.KeyedMessage 10 | import kafka.producer.Producer 11 | import kafka.producer.ProducerConfig 12 | import kafka.serializer.StringDecoder 13 | object KafkaConsumer { 14 | 15 | def main(args: Array[String]) { 16 | try { 17 | val Array(brokerList, topics) = args 18 | val sc = new SparkContext(new SparkConf().setAppName("KafkaConsumer-Streaming").setMaster("local[2]")) 19 | val ssc = new StreamingContext(sc, Seconds(5)) 20 | val topicsSet = topics.split(",").toSet 21 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokerList) 22 | 23 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 24 | ssc, kafkaParams, topicsSet).map(_._2) 25 | 26 | messages.foreachRDD(x => { 27 | if (!x.isEmpty()) { 28 | x.foreach { x => println(x) } 29 | println("--------------------------------------------------------") 30 | println(x.first()) 31 | }else{ 32 | println("Data is not received from the producer") 33 | } 34 | }) 35 | ssc.start() 36 | ssc.awaitTermination() 37 | 38 | } catch { 39 | case ex: Exception => { 40 | println(ex.printStackTrace()) 41 | } 42 | } 43 | } 44 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/KafkaConsumerToCassandra.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Minutes 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.kafka.KafkaUtils 10 | import com.datastax.driver.core.Cluster 11 | import com.datastax.spark.connector.SomeColumns 12 | import com.datastax.spark.connector.toNamedColumnRef 13 | import com.datastax.spark.connector.toRDDFunctions 14 | 15 | import kafka.serializer.StringDecoder 16 | object KafkaConsumerToCassandra { 17 | val SLIDE_INTERVAL = 1 18 | def startStreaming(args: Array[String]): Unit = { 19 | try { 20 | val Array(zkQuorum, topics) = args 21 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Kafka-Streaming").setMaster("local[2]").set("spark.cassandra.connection.host", "127.0.0.1")) 22 | val ssc = new StreamingContext(sc, Minutes(SLIDE_INTERVAL)) 23 | val topicsSet = topics.split(",").toSet 24 | val kafkaParams = Map[String, String]("metadata.broker.list" -> zkQuorum) 25 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 26 | ssc, kafkaParams, topicsSet).map(_._2).map(line => line.split('|')) 27 | 28 | val cluster = Cluster.builder().addContactPoint("127.0.0.1").build() 29 | //Creating Session object 30 | val session = cluster.connect() 31 | session.execute("CREATE KEYSPACE IF NOT EXISTS spark_kafka_cassandra WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 };"); 32 | val query = "CREATE TABLE IF NOT EXISTS spark_kafka_cassandra.employee (id int PRIMARY KEY,name VARCHAR, salary int);" 33 | //Executing the query 34 | session.execute(query) 35 | 36 | messages.foreachRDD( 37 | rdd => { 38 | if (!rdd.isEmpty()) { 39 | println(rdd.first()) 40 | println("rdd count " + rdd.count()) 41 | val resRDD = rdd.map(line => (line(0), line(1), line(2))) 42 | .saveToCassandra("spark_kafka_cassandra", "employee", SomeColumns("id", "name", "salary")) 43 | } else { 44 | println("Data is not yet recevied from the producer....") 45 | } 46 | }) 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } catch { 50 | case ex: Exception => { 51 | println(ex.getMessage) 52 | } 53 | } 54 | } 55 | 56 | def main(args: Array[String]) { 57 | /*if (args.length < 2) { 58 | System.err.println("Usage: KafkaConsumer ") 59 | System.exit(1) 60 | }*/ 61 | startStreaming(args) 62 | } 63 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/KafkaConsumerToHDFS.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.FileSystem 5 | import org.apache.hadoop.fs.Path 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.streaming.Minutes 9 | import org.apache.spark.streaming.StreamingContext 10 | import org.apache.spark.streaming.kafka.KafkaUtils 11 | import com.spark.util._ 12 | import kafka.serializer.StringDecoder 13 | object KafkaConsumerToHDFS { 14 | private val conf = new Configuration() 15 | val fs = FileSystem.get(conf) 16 | private val hdfsCoreSitePath = new Path("/home/centos/hadoop-2.6.0/etc/hadoop/core-site.xml") 17 | conf.addResource(hdfsCoreSitePath) 18 | val uri = conf.get("fs.default.name") 19 | val SLIDE_INTERVAL = 1 20 | def startStreaming(args: Array[String]): Unit = { 21 | try { 22 | val Array(zkQuorum, topics) = args 23 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Kafka-Streaming").setMaster("local[2]")) 24 | val ssc = new StreamingContext(sc, Minutes(SLIDE_INTERVAL)) 25 | val topicsSet = topics.split(",").toSet 26 | val kafkaParams = Map[String, String]("metadata.broker.list" -> zkQuorum) 27 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 28 | ssc, kafkaParams, topicsSet).map(_._2) 29 | println("Messages.count()" + messages.count()) 30 | messages.foreachRDD( 31 | rdd => { 32 | if (!rdd.isEmpty()) { 33 | println(rdd.first()) 34 | println("rdd count " + rdd.count()) 35 | println("URI = " + uri) 36 | val hdfsPath = uri + "/user/data/" + Utills.getTime() 37 | println("HDFS Path = " + hdfsPath) 38 | rdd.saveAsTextFile(hdfsPath) 39 | } else { 40 | println("Data is not yet recevied from the producer....") 41 | } 42 | }) 43 | ssc.start() 44 | ssc.awaitTermination() 45 | } catch { 46 | case ex: Exception => { 47 | println(ex.getMessage) 48 | } 49 | } 50 | } 51 | 52 | def main(args: Array[String]) { 53 | /*if (args.length < 2) { 54 | System.err.println("Usage: KafkaConsumer ") 55 | System.exit(1) 56 | }*/ 57 | startStreaming(args) 58 | } 59 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/KafkaProducer.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import java.util.Properties 3 | 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | 7 | import kafka.producer.KeyedMessage 8 | import kafka.producer.Producer 9 | import kafka.producer.ProducerConfig 10 | import scala.util.Random 11 | object KafkaProducer { 12 | def main(args: Array[String]) { 13 | val conf = new SparkConf().setAppName("Spark-Kafka-Producer").setMaster("local[1]") 14 | val sc = new SparkContext(conf) 15 | val Array(zkQuorum, topic) = args 16 | val props: Properties = new Properties() 17 | // props.put("metadata.broker.list", "10.220.11.171:9092") 18 | props.put("metadata.broker.list", zkQuorum) 19 | props.put("serializer.class", "kafka.serializer.StringEncoder") 20 | 21 | val config = new ProducerConfig(props) 22 | val producer = new Producer[String, String](config) 23 | var events = 0; 24 | var totalEvents = 10; 25 | // for loop execution with a range 26 | for (index <- 1 to totalEvents) { 27 | val salary = Random.nextInt(500000); 28 | val empId = Random.nextInt(1000); 29 | val empName = "Revanth-" + empId 30 | val msg = empId + "|" + empName + "|" + salary; 31 | producer.send(new KeyedMessage[String, String](topic, msg)) 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/MovingAverage.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.expressions.Window 5 | import org.apache.spark.sql.functions._ 6 | 7 | object MovingAverage { 8 | 9 | def main(args: Array[String]) { 10 | 11 | val sc = new SparkContext(new SparkConf().setAppName("Spark-MovingAverage").setMaster("local[1]") 12 | //.set("spark.sql.warehouse.dir", "file:///E:/MyStuff/HadoopProj/Scala/WorkSpace/Spark/spark-warehouse") 13 | ) 14 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 15 | import sqlContext.implicits._ 16 | val customers = sc.parallelize(List( 17 | ("Alice", "2016-05-01", 50.00), 18 | ("Alice", "2016-05-03", 45.00), 19 | ("Alice", "2016-05-04", 55.00), 20 | ("Bob", "2016-05-01", 25.00), 21 | ("Bob", "2016-05-04", 29.00), 22 | ("Bob", "2016-05-06", 27.00))). 23 | toDF("name", "date", "amountSpent") 24 | 25 | // Create a window spec. 26 | val wSpec1 = Window.partitionBy("name").orderBy("date").rowsBetween(-1, 1) 27 | 28 | // Calculate the moving average 29 | customers.withColumn("movingAvg", 30 | avg(customers("amountSpent")).over(wSpec1)).show() 31 | 32 | val wSpec2 = Window.partitionBy("name").orderBy("date").rowsBetween(Long.MinValue, 0) 33 | 34 | // Create a new column which calculates the sum over the defined window frame. 35 | customers.withColumn("cumSum", 36 | sum(customers("amountSpent")).over(wSpec2)).show() 37 | 38 | // Window spec. No need to specify a frame in this case. 39 | val wSpec3 = Window.partitionBy("name").orderBy("date") 40 | 41 | // Use the lag function to look backwards by one row. 42 | customers.withColumn("prevAmountSpent", 43 | lag(customers("amountSpent"), 1).over(wSpec3)).show() 44 | 45 | customers.withColumn("rank", rank().over(wSpec3)).show() 46 | } 47 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/ReadHDFSFolders.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.FileSystem 5 | import org.apache.hadoop.fs.Path 6 | import org.apache.hadoop.fs.FileStatus 7 | import org.apache.hadoop.fs.FileUtil 8 | 9 | import org.apache.spark.SparkConf 10 | import org.apache.spark.SparkContext 11 | 12 | object ReadHDFSFolders { 13 | private val conf = new Configuration() 14 | val fs = FileSystem.get(conf) 15 | val uri = conf.get("fs.default.name") 16 | 17 | def main(args: Array[String]) { 18 | val sc = new SparkContext(new SparkConf().setAppName("Spark-ReadHDFSFolders").setMaster("local[1]")) 19 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 20 | 21 | //Hdfs folder path 22 | var DATA_PATH = "/user/data/stocks" 23 | 24 | //No of Hdfs folders to read 25 | val intervalCount = 3 26 | 27 | var fileStatus: Array[FileStatus] = fs.listStatus(new Path(uri + DATA_PATH)) 28 | var paths: Array[Path] = FileUtil.stat2Paths(fileStatus) 29 | 30 | var filesWithInterval = getHDFSFoldersBasedOnModtime(intervalCount, fileStatus) 31 | 32 | if (fileStatus != null && filesWithInterval.length >= intervalCount) { 33 | val dataframeArray = filesWithInterval.map(folder => { 34 | sqlContext.read.parquet(folder.getPath.toString) 35 | }) 36 | 37 | //Union all the folders and form a single data frame. 38 | val combinedDataFrame = dataframeArray.reduce((x, y) => x.unionAll(y)) 39 | 40 | combinedDataFrame.printSchema() 41 | 42 | println("First Record --> " + combinedDataFrame.first()) 43 | } 44 | 45 | } 46 | 47 | //get the folders from HDFS based on the count provided. 48 | def getHDFSFoldersBasedOnModtime(intervalCount: Int, fileStatus: Array[FileStatus]): Array[FileStatus] = { 49 | var sortedList: List[FileStatus] = fileStatus.toList.sortWith(_.getModificationTime > _.getModificationTime) 50 | var returnList: List[FileStatus] = List() 51 | var itr: Int = 0 52 | var iterator = sortedList.iterator 53 | while (iterator.hasNext) { 54 | var value = iterator.next() 55 | if (itr < intervalCount) { 56 | returnList = returnList.::(value) 57 | itr = itr + 1 58 | } 59 | } 60 | returnList.toArray 61 | } 62 | 63 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/ReadMultipleFiles.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.functions._ 5 | object ReadMultipleFiles { 6 | case class Employee(empid: Int, name: String, dept: String, salary: Int, nop: Int) 7 | 8 | def main(args: Array[String]) { 9 | val conf = new SparkConf().setAppName("Spark-ReadMultipleFiles").setMaster("local[1]") 10 | val sc = new SparkContext(conf) 11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 12 | import sqlContext.implicits._ 13 | 14 | //Directory structure 15 | // employee/folder1/emp.txt (or) employee/emp.txt,emp1.txt 16 | // employee/folder2/emp1.txt (or) employee/folder2/emp1.txt,emp2.txt 17 | 18 | val empDataRDD = sc.textFile("E:/employee/*").coalesce(1, false) 19 | 20 | val filteredRDD = empDataRDD.filter(line => !line.contains("empid")) //removing the header section 21 | 22 | val empDF = filteredRDD.filter { lines => lines.length() > 0 }. 23 | map(_.split("\\|")). 24 | map(p => Employee(p(0).trim.toInt, p(1), p(2), p(3).trim.toInt, p(4).trim.toInt)).toDF() 25 | 26 | empDF.show() 27 | 28 | //val empDataRDD1 = sc.wholeTextFiles("E:/test/*") 29 | //empDataRDD1.collect().foreach { x => println(x._2) } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/SparkFileStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 4 | import StreamingContext._ 5 | import org.apache.hadoop.conf._ 6 | import org.apache.hadoop.fs._ 7 | import org.apache.hadoop.io.LongWritable 8 | import org.apache.hadoop.io.Text 9 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 10 | object SparkFileStreaming { 11 | def main(args: Array[String]) { 12 | val sparkConf = new SparkConf().setAppName("Spark-FileStreaming").setMaster("local[2]") 13 | // Create the context 14 | val ssc = new StreamingContext(sparkConf, Seconds(3)) 15 | 16 | // Create the FileInputDStream on the directory and use the 17 | val lines = ssc.textFileStream("hdfs://sandbox.hortonworks.com:8020/user/data/") 18 | val words = lines.flatMap(_.split(" ")) 19 | val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) 20 | wordCounts.print() 21 | ssc.start() 22 | ssc.awaitTermination() 23 | } 24 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/SparkJDBC.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | 7 | object SparkJDBC { 8 | 9 | def getDetails(sc: SparkContext): Unit = { 10 | 11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 12 | import sqlContext.implicits._ 13 | val url = "jdbc:oracle:thin:@localhost:1521:XE" 14 | val prop = new java.util.Properties 15 | prop.setProperty("driver", "oracle.jdbc.driver.OracleDriver"); 16 | prop.setProperty("user", "root") 17 | prop.setProperty("password", "root") 18 | val employee = sqlContext.read.jdbc(url, "emp", prop) 19 | employee.cache() 20 | employee.registerTempTable("emp") 21 | 22 | sqlContext.sql("select * from emp where NAME like 'HARI%' ").show() 23 | 24 | employee.select("EMPID", "NAME", "SALARY").show() 25 | 26 | employee.filter(employee("SALARY") > 7000).show() 27 | 28 | employee.groupBy("NAME").count().show() 29 | 30 | sc.stop() 31 | 32 | } 33 | 34 | def main(args: Array[String]) { 35 | val conf = new SparkConf().setAppName("Spark-JDBC").setMaster("local[1]") 36 | val sc = new SparkContext(conf); 37 | 38 | getDetails(sc) 39 | 40 | } 41 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/SparkStructType.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types.{ StructType, StructField, StringType, IntegerType } 6 | import com.spark.util._ 7 | object SparkStructType extends LogHelper { 8 | def main(args: Array[String]) { 9 | logger.info("SparkStructType.main()") 10 | val conf = new SparkConf().setAppName("Spark-StructType-Example").setMaster("local[1]") 11 | val sc = new SparkContext(conf) 12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 13 | val person = sc.textFile(Utills.DATA_PATH +"person.txt") 14 | 15 | val schema = StructType(Array(StructField("firstName", StringType, true), StructField("lastName", StringType, true), StructField("age", IntegerType, true))) 16 | val rowRDD = person.map(_.split(",")).map(p => org.apache.spark.sql.Row(p(0), p(1), p(2).toInt)) 17 | val personDF = sqlContext.createDataFrame(rowRDD, schema) 18 | personDF.registerTempTable("person") 19 | sqlContext.sql("select * from person").foreach(println) 20 | 21 | //saving as parquet file 22 | val path = Utills.DATA_PATH +"person-" + Utills.getTime() 23 | personDF.coalesce(1).write.parquet(path) 24 | 25 | //saving DataFrame as Text file 26 | //personDF.rdd.coalesce(1, false).saveAsTextFile(path) 27 | 28 | //reading a parquet file 29 | val parqfileDF = sqlContext.read.parquet(path) 30 | parqfileDF.filter("age > 25").show() 31 | val df = parqfileDF.groupBy("firstName", "lastName").agg(sum(parqfileDF.col("age"))) 32 | df.show() 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Spark_Avro.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.functions._ 5 | import com.databricks.spark.avro._ 6 | import com.spark.util._ 7 | object Spark_Avro { 8 | case class Employee(empid: Int, name: String, dept: String, salary: Int, nop: Int) 9 | def main(args: Array[String]) { 10 | val conf = new SparkConf().setAppName("Spark-Avro").setMaster("local[1]") 11 | val sc = new SparkContext(conf) 12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 13 | // configuration to use deflate compression 14 | sqlContext.setConf("spark.sql.avro.compression.codec", "deflate") 15 | sqlContext.setConf("spark.sql.avro.deflate.level", "5") 16 | import sqlContext.implicits._ 17 | 18 | val empDataRDD = sc.textFile(Utills.DATA_PATH + "emp.txt") //path to emp.txt 19 | val dropHeaderRDD = empDataRDD.mapPartitions(_.drop(1)) //remove the header information from the file 20 | 21 | val empDF = dropHeaderRDD.filter { lines => lines.length() > 0 }. 22 | map(_.split("\\|")). 23 | map(p => Employee(p(0).trim.toInt, p(1), p(2), p(3).trim.toInt, p(4).trim.toInt)).toDF() 24 | 25 | empDF.show() 26 | 27 | //write as avro file. 28 | empDF.write.avro("/user/data/Emp_avro") 29 | 30 | //reading from avro file. 31 | val df = sqlContext.read.avro("/user/data/Emp_avro") 32 | df.filter("salary > 1000").show() 33 | 34 | //Writing Partitioned Data 35 | val moviesDF = Seq( 36 | (2012, 8, "Batman", 9.8), 37 | (2012, 8, "Hero", 8.7), 38 | (2012, 7, "Robot", 5.5), 39 | (2011, 7, "Git", 2.0)).toDF("year", "month", "title", "rating") 40 | 41 | moviesDF.write.partitionBy("year", "month").avro("/user/data/movies") 42 | 43 | //Reading Partitioned Data 44 | val resultDF = sqlContext.read.avro("/user/data/movies") 45 | resultDF.printSchema() 46 | resultDF.filter("year = 2011").collect().foreach(println) 47 | } 48 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Spark_CSV_Reader.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.functions._ 5 | import com.spark.util._ 6 | object Spark_CSV_Reader { 7 | def main(args: Array[String]) { 8 | val conf = new SparkConf().setAppName("Spark-CSV-Example").setMaster("local[1]") 9 | val sc = new SparkContext(conf) 10 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 11 | val auctionDF = sqlContext.read 12 | .format("com.databricks.spark.csv") 13 | .option("header", "true") // Use first line of all files as header 14 | .option("inferSchema", "true") // Automatically infer data types 15 | .load(Utills.DATA_PATH +"ebay.csv") 16 | //auctionDF.printSchema() 17 | //auctionDF.select("auctionid", "bidder").show 18 | 19 | // How many auctions were held? 20 | val count = auctionDF.select("auctionid").distinct.count 21 | println("Distinct items : " + count) 22 | // How many bids per item? 23 | auctionDF.groupBy("auctionid", "item").count.sort("auctionid").show 24 | 25 | // What's the min number of bids per item? what's the average? what's the max? 26 | auctionDF.groupBy("item", "auctionid").count.agg(min("count"), avg("count"), max("count")).show 27 | // Get the auctions with closing price > 100 28 | auctionDF.filter("price > 100").sort("auctionid").show 29 | 30 | // register the DataFrame as a temp table 31 | auctionDF.registerTempTable("auction") 32 | // SQL statements can be run 33 | // How many bids per auction? 34 | val results = sqlContext.sql("SELECT auctionid, item, count(bid) as BidCount FROM auction GROUP BY auctionid, item") 35 | // display dataframe in a tabular format 36 | results.sort("auctionid").show() 37 | 38 | sqlContext.sql("SELECT auctionid,item, MAX(price) as MaxPrice FROM auction GROUP BY item,auctionid").sort("auctionid").show() 39 | } 40 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Spark_Hive.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | object Spark_Hive { 6 | def main(args: Array[String]) { 7 | val conf = new SparkConf().setAppName("Spark_Hive").setMaster("local[1]") 8 | val sc = new SparkContext(conf) 9 | 10 | //create hive context 11 | val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc) 12 | 13 | //Create Table and load data 14 | hiveContext.sql("CREATE EXTERNAL TABLE IF NOT EXISTS users(id INT, name STRING, age INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'") 15 | hiveContext.sql("LOAD DATA LOCAL INPATH '/hdp/dev/hive/users.txt' INTO TABLE users") //specify path to file accordingly 16 | 17 | val result = hiveContext.sql("FROM users SELECT id, name, age").show() 18 | 19 | val rddFromSql = hiveContext.sql("SELECT id,name,age FROM users WHERE age > 25 ORDER BY age") 20 | rddFromSql.show() 21 | 22 | // The results of SQL queries are themselves RDDs and support all normal RDD functions.The 23 | // items in the RDD are of type Row, which allows you to access each column by ordinal. 24 | println("Result of RDD.map:") 25 | val rddAsStrings = rddFromSql.rdd.map { 26 | case Row(id: Int, name: String, age: Int) => s"Id: $id, Name: $name, Age: $age" 27 | } 28 | rddAsStrings.foreach { x => println(x) } 29 | 30 | // Aggregation queries are also supported. 31 | val count = hiveContext.sql("SELECT COUNT(*) FROM users").collect().head.getLong(0) 32 | println(s"count is : $count") 33 | 34 | // Queries are expressed in HiveQL 35 | println("Result of 'SELECT *': ") 36 | hiveContext.sql("SELECT * FROM users").collect().foreach(println) 37 | 38 | } 39 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Spark_Hive_ORC.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.hive.orc._ 5 | import org.apache.spark.sql._ 6 | object Spark_Hive_ORC { 7 | case class YahooStockPrice(date: String, open: Float, high: Float, low: Float, close: Float, volume: Integer, adjClose: Float) 8 | def main(args: Array[String]) { 9 | val conf = new SparkConf().setAppName("Spark_Hive_ORC").setMaster("local[1]") 10 | val sc = new SparkContext(conf) 11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 12 | import sqlContext.implicits._ 13 | 14 | //create hive context 15 | val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc) 16 | 17 | //Create ORC Table and load data 18 | hiveContext.sql("create EXTERNAL table yahoo_orc_table (date STRING, open_price FLOAT, high_price FLOAT, low_price FLOAT, close_price FLOAT, volume INT, adj_price FLOAT) stored as orc") 19 | 20 | val yahoo_stocks = sc.textFile("hdfs://sandbox.hortonworks.com:8020/user/data/yahoo_stocks.csv") 21 | 22 | //filter out the header information 23 | val header = yahoo_stocks.first 24 | val data = yahoo_stocks.filter(_ != header) 25 | 26 | //Map the data to a case class and register it as a temp table. 27 | val stockprice = data.map(_.split(",")).map(row => YahooStockPrice(row(0), row(1).trim.toFloat, row(2).trim.toFloat, row(3).trim.toFloat, row(4).trim.toFloat, row(5).trim.toInt, row(6).trim.toFloat)).toDF() 28 | stockprice.registerTempTable("yahoo_stocks_temp") 29 | val results = sqlContext.sql("SELECT * FROM yahoo_stocks_temp") 30 | 31 | results.map(t => "Stock Entry: " + t.toString).collect().foreach(println) 32 | 33 | //save the data to HDFS in ORC file format. 34 | results.coalesce(1).write.format("orc").save("/user/data/yahoo_stocks_orc") 35 | 36 | //load the data in ORC format to visualize it . 37 | val yahoo_stocks_orc = hiveContext.read.format("orc").load("/user/data/yahoo_stocks_orc") 38 | yahoo_stocks_orc.registerTempTable("orcTest") 39 | hiveContext.sql("SELECT * from orcTest").collect.foreach(println) 40 | 41 | //load the ORC data in to ORC hive table created at the top. 42 | hiveContext.sql("LOAD DATA INPATH '/user/data/yahoo_stocks_orc' INTO TABLE yahoo_orc_table") 43 | val orcResults = hiveContext.sql("FROM yahoo_orc_table SELECT date, open_price,high_price") 44 | orcResults.show 45 | 46 | } 47 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Spark_Joins.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | 6 | object Spark_Joins { 7 | def main(args: Array[String]) { 8 | val conf = new SparkConf().setAppName("Spark-Joins").setMaster("local[2]") 9 | val sc = new SparkContext(conf) 10 | // Create emp RDD 11 | val emp = sc.parallelize(Seq((1, "revanth", 10), (2, "dravid", 20), (3, "kiran", 30), (4, "nanda", 35), (5, "kishore", 30))) 12 | 13 | // Create dept RDD 14 | val dept = sc.parallelize(Seq(("hadoop", 10), ("spark", 20), ("hive", 30), ("sqoop", 40))) 15 | 16 | // Establishing that the third field is to be considered as the Key for the emp RDD 17 | val manipulated_emp = emp.keyBy(t => t._3) 18 | 19 | // Establishing that the second field need to be considered as the Key for dept RDD 20 | val manipulated_dept = dept.keyBy(t => t._2) 21 | 22 | // Inner Join 23 | val join_data = manipulated_emp.join(manipulated_dept) 24 | 25 | // Left Outer Join 26 | val left_outer_join_data = manipulated_emp.leftOuterJoin(manipulated_dept) 27 | left_outer_join_data.collect().foreach(f => println(f)) 28 | 29 | // Right Outer Join 30 | val right_outer_join_data = manipulated_emp.rightOuterJoin(manipulated_dept) 31 | right_outer_join_data.collect().foreach(f => println(f)) 32 | 33 | // Full Outer Join 34 | val full_outer_join_data = manipulated_emp.fullOuterJoin(manipulated_dept) 35 | full_outer_join_data.collect().foreach(f => println(f)) 36 | 37 | // Formatting the Joined Data for better understandable (using map) 38 | val cleaned_joined_data = join_data.map(t => (t._2._1._1, t._2._1._2, t._1, t._2._2._1)) 39 | 40 | cleaned_joined_data.collect().foreach(f => println(f)) 41 | } 42 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Spark_Json_Reader.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.functions._ 5 | import com.spark.util._ 6 | object Spark_Json_Reader { 7 | def main(args: Array[String]) { 8 | val conf = new SparkConf().setAppName("Spark_Json_Reader").setMaster("local[1]") 9 | val sc = new SparkContext(conf) 10 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 11 | 12 | val path = Utills.DATA_PATH +"sales.json" 13 | val salesDF = sqlContext.read.json(path) 14 | salesDF.registerTempTable("sales") 15 | val aggDF = sqlContext.sql("select sum(amountPaid) from sales") 16 | println(aggDF.collectAsList()) 17 | 18 | val results = sqlContext.sql("SELECT customerId,itemName FROM sales ORDER BY itemName") 19 | // display dataframe in a tabular format 20 | results.show() 21 | } 22 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Spark_SequenceFiles.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | 7 | object Spark_SequenceFiles { 8 | 9 | def main(args: Array[String]) { 10 | 11 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Sequence-Files").setMaster("local[1]")) 12 | 13 | val data = sc.textFile("file:////data/Spark/spark-scala/src/main/resources/olympics_data.txt") 14 | 15 | data.map(x => x.split(",")).map(x => (x(1).toString(), x(2).toString())).foreach(f => print(f)) 16 | 17 | val pairs: RDD[(String, String)] = data.map(x => x.split(",")).map(x => (x(1).toString(), x(2).toString())) 18 | 19 | pairs.saveAsSequenceFile("/data/spark/rdd_to_seq") 20 | 21 | //Loading sequenceFiles into an RDD in Spark 22 | 23 | val data1: RDD[(String, String)] = sc.sequenceFile("/data/spark/rdd_to_seq") 24 | 25 | data1.take(5).foreach(f => print(f)) 26 | } 27 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Spark_StructType.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.types._ 7 | import com.spark.util._ 8 | object Spark_StructType { 9 | def main(args: Array[String]) { 10 | val conf = new SparkConf().setAppName("Spark_StructType_Example").setMaster("local[1]") 11 | val sc = new SparkContext(conf) 12 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 13 | import sqlContext.implicits._ 14 | 15 | val people = sc.textFile(Utills.DATA_PATH +"person.txt") 16 | val schemaString = "firstName lastName age" 17 | 18 | val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true))) 19 | val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1), p(2).trim)) 20 | val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema) 21 | 22 | peopleDataFrame.registerTempTable("people") 23 | 24 | val results = sqlContext.sql("SELECT firstName,age FROM people") 25 | 26 | results.map(t => "Name: " + t(0) + "," + "Age: " + t(1)).collect().foreach(println) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Spark_XML.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.functions._ 5 | object Spark_XML { 6 | 7 | //Reference ---> https://github.com/databricks/spark-xml 8 | 9 | def main(args: Array[String]) { 10 | 11 | val conf = new SparkConf().setAppName("Spark_XML_Parsing").setMaster("local[1]") 12 | val sc = new SparkContext(conf) 13 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 14 | import sqlContext.implicits._ 15 | 16 | val df = sqlContext.read.format("com.databricks.spark.xml") 17 | .option("rowTag", "book").load("/user/data/books.xml") 18 | 19 | val selectedData = df.select("author", "title", "_id") 20 | selectedData.show() 21 | 22 | //You can manually specify the schema when reading data: 23 | 24 | import org.apache.spark.sql.SQLContext 25 | import org.apache.spark.sql.types.{ StructType, StructField, StringType, DoubleType }; 26 | 27 | val customSchema = StructType(Array( 28 | StructField("_id", StringType, nullable = true), 29 | StructField("author", StringType, nullable = true), 30 | StructField("description", StringType, nullable = true), 31 | StructField("genre", StringType, nullable = true), 32 | StructField("price", DoubleType, nullable = true), 33 | StructField("publish_date", StringType, nullable = true), 34 | StructField("title", StringType, nullable = true))) 35 | 36 | val df1 = sqlContext.read 37 | .format("com.databricks.spark.xml") 38 | .option("rowTag", "book") 39 | .schema(customSchema) 40 | .load("/user/data/books.xml") 41 | 42 | val selectedData1 = df1.select("author", "_id") 43 | selectedData1.show() 44 | 45 | } 46 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Stateful_WordCount.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark._ 3 | import org.apache.spark.streaming._ 4 | import org.apache.spark.streaming.StreamingContext._ 5 | 6 | object Stateful_WordCount extends App { 7 | 8 | val updateFunc = (values: Seq[Int], state: Option[Int]) => { 9 | val currentCount = values.foldLeft(0)(_ + _) 10 | 11 | val previousCount = state.getOrElse(0) 12 | 13 | Some(currentCount + previousCount) 14 | } 15 | 16 | val conf = new SparkConf().setAppName("Stateful_WordCount").setMaster("local[*]") 17 | val sc = new SparkContext(conf) 18 | val ssc = new StreamingContext(sc, Seconds(10)) 19 | 20 | ssc.checkpoint("/user/data/checkpoints/") 21 | 22 | val lines = ssc.socketTextStream("localhost", 9999) 23 | val words = lines.flatMap(_.split(" ")) 24 | val pairs = words.map(word => (word, 1)) 25 | 26 | val windowedWordCounts = pairs.updateStateByKey(updateFunc) 27 | windowedWordCounts.saveAsTextFiles("/user/data/result") 28 | 29 | ssc.start() 30 | ssc.awaitTermination() 31 | 32 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/WindowBasedStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.streaming.StreamingContext 6 | import org.apache.spark.streaming.Seconds 7 | import com.spark.util._ 8 | object WindowBasedStreaming { 9 | 10 | //nc -lk 9999 11 | 12 | def main(args: Array[String]) { 13 | val conf = new SparkConf().setAppName("Window-Based-Streaming").setMaster("local[*]") 14 | val sc = new SparkContext(conf) 15 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 16 | import sqlContext.implicits._ 17 | sc.setLogLevel("WARN") 18 | val ssc = new StreamingContext(sc, Seconds(10)) 19 | 20 | //ssc.checkpoint("/user/data/checkpoints/") 21 | 22 | val lines = ssc.socketTextStream("localhost", 9999) 23 | 24 | val messages = lines.window(Seconds(30), Seconds(10)) 25 | 26 | messages.foreachRDD( 27 | rdd => { 28 | if (!rdd.isEmpty()) { 29 | println("rdd count " + rdd.count()) 30 | val path = "file:///opt/home/data/" + Utills.getTime() 31 | rdd.coalesce(1, false).saveAsTextFile(path) 32 | } else { 33 | println("Data is not yet recevied from the producer....") 34 | } 35 | }) 36 | 37 | ssc.start() 38 | ssc.awaitTermination() 39 | } 40 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/Window_Sliding_Interval.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.streaming.StreamingContext 5 | import org.apache.spark.streaming.Seconds 6 | 7 | object Window_Sliding_Interval { 8 | 9 | //nc -lk 9999 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setAppName("Window_Sliding_Interval").setMaster("local[*]") 13 | val sc = new SparkContext(conf) 14 | val ssc = new StreamingContext(sc, Seconds(10)) 15 | 16 | ssc.checkpoint("/user/data/checkpoints/") 17 | 18 | val lines = ssc.socketTextStream("localhost", 9999) 19 | val words = lines.flatMap(_.split(" ")) 20 | val pairs = words.map(word => (word, 1)) 21 | 22 | // Reduce last 30 seconds of data, every 10 seconds 23 | val windowedWordCounts = pairs.reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(30), Seconds(10)) 24 | windowedWordCounts.print() 25 | 26 | ssc.start() 27 | ssc.awaitTermination() 28 | } 29 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/examples/WordCount.scala: -------------------------------------------------------------------------------- 1 | package com.spark.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import java.util.Calendar 5 | import org.apache.spark.sql.SQLContext 6 | 7 | object WordCount { 8 | def main(args: Array[String]) { 9 | 10 | val sc = new SparkContext(new SparkConf().setAppName("Spark Count").setMaster("local[1]")) 11 | 12 | val today = Calendar.getInstance().getTime() 13 | 14 | val threshold = 2 15 | 16 | // split each document into words 17 | val tokenized = sc.textFile(args(0)).flatMap(_.split(" ")) 18 | 19 | // count the occurrence of each word 20 | val wordCounts = tokenized.map((_, 1)).reduceByKey(_ + _) 21 | 22 | // filter out words with less than threshold occurrences 23 | val filtered = wordCounts.filter(_._2 >= threshold) 24 | 25 | // count characters 26 | val charCounts = filtered.flatMap(_._1.toCharArray).map((_, 1)).reduceByKey(_ + _) 27 | 28 | //wordCounts.saveAsTextFile(args(1)) 29 | println("---------------------------------------------------") 30 | println(charCounts.collect().mkString(", ")) 31 | 32 | } 33 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/transformations/AggregateByKey.scala: -------------------------------------------------------------------------------- 1 | package com.spark.transformations 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 5 | 6 | object AggregateByKey { 7 | 8 | def myfunc(index: Int, iter: Iterator[(String, Int)]): Iterator[String] = { 9 | iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator 10 | } 11 | 12 | def main(args: Array[String]) { 13 | val conf = new SparkConf().setAppName("AggregateByKey-Example").setMaster("local[1]") 14 | val sc = new SparkContext(conf) 15 | 16 | val pairRDD = sc.parallelize(List(("cat", 2), ("cat", 5), ("mouse", 4), ("cat", 12), ("dog", 12), ("mouse", 2)), 2) 17 | 18 | //lets have a look at what is in the partitions 19 | pairRDD.mapPartitionsWithIndex(myfunc).collect.foreach(f => println(f)) 20 | println("***********************************************") 21 | 22 | pairRDD.aggregateByKey(0)(math.max(_, _), _ + _).collect.foreach(f => println(f)) 23 | println("-----------------------------------------------") 24 | 25 | pairRDD.aggregateByKey(100)(math.max(_, _), _ + _).collect.foreach(f => println(f)) 26 | } 27 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/transformations/Cars.scala: -------------------------------------------------------------------------------- 1 | package com.spark.transformations 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import com.spark.util._ 5 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 6 | object Cars { 7 | def main(args: Array[String]) { 8 | case class cars(make: String, model: String, mpg: String, cylinders: Integer, engine_disp: Integer, horsepower: Integer, weight: Integer, accelerate: Double, year: Integer, origin: String) 9 | val conf = new SparkConf().setAppName("Transformations").setMaster("local[1]") 10 | val sc = new SparkContext(conf) 11 | val rawData = sc.textFile(Utills.DATA_PATH +"cars.txt") //"path to/cars.txt" 12 | 13 | rawData.take(5).foreach { x => println(x) } 14 | 15 | val carsData = rawData.map(x => x.split("\t")) 16 | .map(x => cars(x(0).toString, x(1).toString, x(2).toString, x(3).toInt, x(4).toInt, x(5).toInt, x(6).toInt, x(7).toDouble, x(8).toInt, x(9).toString)) 17 | 18 | carsData.take(2).foreach { x => println(x) } 19 | //persist to memory 20 | carsData.cache() 21 | 22 | //count cars origin wise 23 | val originWiseCount = carsData.map(x => (x.origin, 1)).reduceByKey((x, y) => x + y) 24 | println("originWiseCount :" + originWiseCount.collect().mkString(", ")) 25 | //filter out american cars 26 | val americanCars = carsData.filter(x => (x.origin == "American")) 27 | 28 | //count total american cars 29 | println("americanCars count : " + americanCars.count()) 30 | 31 | // take sum of weights according to make 32 | val makeWeightSum = americanCars.map(x => (x.make, x.weight.toInt)).combineByKey((x: Int) => (x, 1), 33 | (acc: (Int, Int), x) => (acc._1 + x, acc._2 + 1), 34 | (acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)) 35 | 36 | println("americanCars makeWeightSum : " + makeWeightSum.collect().mkString(", ")) 37 | 38 | // take average 39 | val makeWeightAvg = makeWeightSum.map(x => (x._1, (x._2._1 / x._2._2))) 40 | 41 | 42 | println("americanCars makeWeightAvg : " +makeWeightAvg.collect().mkString(", ")) 43 | } 44 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/transformations/Cogroup.scala: -------------------------------------------------------------------------------- 1 | package com.spark.transformations 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 5 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 6 | object Cogroup { 7 | 8 | def main(args: Array[String]) { 9 | val conf = new SparkConf().setAppName("Transformations-Example").setMaster("local[1]") 10 | val sc = new SparkContext(conf) 11 | 12 | // cartesian 13 | 14 | /*Computes the cartesian product between two RDDs (i.e. Each item of the first RDD is joined with each item of the second RDD) 15 | and returns them as a new RDD. (Warning: Be careful when using this function.! Memory consumption can quickly become an issue!) 16 | */ 17 | val x = sc.parallelize(List(1, 2, 3, 4, 5)) 18 | val y = sc.parallelize(List(6, 7, 8, 9, 10)) 19 | x.cartesian(y).collect.foreach(f => println(f)) 20 | 21 | //cogroup 22 | println("cogroup ---cogroup----cogroup") 23 | val a = sc.parallelize(List((1, "apple"), (2, "banana"), (3, "orange"), (4, "kiwi")), 2) 24 | val b = sc.parallelize(List((1, "apple"), (5, "computer"), (1, "laptop"), (1, "desktop"), (4, "iPad")), 2) 25 | 26 | a.cogroup(b).collect.foreach(f => println(f)) 27 | 28 | //subtract 2 RRD's 29 | val diff = a.subtract(b) 30 | diff.collect().foreach(f => println(f._2)) 31 | 32 | //collectAsMap 33 | println("collectAsMap ---collectAsMap----collectAsMap") 34 | val c = sc.parallelize(List(1, 2, 1, 3), 1) 35 | val c2 = sc.parallelize(List(5, 6, 5, 7), 1) 36 | val d = c.zip(c2) 37 | d.collectAsMap.foreach(f => println(f)) 38 | 39 | //combineByKey 40 | println("combineByKey ---combineByKey----combineByKey") 41 | val a1 = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3) 42 | val b1 = sc.parallelize(List(1, 1, 2, 2, 2, 1, 2, 2, 2), 3) 43 | val c1 = b1.zip(a1) 44 | val d1 = c1.combineByKey(List(_), (x: List[String], y: String) => y :: x, (x: List[String], y: List[String]) => x ::: y) 45 | d1.collect.foreach(f => println(f)) 46 | 47 | //filterByRange [Ordered] 48 | println("filterByRange ---filterByRange----filterByRange") 49 | val randRDD = sc.parallelize(List((2, "cat"), (6, "mouse"), (7, "cup"), (3, "book"), (4, "tv"), (1, "screen"), (5, "heater")), 3) 50 | val sortedRDD = randRDD.sortByKey() 51 | 52 | sortedRDD.filterByRange(1, 3).collect.foreach(f => println(f)) 53 | } 54 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/transformations/Filter.scala: -------------------------------------------------------------------------------- 1 | package com.spark.transformations 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | object Filter { 5 | def main(args: Array[String]) { 6 | val conf = new SparkConf().setAppName("Spark Filter Example").setMaster("local[1]") 7 | val sc = new SparkContext(conf) 8 | val x = sc.parallelize(List("Transformation demo", "Test demo", "Filter demo", "Spark is powerfull", "Spark is faster", "Spark is in memory")) 9 | 10 | val lines1 = x.filter(line => line.contains("Spark") || line.contains("Transformation")) 11 | lines1.collect().foreach { line => println(line) } 12 | 13 | val lines = x.filter(line => !line.contains("Filter")) 14 | println("---------------------------------------------") 15 | lines.collect().foreach { line => println(line) } 16 | println("---------------------------------------------") 17 | val count = x.filter(line => line.contains("Spark")).count() 18 | println("count is : " + count) 19 | } 20 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/transformations/FoldByKey.scala: -------------------------------------------------------------------------------- 1 | package com.spark.transformations 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 6 | object FoldByKey { 7 | def main(args: Array[String]) { 8 | val conf = new SparkConf().setAppName("FoldByKey-Example").setMaster("local[1]") 9 | val sc = new SparkContext(conf) 10 | 11 | //Fold in spark 12 | val employeeData = List(("Jack", 1000.0), ("Bob", 2000.0), ("Carl", 7000.0)) 13 | val employeeRDD = sc.makeRDD(employeeData) 14 | 15 | val dummyEmployee = ("dummy", 0.0) 16 | 17 | val maxSalaryEmployee = employeeRDD.fold(dummyEmployee)((acc, employee) => { 18 | if (acc._2 < employee._2) employee else acc 19 | }) 20 | println("employee with maximum salary is" + maxSalaryEmployee) 21 | 22 | //Fold by key 23 | val deptEmployees = List( 24 | ("cs", ("jack", 1000.0)), 25 | ("cs", ("bron", 1200.0)), 26 | ("phy", ("sam", 2200.0)), 27 | ("phy", ("ronaldo", 500.0))) 28 | val empRDD = sc.makeRDD(deptEmployees) 29 | val dummyEmp = ("dummy", 0.0) 30 | val maxByDept = empRDD.foldByKey(dummyEmp)((acc, employee) => { 31 | if (acc._2 < employee._2) employee else acc 32 | }) 33 | println("maximum salaries in each dept" + maxByDept.collect().toList) 34 | 35 | //Fold by key 36 | var rdd1 = sc.makeRDD(Array(("A", 0), ("A", 2), ("B", 1), ("B", 2), ("C", 1))) 37 | rdd1.foldByKey(0)(_ + _).collect.foreach(f => println(f)) 38 | println("-----------------------------------------------") 39 | rdd1.foldByKey(1)(_ * _).collect.foreach(f => println(f)) 40 | 41 | 42 | } 43 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/transformations/GroupBY_ReduceBY.scala: -------------------------------------------------------------------------------- 1 | package com.spark.transformations 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 5 | object GroupBY_ReduceBY { 6 | def main(args: Array[String]) { 7 | val conf = new SparkConf().setAppName("Spark-GroupBY-ReduceBY-Example").setMaster("local[1]") 8 | val sc = new SparkContext(conf) 9 | val words = Array("a", "b", "b", "c", "d", "e", "a", "b", "b", "c", "d", "e", "b", "b", "c", "d", "e") 10 | val wordPairsRDD = sc.parallelize(words).map(word => (word, 1)) 11 | 12 | val wordCountsWithReduce = wordPairsRDD 13 | .reduceByKey(_ + _) 14 | .collect() 15 | wordCountsWithReduce.foreach(f => println(f)) 16 | 17 | //Avoid GroupByKey 18 | println("Avoid GroupByKey") 19 | val wordCountsWithGroup = wordPairsRDD 20 | .groupByKey() 21 | .map(t => (t._1, t._2.sum)) 22 | .collect() 23 | wordCountsWithGroup.foreach(f => println(f)) 24 | } 25 | 26 | //https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html 27 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/transformations/MapvsFlatMap.scala: -------------------------------------------------------------------------------- 1 | package com.spark.transformations 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | object MapvsFlatMap { 5 | def main(args: Array[String]) { 6 | 7 | val conf = new SparkConf().setAppName("MapvsFlatMap").setMaster("local[1]") 8 | val sc = new SparkContext(conf) 9 | val x = sc.parallelize(List("spark rdd example", "sample example")) 10 | 11 | // map operation will return Array of Arrays in following case : check type of result 12 | val y = x.map(x => x.split(" ")) // split(" ") returns an array of words 13 | //result -> Array[Array[String]] = Array(Array(spark, rdd, example), Array(sample, example)) 14 | 15 | /*Similar to map, but each input item can be mapped to 0 or more output items 16 | (so func should return a Seq rather than a single item).*/ 17 | 18 | // flatMap operation will return Array of words in following case : Check type of result 19 | val z = x.flatMap(x => x.split(" ")) 20 | z.collect().foreach { x => println(x) } 21 | //result -> Array[String] = Array(spark, rdd, example, sample, example) 22 | 23 | } 24 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/transformations/Reduce.scala: -------------------------------------------------------------------------------- 1 | package com.spark.transformations 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | object Reduce { 5 | def main(args: Array[String]) { 6 | val conf = new SparkConf().setAppName("Reduce-Example").setMaster("local[1]") 7 | val sc = new SparkContext(conf) 8 | 9 | /*val data = Array(1, 2, 3, 4, 5) 10 | val distData = sc.parallelize(data) 11 | distData.collect().foreach { x => println(x) } 12 | val red = distData.reduce((a, b) => a + b) 13 | println(red)*/ 14 | 15 | val distFile = sc.textFile("F:\\Software\\Spark\\input.txt") 16 | 17 | val fil = distFile.map { x => x.split(" ").size } 18 | val rdd = distFile.reduce((a, b) => a + b) 19 | println(rdd) 20 | 21 | val res = distFile.map(s => s.length).reduce((a, b) => a + b) 22 | val res1 = distFile.reduce((a, b) => a + b) 23 | println(res) 24 | println(res1) 25 | 26 | } 27 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/NamesAnalysis.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import com.spark.util.Utills 5 | object NamesAnalysis { 6 | def main(args: Array[String]) { 7 | val conf = new SparkConf().setAppName("Names-Analysis").setMaster("local[1]") 8 | val sc = new SparkContext(conf) 9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 10 | import sqlContext.implicits._ 11 | 12 | val babyNamesRDD = sc.textFile(Utills.DATA_PATH + "/baby_names.txt") 13 | 14 | //remove the header information from the file 15 | val dropHeaderRDD = babyNamesRDD.mapPartitions(_.drop(1)) 16 | val rows = dropHeaderRDD.map(line => line.split(",")) 17 | 18 | //unique counties over the years of data collect 19 | val count = rows.map(row => row(2)).distinct.count 20 | println("unique counties count --> " + count) 21 | //rows.collect().foreach { x => println(x(0) + " : " +x(1) + " : " + x(2) + " : " + x(3) + " : " + x(4)) } 22 | 23 | //rows containing the name "SACHIN" 24 | val sachinRows = rows.filter(row => row(1).contains("SACHIN")) 25 | 26 | //Number of rows where NAME "SACHIN" has a "Count" greater than 10 27 | sachinRows.filter(row => row(4).toInt > 10).count() 28 | 29 | val uniqueCounties = sachinRows.filter(row => row(4).toInt > 10).map(r => r(2)).distinct 30 | println("-------- unique country names which have had the name 'SACHIN' ---------") 31 | uniqueCounties.foreach { x => println(x) } 32 | // unique counties which have had the name SACHIN over 10 times in a given year 33 | val uniCountryCount = sachinRows.filter(row => row(4).toInt > 10).map(r => r(2)).distinct.count 34 | println("unique counties which have had the name SACHIN --> " + uniCountryCount) 35 | 36 | val names = rows.map(name => (name(1), 1)) 37 | // shows number of times each name appears in file 38 | names.reduceByKey((a, b) => a + b).sortBy(_._2).foreach(println _) 39 | 40 | //Another way to filter the header information 41 | val filteredRows = babyNamesRDD.filter(line => !line.contains("Count")).map(line => line.split(",")) 42 | filteredRows.map(n => (n(1), n(4).toInt)).reduceByKey((a, b) => a + b).sortBy(_._2).foreach(println _) 43 | } 44 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/OlaDataAnalysis.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases 2 | import org.apache.spark.SparkConf 3 | import com.spark.util.Utills 4 | import org.apache.spark.SparkContext 5 | object OlaDataAnalysis { 6 | def main(args: Array[String]) { 7 | val conf = new SparkConf().setAppName("Ola-Cab-Data-Analysis").setMaster("local[1]") 8 | val sc = new SparkContext(conf) 9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 10 | val dataset = sc.textFile(Utills.DATA_PATH + "/olaCabData.txt") 11 | val header = dataset.first() 12 | val format = new java.text.SimpleDateFormat("MM/dd/yyyy") 13 | var days = Array("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat") 14 | val eliminate = dataset.filter(line => line != header) 15 | 16 | //Find the days on which each basement has more trips. 17 | 18 | val split = eliminate.map(line => line.split(",")).map { x => (x(0), format.parse(x(1)), x(3)) } 19 | split.foreach(f => println(f)) 20 | 21 | val combine = split.map(x => (x._1 + " " + days(x._2.getDay), x._3.toInt)) 22 | combine.foreach(f => println(f)) 23 | 24 | val arrange = combine.reduceByKey(_ + _).map(item => item.swap).sortByKey(false).collect.foreach(println) 25 | } 26 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/OlympicsDataAnalysis.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import com.spark.util.Utills 5 | object OlympicsDataAnalysis { 6 | def main(args: Array[String]) { 7 | val conf = new SparkConf().setAppName("Travel-Data-Analysis").setMaster("local[1]") 8 | val sc = new SparkContext(conf) 9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 10 | val textFile = sc.textFile(Utills.DATA_PATH + "/olympics_data.txt") 11 | val olympicsDataRDD = textFile.mapPartitions(_.drop(1)) //remove the header information from the file 12 | val lines = olympicsDataRDD.filter { x => { if (x.toString().split(",").length >= 10) true else false } } 13 | .map(line => { line.toString().split(",") }) 14 | 15 | //Find the total number of medals won by each country in swimming. 16 | val filteredRDD = lines.filter(x => { if (x(5).equalsIgnoreCase("swimming") && (x(9).matches(("\\d+")))) true else false }) 17 | val results = filteredRDD.map(x => (x(2), x(9).toInt)) 18 | val totalNoMedals = results.reduceByKey(_ + _).collect() 19 | println("---Total number of medals won by each country in swimming---") 20 | totalNoMedals.foreach(f => println(f)) 21 | 22 | //Find the number of medals that won by India year wise. 23 | val filteredIndiaRDD = lines.filter(x => { if (x(2).equalsIgnoreCase("india") && (x(9).matches(("\\d+")))) true else false }) 24 | val indiaResults = filteredIndiaRDD.map(x => (x(3), x(9).toInt)) 25 | val indiaMedals = indiaResults.reduceByKey(_ + _).collect() 26 | println("---Number of medals that won by India year wise---") 27 | indiaMedals.foreach(f => println(f)) 28 | 29 | //Find the total number of medals won by each country. 30 | val filteredLines = lines.filter(x => { if ((x(9).matches(("\\d+")))) true else false }) 31 | val filteredResults = filteredLines.map(x => (x(2), x(9).toInt)) 32 | val medalsCountryWise = filteredResults.reduceByKey(_ + _).collect() 33 | println("---Total number of medals won by each country---") 34 | medalsCountryWise.foreach(f => println(f)) 35 | 36 | } 37 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/TVShowDataAnalysis.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import com.spark.util.Utills 5 | object TVShowDataAnalysis { 6 | def main(args: Array[String]) { 7 | val conf = new SparkConf().setAppName("TV-Show-Data-Analysis").setMaster("local[1]") 8 | val sc = new SparkContext(conf) 9 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 10 | val format = new java.text.SimpleDateFormat("MM/dd/yy") 11 | val textFile = sc.textFile(Utills.DATA_PATH + "/dialy_show_guests.txt") 12 | val tvDataRDD = textFile.mapPartitions(_.drop(1)) //remove the header information from the file 13 | 14 | //Find the top 5 kinds of GoogleKnowlege_Occupation people guested the show in a particular time period. 15 | 16 | val splitedRDD = tvDataRDD.map(line => line.split(",")) 17 | val pair = splitedRDD.map(line => (line(1), format.parse(line(2)))) 18 | val fil = pair.filter(x => { if (x._2.after(format.parse("1/11/99")) && x._2.before(format.parse("6/11/99"))) true else false }) 19 | val top5GuestsRDD = fil.map(x => (x._1, 1)).reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(5) 20 | 21 | top5GuestsRDD.foreach(f => println(f)) 22 | } 23 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/TravelDataAnalysis.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.functions._ 5 | import com.spark.util.Utills 6 | object TravelDataAnalysis { 7 | def main(args: Array[String]) { 8 | val conf = new SparkConf().setAppName("Travel-Data-Analysis").setMaster("local[1]") 9 | val sc = new SparkContext(conf) 10 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 11 | val textFile = sc.textFile(Utills.DATA_PATH + "/TravelData.txt") 12 | val travelDataRDD = textFile.mapPartitions(_.drop(1)) //remove the header information from the file 13 | 14 | //Top 20 destination people travel the most 15 | val top20DestinationRDD = travelDataRDD.map(lines => lines.split('\t')). 16 | map(x => (x(2), 1)).reduceByKey(_ + _). 17 | map(item => item.swap).sortByKey(false).take(20) 18 | top20DestinationRDD.foreach(f => println(f)) 19 | 20 | //Top 20 locations from where people travel the most 21 | val top20LocationsRDD = travelDataRDD.map(lines => lines.split('\t')).map(x => (x(1), 1)). 22 | reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(20) 23 | 24 | top20LocationsRDD.foreach(f => println(f)) 25 | 26 | //Top 20 cities that generate high airline revenues for travel 27 | val fil = travelDataRDD.map(x => x.split('\t')).filter(x => { if ((x(3).matches(("1")))) true else false }) 28 | // fil.collect().foreach { x => println(x(2)) } 29 | val Top20Cities = fil.map(x => (x(2), 1)).reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(20) 30 | Top20Cities.foreach(f => println(f)) 31 | } 32 | 33 | //https://acadgild.com/blog/spark-use-case-travel-data-analysis/ 34 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/YoutubeDataAnalysis.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.functions._ 5 | import com.spark.util.Utills 6 | object YoutubeDataAnalysis { 7 | 8 | def main(args: Array[String]) { 9 | val conf = new SparkConf().setAppName("Youtube-Data-Analysis").setMaster("local[1]") 10 | val sc = new SparkContext(conf) 11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 12 | val textFile = sc.textFile(Utills.DATA_PATH + "/youtubedata.txt") 13 | 14 | //Here, we will find out what are the top five categories with maximum number of videos uploaded. 15 | // val counts = textFile.map(line => { var YoutubeRecord = ""; val temp = line.split("\t"); ; if (temp.length >= 3) { YoutubeRecord = temp(3) }; YoutubeRecord }) 16 | val counts = textFile.map(_.split("\t")).filter(_.length >= 3).map(_(3)) 17 | val test = counts.map(x => (x, 1)) 18 | val res = test.reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(5) 19 | res.foreach(f => println(f)) 20 | 21 | //In this problem statement, we will find the top 10 rated videos in YouTube. 22 | val counts1 = textFile.filter { x => { if (x.toString().split("\t").length >= 6) true else false } }.map(line => { line.toString().split("\t") }) 23 | val pairs = counts1.map(x => { (x(0), x(6).toDouble) }) 24 | val res1 = pairs.reduceByKey(_ + _).map(item => item.swap).sortByKey(false).take(10) 25 | res1.foreach(f => println(f)) 26 | 27 | } 28 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/loganalysis/ApacheAccessLog.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases.loganalysis 2 | 3 | case class ApacheAccessLog(ipAddress: String, clientIdentd: String, 4 | userId: String, dateTime: String, method: String, 5 | endpoint: String, protocol: String, 6 | responseCode: Int, contentSize: Long) { 7 | 8 | } 9 | 10 | object ApacheAccessLog { 11 | val PATTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r 12 | 13 | def parseLogLine(log: String): ApacheAccessLog = { 14 | val res = PATTERN.findFirstMatchIn(log) 15 | if (res.isEmpty) { 16 | throw new RuntimeException("Cannot parse log line: " + log) 17 | } 18 | val m = res.get 19 | ApacheAccessLog(m.group(1), m.group(2), m.group(3), m.group(4), 20 | m.group(5), m.group(6), m.group(7), m.group(8).toInt, m.group(9).toLong) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/loganalysis/LogAnalyzer.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases.loganalysis 2 | import org.apache.spark.{ SparkContext, SparkConf } 3 | import org.apache.spark.SparkContext._ 4 | import com.spark.util._ 5 | object LogAnalyzer { 6 | 7 | object SecondValueOrdering extends Ordering[(String, Int)] { 8 | def compare(a: (String, Int), b: (String, Int)) = { 9 | a._2 compare b._2 10 | } 11 | } 12 | 13 | def main(args: Array[String]) { 14 | val sparkConf = new SparkConf().setAppName("Log Analysis").setMaster("local[1]") 15 | val sc = new SparkContext(sparkConf) 16 | 17 | val accessLogs = sc.textFile(Utills.DATA_PATH + "log.txt") 18 | .map(ApacheAccessLog.parseLogLine).cache() 19 | 20 | // Any IPAddress that has accessed the server more than 2 times. 21 | val ipAddresses = accessLogs 22 | .map(log => (log.ipAddress, 1)) 23 | .reduceByKey(_ + _) 24 | .filter(_._2 > 4) 25 | .map(_._1) 26 | .take(10) 27 | println(s"""IPAddresses > 2 times: ${ipAddresses.mkString("[", ",", "]")}""") 28 | 29 | // Finding top 5 hits. 30 | val ipAddressesTop5 = accessLogs 31 | .map(log => (log.ipAddress, 1)) 32 | .reduceByKey(_ + _) 33 | .top(5)(SecondValueOrdering) 34 | 35 | println(s"""Top 5 hits : ${ipAddressesTop5.mkString("[", ",", "]")}""") 36 | 37 | // Top Endpoints. 38 | val topEndpoints = accessLogs 39 | .map(log => (log.endpoint, 1)) 40 | .reduceByKey(_ + _) 41 | .top(10)(SecondValueOrdering) 42 | println(s"""Top Endpoints: ${topEndpoints.mkString("[", ",", "]")}""") 43 | 44 | // Calculate statistics based on the content size. 45 | val contentSizes = accessLogs.map(log => log.contentSize).cache() 46 | println("Content Size Avg: %s, Min: %s, Max: %s".format( 47 | contentSizes.reduce(_ + _) / contentSizes.count, 48 | contentSizes.min, 49 | contentSizes.max)) 50 | 51 | // Compute Response Code to Count. 52 | val responseCodeToCount = accessLogs 53 | .map(log => (log.responseCode, 1)) 54 | .reduceByKey(_ + _) 55 | .take(100) 56 | println(s"""Response code counts: ${responseCodeToCount.mkString("[", ",", "]")}""") 57 | } 58 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/loganalysis/LogAnalyzerSQL.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases.loganalysis 2 | import org.apache.spark.sql.SQLContext 3 | import org.apache.spark.{ SparkContext, SparkConf } 4 | import com.spark.util._ 5 | object LogAnalyzerSQL { 6 | def main(args: Array[String]) { 7 | val sparkConf = new SparkConf().setAppName("Log Analyzer SQL").setMaster("local[1]") 8 | val sc = new SparkContext(sparkConf) 9 | val sqlContext = new SQLContext(sc) 10 | import sqlContext.implicits._ 11 | 12 | val accessLogs = sc.textFile(Utills.DATA_PATH + "log.txt").map(ApacheAccessLog.parseLogLine).toDF() 13 | accessLogs.registerTempTable("Logs") 14 | sqlContext.cacheTable("Logs"); 15 | 16 | // Calculate statistics based on the content size. 17 | val contentSizeStats = sqlContext 18 | .sql("SELECT SUM(contentSize), COUNT(*), MIN(contentSize), MAX(contentSize) FROM Logs") 19 | .first() 20 | println("Content Size Avg: %s, Min: %s, Max: %s".format( 21 | contentSizeStats.getLong(0) / contentSizeStats.getLong(1), 22 | contentSizeStats(2), 23 | contentSizeStats(3))) 24 | 25 | // Compute Response Code to Count. 26 | val responseCodeToCount = sqlContext 27 | .sql("SELECT responseCode, COUNT(*) FROM Logs GROUP BY responseCode LIMIT 1000") 28 | .map(row => (row.getInt(0), row.getLong(1))) 29 | .collect() 30 | println(s"""Response code counts: ${responseCodeToCount.mkString("[", ",", "]")}""") 31 | 32 | // Any IPAddress that has accessed the server more than 10 times. 33 | val ipAddresses = sqlContext 34 | .sql("SELECT ipAddress, COUNT(*) AS total FROM Logs GROUP BY ipAddress HAVING total > 10 LIMIT 1000") 35 | .map(row => row.getString(0)) 36 | .collect() 37 | println(s"""IPAddresses > 10 times: ${ipAddresses.mkString("[", ",", "]")}""") 38 | 39 | val topEndpoints = sqlContext 40 | .sql("SELECT endpoint, COUNT(*) AS total FROM Logs GROUP BY endpoint ORDER BY total DESC LIMIT 10") 41 | .map(row => (row.getString(0), row.getLong(1))) 42 | .collect() 43 | println(s"""Top Endpoints: ${topEndpoints.mkString("[", ",", "]")}""") 44 | 45 | sc.stop() 46 | } 47 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/sensoranalytics/Models.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases.sensoranalytics 2 | 3 | import java.sql.{Date, Timestamp} 4 | import java.text.SimpleDateFormat 5 | import java.util.Calendar 6 | 7 | import org.joda.time.DateTime 8 | 9 | 10 | case class SensorRecord(dateTime: DateTime, 11 | country:String, 12 | state:String, 13 | city:String, 14 | sensorStatus:String) 15 | 16 | case class CountryWiseStats(date: DateTime,country:String, count: BigInt) 17 | 18 | case class StateWiseStats(date: DateTime,country:String,state:String, count: BigInt) 19 | 20 | case class CityWiseStats(date: DateTime,city:String,sensorStatus:String, count: BigInt) -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/sensoranalytics/SchemaParser.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases.sensoranalytics 2 | 3 | import org.joda.time.DateTimeZone 4 | import org.joda.time.format.DateTimeFormat 5 | 6 | import scala.util.Try 7 | 8 | object SchemaParser { 9 | 10 | def parse(eachRow: String): Option[SensorRecord] = { 11 | val columns = eachRow.split(",") 12 | //println("columns --->" + columns(0) +","+ columns(1) +","+ columns(2) +","+ columns(3)) 13 | Try { 14 | if (columns.length == 5) { 15 | Option(SensorRecord(createDate(columns(0)), columns(1), columns(2), columns(3), columns(4))) 16 | } else { 17 | None 18 | } 19 | }.getOrElse(None) 20 | } 21 | 22 | def createDate(input: String) = { 23 | val columns = input.split(" ") 24 | val pattern = "YYYY/MM/dd HH" 25 | DateTimeFormat.forPattern(pattern) 26 | .parseDateTime(columns(0) + " " + columns(1).split(":")(0)) 27 | .withZone(DateTimeZone.getDefault()) 28 | } 29 | 30 | def createDelay(input: String): Double = { 31 | val delay_regex = """[^\d|.]*([0-9\\.]+)\s*(ms|.*)""".r 32 | 33 | input match { 34 | case delay_regex(value, unit) => { 35 | if (unit.equalsIgnoreCase("ms")) { 36 | value.toDouble 37 | } else { 38 | 0 39 | } 40 | } 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/twitteranalytics/TwitterAnalytics.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases.twitteranalytics 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.streaming._ 6 | import org.apache.spark.streaming.StreamingContext._ 7 | import org.apache.spark.streaming.twitter._ 8 | 9 | object TwitterAnalytics extends App { 10 | val conf = new SparkConf().setAppName("myStream").setMaster("local[2]") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("WARN") 13 | val ssc = new StreamingContext(sc, Seconds(2)) 14 | val client = new twitterclient() 15 | val tweetauth = client.start() 16 | val inputDstream = TwitterUtils.createStream(ssc, Option(tweetauth.getAuthorization)) 17 | 18 | // Split the stream on space and extract hashtags 19 | val hashTags = inputDstream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) 20 | 21 | // Get the top hashtags over the previous 60 sec window 22 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) 23 | .map { case (topic, count) => (count, topic) } 24 | .transform(_.sortByKey(false)) 25 | 26 | // Get the top hashtags over the previous 10 sec window 27 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) 28 | .map { case (topic, count) => (count, topic) } 29 | .transform(_.sortByKey(false)) 30 | 31 | // print tweets in the currect DStream 32 | inputDstream.print() 33 | 34 | // Print popular hashtags 35 | topCounts60.foreachRDD(rdd => { 36 | val topList = rdd.take(10) 37 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) 38 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 39 | }) 40 | topCounts10.foreachRDD(rdd => { 41 | val topList = rdd.take(10) 42 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) 43 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 44 | }) 45 | 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/usecases/twitteranalytics/twitterclient.scala: -------------------------------------------------------------------------------- 1 | package com.spark.usecases.twitteranalytics 2 | 3 | import twitter4j.Twitter 4 | import twitter4j.TwitterException; 5 | import twitter4j.TwitterFactory; 6 | import twitter4j.auth.AccessToken; 7 | import twitter4j.auth.RequestToken; 8 | class twitterclient { 9 | val CONSUMER_KEY: String = "Tn6mCikBNxLviA6znN4FgIXfY" 10 | val CONSUMER_KEY_SECRET: String = "JoRN26wNoPUuUYsgR4zKwre82zTY53r8rDzy6nLSrS4cMqiRzg" 11 | val ACCESS_TOKEN = "199435611-ancQT2HKivvIrlrKg2FYLTBoQyA0zsISGhDbO7ug" 12 | val ACCESS_TOKEN_SECRET = "wHaw4X7ok2uWXVGvOAOzaSgZvRovK4xFY4CAMLoNuMOy8" 13 | def start(): Twitter = { 14 | val twitter: Twitter = new TwitterFactory().getInstance(); 15 | twitter.setOAuthConsumer(CONSUMER_KEY, CONSUMER_KEY_SECRET); 16 | twitter.setOAuthAccessToken(new AccessToken(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)) 17 | twitter 18 | } 19 | } -------------------------------------------------------------------------------- /Spark-1.5/src/main/scala/com/spark/util/LogHelper.scala: -------------------------------------------------------------------------------- 1 | package com.spark.util 2 | 3 | import org.apache.log4j.Logger 4 | import org.apache.log4j.xml.DOMConfigurator 5 | 6 | trait LogHelper { 7 | DOMConfigurator.configure(Utills.DATA_PATH +"log4j_conf.xml") 8 | val loggerName = this.getClass.getName 9 | final val logger = Logger.getLogger(loggerName) 10 | } -------------------------------------------------------------------------------- /Spark-2.1/README.md: -------------------------------------------------------------------------------- 1 | # Spark-2.1 2 | Apache Spark is a fast and general-purpose cluster computing system. It provides high-level APIs in Java, Scala, Python and R, and an optimized engine that supports general execution graphs. It also supports a rich set of higher-level tools including Spark SQL for SQL and structured data processing, MLlib for machine learning, GraphX for graph processing, and Spark Streaming. 3 | 4 | Topics Covered : 5 | ---------------- 6 | Implementing custom UDF,UDAF,Partitioner using Spark-2.1 7 | Working with DataFrames (ComplexSchema,DropDuplicates,DatasetConversion,GroupingAndAggregation) 8 | Working with DataSets 9 | Working with Parquet files 10 | Working with Spark Catalog API to access Hive tables 11 | Loading Data from Cassnadra table using Spark 12 | CRUD operations on Cassandra Using Spark 13 | Reading/Writing to S3 buckets Using Spark 14 | Spark MangoDB Integration 15 | 16 | Pushing Spark Accumulator Values as metrics to DataDog API 17 | 18 | 19 | 20 | ------------------------------------------------------------------------------------------------------------------------------------ 21 | 22 | You can reach me for any suggestions/clarifications on : revanthkumar95@gmail.com 23 | Feel free to share any insights or constructive criticism. Cheers!! 24 | #Happy Sparking!!!.. 25 | -------------------------------------------------------------------------------- /Spark-2.1/input/README.md: -------------------------------------------------------------------------------- 1 | Developers have always loved Apache Spark for providing APIs that are simple yet powerful, a combination of traits that makes complex analysis possible with minimal programmer effort. At Databricks, we have continued to push Spark’s usability and performance envelope through the introduction of DataFrames and Spark SQL. These are high-level APIs for working with structured data (e.g. database tables, JSON files), which let Spark automatically optimize both storage and computation. Behind these APIs, the Catalyst optimizer and Tungsten execution engine optimize applications in ways that were not possible with Spark’s object-oriented (RDD) API, such as operating on data in a raw binary form. -------------------------------------------------------------------------------- /Spark-2.1/input/badrecords: -------------------------------------------------------------------------------- 1 | Barack,Obama,53 2 | George,Bush,68 3 | Hillary,Clinton,F 4 | Bill,Clinton,68 5 | Tom,Cruise,M 6 | Barack,Obama,53 7 | George,Bush,68 8 | Hillary,Clinton,F 9 | Bill,Clinton,68 10 | Tom,Cruise,M -------------------------------------------------------------------------------- /Spark-2.1/input/conf.properties: -------------------------------------------------------------------------------- 1 | tidList = 102:1,104:1,106:9 2 | topics = topic1,topic2,topic3 -------------------------------------------------------------------------------- /Spark-2.1/input/emp.txt: -------------------------------------------------------------------------------- 1 | 100,Steven,King,SKING,515.123.4567,1987-06-17,AD_PRES,24000.00,null,null,90 2 | 101,Neena,Kochhar,NKOCHHAR,515.123.4568,1989-09-21,AD_VP,17000.00,null,100,90 -------------------------------------------------------------------------------- /Spark-2.1/input/iap_sw_cpu_mem_stats_rollup/part-00000-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/iap_sw_cpu_mem_stats_rollup/part-00000-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet -------------------------------------------------------------------------------- /Spark-2.1/input/iap_sw_cpu_mem_stats_rollup/part-00001-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/iap_sw_cpu_mem_stats_rollup/part-00001-03d43b04-1dda-472c-8601-e7a8914e6097.snappy.parquet -------------------------------------------------------------------------------- /Spark-2.1/input/lines: -------------------------------------------------------------------------------- 1 | Note that this information is only available for the duration of the application by default. To view the web UI after the fact, set spark.eventLog.enabled to true before starting the application. This configures Spark to log Spark events that encode the information displayed in the UI to persisted storage. -------------------------------------------------------------------------------- /Spark-2.1/input/nested.json: -------------------------------------------------------------------------------- 1 | {"queryResults":{"searchResponse":{"response":{"docs":[{"transactions":[{"recordDate":"2010-02-02 00:00:00","code":"PGM/","description":"Recordation of Patent Grant Mailed"},{"recordDate":"2010-01-13 00:00:00","code":"WPIR","description":"Issue Notification Mailed"},{"recordDate":"2009-12-17 00:00:00","code":"R1021","description":"Receipt into Pubs"}]}]}}}} -------------------------------------------------------------------------------- /Spark-2.1/input/pbs.csv: -------------------------------------------------------------------------------- 1 | Name,Position Title,Department,Employee Annual Salary 2 | "AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00 3 | "AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90745.00 4 | "AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00 5 | "AARON, KARINA",POLICE OFFICER,POLICE,$84450.00 6 | "AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00 7 | "ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00 8 | "ABARCA, ANABEL",ASST TO THE ALDERMAN,CITY COUNCIL,$70764.00 9 | "ABARCA, EMMANUEL",GENERAL LABORER - DSS,STREETS & SAN,$41849.60 10 | "ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,$20051.20 -------------------------------------------------------------------------------- /Spark-2.1/input/product: -------------------------------------------------------------------------------- 1 | 217,11,Fitness Gear 300 lb Olympic Weight Set,desc,209.99,http://images.acmesports.sports/Fitness+Gear+300+lb+Olympic+Weight+Set 2 | 218,11,Elevation Training Mask 2.0,,,http://images.acmesports.sports/Elevation+Training+Mask+2.0 3 | 219,11,Fitness Gear Pro Utility Bench,,179.99,http://images.acmesports.sports/Fitness+Gear+Pro+Utility+Bench 4 | 220,11,Teeter Hang Ups NXT-S Inversion Table,,299.99,http://images.acmesports.sports/Teeter+Hang+Ups+NXT-S+Inversion+Table -------------------------------------------------------------------------------- /Spark-2.1/input/purchases.log: -------------------------------------------------------------------------------- 1 | Kolkata Central Avenue Groceries 233.65 2 | 3 | Kolkata Bowbazar Hair Care 198.99 4 | Bad data packet 5 | Kolkata Amherst Street Beverages 92.75 6 | Kolkata Amherst Street Beverages 0 7 | Kolkata Amherst Street Groceries 92.75 8 | Kolkata Amherst Street Beverages 92.75 9 | Kolkata Central Avenue 233.65 10 | Kolkata Amherst Street Hair Care 92.75 11 | Bad data packet 12 | Kolkata Bowbazar Groceries 198.99 13 | Kolkata Bowbazar Groceries 198.99 14 | Kolkata Bowbazar Hair Care 198.99 15 | Bad data packet 16 | Kolkata Bowbazar Groceries 198.99 17 | Kolkata Bowbazar Groceries 198.99 18 | Kolkata Bowbazar Beverages 198.99 19 | Kolkata Central Avenue Hair Care 0 20 | Bad data packet 21 | Kolkata Central Avenue Beverages 0 22 | Bad data packet 23 | Kolkata Central Avenue Groceries 233.65 24 | Kolkata Central Avenue Groceries 233.65 25 | Kolkata Central Avenue Beverages 233.65 -------------------------------------------------------------------------------- /Spark-2.1/input/schools.json: -------------------------------------------------------------------------------- 1 | {"name": "UC Berkeley", "yearFounded": 1868,"numStudents": 37581} 2 | {"name": "MIT", "yearFounded": 1860, "numStudents": 11318} 3 | {"name": "JNTU-A", "yearFounded": 1950,"numStudents": 37581} 4 | {"name": "BITIT", "yearFounded": 1999, "numStudents": 11318} 5 | {"name": "VIT", "yearFounded": 1900,"numStudents": 37581} 6 | {"name": "VTU", "yearFounded": 1900, "numStudents": 11318} 7 | {"name": "SRM", "yearFounded": 1968,"numStudents": 37581} 8 | {"name": "SASTRA", "yearFounded": 1990, "numStudents": 11318} -------------------------------------------------------------------------------- /Spark-2.1/input/sw_hp_system_cpu_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_cpu_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet -------------------------------------------------------------------------------- /Spark-2.1/input/sw_hp_system_cpu_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_cpu_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet -------------------------------------------------------------------------------- /Spark-2.1/input/sw_hp_system_info_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_info_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet -------------------------------------------------------------------------------- /Spark-2.1/input/sw_hp_system_info_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_info_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet -------------------------------------------------------------------------------- /Spark-2.1/input/sw_hp_system_memory_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_memory_stats_records/date_month=2020-01/date_hour=2020-01-13-04/data.parquet -------------------------------------------------------------------------------- /Spark-2.1/input/sw_hp_system_memory_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-2.1/input/sw_hp_system_memory_stats_records/date_month=2020-01/date_hour=2020-01-13-05/data.parquet -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/datadog/metrics/AbstractCaseClass.scala: -------------------------------------------------------------------------------- 1 | package com.datadog.metrics 2 | 3 | abstract class AbstractCaseClass -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/datadog/metrics/CaseClasses.scala: -------------------------------------------------------------------------------- 1 | package com.datadog.metrics 2 | 3 | object CaseClasses { 4 | 5 | //case classes for events and metrics construction 6 | case class SeriesList(series: List[Series]) extends AbstractCaseClass 7 | case class Series(metric: String, `type`: String, points: List[List[Long]], tags: List[String]) extends AbstractCaseClass 8 | case class Event(title: String, text: String, priority: String, alert_type: String, date_happened: Long, tags: List[String]) extends AbstractCaseClass 9 | 10 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/datadog/metrics/DataDogHttpTransport.scala: -------------------------------------------------------------------------------- 1 | package com.datadog.metrics 2 | 3 | import org.apache.http.HttpHost 4 | import org.apache.http.HttpResponse 5 | import org.apache.http.client.fluent.Request.Post 6 | import org.apache.http.client.fluent.Response 7 | import org.apache.http.entity.ContentType 8 | import org.apache.log4j.Logger 9 | 10 | /** 11 | * Uses the datadog http webservice to push metrics. 12 | * 13 | * @see API docs 14 | */ 15 | 16 | class DataDogHttpTransport(apiKey: String, 17 | private val connectTimeout: Int, 18 | private val socketTimeout: Int, 19 | private val proxy: HttpHost, 20 | isMetrics: java.lang.Boolean) 21 | extends Transport { 22 | 23 | val logger: Logger = Logger.getLogger(classOf[DataDogHttpTransport]) 24 | 25 | private val BASE_URL: String = "https://app.datadoghq.com/api/v1" 26 | 27 | /** 28 | * seriesUrl gets constructed based on the 'isMetrics' value 29 | */ 30 | private val seriesUrl: String = 31 | if (isMetrics) String.format("%s/series?api_key=%s", BASE_URL, apiKey) 32 | else String.format("%s/events?api_key=%s", BASE_URL, apiKey) 33 | 34 | /** 35 | * This method is used to send Metrics/Events to DataDog. 36 | * @return httpResponseCode 37 | */ 38 | def sendToDataDog(transport: DataDogHttpTransport, jsonData: String): Int = { 39 | val request: org.apache.http.client.fluent.Request = 40 | Post(transport.seriesUrl) 41 | .useExpectContinue() 42 | .connectTimeout(transport.connectTimeout) 43 | .socketTimeout(transport.socketTimeout) 44 | .bodyString(jsonData, ContentType.APPLICATION_JSON) 45 | if (transport.proxy != null) { 46 | request.viaProxy(transport.proxy) 47 | } 48 | val response: Response = request.execute() 49 | val httpResponse: HttpResponse = response.returnResponse() 50 | httpResponse.getStatusLine.getStatusCode 51 | } 52 | 53 | /** 54 | * This method is used to send the Json request. 55 | * @return httpResponseCode 56 | */ 57 | def send(jsonData: String) = sendToDataDog(this, jsonData) 58 | 59 | def close(): Unit = {} 60 | } 61 | -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/datadog/metrics/MetricsCollector.scala: -------------------------------------------------------------------------------- 1 | package com.datadog.metrics 2 | 3 | /** 4 | * @author revanthreddy 5 | */ 6 | trait MetricsCollector { 7 | 8 | 9 | /** 10 | * This method is used to send metrics to DataDog . 11 | */ 12 | def sendMetrics(metricName: String, metricValue: Long, tags: collection.mutable.Map[String, Any]) 13 | 14 | /** 15 | * This method is used to send events to DataDog. 16 | */ 17 | def sendEvents(title: String, text: String, priority: String, alert_type: String, tags: collection.mutable.Map[String, Any]) 18 | 19 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/datadog/metrics/MetricsCollectorFactory.scala: -------------------------------------------------------------------------------- 1 | package com.datadog.metrics 2 | 3 | /** 4 | * @author revanthreddy 5 | */ 6 | object MetricsCollectorFactory { 7 | 8 | def getDatadogCollector(apikey: String, env: String): MetricsCollector = new DataDogCollector(apikey, env) 9 | 10 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/datadog/metrics/Spark_Accumulator.scala: -------------------------------------------------------------------------------- 1 | package com.datadog.metrics 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object Spark_Accumulator { 6 | def main(args: Array[String]) { 7 | 8 | val sparkSession = SparkSession.builder. 9 | master("local[1]") 10 | .appName("Spark_Accumulator_Metrics_To_DataDog") 11 | .getOrCreate() 12 | 13 | val sc = sparkSession.sparkContext 14 | 15 | val accum = sc.longAccumulator("total.charecters.counter") 16 | val words = sc.textFile("input/lines").flatMap(_.split(" ")) 17 | words.foreach(w => accum.add(w.length)) 18 | 19 | //setting the metrics tags 20 | var tags = collection.mutable.Map[String, Any]() 21 | tags.put("counter", accum.name.get) 22 | tags += ("class" -> getClass) 23 | tags += ("count" -> accum.value, "accum name" -> "total.charecters.counter") 24 | 25 | //DataDog API Key needs to be generted by creating account in DataDog 26 | val apiKey="947d12f46dead405bf019033434f0xxx" 27 | //initializing the metrics collector 28 | val metricsCollector = MetricsCollectorFactory.getDatadogCollector(apiKey, "dev") 29 | 30 | //sending accumulator values as metrics to DataDog 31 | metricsCollector.sendMetrics(accum.name.get, accum.value, null) 32 | 33 | val badRecords = sc.longAccumulator("bad.records.counter") 34 | val baddata = sc.textFile("input/badrecords").map(v => v.split(",")) 35 | baddata.foreach(r => { try { r(2).toInt } catch { case e: NumberFormatException => badRecords.add(1) } }) 36 | 37 | //sending accumulator values as metrics to DataDog 38 | metricsCollector.sendMetrics(badRecords.name.get, badRecords.value, tags) 39 | 40 | val acc = sc.longAccumulator("counter.test") 41 | val baddata1 = sc.textFile("input/badrecords").map(x => acc.add(1)) 42 | baddata1.collect() 43 | 44 | //sending events to DataDog 45 | metricsCollector.sendEvents("Spark-Events", "Test Run", "normal", "info", tags) 46 | 47 | sc.stop() 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/datadog/metrics/Transport.scala: -------------------------------------------------------------------------------- 1 | package com.datadog.metrics 2 | import java.io.Closeable 3 | 4 | 5 | /** 6 | * The transport layer for pushing metrics to datadog 7 | */ 8 | trait Transport extends Closeable { 9 | 10 | /** 11 | * Build a request context. 12 | */ 13 | def send(jsonData: String): Int 14 | 15 | /** 16 | * Send the request to datadog 17 | */ 18 | def sendToDataDog(transport: DataDogHttpTransport,jsonData: String): Int 19 | 20 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/aws/Spark_AWS_S3.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.aws 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.SaveMode 5 | 6 | object Spark_AWS_S3 extends App { 7 | case class Employee(empid: Int, name: String, dept: String, salary: Double, nop: Int, dttime: String) 8 | 9 | val spark = SparkSession.builder().appName("Spark_AWS_S3").master("local[1]").getOrCreate() 10 | val sc = spark.sparkContext 11 | 12 | sc.hadoopConfiguration.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") 13 | sc.hadoopConfiguration.set("fs.s3n.awsAccessKeyId", "xxxxxxxxxx") 14 | sc.hadoopConfiguration.set("fs.s3n.awsSecretAccessKey", "xxxxxxxxxxxx") 15 | 16 | import spark.implicits._ 17 | 18 | val empDF = Seq( 19 | Employee(123, "revanth", "cloud", 1000, 2, "07-06-2016-06-08-27"), 20 | Employee(124, "shyam", "finance", 3000, 2, "07-06-2016-06-08-27"), 21 | Employee(125, "hari", "TAG", 6000, 2, "07-06-2016-06-08-27"), 22 | Employee(126, "kiran", "cloud", 2000, 2, "08-06-2016-07-08-27"), 23 | Employee(127, "nandha", "sales", 1000, 2, "08-06-2016-07-08-27"), 24 | Employee(128, "pawan", "cloud", 1000, 2, "08-06-2016-07-08-27"), 25 | Employee(129, "kalyan", "conectivity", 1000, 2, "09-06-2016-08-08-27"), 26 | Employee(121, "satish", "finance", 1000, 2, "09-06-2016-08-08-27"), 27 | Employee(131, "arun", "cloud", 1000, 2, "09-06-2016-08-08-27"), 28 | Employee(132, "ram", "cloud", 1000, 2, "10-06-2016-08-08-27"), 29 | Employee(133, "suda", "conectivity", 1000, 2, "10-06-2016-08-08-27"), 30 | Employee(134, "sunder", "sales", 1000, 2, "10-06-2016-08-08-27"), 31 | Employee(135, "charan", "TAG", 1000, 2, "12-06-2016-08-08-27"), 32 | Employee(136, "ravi", "TAG", 1000, 2, "11-06-2016-08-08-27"), 33 | Employee(137, "arjun", "cloud", 1000, 2, "11-06-2016-08-08-27")).toDF() 34 | 35 | empDF.coalesce(1).write.format("org.apache.spark.sql.json").mode(SaveMode.Append).save("s3n://snanpsat/emp") 36 | 37 | val empS3DF = spark.read.json("s3n://snanpsat/emp") 38 | empS3DF.printSchema() 39 | 40 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/cassandra/ChangeDFTypes.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.cassandra 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.LongType 5 | 6 | object ChangeDFTypes extends App { 7 | 8 | val spark = SparkSession.builder().appName("ChangeDFTypes-Job").master("local[1]") 9 | .config("spark.cassandra.connection.host", "127.0.0.1").getOrCreate() 10 | 11 | var testDF = (spark.read.format("org.apache.spark.sql.cassandra") 12 | .options(Map("table" -> "test", "keyspace" -> "dev")) 13 | .load) 14 | 15 | println("schema and data before conversion....") 16 | testDF.printSchema() 17 | testDF.show(3, false) 18 | 19 | val newTestDF = testDF.dtypes 20 | 21 | //converting all the timestamp columns in the dataframe to long type 22 | newTestDF.foreach { f => 23 | val columnName = f._1 24 | val columnType = f._2 25 | 26 | if (columnType.equals("TimestampType")) { 27 | testDF = testDF.withColumn(columnName, testDF(columnName).cast(LongType)) 28 | 29 | } 30 | } 31 | println("schema and data after conversion....") 32 | testDF.printSchema() 33 | testDF.show(3, false) 34 | } 35 | //CREATE TABLE TEST (ID TEXT, NAME TEXT, VALUE TEXT, LAST_MODIFIED_DATE TIMESTAMP,CREATED_DATE timestamp, PRIMARY KEY (ID)); 36 | //INSERT INTO TEST (ID, NAME, VALUE, LAST_MODIFIED_DATE,CREATED_DATE) VALUES ('1', 'orange', 'fruit', toTimestamp(now()),toTimestamp(now())); 37 | //INSERT INTO TEST (ID, NAME, VALUE, LAST_MODIFIED_DATE,CREATED_DATE) VALUES ('2', 'elephant', 'animal', toTimestamp(now()),toTimestamp(now())); -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/cassandra/ConvetTimestampToLong.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.cassandra 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.LongType 5 | import org.apache.spark.sql.functions.{ col, udf } 6 | import java.sql.Timestamp 7 | object ConvetTimestampToLong extends App { 8 | 9 | val spark = SparkSession.builder().appName("ConvetTimestampToLong-Job").master("local[1]") 10 | .config("spark.cassandra.connection.host", "127.0.0.1").getOrCreate() 11 | 12 | var testDF = (spark.read.format("org.apache.spark.sql.cassandra") 13 | .options(Map("table" -> "test", "keyspace" -> "dev")) 14 | .load) 15 | 16 | println("schema and data before conversion....") 17 | testDF.printSchema() 18 | testDF.show(3, false) 19 | 20 | /* convert Spark Timestamp column type to Long in epoch-msecs */ 21 | protected val udfTimestampToEpochMsecLong = udf { (ts: Timestamp) => 22 | Option(ts) match { 23 | case Some(ts) => Some(ts.getTime()) 24 | case _ => None 25 | } 26 | } 27 | 28 | val newTestDF = testDF.dtypes 29 | 30 | //converting all the timestamp columns in the dataframe to long type 31 | newTestDF.foreach { f => 32 | val columnName = f._1 33 | val columnType = f._2 34 | /* for consistency with Parquet schema, convert it to Long (in epoch-millisecs). 35 | * -> Note: DO NOT directly cast to long, that returns epoch-seconds, which is 3 digits shorter! */ 36 | if (columnType.equals("TimestampType")) { 37 | testDF = testDF.withColumn(columnName, udfTimestampToEpochMsecLong(col(columnName))) 38 | } 39 | } 40 | println("schema and data after conversion....") 41 | testDF.printSchema() 42 | testDF.show(3, false) 43 | } 44 | //CREATE TABLE TEST (ID TEXT, NAME TEXT, VALUE TEXT, LAST_MODIFIED_DATE TIMESTAMP,CREATED_DATE timestamp, PRIMARY KEY (ID)); 45 | //INSERT INTO TEST (ID, NAME, VALUE, LAST_MODIFIED_DATE,CREATED_DATE) VALUES ('1', 'orange', 'fruit', toTimestamp(now()),toTimestamp(now())); 46 | //INSERT INTO TEST (ID, NAME, VALUE, LAST_MODIFIED_DATE,CREATED_DATE) VALUES ('2', 'elephant', 'animal', toTimestamp(now()),toTimestamp(now())); -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/cassandra/FilterCassandraData.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.cassandra 2 | import org.apache.spark.sql.SparkSession 3 | 4 | object FilterCassandraData extends App { 5 | 6 | case class Employee(id: Int, name: String, salary: Int) 7 | 8 | val spark = SparkSession.builder().appName("Spark_To_Caasandra").master("local[1]").getOrCreate() 9 | import spark.implicits._ 10 | 11 | spark.conf.set("spark.sql.shuffle.partitions", "2") 12 | spark.conf.set("spark.cassandra.connection.host", "127.0.0.1") 13 | 14 | val KEY_SPACE_NAME = "dev" 15 | val TABLE_NAME = "employee" 16 | val TABLE_NAME1 = "master_collection1" 17 | 18 | //loading data from cassandra table 19 | val df = spark.read.format("org.apache.spark.sql.cassandra").option("table", TABLE_NAME) 20 | .option("keyspace", KEY_SPACE_NAME) 21 | .load() 22 | // df.printSchema() 23 | // df.show() 24 | 25 | val masterdf = spark.read.format("org.apache.spark.sql.cassandra").option("table", TABLE_NAME1) 26 | .option("keyspace", KEY_SPACE_NAME) 27 | .load() 28 | val tidfiltDF = masterdf.select("id").where(masterdf("disable") === "0") 29 | tidfiltDF.show() 30 | val tidList = tidfiltDF.select("id").map(r => r.getInt(0)).collect.toList 31 | val filt = tidList.mkString("id in (", ",", ")") 32 | println(filt) 33 | 34 | val finalfildf = df.filter(filt) 35 | finalfildf.show() 36 | finalfildf.select("id").distinct.show() 37 | 38 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/cassandra/SparkStreaming_Cassandra.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.cassandra 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.ConstantInputDStream 9 | import com.datastax.spark.connector.streaming.toStreamingContextFunctions 10 | import com.datastax.spark.connector.toNamedColumnRef 11 | 12 | /** 13 | * Reading from Cassandra using Spark Streaming 14 | */ 15 | object SparkStreaming_Cassandra extends App { 16 | case class Employee(id: Int, name: String, salary: Int) 17 | 18 | val spark = SparkSession.builder().appName("Spark_Streaming_Cassandra").master("local[*]").getOrCreate() 19 | 20 | spark.conf.set("spark.sql.shuffle.partitions", "2") 21 | spark.conf.set("spark.cassandra.connection.host", "127.0.0.1") 22 | 23 | val KEY_SPACE_NAME = "dev" 24 | val TABLE_NAME = "employee" 25 | 26 | val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) 27 | val cassandraRDD = ssc.cassandraTable[Employee](KEY_SPACE_NAME, TABLE_NAME).select("id", "name", "salary") 28 | 29 | val dstream = new ConstantInputDStream(ssc, cassandraRDD) 30 | 31 | dstream.foreachRDD { rdd => 32 | println("Total Records cont in DB : " + rdd.count) 33 | 34 | println(rdd.collect.mkString("\n")) 35 | } 36 | 37 | ssc.start() 38 | ssc.awaitTermination() 39 | 40 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/cassandra/Writting_DF_To_Cassandra.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.cassandra 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.SaveMode 5 | 6 | object Writting_DF_To_Cassandra extends App { 7 | 8 | case class Emp(id: Int, name: String, salary: Int) 9 | val spark = SparkSession.builder().appName("Spark_To_Caasandra").master("local[1]").getOrCreate() 10 | 11 | spark.conf.set("spark.sql.shuffle.partitions", "2") 12 | spark.conf.set("spark.cassandra.connection.host", "127.0.0.1") 13 | 14 | val KEY_SPACE_NAME = "dev" 15 | val TABLE_NAME = "employee" 16 | 17 | val emps = List( 18 | Emp(1, "Mike", 1032230), 19 | Emp(2, "Shyam", 1322200), 20 | Emp(3, "Revanth", 2223300), 21 | Emp(4, "Raghu", 2773666), 22 | Emp(5, "naga", 2002233), 23 | Emp(6, "siva", 2773666)) 24 | 25 | val empDF = spark.createDataFrame(emps) 26 | 27 | 28 | empDF.write.format("org.apache.spark.sql.cassandra").option("table", TABLE_NAME) 29 | .option("keyspace", KEY_SPACE_NAME).mode(SaveMode.Append).save() 30 | 31 | println("done .......") 32 | } 33 | 34 | //CREATE TABLE dev.employee ( 35 | // id int PRIMARY KEY, 36 | // name text, 37 | // salary int 38 | //); -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/cassandra/export/CassandraYaml.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.cassandra.export 2 | 3 | import scala.beans.BeanProperty 4 | class CassandraYaml { 5 | @BeanProperty var cassandra_table_export = new java.util.ArrayList[YamlProps]() 6 | } 7 | -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/cassandra/export/Utils.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.cassandra.export 2 | 3 | object Utils { 4 | 5 | /** 6 | * Method to parse the input arguments 7 | * @param args: Array[String] 8 | * @return java.util.HashMap[String, String] 9 | */ 10 | def argsParser(args: Array[String]): java.util.HashMap[String, String] = { 11 | val result = new java.util.HashMap[String, String]() 12 | var index = 0 13 | for (arg <- args) { 14 | index += 1 15 | val trimmedArg = arg.trim() 16 | if (trimmedArg.startsWith("--")) { 17 | val key = trimmedArg.replaceAll("--", "") 18 | if (index < args.length) { 19 | val value = args(index).trim() 20 | result.put(key, value) 21 | } 22 | } 23 | } 24 | result 25 | } 26 | 27 | /** 28 | * This method is used to parse the timeStamp(2017-09-26 05:00:00.0) 29 | * @param String: timeStamp 30 | * @return String: 2017-09 31 | */ 32 | val setDateMonth: (String) => String = (timeStamp: String) => { 33 | var date_hour_list = timeStamp.split(" ") 34 | var date = date_hour_list(0) 35 | var month = date.split("-") 36 | month(0) + "-" + month(1) 37 | } 38 | 39 | /** 40 | * This method is used to parse the timeStamp(2017-09-26 05:00:00.0) 41 | * @param String: timeStamp 42 | * @return String: 2017-09-26-05 43 | */ 44 | val setDateHour: (String) => String = (timeStamp: String) => { 45 | var date_hour_list = timeStamp.split(" ") 46 | var date = date_hour_list(0) 47 | var month = date.split("-") 48 | month(0) + "-" + month(1) 49 | var hour_min_sec = date_hour_list(1).split(":") 50 | date + "-" + hour_min_sec(0) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/cassandra/export/YamlProps.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.cassandra.export 2 | 3 | import scala.beans.BeanProperty 4 | 5 | class YamlProps { 6 | @BeanProperty var table_name = "" 7 | @BeanProperty var keyspace = "" 8 | @BeanProperty var output_location = "" 9 | @BeanProperty var duration_in_hour = "" 10 | } 11 | -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/cassandra/export/cassandra-table-export.yml: -------------------------------------------------------------------------------- 1 | # Configurations to backup cassandra tables 2 | cassandra_table_export: 3 | 4 | - table_name : insight 5 | keyspace: demo_database 6 | output_location: backup/cassandra/parquet_insight 7 | duration_in_hour: 24 8 | 9 | - table_name : insight_rc_data 10 | keyspace: demo_database 11 | output_location: backup/cassandra/parquet_insight_rc_data 12 | duration_in_hour: 24 13 | 14 | - table_name : insight_rc_data_count 15 | keyspace: demo_database 16 | output_location: backup/cassandra/parquet_insight_rc_data_count 17 | duration_in_hour: 24 -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/custom/CustomPartitioner.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.custom 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.{ Partitioner, SparkContext, SparkConf } 5 | import org.apache.spark.SparkContext._ 6 | 7 | class SpecialPartitioner extends Partitioner { 8 | def numPartitions = 10 9 | 10 | def getPartition(key: Any): Int = { 11 | key match { 12 | case (x, y: Int, z) => y % numPartitions 13 | case _ => throw new ClassCastException 14 | } 15 | } 16 | } 17 | 18 | object CustomPartitioner { 19 | def analyze[T](r: RDD[T]): Unit = { 20 | val partitions = r.glom() 21 | println(partitions.count() + " parititons") 22 | 23 | // use zipWithIndex() to see the index of each partition 24 | // we need to loop sequentially so we can see them in order: use collect() 25 | partitions.zipWithIndex().collect().foreach { 26 | case (a, i) => { 27 | println("Partition " + i + " contents (count " + a.count(_ => true) + "):" + 28 | a.foldLeft("")((e, s) => e + " " + s)) 29 | } 30 | } 31 | } 32 | 33 | def main(args: Array[String]) { 34 | val conf = new SparkConf().setAppName("Streaming").setMaster("local[4]") 35 | val sc = new SparkContext(conf) 36 | 37 | val triplets = 38 | for (x <- 1 to 3; y <- 1 to 20; z <- 'a' to 'd') 39 | yield ((x, y, z), x * y) 40 | 41 | // Spark has the good sense to use the first tuple element 42 | // for range partitioning, but for this data-set it makes a mess 43 | val defaultRDD = sc.parallelize(triplets, 10) 44 | println("with default partitioning") 45 | analyze(defaultRDD) 46 | 47 | // out custom partitioner uses the second tuple element 48 | val deliberateRDD = defaultRDD.partitionBy(new SpecialPartitioner()) 49 | println("with deliberate partitioning") 50 | analyze(deliberateRDD) 51 | 52 | } 53 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/custom/HashJoin.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.custom 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.{ SparkContext, SparkConf } 5 | 6 | import scala.collection.mutable 7 | 8 | // This gives is access to the PairRDDFunctions 9 | import org.apache.spark.SparkContext._ 10 | 11 | // encapsulate a small sequence of pairs to be joined with pair RDDs -- 12 | // making this serializable effectively allows the hash table to be 13 | // broadcast to each worker 14 | // Reference: http://en.wikipedia.org/wiki/Hash_join 15 | // (this is specifically an inner equi-join on pairs) 16 | class HashJoiner[K, V](small: Seq[(K, V)]) extends java.io.Serializable { 17 | 18 | // stash it as a hash table, remembering that the keys may not be unique, 19 | // so we need to collect values for each key in a list 20 | val m = new mutable.HashMap[K, mutable.ListBuffer[V]]() 21 | small.foreach { 22 | case (k, v) => if (m.contains(k)) m(k) += v else m(k) = mutable.ListBuffer(v) 23 | } 24 | 25 | // when joining the RDD, remember that each key in it may or may not have 26 | // a matching key in the array, and we need a result tuple for each value 27 | // in the list contained in the corresponding hash table entry 28 | def joinOnLeft[U](large: RDD[(K, U)]): RDD[(K, (U, V))] = { 29 | large.flatMap { 30 | case (k, u) => 31 | m.get(k).flatMap(ll => Some(ll.map(v => (k, (u, v))))).getOrElse(mutable.ListBuffer()) 32 | } 33 | } 34 | } 35 | 36 | object HashJoin { 37 | def main(args: Array[String]) { 38 | val conf = new SparkConf().setAppName("HashJoin").setMaster("local[4]") 39 | val sc = new SparkContext(conf) 40 | 41 | val smallRDD = sc.parallelize( 42 | Seq((1, 'a'), (1, 'c'), (2, 'a'), (3, 'x'), (3, 'y'), (4, 'a')), 43 | 4) 44 | 45 | val largeRDD = sc.parallelize( 46 | for (x <- 1 to 10000) yield (x % 4, x), 47 | 4) 48 | 49 | // simply joining the two RDDs will be slow as it requires 50 | // lots of communication 51 | val joined = largeRDD.join(smallRDD) 52 | joined.collect().foreach(println) 53 | 54 | // If the smaller RDD is small enough we're better of with it not 55 | // being an RDD -- and we can implement a hash join by hand, effectively 56 | // broadcasting the hash table to each worker 57 | println("hash join result") 58 | // NOTE: it may be tempting to use "collectAsMap" below instead of "collect", 59 | // and simplify the joiner accordingly, but that only works if the keys 60 | // are unique 61 | val joiner = new HashJoiner(smallRDD.collect()) 62 | val hashJoined = joiner.joinOnLeft(largeRDD) 63 | hashJoined.collect().foreach(println) 64 | 65 | } 66 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/custom/SemiStructuredUtilUDF.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.custom 2 | 3 | import org.apache.spark.sql.{ Row, SQLContext, SparkSession } 4 | import org.apache.spark.{ SparkConf, SparkContext } 5 | 6 | import scala.collection.mutable.ArrayBuffer 7 | 8 | // 9 | // Show various ways to query in SQL using user-defined functions UDFs. 10 | // 11 | 12 | object SemiStructuredUtilUDF { 13 | 14 | def isAtomic(o: AnyRef): Boolean = { 15 | o match { 16 | case l: ArrayBuffer[_] => false 17 | case _ => true 18 | } 19 | } 20 | 21 | def isString(o: AnyRef): Boolean = { 22 | o match { 23 | case s: String => true 24 | case _ => false 25 | } 26 | } 27 | 28 | //def isInt(o:AnyRef) : Boolean = { 29 | // o match { 30 | // case i:Int => true 31 | // case _ => false 32 | // } 33 | //} 34 | 35 | def isArray(o: AnyRef): Boolean = { 36 | o match { 37 | case l: ArrayBuffer[_] => true 38 | case _ => false 39 | } 40 | } 41 | 42 | def arrayLength(o: AnyRef): Int = { 43 | o match { 44 | case l: ArrayBuffer[_] => l.size 45 | case null => 0 46 | case _ => 1 47 | } 48 | } 49 | 50 | def isStruct(o: AnyRef): Boolean = { 51 | o match { 52 | case r: Row => true 53 | case _ => false 54 | } 55 | } 56 | 57 | def arrayContains(a: AnyRef, v: AnyRef): Boolean = { 58 | a match { 59 | case l: ArrayBuffer[_] => l.contains(v) 60 | case _ => false 61 | } 62 | } 63 | 64 | def struct(a: AnyRef): Boolean = { 65 | println("hello") 66 | true 67 | } 68 | 69 | def main(args: Array[String]) { 70 | val spark = 71 | SparkSession.builder() 72 | .appName("Custom") 73 | .master("local[4]") 74 | .getOrCreate() 75 | 76 | val transactions = spark.read.json("src/main/resources/data/mixed.json") 77 | transactions.printSchema() 78 | transactions.createOrReplaceTempView("transactions") 79 | 80 | spark.udf.register("struct", struct _) 81 | 82 | val all = 83 | spark.sql("SELECT a, id, struct(address) FROM transactions") 84 | all.foreach(r => println(r)) 85 | 86 | spark.udf.register("isAtomic", isAtomic _) 87 | spark.udf.register("arrayLength", arrayLength _) 88 | 89 | val lotsOfOrders = 90 | spark.sql("SELECT id FROM transactions WHERE arrayLength(orders) > 2") 91 | //lotsOfOrders.foreach(println) 92 | } 93 | 94 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/DataFrame_DropDuplicates.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object DataFrame_DropDuplicates { 6 | def main(args: Array[String]) { 7 | val spark = 8 | SparkSession.builder() 9 | .appName("DataFrame-DropDuplicates") 10 | .master("local[4]") 11 | .getOrCreate() 12 | 13 | import spark.implicits._ 14 | 15 | // create an RDD of tuples with some data 16 | val custs = Seq( 17 | (1, "Widget Co", 120000.00, 0.00, "AZ"), 18 | (2, "Acme Widgets", 410500.00, 500.00, "CA"), 19 | (3, "Widgetry", 410500.00, 200.00, "CA"), 20 | (4, "Widgets R Us", 410500.00, 0.0, "CA"), 21 | (3, "Widgetry", 410500.00, 200.00, "CA"), 22 | (5, "Ye Olde Widgete", 500.00, 0.0, "MA"), 23 | (6, "Widget Co", 12000.00, 10.00, "AZ")) 24 | val customerRows = spark.sparkContext.parallelize(custs, 4) 25 | 26 | // convert RDD of tuples to DataFrame by supplying column names 27 | val customerDF = customerRows.toDF("id", "name", "sales", "discount", "state") 28 | 29 | println("*** Here's the whole DataFrame with duplicates") 30 | 31 | customerDF.printSchema() 32 | 33 | customerDF.show() 34 | 35 | // drop fully identical rows 36 | val withoutDuplicates = customerDF.dropDuplicates() 37 | 38 | println("*** Now without duplicates") 39 | 40 | withoutDuplicates.show() 41 | 42 | // drop fully identical rows 43 | val withoutPartials = customerDF.dropDuplicates(Seq("name", "state")) 44 | 45 | println("*** Now without partial duplicates too") 46 | 47 | withoutPartials.show() 48 | 49 | } 50 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/DateTime.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | 3 | import java.sql.{ Date, Timestamp } 4 | 5 | import org.apache.spark.sql.{ Row, SparkSession } 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.functions._ 8 | 9 | // 10 | // Functions for querying against columns of DateType and TimestampType in 11 | // a DataFrame. 12 | // 13 | object DateTime { 14 | def main(args: Array[String]) { 15 | val spark = 16 | SparkSession.builder() 17 | .appName("DataFrame-DateTime") 18 | .master("local[4]") 19 | .getOrCreate() 20 | 21 | import spark.implicits._ 22 | 23 | val schema = StructType( 24 | Seq( 25 | StructField("id", IntegerType, true), 26 | StructField("dt", DateType, true), 27 | StructField("ts", TimestampType, true))) 28 | val rows = spark.sparkContext.parallelize( 29 | Seq( 30 | Row( 31 | 1, 32 | Date.valueOf("1999-01-11"), 33 | Timestamp.valueOf("2011-10-02 09:48:05.123456")), 34 | Row( 35 | 1, 36 | Date.valueOf("2004-04-14"), 37 | Timestamp.valueOf("2011-10-02 12:30:00.123456")), 38 | Row( 39 | 1, 40 | Date.valueOf("2008-12-31"), 41 | Timestamp.valueOf("2011-10-02 15:00:00.123456"))), 4) 42 | val tdf = spark.createDataFrame(rows, schema) 43 | 44 | println("DataFrame with both DateType and TimestampType") 45 | tdf.show() 46 | 47 | println("Pull a DateType apart when querying") 48 | tdf.select($"dt", year($"dt"), quarter($"dt"), month($"dt"), 49 | weekofyear($"dt"), dayofyear($"dt"), dayofmonth($"dt")).show() 50 | 51 | println("Date arithmetic") 52 | tdf.select($"dt", datediff(current_date(), $"dt"), 53 | date_sub($"dt", 20), 54 | date_add($"dt", 10), 55 | add_months($"dt", 6)).show() 56 | 57 | println("Date truncation") 58 | tdf.select($"dt", trunc($"dt", "YYYY"), trunc($"dt", "YY"), 59 | trunc($"dt", "MM")).show() 60 | 61 | println("Date formatting") 62 | tdf.select($"dt", date_format($"dt", "MMM dd, YYYY")).show() 63 | 64 | println("Pull a Timestamp type apart when querying") 65 | tdf.select($"ts", year($"ts"), hour($"ts"), minute($"ts"), second($"ts")).show() 66 | } 67 | 68 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/DropColumns.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object DropColumns { 6 | def main(args: Array[String]) { 7 | val spark = 8 | SparkSession.builder() 9 | .appName("DataFrame-DropColumns") 10 | .master("local[4]") 11 | .getOrCreate() 12 | 13 | import spark.implicits._ 14 | 15 | // create an RDD of tuples with some data 16 | val custs = Seq( 17 | (1, "Widget Co", 120000.00, 0.00, "AZ"), 18 | (2, "Acme Widgets", 410500.00, 500.00, "CA"), 19 | (3, "Widgetry", 410500.00, 200.00, "CA"), 20 | (4, "Widgets R Us", 410500.00, 0.0, "CA"), 21 | (5, "Ye Olde Widgete", 500.00, 0.0, "MA")) 22 | val customerRows = spark.sparkContext.parallelize(custs, 4) 23 | 24 | // convert RDD of tuples to DataFrame by supplying column names 25 | val customerDF = customerRows.toDF("id", "name", "sales", "discount", "state") 26 | 27 | println("*** Here's the whole DataFrame") 28 | 29 | customerDF.printSchema() 30 | 31 | customerDF.show() 32 | 33 | // remove a couple of columns 34 | val fewerCols = customerDF.drop("sales").drop("discount") 35 | 36 | println("*** Now with fewer columns") 37 | 38 | fewerCols.printSchema() 39 | 40 | fewerCols.show() 41 | 42 | } 43 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/GenerateUniqueId.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | import org.apache.spark.sql.functions._ 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.Row 5 | 6 | object GenerateUniqueId { 7 | def main(args: Array[String]): Unit = { 8 | 9 | val session = SparkSession.builder().appName("GenerateUniqueId").master("local[1]").getOrCreate() 10 | val sc = session.sparkContext 11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 12 | import session.implicits._ 13 | val df = sc.parallelize(Seq(("Databricks", 20000), ("Spark", 100000), ("Hadoop", 3000))).toDF("word", "count") 14 | 15 | //Option 1 => Using MontotonicallyIncreasingID or ZipWithUniqueId methods 16 | df.withColumn("uniqueID", monotonicallyIncreasingId).show() 17 | 18 | import org.apache.spark.sql.types.{ StructType, StructField, LongType } 19 | val df1 = sc.parallelize(Seq(("Databricks", 20000), ("Spark", 100000), ("Hadoop", 3000))).toDF("word", "count") 20 | val wcschema = df1.schema 21 | val inputRows = df1.rdd.zipWithUniqueId.map { 22 | case (r: Row, id: Long) => Row.fromSeq(id +: r.toSeq) 23 | } 24 | val wcID = sqlContext.createDataFrame(inputRows, StructType(StructField("id", LongType, false) +: wcschema.fields)) 25 | 26 | wcID.show() 27 | 28 | //Option 2 => Use Row_Number Function 29 | 30 | //With PartitionBy Column: 31 | 32 | val df2 = sc.parallelize(Seq(("Databricks", 20000), ("Spark", 100000), ("Hadoop", 3000))).toDF("word", "count") 33 | df2.createOrReplaceTempView("wordcount") 34 | val tmpTable = sqlContext.sql("select row_number() over (partition by word order by count) as rnk,word,count from wordcount") 35 | tmpTable.show() 36 | 37 | //Without PartitionBy Column: 38 | val tmpTable1 = sqlContext.sql("select row_number() over (order by count) as rnk,word,count from wordcount") 39 | tmpTable1.show() 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/HDFSFilesList.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.FileStatus 5 | import org.apache.hadoop.fs.FileSystem 6 | import org.apache.hadoop.fs.FileUtil 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.spark.sql.SparkSession 9 | 10 | object HDFSFilesList { 11 | private val conf = new Configuration() 12 | val fs = FileSystem.get(conf) 13 | val uri = conf.get("fs.default.name") 14 | 15 | def main(args: Array[String]) { 16 | 17 | val spark = SparkSession.builder().appName("Spark-Read-HDFS-Folders").master("local[*]") 18 | .getOrCreate() 19 | 20 | //Hdfs folder path 21 | var DATA_PATH = args(0) 22 | 23 | //No of Hdfs folders to read 24 | val intervalCount = 1 25 | 26 | var fileStatus: Array[FileStatus] = fs.listStatus(new Path(uri + DATA_PATH)) 27 | var paths: Array[Path] = FileUtil.stat2Paths(fileStatus) 28 | 29 | var filesWithInterval = getHDFSFoldersBasedOnModtime(intervalCount, fileStatus) 30 | 31 | if (fileStatus != null && filesWithInterval.length >= intervalCount) { 32 | val dataframeArray = filesWithInterval.map(folder => { 33 | println(folder.getPath.toString) 34 | val path = folder.getPath.toString 35 | //spark.read.parquet(folder.getPath.toString) 36 | }) 37 | } 38 | 39 | } 40 | 41 | //get the folders from HDFS based on the count provided. 42 | def getHDFSFoldersBasedOnModtime(intervalCount: Int, fileStatus: Array[FileStatus]): Array[FileStatus] = { 43 | var sortedList: List[FileStatus] = fileStatus.toList.sortWith(_.getModificationTime > _.getModificationTime) 44 | var returnList: List[FileStatus] = List() 45 | var itr: Int = 0 46 | var iterator = sortedList.iterator 47 | while (iterator.hasNext) { 48 | var value = iterator.next() 49 | if (itr < intervalCount) { 50 | returnList = returnList.::(value) 51 | itr = itr + 1 52 | } 53 | } 54 | returnList.toArray 55 | } 56 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/HandlingNulls.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object HandlingNulls { 6 | 7 | val spark = SparkSession.builder().appName("Handling-Nulls").master("local[*]") 8 | .getOrCreate() 9 | 10 | case class employee( 11 | employee_id: Int, first_name: String, last_name: String, 12 | email: String, phone_number: String, hire_date: String, 13 | job_id: String, salary: Float, commission_pct: Float, 14 | manager_id: Int, department_id: Int) 15 | 16 | private def checkNullForFloat(value: String): Float = { 17 | if (!"null".equals(value)) { 18 | return value.toFloat; 19 | } else if (!"".equals(value)) { 20 | return value.toFloat; 21 | } 22 | return 0f; 23 | } 24 | private def checkNullForInt(value: String): Int = { 25 | if (!"null".equals(value)) { 26 | return value.toInt; 27 | } else if (!"".equals(value)) { 28 | return value.toInt; 29 | } 30 | return 0; 31 | } 32 | def main(args: Array[String]): Unit = { 33 | 34 | val employeeData = spark.sparkContext.textFile("input/emp.txt") 35 | import spark.implicits._ 36 | 37 | val employeeDF = employeeData.map( 38 | rec => { 39 | var d = rec.split(",") 40 | employee(d(0).toInt, 41 | d(1), d(2), d(3), d(4), d(5), d(6), d(7).toFloat, 42 | checkNullForFloat(d(8)), 43 | checkNullForInt(d(9)), 44 | checkNullForInt(d(10))) 45 | 46 | }).toDF() 47 | 48 | 49 | //or another way of filtering null columns 50 | val employeeDF1 = employeeData.map( 51 | rec => { 52 | var d = rec.split(",") 53 | employee(d(0).toInt, 54 | d(1), d(2), d(3), d(4), d(5), d(6), d(7).toFloat, 55 | if (d(8).asInstanceOf[Any] != "null") d(8).toFloat else 0F, 56 | if (d(9).asInstanceOf[Any] != "null") d(9).toInt else 0, 57 | if (d(10).asInstanceOf[Any] != "null") d(10).toInt else 0) 58 | }).toDF() 59 | 60 | employeeDF.show() 61 | } 62 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/PartitionBy.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | 3 | import org.apache.spark.sql.SparkSession 4 | object PartitionBy { 5 | 6 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double) 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val session = SparkSession.builder().appName("Spark-PartitionBy").master("local[1]").getOrCreate() 11 | 12 | import session.implicits._ 13 | val purchaseDF = List( 14 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99), 15 | Purchase(122, 247, "2017-04-19", "15:30", "PST", 300.22), 16 | Purchase(123, 254, "2017-04-19", "00:50", "EST", 122.19), 17 | Purchase(124, 234, "2017-04-19", "20:50", "UTC", 500.99), 18 | Purchase(125, 247, "2017-04-19", "15:30", "PST", 300.22), 19 | Purchase(126, 254, "2017-04-19", "00:50", "EST", 122.19), 20 | Purchase(125, 250, "2017-04-19", "15:30", "PST", 300.22), 21 | Purchase(126, 251, "2017-04-19", "00:50", "EST", 122.19), 22 | Purchase(127, 299, "2017-04-19", "07:30", "UTC", 524.37)).toDF() 23 | 24 | purchaseDF.coalesce(1).write.parquet("input/parqOut1") 25 | 26 | val df = session.read.parquet("input/parqOut1") 27 | 28 | val duplicated = df.withColumn("_cust_id", $"customer_id") 29 | 30 | duplicated.coalesce(1).write.partitionBy("_cust_id").csv("input/csv/") 31 | } 32 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/PartitionByColumn.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | 3 | import org.apache.spark.sql.SparkSession 4 | /** 5 | * Partition the data by a specific column and store it partition wise. 6 | */ 7 | object PartitionByColumn { 8 | 9 | case class Emp(empId: Int, emp_name: String, deptId: Int, deptName: String, location: String) 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val session = SparkSession.builder().appName("PartitionByColumn").master("local[1]").getOrCreate() 14 | 15 | val emps = List( 16 | Emp(1, "Mike", 1, "Cloud", "BNGL"), 17 | Emp(2, "Shyam", 1, "Cloud", "HYD"), 18 | Emp(3, "Revanth", 2, "Bigdata", "BNGL"), 19 | Emp(4, "Raghu", 2, "Bigdata", "HYD"), 20 | Emp(6, "Apporva", 3, "Apac", "BNGL"), 21 | Emp(5, "Naga", 3, "Apac", "HYD")) 22 | 23 | val empDF = session.createDataFrame(emps) 24 | 25 | //Partitioning the data by deptName 26 | empDF.write.partitionBy("deptName").csv("output/test") 27 | 28 | //Partitioning the data by deptName,location 29 | empDF.write.partitionBy("deptName", "location").csv("output/test1") 30 | 31 | println("Done ....") 32 | } 33 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/PartitionBy_WithUDF.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | import org.apache.spark.sql.functions._ 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.SaveMode 5 | 6 | object PartitionBy_withUDF { 7 | 8 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double) 9 | 10 | val setDateMonth: (String) => String = (timeStamp: String) => { 11 | var date_hour_list = timeStamp.split(" ") 12 | var date = date_hour_list(0) 13 | var month = date.split("-") 14 | month(0) + "-" + month(1) 15 | } 16 | val setDateHour: (String) => String = (timeStamp: String) => { 17 | var date_hour_list = timeStamp.split(" ") 18 | var date = date_hour_list(0) 19 | var month = date.split("-") 20 | month(0) + "-" + month(1) 21 | var hour_min_sec = date_hour_list(1).split(":") 22 | date + "-" + hour_min_sec(0) 23 | } 24 | val getDateMonth = udf(setDateMonth) 25 | val getDateHour = udf(setDateHour) 26 | 27 | def main(args: Array[String]): Unit = { 28 | 29 | val session = SparkSession.builder().appName("Spark-PartitionBy").master("local[1]").getOrCreate() 30 | 31 | import session.implicits._ 32 | val purchaseDF = List( 33 | Purchase(121, 234, "2017-09-26 05:00:00.0", "20:50", "UTC", 500.99), 34 | Purchase(122, 247, "2017-07-26 05:00:00.0", "15:30", "PST", 300.22), 35 | Purchase(123, 254, "2017-09-26 05:00:00.0", "00:50", "EST", 122.19), 36 | Purchase(124, 234, "2017-09-26 04:00:00.0", "20:50", "UTC", 500.99), 37 | Purchase(125, 247, "2017-08-26 05:00:00.0", "15:30", "PST", 300.22), 38 | Purchase(126, 254, "2017-08-26 05:00:00.0", "00:50", "EST", 122.19), 39 | Purchase(125, 250, "2017-08-26 05:00:00.0", "15:30", "PST", 300.22), 40 | Purchase(121, 251, "2017-07-26 07:00:00.0", "00:50", "EST", 122.19), 41 | Purchase(127, 299, "2017-07-26 05:00:00.0", "07:30", "UTC", 524.37)).toDF() 42 | 43 | purchaseDF.coalesce(1).write.parquet("input/parqOut1") 44 | 45 | val df = session.read.parquet("input/parqOut1") 46 | 47 | df.printSchema() 48 | 49 | val finalDF = df.withColumn("date_month", getDateMonth(df.col("date"))).withColumn("date_hour", getDateHour(df.col("date"))) 50 | 51 | finalDF.coalesce(1).write.mode(SaveMode.Overwrite).partitionBy("date_month", "date_hour", "customer_id").csv("input/csv/") 52 | } 53 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataframes/ProblemStatement.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object ProblemStatement { 6 | 7 | def main(args: Array[String]) { 8 | val spark = 9 | SparkSession.builder() 10 | .appName("DataFrame-ProblemStatement") 11 | .master("local[2]") 12 | .getOrCreate() 13 | import spark.implicits._ 14 | 15 | //What are the best-salary and the second best-salary of employees in every department? 16 | 17 | val dataRDD = spark.read.format("csv").option("header", "true").load("/Users/revanthreddy/Desktop/Docs/empData.csv").rdd 18 | val filteredDF = dataRDD.map(x => (x(2).toString(), x(3).toString().replace("$", "").toDouble)).toDF("dept", "salary").dropDuplicates().toDF() 19 | 20 | val maxSalDF = filteredDF.groupBy("dept").agg(max(filteredDF.col("salary")).as("MaxSal")).sort("dept") 21 | maxSalDF.show 22 | 23 | val subDF = filteredDF.except(maxSalDF) 24 | 25 | val ScndMaxSalDF = subDF.groupBy("dept").agg(max(subDF.col("salary")).as("SecMaxSal")).sort("dept") 26 | ScndMaxSalDF.show 27 | 28 | val pboblem1ResDF = maxSalDF.join(ScndMaxSalDF, Seq("dept")).sort("dept").toDF() 29 | pboblem1ResDF.show 30 | pboblem1ResDF.coalesce(1).write.option("header", "true").csv("/Users/revanthreddy/Desktop/Docs/file1.csv") 31 | 32 | //What is the difference between the salary of each employee and the highest salary of employee in the same department? 33 | 34 | val pboblem2DF = dataRDD.map(x => (x(0).toString(), x(2).toString(), x(3).toString().replace("$", "").toDouble)).toDF("name", "dept", "salary").dropDuplicates().toDF() 35 | 36 | val resDF = pboblem2DF.join(maxSalDF, Seq("dept")).sort("dept").toDF() 37 | 38 | val pboblem2ResDF = resDF.withColumn("diffSal", (resDF.col("MaxSal") - resDF.col("salary"))) 39 | pboblem2ResDF.coalesce(1).write.option("header", "true").csv("/Users/revanthreddy/Desktop/Docs/file2.csv") 40 | 41 | } 42 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataset/DatasetBasic.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataset 2 | import org.apache.spark.sql.SparkSession 3 | 4 | // 5 | // Create Datasets of primitive type and tuple type ands show simple operations. 6 | // 7 | object DatasetBasic { 8 | // define a case class for the elements of the Dataset 9 | // NOTE: this needs to be outside the scope of the method where the 10 | // Dataset is created 11 | case class Number(i: Int, english: String, french: String) 12 | 13 | def main(args: Array[String]) { 14 | val spark = 15 | SparkSession.builder() 16 | .appName("Dataset-CaseClass") 17 | .master("local[4]") 18 | .getOrCreate() 19 | 20 | import spark.implicits._ 21 | 22 | val numbers = Seq( 23 | Number(1, "one", "un"), 24 | Number(2, "two", "deux"), 25 | Number(3, "three", "trois")) 26 | val numberDS = numbers.toDS() 27 | 28 | println("*** case class Dataset types") 29 | numberDS.dtypes.foreach(println(_)) 30 | 31 | // Since we used a case class we can query using the field names 32 | // as column names 33 | println("*** filter by one column and fetch another") 34 | numberDS.where($"i" > 2).select($"english", $"french").show() 35 | 36 | println("*** could have used SparkSession.createDataset() instead") 37 | val anotherDS = spark.createDataset(numbers) 38 | 39 | println("*** case class Dataset types") 40 | anotherDS.dtypes.foreach(println(_)) 41 | } 42 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataset/SemiStructuredData.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataset 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.SQLContext 5 | 6 | object SemiStructuredData { 7 | 8 | case class University(name: String, numStudents: Long, yearFounded: Long) 9 | def main(args: Array[String]): Unit = { 10 | 11 | val session = SparkSession.builder().appName("SemiStructuredData").master("local[1]").getOrCreate() 12 | 13 | import session.implicits._ 14 | val sc = session.sparkContext 15 | val sqlContext = new SQLContext(sc) 16 | 17 | val schools = sqlContext.read.json("input/schools.json").as[University] 18 | schools.printSchema() 19 | val res = schools.map(s => s"${s.name} is ${2017 - s.yearFounded} years old") 20 | res.foreach { x => println(x) } 21 | } 22 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/dataset/WordCountDS.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataset 2 | import org.apache.spark.sql.SQLContext 3 | import org.apache.spark.{ SparkConf, SparkContext } 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | 7 | object WordCountDS { 8 | def main(args: Array[String]): Unit = { 9 | 10 | val session = SparkSession.builder().appName("WordCountDS Application").master("local[1]").getOrCreate() 11 | 12 | import session.implicits._ 13 | val sc = session.sparkContext 14 | val sqlContext = new SQLContext(sc) 15 | 16 | //With Spark DataSets API 17 | //Since the Dataset version of word count can take advantage of the built-in aggregate count, 18 | // this computation can not only be expressed with less code, but it will also execute significantly faster. 19 | 20 | val ds = sqlContext.read.text("input/README.md").as[String] 21 | val result = ds 22 | .flatMap(_.split(" ")) // Split on whitespace 23 | .filter(_ != "") // Filter empty words 24 | .toDF() // Convert to DataFrame to perform aggregation / sorting 25 | .groupBy($"value") // Count number of occurences of each word 26 | .agg(count("*") as "numOccurances") 27 | .orderBy($"numOccurances" desc) // Show most common words first 28 | 29 | result.foreach { x => println(x) } 30 | 31 | } 32 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/elasticsearch/CsvToESLoad.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.elasticsearch 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.elasticsearch.spark.sql._ 5 | 6 | object CsvToESLoad { 7 | 8 | def main(args: Array[String]) { 9 | 10 | val spark = SparkSession.builder().appName("CsvToESLoad").master("local[*]").getOrCreate() 11 | 12 | val esConfig = Map(("es.nodes", "localhost"), ("es.port", "9200"), 13 | ("es.index.auto.create", "true"), ("es.http.timeout", "5m"), 14 | ("es.nodes.wan.only" -> "true")) 15 | 16 | val index = "realestatedata/data" 17 | 18 | import spark.implicits._ 19 | 20 | val esdf = spark.read.format("com.databricks.spark.csv") 21 | .option("header", "true") 22 | .option("inferSchema", "true") 23 | .load("input/Real_Estate_Data.csv") 24 | 25 | esdf.show(2, false) 26 | 27 | //writing to ElasticSearch index 28 | esdf.saveToEs(index, cfg = esConfig) 29 | 30 | //reading from ElasticSearch index 31 | val df = spark.read.format("org.elasticsearch.spark.sql").load(index) 32 | df.show(10, false) 33 | } 34 | //https://www.elastic.co/guide/en/elasticsearch/reference/current/getting-started.html 35 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/elasticsearch/Read_From_ES.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.elasticsearch 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.SparkSession 5 | object Read_From_ES { 6 | 7 | def main(args: Array[String]) { 8 | 9 | val spark = SparkSession.builder().appName("Spark-Read-ElasticSearch").master("local[*]").getOrCreate() 10 | 11 | val esConfig = Map(("es.nodes", "localhost"), ("es.port", "9200"), 12 | ("es.index.auto.create", "true"), ("es.http.timeout", "5m"), 13 | ("es.nodes.wan.only" -> "true")) 14 | 15 | // load the elasticsearch index into spark dataframe 16 | val df = spark.read.format("org.elasticsearch.spark.sql").load("blabla/joke") 17 | 18 | df.show(10, false) 19 | 20 | } 21 | 22 | //https://www.elastic.co/guide/en/elasticsearch/reference/current/getting-started.html 23 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/elasticsearch/Write_To_ES.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.elasticsearch 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.elasticsearch.spark.sql._ 5 | object Write_To_ES { 6 | case class SimpsonCharacter(name: String, actor: String, episodeDebut: String) 7 | 8 | def main(args: Array[String]) { 9 | 10 | val spark = SparkSession.builder().appName("Spark-Write-ElasticSearch").master("local[*]").getOrCreate() 11 | 12 | val index = "shows/data" 13 | 14 | val esConfig = Map(("es.nodes", "localhost"), ("es.port", "9200"), 15 | ("es.index.auto.create", "true"), ("es.http.timeout", "5m"), 16 | ("es.nodes.wan.only" -> "true")) 17 | 18 | import spark.implicits._ 19 | 20 | val simpsonsDF = spark.sparkContext.parallelize( 21 | SimpsonCharacter("Homer", "Dan Castellaneta", "Good Night") :: 22 | SimpsonCharacter("Marge", "Julie Kavner", "Good Night") :: 23 | SimpsonCharacter("Bart", "Nancy Cartwright", "Good Night") :: 24 | SimpsonCharacter("Lisa", "Yeardley Smith", "Good Night") :: 25 | SimpsonCharacter("Maggie", "Liz Georges and more", "Good Night") :: 26 | SimpsonCharacter("Sideshow Bob", "Kelsey Grammer", "The Telltale Head") :: 27 | Nil).toDF().repartition(1) 28 | 29 | //writing to ElasticSearch index 30 | simpsonsDF.saveToEs(index, cfg = esConfig) 31 | 32 | //reading from ElasticSearch index 33 | val df = spark.read.format("org.elasticsearch.spark.sql").load(index) 34 | df.show(10, false) 35 | } 36 | //https://www.elastic.co/guide/en/elasticsearch/reference/current/getting-started.html 37 | //https://marekj.github.io/2016/03/22/elasticsearch-mac-osx 38 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/examples/ExplodeDemo.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.examples 2 | import org.apache.spark.sql.{ Row, SQLContext } 3 | import org.apache.spark.{ SparkConf, SparkContext } 4 | import org.apache.spark.sql.functions._ 5 | object ExplodeDemo { 6 | 7 | def main(args: Array[String]): Unit = { 8 | val conf = new SparkConf() 9 | .setAppName("ExplodeDemo") 10 | .setMaster("local") 11 | val sc = new SparkContext(conf) 12 | val sqlContext = new SQLContext(sc) 13 | import sqlContext.implicits._ 14 | val df = sc.parallelize(Seq((1, Seq(2, 3, 4), Seq(5, 6, 7)), (2, Seq(3, 4, 5), Seq(6, 7, 8)), (3, Seq(4, 5, 6), Seq(7, 8, 9)))).toDF("a", "b", "c") 15 | 16 | val df1 = df.select(df("a"), explode(df("b")).alias("b_columns"), df("c")) 17 | 18 | val df2 = df1.select(df1("a"), df1("b_columns"), explode(df1("c").alias("c_columns"))) 19 | 20 | df.show() 21 | df1.show() 22 | df2.show() 23 | } 24 | 25 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/examples/Filter.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.examples 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | object Filter { 5 | def main(args: Array[String]) { 6 | val conf = new SparkConf().setAppName("Spark Filter Example") 7 | 8 | 9 | val sc = new SparkContext(conf) 10 | println("------------ Printing Spark configs ------------") 11 | 12 | sc.getConf.getAll.foreach(f => println(f)) 13 | val x = sc.parallelize(List("Transformation demo", "Test demo", "Filter demo", "Spark is powerfull", "Spark is faster", "Spark is in memory")) 14 | val lines1 = x.filter(line => line.contains("Spark") || line.contains("Transformation")) 15 | println("\n") 16 | println("---------------------------------------------") 17 | lines1.collect().foreach { line => println(line) } 18 | 19 | val lines = x.filter(line => !line.contains("Filter")) 20 | println("---------------------------------------------") 21 | lines.collect().foreach { line => println(line) } 22 | println("---------------------------------------------") 23 | val count = x.filter(line => line.contains("Spark")).count() 24 | println("count is : " + count) 25 | } 26 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/examples/FilterEmpty.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.examples 2 | import org.apache.spark.sql.SparkSession 3 | 4 | //The problem statement is remove the rows having product_price as null or empty 5 | object FilterEmpty extends App { 6 | 7 | private def checkNullForFloat(value: String): Float = { 8 | if (!"".equals(value)) { 9 | return value.toFloat; 10 | } 11 | return 0; 12 | } 13 | case class Product(product_id: Int, product_category_id: Int, product_name: String, product_description: String, product_price: Float, product_image: String) 14 | val session = SparkSession.builder().appName("Spark-FilterEmpty") 15 | .master("local[1]").getOrCreate() 16 | 17 | import session.implicits._ 18 | val rawRDD = session.sparkContext.textFile("input/product") 19 | 20 | val dummyRDD = rawRDD.map(_.split("\\,")).map(p => (p(0).toInt, p(1)toInt, p(2), p(3), p(4), p(5))) 21 | val filteredRDD = dummyRDD.filter(x => (x._5 != null) && (x._5.length > 0)) 22 | 23 | filteredRDD.map(f => Product(f._1, f._2, f._3, f._4, f._5.toFloat, f._6)).toDF() 24 | .sort($"product_price".desc).show() 25 | 26 | //OR 27 | val prodRDD = rawRDD.map(_.split("\\,")).map(p => Product(p(0).toInt, p(1)toInt, p(2), p(3), checkNullForFloat(p(4)), p(5))) 28 | 29 | //removing the products that have product_price = 0.0 30 | val resDF = prodRDD.filter(x => x.product_price != 0.0).toDF() 31 | 32 | resDF.sort($"product_price".desc).show() 33 | 34 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/examples/LoadPropsFile.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.dataframes 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import java.util.Properties; 5 | import org.apache.hadoop.fs.FSDataInputStream; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.spark.SparkFiles; 9 | import org.apache.hadoop.conf.Configuration 10 | 11 | object LoadPropsFile { 12 | def main(args: Array[String]): Unit = { 13 | 14 | val spark = SparkSession.builder().appName("Loading-PropsFile-Spark").master("local[*]") 15 | .getOrCreate() 16 | 17 | val hdfsConf = new Configuration() 18 | val fs = FileSystem.get(hdfsConf) 19 | 20 | //file should be in HDFS directory 21 | val is = fs.open(new Path("/user/centos/input/conf.properties")) 22 | val prop = new Properties() 23 | 24 | //load properties 25 | prop.load(is) 26 | 27 | //retrieve properties 28 | val tidList = prop.getProperty("tidList") 29 | println("tidList--> " + tidList) 30 | 31 | val topicsList = prop.getProperty("topics") 32 | println("topicsList--> " + topicsList) 33 | } 34 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/examples/ParquetCompactor.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.examples 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ParquetCompactor { 6 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double) 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val session = SparkSession.builder().appName("ParquetCompactor").master("local[1]").getOrCreate() 11 | 12 | import session.implicits._ 13 | val purchaseDF = List( 14 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99), 15 | Purchase(122, 247, "2017-04-19", "15:30", "PST", 300.22), 16 | Purchase(185, 254, "2017-04-19", "00:50", "EST", 122.19), 17 | Purchase(186, 299, "2017-04-19", "07:30", "UTC", 524.37)).toDF() 18 | 19 | //purchaseDF.write.parquet("input/parqOut1") 20 | 21 | // val df = session.read.parquet("input/parqOut") 22 | // 23 | // df.show() 24 | // print("count before :" + df.count()) 25 | // val dropedDF = df.dropDuplicates("customer_id") 26 | // println("count after :" + dropedDF.count()) 27 | // dropedDF.show() 28 | 29 | val df = session.read.parquet("/Users/revanthreddy/Desktop/date_month=2017-04") 30 | df.printSchema() 31 | println("count before :" + df.count()) 32 | //df.write.parquet("input/parqOut1") 33 | 34 | val resdf = session.read.parquet("input/parqOut1") 35 | println("count after :" + resdf.count()) 36 | } 37 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/examples/Spark_Accumulator.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.examples 2 | 3 | import org.apache.spark.{ SparkContext, SparkConf } 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object Spark_Accumulator { 8 | def main(args: Array[String]): Unit = { 9 | 10 | val sparkSession = SparkSession.builder. 11 | master("local") 12 | .appName("Spark_Accumulator") 13 | .getOrCreate() 14 | 15 | val sc = sparkSession.sparkContext 16 | 17 | val badPkts = sc.longAccumulator("bad.packets") 18 | val zeroValueSales = sc.longAccumulator("Zero.Value.Sales") 19 | val missingFields = sc.longAccumulator("Missing.Fields") 20 | val blankLines = sc.longAccumulator("Blank.Lines") 21 | 22 | sc.textFile("input/purchases.log", 4) 23 | .foreach { line => 24 | 25 | if (line.length() == 0) blankLines.add(1) 26 | else if (line.contains("Bad data packet")) badPkts.add(1) 27 | else { 28 | val fields = line.split("\t") 29 | if (fields.length != 4) missingFields.add(1) 30 | else if (fields(3).toFloat == 0) zeroValueSales.add(1) 31 | } 32 | } 33 | 34 | println("Purchase Log Analysis Counters:") 35 | println(s"\tBad Data Packets=${badPkts.value}") 36 | println(s"\tZero Value Sales=${zeroValueSales.value}") 37 | println(s"\tMissing Fields=${missingFields.value}") 38 | println(s"\tBlank Lines=${blankLines.value}") 39 | 40 | sc.stop 41 | } 42 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/examples/Spark_CatalogAPI.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.examples 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object Spark_CatalogAPI { 6 | 7 | def main(args: Array[String]) { 8 | 9 | val sparkSession = SparkSession.builder. 10 | master("local[2]") 11 | .appName("Spark-Catalog-Example") 12 | .enableHiveSupport() 13 | .config("spark.sql.warehouse.dir", "/Users/revanthreddy/Project/Spark-2.1/spark-warehouse") 14 | .getOrCreate() 15 | 16 | //interacting with catalogue 17 | 18 | val catalog = sparkSession.catalog 19 | 20 | //print the databases 21 | 22 | catalog.listDatabases().foreach { x => println(x) } 23 | catalog.setCurrentDatabase("airline_db") 24 | catalog.listTables.show 25 | catalog.listColumns("airline").foreach { x => println(x) } 26 | 27 | import sparkSession.implicits._ 28 | import sparkSession.sql 29 | 30 | sql("SELECT * FROM airline limit 3").show() 31 | } 32 | 33 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/examples/Spark_To_Caasandra.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.examples 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.SQLContext 6 | import org.apache.spark.sql.SparkSession 7 | object Spark_To_Caasandra extends App { 8 | 9 | val spark = SparkSession.builder().appName("Spark_To_Caasandra").master("local[1]").getOrCreate() 10 | 11 | spark.conf.set("spark.sql.shuffle.partitions", "2") 12 | spark.conf.set("spark.cassandra.connection.host", "127.0.0.1") 13 | 14 | val df = spark.read.format("org.apache.spark.sql.cassandra").option("table", "emp") 15 | .option("keyspace", "dev") 16 | .load() 17 | 18 | df.printSchema() 19 | 20 | df.show() 21 | 22 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/examples/Test.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.examples 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.DataFrame 6 | import org.apache.spark.sql.Row 7 | import org.apache.spark.sql.catalyst.InternalRow 8 | import org.apache.spark.unsafe.types.UTF8String 9 | import org.apache.spark.sql.catalyst.expressions.UnsafeRow 10 | import org.apache.spark.sql.types.StructType 11 | import org.apache.spark.sql.types.StructField 12 | import org.apache.spark.sql.types.IntegerType 13 | import org.apache.spark.sql.types.StringType 14 | import org.apache.spark.sql.SQLContext 15 | import org.apache.spark.sql.SparkSession 16 | import org.apache.spark.sql.Column 17 | import org.apache.spark.sql.Dataset 18 | 19 | case class Person(name: String, empId: Int) 20 | case class Employee(empId: Int,emp_name:String) 21 | 22 | case class Select(cols: Column*) { 23 | def transform(ds: DataFrame) = ds.select(cols: _*) 24 | } 25 | 26 | object Test { 27 | 28 | def main(args: Array[String]): Unit = { 29 | 30 | val session = SparkSession.builder().appName("test").master("local[1]").getOrCreate() 31 | 32 | val person = Array(Person("John", 1), Person("Mike", 2)) 33 | 34 | val employee = Array(Employee(1,"Aruba")) 35 | 36 | val personDf = session.createDataFrame(person) 37 | 38 | val employeeDf = session.createDataFrame(employee) 39 | 40 | val joinDf = personDf.join(employeeDf, Seq("empId"), "left") 41 | 42 | joinDf.write.partitionBy("name").parquet("output/test") 43 | 44 | 45 | joinDf.show() 46 | 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/fileformats/AvroToJson.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.fileformats 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import com.databricks.spark.avro._ 5 | object AvroToJson { 6 | case class Emp(empId: Int, emp_name: String, deptId: Int, deptName: String, location: String) 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val spark = SparkSession.builder().appName("Spark_ToAvro").master("local[1]").getOrCreate() 11 | spark.conf.set("spark.sql.avro.compression.codec", "snappy") 12 | import spark.implicits._ 13 | 14 | val empDF = List( 15 | Emp(1, "Mike", 1, "Cloud", "BNGL"), 16 | Emp(2, "Shyam", 1, "Cloud", "HYD"), 17 | Emp(3, "Revanth", 2, "Bigdata", "BNGL"), 18 | Emp(4, "Raghu", 2, "Bigdata", "HYD"), 19 | Emp(6, "Apporva", 3, "Apac", "BNGL"), 20 | Emp(5, "Naga", 3, "Apac", "HYD")).toDF() 21 | 22 | empDF.write.avro("output/to_avro") 23 | 24 | val avroDF = spark.read.avro("output/to_avro") 25 | avroDF.show 26 | 27 | avroDF.coalesce(1).write.option("compression", "gzip").json("output/avro_to_json") 28 | 29 | } 30 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/fileformats/NestedJsonParser.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.fileformats 2 | import org.apache.spark.sql.SparkSession 3 | import org.apache.spark.sql.types.ArrayType 4 | import org.apache.spark.sql.types.StringType 5 | import org.apache.spark.sql.types.StructField 6 | import org.apache.spark.sql.types.StructType 7 | import org.apache.spark.sql.functions._ 8 | object NestedJsonParser extends App { 9 | 10 | val session = SparkSession.builder().appName("Spark-JsonParser") 11 | .master("local[1]").getOrCreate() 12 | 13 | val schema = StructType(List( 14 | StructField("queryResults", StructType( 15 | List(StructField("searchResponse", StructType( 16 | List(StructField("response", StructType( 17 | List(StructField("docs", ArrayType(StructType( 18 | List( 19 | StructField("appCustNumber", StringType, nullable = true), 20 | StructField("transactions", ArrayType(StructType( 21 | List( 22 | StructField("code", StringType, nullable = true), 23 | StructField("description", StringType, nullable = true), 24 | StructField("recordDate", StringType, nullable = true)))))))))))))))))))) 25 | 26 | val dff = session.read.schema(schema).json("input/nested.json") 27 | println(dff.printSchema()) 28 | 29 | val dfContent = dff.select(explode(dff("queryResults.searchResponse.response.docs.transactions"))).toDF("transaction") 30 | val code = dfContent.select("transaction.code") 31 | code.show(false) 32 | 33 | val dfFinal = dfContent.select("transaction.code", "transaction.description", "transaction.recordDate") 34 | dfFinal.show(false) 35 | 36 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/fileformats/Simple_XMLParser.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.fileformats 2 | import org.apache.spark.sql.SparkSession 3 | 4 | object Simple_XMLParser { 5 | 6 | def main(args: Array[String]): Unit = { 7 | 8 | val xmlFilePath = args(0) 9 | val spark = SparkSession.builder().appName("Spark-XMLParsing").master("local[*]").getOrCreate() 10 | spark.conf.set("spark.debug.maxToStringFields", "10000000") 11 | 12 | val rawDataDF = spark.read.format("com.databricks.spark.xml") 13 | .option("rowTag", "book") 14 | .option("treatEmptyValuesAsNulls", true) 15 | .load(xmlFilePath) 16 | 17 | println("Total books count : " + rawDataDF.count()) 18 | 19 | val selectedData = rawDataDF.select("author", "_id") 20 | 21 | selectedData.show(10, false) 22 | 23 | } 24 | 25 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/fileformats/Simple_XMLParser1.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.fileformats 2 | import org.apache.spark.sql.SparkSession 3 | import org.apache.spark.sql.types.{ StructType, StructField, StringType, DoubleType }; 4 | 5 | object Simple_XMLParser1 { 6 | 7 | val customSchema = StructType(Array( 8 | StructField("_id", StringType, nullable = true), 9 | StructField("author", StringType, nullable = true), 10 | StructField("description", StringType, nullable = true), 11 | StructField("genre", StringType, nullable = true), 12 | StructField("price", DoubleType, nullable = true), 13 | StructField("publish_date", StringType, nullable = true), 14 | StructField("title", StringType, nullable = true))) 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | val xmlFilePath = "input/books.xml" 19 | val spark = SparkSession.builder().appName("Spark-XMLParsing").master("local[*]").getOrCreate() 20 | spark.conf.set("spark.debug.maxToStringFields", "10000000") 21 | 22 | val rawDataDF = spark.read.format("com.databricks.spark.xml") 23 | .option("rowTag", "book") 24 | .option("treatEmptyValuesAsNulls", true) 25 | .schema(customSchema) 26 | .load(xmlFilePath) 27 | 28 | val selectedData = rawDataDF.select("author", "_id") 29 | 30 | selectedData.write 31 | .format("com.databricks.spark.xml") 32 | .option("rootTag", "books") 33 | .option("rowTag", "book") 34 | .save("output/newbooks.xml") 35 | 36 | } 37 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/fileformats/Spark_To_ObjectFile.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.fileformats 2 | import org.apache.spark.sql.SparkSession 3 | 4 | object Spark_To_ObjectFile { 5 | 6 | case class Emp(empId: Int, emp_name: String, deptId: Int, deptName: String, location: String) 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val session = SparkSession.builder().appName("Spark_To_ObjectFile").master("local[1]").getOrCreate() 11 | val sc = session.sparkContext 12 | 13 | val nums = sc.makeRDD(1 to 3).map(x => (x, "a" * x)) 14 | nums.saveAsObjectFile("output/test") 15 | 16 | // Try reading the output back as an object file 17 | val output = sc.objectFile[(Int, String)]("output/test") 18 | output.foreach(f => println(f)) 19 | 20 | val emps = List( 21 | Emp(1, "Mike", 1, "Cloud", "BNGL"), 22 | Emp(2, "Shyam", 1, "Cloud", "HYD"), 23 | Emp(3, "Revanth", 2, "Bigdata", "BNGL"), 24 | Emp(4, "Raghu", 2, "Bigdata", "HYD"), 25 | Emp(6, "Apporva", 3, "Apac", "BNGL"), 26 | Emp(5, "Naga", 3, "Apac", "HYD")) 27 | 28 | //Saving rdd as ObjectFile and reading back 29 | val empRDD = sc.parallelize(emps) 30 | empRDD.saveAsObjectFile("output/rdd_to_obj") 31 | 32 | val resRDD = sc.objectFile[Any]("output/rdd_to_obj") 33 | resRDD.foreach(f => println(f)) 34 | 35 | //Saving DataFrame as ObjectFile and reading back 36 | import session.implicits._ 37 | val empDF = emps.toDF() 38 | empDF.rdd.saveAsObjectFile("output/df_to_obj") 39 | 40 | val resDF = sc.objectFile[Any]("output/df_to_obj") 41 | resDF.foreach(f => println(f)) 42 | 43 | } 44 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/fileformats/Spark_To_SequenceFiles.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.fileformats 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.hadoop.io.compress.GzipCodec 5 | 6 | object Spark_To_SequenceFiles { 7 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double) 8 | 9 | def main(args: Array[String]): Unit = { 10 | 11 | val session = SparkSession.builder().appName("Spark_To_SequenceFiles").master("local[1]").getOrCreate() 12 | 13 | import session.implicits._ 14 | val purchaseDF = List( 15 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99), 16 | Purchase(122, 247, "2017-05-20", "15:30", "PST", 300.22), 17 | Purchase(123, 254, "2016-03-09", "00:50", "EST", 122.19), 18 | Purchase(124, 234, "2016-02-14", "20:50", "UTC", 500.99), 19 | Purchase(125, 247, "2015-01-13", "15:30", "PST", 300.22), 20 | Purchase(126, 254, "2015-05-16", "00:50", "EST", 122.19), 21 | Purchase(127, 250, "2016-09-17", "15:30", "PST", 300.22), 22 | Purchase(128, 251, "2018-08-15", "00:50", "EST", 122.19), 23 | Purchase(129, 299, "2019-02-19", "07:30", "UTC", 524.37)).toDF() 24 | 25 | import org.apache.spark.rdd.RDD 26 | import org.apache.spark.sql.Row 27 | 28 | val purchaseRDD: RDD[(Int, String)] = purchaseDF.rdd.map { 29 | case r: Row => (r.getAs[Int](0), r.getAs[String](2)) 30 | } 31 | purchaseRDD.saveAsSequenceFile("output/rdd_to_seq") 32 | 33 | //Loading sequenceFiles into an RDD in Spark 34 | 35 | val data: RDD[(Int, String)] = session.sparkContext.sequenceFile("output/rdd_to_seq") 36 | 37 | data.foreach(f => println(f)) 38 | 39 | } 40 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/fileformats/ToParquet.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.fileformats 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ToParquet { 6 | case class Emp(empId: Int, emp_name: String, deptId: Int, deptName: String, location: String) 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val spark = SparkSession.builder().appName("Spark_ToAvro").master("local[1]").getOrCreate() 11 | spark.conf.set("spark.sql.parquet.compression.codec", "gzip") 12 | import spark.implicits._ 13 | 14 | val empDF = List( 15 | Emp(1, "Mike", 1, "Cloud", "BNGL"), 16 | Emp(2, "Shyam", 1, "Cloud", "HYD"), 17 | Emp(3, "Revanth", 2, "Bigdata", "BNGL"), 18 | Emp(4, "Raghu", 2, "Bigdata", "HYD"), 19 | Emp(6, "Apporva", 3, "Apac", "BNGL"), 20 | Emp(5, "Naga", 3, "Apac", "HYD")).toDF() 21 | 22 | empDF.coalesce(1).write.parquet("output/to_parquet") 23 | 24 | val parquetDF = spark.read.parquet("output/to_parquet") 25 | parquetDF.show 26 | 27 | } 28 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/fileformats/XMLParsing.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.fileformats 2 | import org.apache.spark.sql.SQLContext 3 | import com.databricks.spark.xml._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql 8 | object XMLParsing { 9 | //spark-submit --class com.spark2.fileformats.XMLParsing --master local[*] Spark-2.1-1.0.jar file:////home/centos/revanth/one.xml 10 | def main(args: Array[String]): Unit = { 11 | 12 | val xmlFilePath = args(0) 13 | val spark = SparkSession.builder().appName("XMLParsing").getOrCreate() 14 | spark.conf.set("spark.debug.maxToStringFields", "10000000") 15 | 16 | import spark.implicits._ 17 | 18 | val df = spark.read.format("com.databricks.spark.xml") 19 | .option("rowTag", "us-bibliographic-data-grant") 20 | .option("treatEmptyValuesAsNulls", true) 21 | .load(xmlFilePath) 22 | 23 | val q1 = df.withColumn("country", $"publication-reference.document-id.country".cast(sql.types.StringType)) 24 | .withColumn("document_number", $"publication-reference.document-id.doc-number".cast(sql.types.StringType)).select("country", "document_number") 25 | for (l <- q1) { 26 | val m1 = l.get(0) 27 | val m2 = l.get(1) 28 | println(m1, m2) 29 | } 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/hive/Save_To_Hive.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.hive 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.SaveMode 5 | 6 | object Save_To_Hive { 7 | 8 | case class Purchase(customer_id: Int, purchase_id: Int, day: String, time: String, tz: String, amount: Double) 9 | 10 | def main(args: Array[String]) { 11 | 12 | // warehouseLocation points to the default location for managed databases and tables 13 | val warehouseLocation = "file:${system:user.dir}/spark-warehouse" 14 | 15 | val spark = SparkSession.builder. 16 | master("local[2]") 17 | .appName("Save_Saprk_To_Hive-Example") 18 | .enableHiveSupport() 19 | .config("spark.sql.warehouse.dir", warehouseLocation) 20 | .config(" hive.metastore.warehouse.dir", "/user/hive/warehouse") 21 | .getOrCreate() 22 | 23 | import spark.implicits._ 24 | import spark.sql 25 | 26 | val purchaseDF = List( 27 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99), 28 | Purchase(122, 247, "2017-04-19", "15:30", "PST", 300.22), 29 | Purchase(123, 254, "2017-04-19", "00:50", "EST", 122.19), 30 | Purchase(124, 234, "2017-04-19", "20:50", "UTC", 500.99), 31 | Purchase(125, 247, "2017-04-19", "15:30", "PST", 300.22), 32 | Purchase(126, 254, "2017-04-19", "00:50", "EST", 122.19), 33 | Purchase(125, 250, "2017-04-19", "15:30", "PST", 300.22), 34 | Purchase(126, 251, "2017-04-19", "00:50", "EST", 122.19), 35 | Purchase(127, 299, "2017-04-19", "07:30", "UTC", 524.37)).toDF() 36 | 37 | //Storing in to hive internal/managed tables 38 | purchaseDF.coalesce(1).write.mode(SaveMode.Append).insertInto("sales") 39 | 40 | //loading the data from the table 41 | val salesDf = spark.read.table("sales") 42 | salesDf.show 43 | //or 44 | sql("SELECT * FROM sales").show() 45 | 46 | //Storing in to hive external tables 47 | purchaseDF.coalesce(1).write.mode(SaveMode.Append).insertInto("sales_ext") 48 | 49 | sql("SELECT * FROM sales_ext").show() 50 | 51 | } 52 | 53 | /* CREATE TABLE IF NOT EXISTS sales ( customer_id int, purchase_id int,day String, time String, tz String, amount double) 54 | COMMENT 'Sales Data' 55 | ROW FORMAT DELIMITED 56 | FIELDS TERMINATED BY ',' 57 | LINES TERMINATED BY '\n' 58 | STORED AS TEXTFILE LOCATION '/user/centos/hive/sale'; 59 | */ 60 | 61 | /* 62 | CREATE EXTERNAL TABLE IF NOT EXISTS sales_ext ( customer_id int, purchase_id int,day String, time String, tz String, amount double) 63 | COMMENT 'Sales Data' 64 | ROW FORMAT DELIMITED 65 | FIELDS TERMINATED BY ',' 66 | LINES TERMINATED BY '\n' 67 | STORED AS TEXTFILE LOCATION '/user/centos/hive/sale_ext'; 68 | */ 69 | 70 | } 71 | -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/hive/Spark_CatalogAPI.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.hive 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object Spark_CatalogAPI { 6 | 7 | def main(args: Array[String]) { 8 | 9 | // warehouseLocation points to the default location for managed databases and tables 10 | val warehouseLocation = "file:${system:user.dir}/spark-warehouse" 11 | 12 | val spark = SparkSession.builder. 13 | master("local[2]") 14 | .appName("Spark-Catalog-Example") 15 | .enableHiveSupport() 16 | .config("spark.sql.warehouse.dir", warehouseLocation) 17 | .getOrCreate() 18 | 19 | //interacting with catalogue 20 | 21 | val catalog = spark.catalog 22 | 23 | //print the databases 24 | 25 | catalog.listDatabases().foreach { x => println(x) } 26 | catalog.setCurrentDatabase("default") 27 | catalog.listTables.show 28 | catalog.listColumns("employee").foreach { x => println(x) } 29 | 30 | import spark.implicits._ 31 | import spark.sql 32 | 33 | sql("SELECT * FROM employee").show() 34 | } 35 | 36 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/mangodb/Spark_To_MangoDB.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.mangodb 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql._ 5 | import com.mongodb.casbah.{ WriteConcern => MongodbWriteConcern } 6 | import com.stratio.datasource.mongodb._ 7 | import com.stratio.datasource.mongodb.config._ 8 | import com.stratio.datasource.mongodb.config.MongodbConfig._ 9 | 10 | object Spark_To_MangoDB { 11 | 12 | case class Student(name: String, age: Int, gender: String) 13 | 14 | def main(args: Array[String]) { 15 | 16 | val spark = SparkSession.builder().appName("Spark_To_MangoDB").master("local[1]").getOrCreate() 17 | 18 | //To save a DataFrame in MongoDB 19 | val saveConfig = MongodbConfigBuilder(Map(Host -> List("localhost:27017"), Database -> "dev", Collection -> "students", SamplingRatio -> 1.0, WriteConcern -> "normal", SplitSize -> 8, SplitKey -> "_id")) 20 | 21 | val df: DataFrame = spark.createDataFrame(spark.sparkContext.parallelize( 22 | List(Student("ravali", 27, "female"), Student("abc", 34, "male")))) 23 | 24 | df.saveToMongodb(saveConfig.build) 25 | 26 | //fromMongoDB() function to read from MongoDB and transform it to a DataFrame 27 | val builder = MongodbConfigBuilder(Map(Host -> List("localhost:27017"), Database -> "dev", Collection -> "students", SamplingRatio -> 1.0, WriteConcern -> "normal")) 28 | val readConfig = builder.build 29 | val mongoRDD = spark.sqlContext.fromMongoDB(readConfig) 30 | mongoRDD.createTempView("students") 31 | 32 | val dataFrame = spark.sql("SELECT name, age,gender FROM students") 33 | dataFrame.show 34 | 35 | //Using DataFrameWriter 36 | import org.apache.spark.sql._ 37 | val options = Map("host" -> "localhost:27017", "database" -> "dev", "collection" -> "students") 38 | val dfw: DataFrame = spark.createDataFrame(spark.sparkContext.parallelize(List(Student("ravi", 30, "female")))) 39 | dfw.write.format("com.stratio.datasource.mongodb").mode(SaveMode.Append).options(options).save() 40 | val resDF = spark.read.format("com.stratio.datasource.mongodb").options(options).load 41 | resDF.show 42 | 43 | } 44 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/parquet/FileCompression.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.parquet 2 | 3 | import org.apache.spark.sql.{ SQLContext, SparkSession } 4 | import org.apache.spark.{ SparkConf, SparkContext } 5 | object FileCompression { 6 | 7 | case class DataFrameSample(name: String, actor: String, episodeDebut: String) 8 | 9 | def main(args: Array[String]): Unit = { 10 | 11 | val spark = SparkSession 12 | .builder() 13 | .appName("Spark File Compression Handling") 14 | .master("local[1]") 15 | .getOrCreate() 16 | 17 | val df = spark.createDataFrame( 18 | DataFrameSample("Homer", "Dan Castellaneta", "Good Night") :: 19 | DataFrameSample("Marge", "Julie Kavner", "Good Night") :: 20 | DataFrameSample("Bart", "Nancy Cartwright", "Good Night") :: 21 | DataFrameSample("Lisa", "Yeardley Smith", "Good Night") :: 22 | DataFrameSample("Maggie", "Liz Georges and more", "Good Night") :: 23 | DataFrameSample("Sideshow Bob", "Kelsey Grammer", "The Telltale Head") :: 24 | Nil).toDF().cache() 25 | 26 | df.write.mode("overwrite").format("parquet").option("compression", "none").mode("overwrite").save("/tmp/file_no_compression_parq") 27 | df.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save("/tmp/file_with_gzip_parq") 28 | df.write.mode("overwrite").format("parquet").option("compression", "snappy").mode("overwrite").save("/tmp/file_with_snappy_parq") 29 | //lzo - requires a different method in terms of implementation. 30 | 31 | df.write.mode("overwrite").format("orc").option("compression", "none").mode("overwrite").save("/tmp/file_no_compression_orc") 32 | df.write.mode("overwrite").format("orc").option("compression", "snappy").mode("overwrite").save("/tmp/file_with_snappy_orc") 33 | df.write.mode("overwrite").format("orc").option("compression", "zlib").mode("overwrite").save("/tmp/file_with_zlib_orc") 34 | } 35 | 36 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/parquet/ParquetCompactor.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.parquet 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ParquetCompactor { 6 | case class Purchase(customer_id: Int, purchase_id: Int, date: String, time: String, tz: String, amount: Double) 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val session = SparkSession.builder().appName("ParquetCompactor").master("local[1]").getOrCreate() 11 | 12 | import session.implicits._ 13 | val purchaseDF = List( 14 | Purchase(121, 234, "2017-04-19", "20:50", "UTC", 500.99), 15 | Purchase(122, 247, "2017-04-19", "15:30", "PST", 300.22), 16 | Purchase(185, 254, "2017-04-19", "00:50", "EST", 122.19), 17 | Purchase(186, 299, "2017-04-19", "07:30", "UTC", 524.37)).toDF() 18 | 19 | purchaseDF.write.parquet("input/parqOut") 20 | 21 | val df = session.read.parquet("input/parqOut") 22 | 23 | df.show() 24 | print("count before dropping :" + df.count()) 25 | 26 | //dropping the duplicate rows based on customer_id 27 | val dropedDF = df.dropDuplicates("customer_id") 28 | 29 | println("count after dropping :" + dropedDF.count()) 30 | dropedDF.show() 31 | 32 | } 33 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/parquet/TestDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.parquet 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.DataFrame 6 | import org.apache.spark.sql.Row 7 | import org.apache.spark.sql.catalyst.InternalRow 8 | import org.apache.spark.unsafe.types.UTF8String 9 | import org.apache.spark.sql.catalyst.expressions.UnsafeRow 10 | import org.apache.spark.sql.types.StructType 11 | import org.apache.spark.sql.types.StructField 12 | import org.apache.spark.sql.types.IntegerType 13 | import org.apache.spark.sql.types.StringType 14 | import org.apache.spark.sql.SQLContext 15 | import org.apache.spark.sql.SparkSession 16 | import org.apache.spark.sql.Column 17 | import org.apache.spark.sql.Dataset 18 | 19 | case class Person(name: String, empId: Int) 20 | case class Employee(empId: Int, emp_name: String) 21 | 22 | object TestDataFrame { 23 | 24 | def main(args: Array[String]): Unit = { 25 | 26 | val session = SparkSession.builder().appName("test").master("local[1]").getOrCreate() 27 | 28 | val person = Array(Person("John", 1), Person("Mike", 2)) 29 | 30 | val employee = Array(Employee(1, "Aruba")) 31 | 32 | val personDf = session.createDataFrame(person) 33 | 34 | val employeeDf = session.createDataFrame(employee) 35 | 36 | val joinDf = personDf.join(employeeDf, Seq("empId"), "left") 37 | 38 | joinDf.write.partitionBy("name").parquet("output/test") 39 | 40 | joinDf.show() 41 | 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/problemstatement/ProblemStatement.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.problemstatements 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | object ProblemStatement { 6 | 7 | def main(args: Array[String]) { 8 | val spark = 9 | SparkSession.builder() 10 | .appName("DataFrame-ProblemStatement") 11 | .master("local[2]") 12 | .getOrCreate() 13 | import spark.implicits._ 14 | 15 | //What are the best-salary and the second best-salary of employees in every department? 16 | 17 | val dataRDD = spark.read.format("csv").option("header", "true").load("input/pbs.csv").rdd 18 | val filteredDF = dataRDD.map(x => (x(2).toString(), x(3).toString().replace("$", "").toDouble)).toDF("dept", "salary").dropDuplicates().toDF() 19 | 20 | val maxSalDF = filteredDF.groupBy("dept").agg(max(filteredDF.col("salary")).as("MaxSal")).sort("dept") 21 | maxSalDF.show 22 | 23 | val subDF = filteredDF.except(maxSalDF) 24 | 25 | val ScndMaxSalDF = subDF.groupBy("dept").agg(max(subDF.col("salary")).as("SecMaxSal")).sort("dept") 26 | ScndMaxSalDF.show 27 | 28 | val pboblem1ResDF = maxSalDF.join(ScndMaxSalDF, Seq("dept")).sort("dept").toDF() 29 | pboblem1ResDF.show 30 | pboblem1ResDF.coalesce(1).write.option("header", "true").csv("/Users/revanthreddy/Desktop/Docs/file1.csv") 31 | 32 | //What is the difference between the salary of each employee and the highest salary of employee in the same department? 33 | 34 | val pboblem2DF = dataRDD.map(x => (x(0).toString(), x(2).toString(), x(3).toString().replace("$", "").toDouble)).toDF("name", "dept", "salary").dropDuplicates().toDF() 35 | 36 | val resDF = pboblem2DF.join(maxSalDF, Seq("dept")).sort("dept").toDF() 37 | 38 | val pboblem2ResDF = resDF.withColumn("diffSal", (resDF.col("MaxSal") - resDF.col("salary"))) 39 | pboblem2ResDF.coalesce(1).write.option("header", "true").csv("/Users/revanthreddy/Desktop/Docs/file2.csv") 40 | 41 | } 42 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/streaming/Spark_Kafka_Streaming.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.streaming 2 | 3 | import org.apache.spark.streaming.kafka.KafkaUtils 4 | import kafka.serializer.StringDecoder 5 | import org.apache.spark.streaming.StreamingContext 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.streaming.Seconds 9 | 10 | object Spark_Kafka_Streaming { 11 | 12 | def main(args: Array[String]) { 13 | val SLIDE_INTERVAL = 20 //in seconds 14 | val topics = "demo" 15 | val zkQuorum = "localhost:9092" 16 | val sc = new SparkContext(new SparkConf().setAppName("Spark-Kafka-Streaming").setMaster("local[2]")) 17 | val ssc = new StreamingContext(sc, Seconds(SLIDE_INTERVAL)) 18 | 19 | val topicsSet = topics.split(",").toSet 20 | println("Streaming topics : " + topicsSet) 21 | 22 | val kafkaParams = Map[String, String]("metadata.broker.list" -> zkQuorum) 23 | 24 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 25 | ssc, kafkaParams, topicsSet).map(_._2) 26 | 27 | messages.foreachRDD( 28 | rdd => { 29 | if (!rdd.isEmpty()) { 30 | println("First record : " + rdd.first()) 31 | println("rdd count : " + rdd.count()) 32 | } else { 33 | println("Data is not yet recevied from the producer....") 34 | } 35 | }) 36 | 37 | ssc.start() 38 | ssc.awaitTermination() 39 | } 40 | } -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/window/functions/ApStats.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.window.functions 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types._ 6 | 7 | object ApStats extends App { 8 | 9 | val spark = SparkSession.builder().appName("ApStats").master("local[1]").getOrCreate() 10 | 11 | import spark.implicits._ 12 | 13 | val input_switch_cpu = "input/iap_ap_info_records/date_month=2020-01/date_hour=2020-01-24-11/*" 14 | 15 | val df = spark.read.parquet(input_switch_cpu) 16 | 17 | df.printSchema() 18 | 19 | val stats = df.select($"tid", $"ap_name", $"ap_mac".as("apmac"), $"cpu_usage", $"memory_total", $"memory_free", $"ts") 20 | .withColumn( 21 | "memory_usage", 22 | lit(((col("memory_total") - col("memory_free")) / col("memory_total")) * 100)) 23 | .withColumn("temp_ts", col("ts").divide(1000).cast("timestamp")) 24 | .select("tid", "ap_name", "apmac", "cpu_usage", "memory_usage", "temp_ts") 25 | .withColumn("cpu_timeseries", struct( 26 | $"temp_ts".cast("long").as("ts"), 27 | $"cpu_usage".cast("float").as("avg"), 28 | $"cpu_usage".cast("float").as("max"))) 29 | .withColumn("memory_timeseries", struct( 30 | $"temp_ts".cast("long").as("ts"), 31 | $"memory_usage".cast("float").as("avg"), 32 | $"memory_usage".cast("float").as("max"))) 33 | .groupBy(col("tid"), col("apmac"), 34 | window(col("temp_ts"), "1 hour").alias("ts")). 35 | agg( 36 | avg("cpu_usage").as("cl_ap_system_stats_cpu_util"), 37 | avg("memory_usage").as("cl_ap_system_stats_mem_util"), 38 | collect_list($"cpu_timeseries").as("cpu_timeseries"), 39 | collect_list($"memory_timeseries").as("memory_timeseries")) 40 | .withColumn("ts_hr", hour($"ts.start")) 41 | 42 | 43 | stats.printSchema() 44 | stats.show(5, false) 45 | } 46 | -------------------------------------------------------------------------------- /Spark-2.1/src/main/scala/com/spark2/window/functions/CPUTidSiteRollup.scala: -------------------------------------------------------------------------------- 1 | package com.spark2.window.functions 2 | import org.apache.spark.sql.SparkSession 3 | import java.sql.Date 4 | import org.apache.spark.sql.expressions.Window 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.types._ 7 | 8 | object CPUTidSiteRollup extends App { 9 | 10 | val spark = SparkSession.builder().appName("SwitchCPU").master("local[1]").getOrCreate() 11 | import spark.implicits._ 12 | 13 | val input_switch_cpu = "input/iap_sw_cpu_mem_stats_rollup" 14 | 15 | val dfIntermed = spark.read.parquet(input_switch_cpu) 16 | dfIntermed.show(3, false) 17 | dfIntermed.printSchema() 18 | 19 | var dfRollup = dfIntermed 20 | .groupBy("tid") 21 | .agg(countDistinct("serial_number").cast(IntegerType).as("num_switches_impacted")) 22 | .withColumn("data_type", lit("iap_insight")) 23 | 24 | dfRollup.show(5, false) 25 | 26 | val dfMpdelRollUp = dfIntermed 27 | .groupBy("tid", "model") 28 | .agg(countDistinct("serial_number").alias("num_switches_impacted")) 29 | .withColumn("model_switch_count", struct( 30 | $"model".as("model"), 31 | $"num_switches_impacted".as("count"))) 32 | .groupBy("tid") 33 | .agg(collect_list("model_switch_count").alias("model_switch_count_list")) 34 | 35 | dfMpdelRollUp.show(5, false) 36 | 37 | val dfFirmwareRollup = dfIntermed 38 | .groupBy("tid", "firmware") 39 | .agg(countDistinct("serial_number").alias("num_switches_impacted")) 40 | .withColumn("firmware_switch_count", struct( 41 | $"firmware".as("firmware"), 42 | $"num_switches_impacted".as("count"))) 43 | .groupBy("tid") 44 | .agg(collect_list("firmware_switch_count").alias("firmware_switch_count_list")) 45 | 46 | dfRollup = dfRollup.join( 47 | dfMpdelRollUp, 48 | Seq("tid"), "left_outer") 49 | .join( 50 | dfFirmwareRollup, 51 | Seq("tid"), "left_outer") 52 | .withColumn("timeline_metric", $"num_switches_impacted".cast(FloatType)) 53 | 54 | dfRollup.show(5, false) 55 | } 56 | -------------------------------------------------------------------------------- /Spark-2.1/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=WARN, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %t %c{1}:%L - %m%n 9 | 10 | log4j.logger.com.rasa.cloud=DEBUG,stdout 11 | log4j.logger.com.rasa.cloud.nade=DEBUG,stdout 12 | log4j.additivity.com.rasa.cloud.nade=false -------------------------------------------------------------------------------- /Spark-2.1/src/test/scala/test/MetricsTest.scala: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import com.datadog.metrics.MetricsCollectorFactory 5 | 6 | object MetricsTest { 7 | def main(args: Array[String]) { 8 | 9 | val sparkSession = SparkSession.builder. 10 | master("local[1]") 11 | .appName("Spark_Accumulator_Metrics_To_DataDog") 12 | .getOrCreate() 13 | 14 | val sc = sparkSession.sparkContext 15 | 16 | val accum = sc.longAccumulator("total.charecters.counter") 17 | val words = sc.textFile("input/lines").flatMap(_.split(" ")) 18 | words.foreach(w => accum.add(w.length)) 19 | 20 | //setting the metrics tags 21 | var metricsTags = collection.mutable.Map[String, Any]() 22 | metricsTags.put("accum name", accum.name.get) 23 | metricsTags.put("accum value", accum.value) 24 | 25 | //initializing the metrics collector 26 | val metricsCollector = MetricsCollectorFactory.getDatadogCollector("947d12f46dead405bf019033434f0cba", "dev") 27 | 28 | //sending accumulator values as metrics to DataDog 29 | metricsCollector.sendMetrics(accum.name.get, accum.value, metricsTags) 30 | 31 | val badRecords = sc.longAccumulator("bad.records.counter") 32 | val baddata = sc.textFile("input/badrecords").map(v => v.split(",")) 33 | baddata.foreach(r => { try { r(2).toInt } catch { case e: NumberFormatException => badRecords.add(1) } }) 34 | 35 | //sending accumulator values as metrics to DataDog 36 | metricsCollector.sendMetrics(badRecords.name.get, badRecords.value, null) 37 | 38 | val acc = sc.longAccumulator("counter.test") 39 | val baddata1 = sc.textFile("input/badrecords").map(x => acc.add(1)) 40 | baddata1.collect() 41 | 42 | //setting the event tags 43 | var eventTags = collection.mutable.Map[String, Any]() 44 | eventTags.put("accum name", acc.name.get) 45 | eventTags.put("accum value", acc.value) 46 | 47 | //sending events to DataDog 48 | metricsCollector.sendEvents("DataDog Event Test", "Sending events", "normal", "info", eventTags) 49 | 50 | sc.stop() 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /Spark-Zeppelin/FirstSparkCassandraApp.git.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Spark-Zeppelin/README.md: -------------------------------------------------------------------------------- 1 | # Setting up your Spark Cassandra Dev Environment 2 | 3 | A quick workshop on building your first Spark Cassandra Stand Alone Application. In this workshop we will 4 | walk through setting up our Spark and Cassandra Dev environment with IntelliJ. In addition we wil setup Zeppelin 5 | to use as a Spark Interpreter. 6 | 7 | 1. [Setup and Download Components](Setup.md) 8 | 2. [Zeppelin Tutorial](Zeppelin.md) 9 | 3. [Stand Alone App Tutorial](StandAloneApp.md) 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Spark-Zeppelin/Setup.md: -------------------------------------------------------------------------------- 1 | # Setup for Workshop 2 | 3 | ## Downloading our components 4 | 5 | [Download Apache Spark 2.2.1](https://www.apache.org/dyn/closer.lua/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz) 6 | 7 | [Download Apache Cassandra 3.0.15](http://www.apache.org/dyn/closer.lua/cassandra/3.0.15/apache-cassandra-3.0.15-bin.tar.gz) 8 | 9 | [Download Apache Zeppelin 0.7.3](http://mirrors.gigenet.com/apache/zeppelin/zeppelin-0.7.3/zeppelin-0.7.3-bin-netinst.tgz) 10 | 11 | 12 | ## Let's start by Setting up Cassandra 13 | 14 | ### Start up Cassandra 15 | 16 | tar -xvf apache-cassandra-3.0.15-bin.tar.gz 17 | cd apache-cassandra-3.0.15 18 | ./bin/cassandra 19 | 20 | ### Test out our Cassandra Connection 21 | 22 | ./bin/cqlsh 23 | 24 | CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }; 25 | use test; 26 | CREATE TABLE tab ( k int, c int, v int, PRIMARY KEY (k,c)) 27 | INSERT INTO tab (k , c , v ) VALUES ( 1, 1, 1) ; 28 | SELECT * FROM test.tab; 29 | 30 | k | c | v 31 | ---+---+--- 32 | 1 | 1 | 1 33 | 34 | ### What does our Cassandra Table look like? 35 | 36 | k :: partition key 37 | c :: clustering key 38 | v :: a value 39 | 40 | On disk this looks like 41 | 42 | k1 -> (c1,v1) , (c2,v2), (c3,v3) 43 | k2 -> (c1,v1) , (c2,v2), (c3,v3) 44 | 45 | 46 | #### Important Cassandra Concepts 47 | For more information on Cassandra and Data Layout 48 | * Tokens : Where data lives 49 | * DataModeling : How data is laid out on disk 50 | * Replication : How many copies of the data will there be on the Server 51 | * Consistency Level : How many acknowledgements the Client needs for success 52 | 53 | Study more later! 54 | [Datastax Academy](https://academy.datastax.com/) 55 | 56 | #### Setup Spark 57 | 58 | tar -xvf spark-2.2.1-bin-hadoop2.7.tgz 59 | 60 | 61 | #### Add Spark-Cassandra-Connector 62 | 63 | Edit our environment 64 | 65 | Add to conf/spark-defaults : 66 | 67 | cp spark-defaults.conf.template spark-defaults.conf 68 | 69 | edit spark-defaults.conf file 70 | 71 | spark.jars.packages datastax:spark-cassandra-connector:2.0.7-s_2.11 72 | 73 | [Spark Cassandra Connector at Spark Packages](https://spark-packages.org/package/datastax/spark-cassandra-connector) 74 | 75 | -------------------------------------------------------------------------------- /Spark-Zeppelin/Zeppelin.md: -------------------------------------------------------------------------------- 1 | ## Working with Zeppelin 2 | 3 | ### Set up Zeppelin 4 | 5 | tar -xvf zeppelin-0.7.3-bin-netinst.tgz 6 | 7 | Set SPARK_HOME for Zeppelin to our spark directory 8 | Edit zeppelin-0.7.3-bin-netinst/conf/zeppelin-env.sh 9 | 10 | export SPARK_HOME="/Users/russellspitzer/SparkInstalls/spark-2.2.1-bin-hadoop2.7" ## Replace this with your install directory 11 | 12 | 13 | #### Start Zeppelin 14 | 15 | ./zeppelin-0.7.3-bin-netinst/bin/zeppelin.sh 16 | 17 | ### Zeppelin Home Screen 18 | 19 | [Local Zeppelin](http://localhost:8080/#/) 20 | ![Screenshot of create note popup](images/makenote.png) 21 | 22 | * Name is just something personal to identify the notebook for us 23 | * Interpreter is the code execution engine used for snippets 24 | 25 | ### Setting up the Interpreter 26 | 27 | ![Go to the Interpreter Settings Menu](images/Interpreter.png) 28 | ![Image of all Spark Interpreter Options](images/SparkOptions.png) 29 | 30 | Important for us 31 | 32 | * master : The Spark Resource manager used for our Application 33 | 34 | ### Go over some basics with Dataframes 35 | 36 | ![Initial Spark Cassandra Note](images/SetupImplicits.png) 37 | 38 | 39 | [Dataframe Notebook](notebooks/Spark%20Cassandra%20Note.json) 40 | 41 | 42 | ### Zeppelin Challenges 43 | 44 | [Challenges Notebook](notebooks/Spark%20Cassandra%20Challenges.json) -------------------------------------------------------------------------------- /Spark-Zeppelin/images/Interpreter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-Zeppelin/images/Interpreter.png -------------------------------------------------------------------------------- /Spark-Zeppelin/images/SetupImplicits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-Zeppelin/images/SetupImplicits.png -------------------------------------------------------------------------------- /Spark-Zeppelin/images/SparkOptions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-Zeppelin/images/SparkOptions.png -------------------------------------------------------------------------------- /Spark-Zeppelin/images/makenote.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-123-eng/Spark/eec07c2aeb6e9a9fd1b225b322a288c6fa214f0d/Spark-Zeppelin/images/makenote.png --------------------------------------------------------------------------------