├── .gitattributes ├── .gitignore ├── Flume ├── .gitignore ├── README.md ├── pom.xml └── src │ └── com │ └── vishnu │ ├── flume │ ├── config │ │ ├── flume-conf.properties │ │ └── flume-conf_spark.properties │ └── source │ │ └── CustomFlumeTCPSource.java │ └── tcp │ └── client │ └── TcpClient.java ├── KafkaStreams ├── .gitignore ├── README.md ├── build.sbt ├── project │ ├── assembly.sbt │ ├── build.properties │ └── plugins.sbt └── src │ └── main │ └── scala-2.11 │ └── com │ └── vishnuviswanath │ └── kafka │ └── streams │ ├── ClimateLogStream.scala │ └── HelloKafkaStreams.scala ├── README.md ├── datascience ├── kaggle │ ├── AnimalShelter │ │ └── AnimalShelterPreprocess.java │ ├── SanfranciscoCrime Classification │ │ ├── Sanfrancisco crime_LogisticRegression.ipynb │ │ ├── SanfranciscoCrime_KNN.ipynb │ │ └── SanfranciscoCrime_RandomForest.ipynb │ └── TitanicMachinLearnigFromDisaster │ │ ├── Kaggle Titanic Random Forest.ipynb │ │ └── Kaggle Titanic.ipynb ├── rnn │ ├── .gitignore │ └── Webtraffic_forecasting_LSTM.ipynb └── tensorflow │ ├── Basics.ipynb │ ├── cnn_cat_dog.py │ ├── kaggle_invasive_species │ ├── cnn_model_1.py │ ├── cnn_model_2.py │ ├── cnn_model_3.py │ ├── cnn_model_4.py │ ├── cnn_model_5.py │ ├── cnn_model_6.py │ ├── cnn_model_7.py │ ├── cnn_model_7_3.py │ ├── perpare_folder_structure.py │ ├── prepare_validset.py │ ├── sample_model.h5 │ ├── sample_model.py │ ├── submission.csv │ ├── submission2.csv │ ├── submission4.csv │ ├── submission4_2.csv │ └── train_labels.csv │ └── tensorflow_nn_model.ipynb ├── docker ├── spark-kafka-docker │ ├── Dockerfile │ ├── docker-compose.yml │ └── entrypoint.sh ├── spark-kafka-single-node-docker │ ├── Dockerfile │ └── entrypoint.sh └── spark-kafka-single-node-for-meetup │ ├── Dockerfile │ ├── data │ ├── site-device │ │ ├── csv │ │ │ ├── site-device-corrupted.csv │ │ │ └── site-device.csv │ │ └── parquet │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc │ │ │ ├── .part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc │ │ │ ├── _SUCCESS │ │ │ ├── part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet │ │ │ └── part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet │ └── site-views │ │ ├── csv │ │ └── site-views.csv │ │ ├── json │ │ └── site-views.jsonl │ │ └── xml │ │ └── site-views.xml │ └── entrypoint.sh ├── flink-examples-java ├── .gitignore ├── Dockerfile ├── pom.xml ├── src │ └── main │ │ ├── java │ │ └── com │ │ │ └── vishnuviswanath │ │ │ └── flink │ │ │ └── streaming │ │ │ ├── HelloStreamingJob.java │ │ │ └── sources │ │ │ ├── SocketStreamingJob.java │ │ │ └── TextSourceJob.java │ │ └── resources │ │ └── log4j.properties └── submit.sh ├── flink ├── .gitignore ├── README.md ├── build.sbt ├── project │ ├── build.properties │ └── plugins.sbt └── src │ └── main │ ├── java │ ├── FlinkMain.java │ └── utility │ │ └── Server.java │ ├── resources │ └── log4j.properties │ └── scala │ └── com │ └── vishnu │ └── flink │ ├── WordCount.scala │ ├── dataset │ └── WordCount.scala │ ├── streaming │ ├── CountTumblingWindow.scala │ ├── EventTimeProcessing.scala │ ├── FlinkStreamingWordCount.scala │ ├── ProcessingTimeWindowEvictor.scala │ ├── ProcessingTimeWindowWithTrigger.scala │ ├── StreamingWithRocksDb.scala │ ├── TimeSlidingWindow.scala │ ├── TimeTumblingWindow.scala │ ├── cep │ │ └── HelloCep.scala │ ├── queryablestate │ │ ├── QueryClient.scala │ │ └── QuerybleStateStream.scala │ ├── sessionwindows │ │ ├── SessionWindowExample.scala │ │ └── SessionWindowExampleDummyDataProd.scala │ └── windowtimer │ │ ├── ProccWindowFun.scala │ │ └── ProcessWindowExample.scala │ └── util │ ├── ParameterParser.scala │ └── RandomServerEventsKafkaProducer.scala ├── mapreduce ├── .gitignore ├── ArrayWritableExample.java ├── DistributeCache.java ├── FindMaximum.java ├── ImageReader.java ├── LetterWordMapper.java ├── MultiInputPath.java ├── README.md ├── SequenceFileTest.java ├── WordCount.java ├── chaining │ ├── ChainMapperExample.java │ ├── ChainingJobControl.java │ ├── ChainingSimple.java │ ├── LetterCount.java │ ├── ToUpperCase.java │ └── WordCount.java ├── customtypes │ ├── AreaCalculator.java │ ├── Comparator.java │ ├── CustomPartitioner.java │ ├── DollarInputFormat.java │ ├── DollarRecordReader.java │ ├── DollarStreamExample.java │ ├── IdentityReducerEx.java │ ├── Point2D.java │ ├── Rectangle.java │ ├── RectangleCount.java │ ├── RectangleInputFormat.java │ ├── RectangleKey.java │ ├── RectangleRecordReader.java │ ├── XmlOutputDriver.java │ └── XmlOutputFormat.java ├── datafu_example │ ├── .gitignore │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── incrementaljob │ │ ├── IncrementalAccumulator.java │ │ ├── IncrementalAggr.java │ │ └── IncrementalMapper.java ├── joins │ └── ReduceSideJoin.java ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── vishnu │ │ └── mapreduce │ │ ├── CustomMultiplOututFormat.java │ │ ├── CustomOutputFormatTest.java │ │ └── WordCount.java │ └── test │ └── java │ └── com │ └── vishnu │ └── mapreduce │ └── AppTest.java ├── spark ├── .gitignore ├── README.md ├── build.properties ├── build.sbt ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── pyspark-files │ └── helloworld.py ├── src │ └── main │ │ └── scala │ │ └── com │ │ └── vishnu │ │ └── spark │ │ ├── Test.scala │ │ ├── bClassifier.scala │ │ ├── basics │ │ ├── ApiLearn.scala │ │ ├── AuctionApp.scala │ │ ├── CustomPartitioner.scala │ │ ├── DataframeExample.scala │ │ ├── SequenceFileTest.scala │ │ ├── dataframes.sc │ │ ├── pairrdd.sc │ │ ├── rdds.sc │ │ └── streams.sc │ │ ├── blog │ │ └── supportfiles │ │ │ └── spark_session_blog_commands │ │ ├── graph │ │ ├── PregelGraphExample.scala │ │ ├── PropertyGraphExample.scala │ │ └── res │ │ │ ├── airports.csv │ │ │ └── routes.csv │ │ ├── kaggle │ │ └── titanic │ │ │ ├── KaggleTitanic.scala │ │ │ ├── TitanicOverfit.scala │ │ │ ├── TitanicUnderfit.scala │ │ │ └── TitanicWithPipeline.scala │ │ ├── map_reduce_in_spark.scala │ │ ├── mllib │ │ ├── ALSRecommender.scala │ │ ├── ALSRecommender2.scala │ │ ├── FeatureTransformations.scala │ │ ├── LinearRegr.scala │ │ ├── LogisticRegr.scala │ │ ├── TFIDF.scala │ │ └── house_data.csv │ │ ├── sql │ │ ├── FromJson.scala │ │ ├── HiveTest.scala │ │ ├── StreamSQL.scala │ │ ├── ToMongoDB.scala │ │ └── res │ │ │ ├── twitter.avro │ │ │ └── twitter.avsc │ │ └── streaming │ │ ├── FlumeStreaming.scala │ │ ├── KafkaDirectStream.scala │ │ ├── KafkaStreaming.scala │ │ ├── README.md │ │ ├── SeqFileStreaming.scala │ │ ├── SocketStreaming.scala │ │ ├── StreamHbase.scala │ │ ├── StreamingFromCheckpoint.scala │ │ ├── StreamingJoins.scala │ │ ├── StreamingWindow.scala │ │ ├── StreamingWithCheckpoint.scala │ │ ├── UpdateStateByKey.scala │ │ ├── WindowedStream.scala │ │ ├── akka │ │ ├── SendToActor.scala │ │ └── SparkAkkaSource.scala │ │ └── customsource │ │ ├── ActivityReceiver.scala │ │ └── StreamingWithCustomSource.scala └── uberjar.md ├── spark_23 ├── MEETUP_NOTES.md ├── README.md ├── build.sbt ├── project │ ├── build.properties │ └── plugins.sbt └── src │ ├── main │ └── scala │ │ └── com │ │ └── vishnuviswanath │ │ └── spark │ │ ├── streaming │ │ ├── ContinuousKafkaStreaming.scala │ │ ├── CustomV2SourceExample.scala │ │ ├── HelloStructredStreaming.scala │ │ ├── KafkaSourceStreaming.scala │ │ ├── SocketSourceStreaming.scala │ │ ├── StreamingAggregations.scala │ │ └── sources │ │ │ └── netcat │ │ │ ├── NetcatContinuousReader.scala │ │ │ ├── NetcatOffset.scala │ │ │ ├── NetcatReader.scala │ │ │ └── NetcatSourceProvider.scala │ │ └── util │ │ ├── NetcatProducer.scala │ │ ├── RandomCarsKafkaProducer.scala │ │ ├── SimulateLateDateProducer.scala │ │ ├── ToFileProducer.scala │ │ └── WordsStream.scala │ └── test │ └── scala │ └── com │ └── vishnuviswanath │ └── spark │ └── streaming │ └── HelloStructuredStreamingSpec.scala └── stormkafka ├── .gitignore ├── README.md ├── pom.xml └── src └── main └── java ├── com └── vishnu │ └── storm │ ├── Keys.java │ ├── Topology.java │ ├── bolt │ ├── BoltBuilder.java │ ├── MongodbBolt.java │ ├── SinkTypeBolt.java │ └── SolrBolt.java │ └── spout │ └── SpoutBuilder.java └── default_config.properties /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-vendored 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ensime 2 | .ensime_cache/ 3 | .ipynb_checkpoints/ 4 | _site/ 5 | .jekyll-metadata 6 | .settings/ 7 | target/ 8 | .metadata 9 | bin/ 10 | tmp/ 11 | *.tmp 12 | *.bak 13 | *.swp 14 | *~.nib 15 | local.properties 16 | .settings/ 17 | .loadpath 18 | 19 | # Eclipse Core 20 | .project 21 | 22 | # External tool builders 23 | .externalToolBuilders/ 24 | 25 | # Locally stored "Eclipse launch configurations" 26 | *.launch 27 | 28 | # PyDev specific (Python IDE for Eclipse) 29 | *.pydevproject 30 | 31 | # CDT-specific (C/C++ Development Tooling) 32 | .cproject 33 | 34 | # JDT-specific (Eclipse Java Development Tools) 35 | .classpath 36 | 37 | # Java annotation processor (APT) 38 | .factorypath 39 | 40 | # PDT-specific (PHP Development Tools) 41 | .buildpath 42 | 43 | # sbteclipse plugin 44 | .target 45 | 46 | # Tern plugin 47 | .tern-project 48 | 49 | # TeXlipse plugin 50 | .texlipse 51 | 52 | # STS (Spring Tool Suite) 53 | .springBeans 54 | 55 | # Code Recommenders 56 | .recommenders 57 | /target 58 | 59 | -------------------------------------------------------------------------------- /Flume/.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | .settings 4 | target 5 | 6 | *.class 7 | 8 | # Mobile Tools for Java (J2ME) 9 | .mtj.tmp/ 10 | 11 | #intellij project file 12 | *.iml 13 | .idea/ 14 | .cache-main 15 | 16 | # Package Files # 17 | *.jar 18 | *.war 19 | *.ear 20 | 21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 22 | hs_err_pid* 23 | -------------------------------------------------------------------------------- /Flume/README.md: -------------------------------------------------------------------------------- 1 | ### Flume Custom TCP Source 2 | 3 | CustomFlumeTCPSource.java is custom flume source which listens to a port and sends the content to the configured channel. The custom source adds the client information to the header of message before sending to the channel. 4 | It takes two configurations 5 | 6 | 1. port - the port to listen to 7 | 2. buffer - how often should the events be send to the channel 8 | 9 | #### Sample configuration 10 | agent.sources.CustomTcpSource.type = com.vishnu.flume.source.CustomFlumeTCPSource 11 | agent.sources.CustomTcpSource.port = 4443 12 | agent.sources.CustomTcpSource.buffer = 1 13 | 14 | 15 | -------------------------------------------------------------------------------- /Flume/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.vishnu 4 | Flume 5 | 0.0.1-SNAPSHOT 6 | 7 | src 8 | 9 | 10 | maven-compiler-plugin 11 | 3.3 12 | 13 | 1.8 14 | 1.8 15 | 16 | 17 | 18 | 19 | 20 | 21 | org.apache.flume 22 | flume-ng-core 23 | 1.6.0 24 | 25 | 26 | -------------------------------------------------------------------------------- /Flume/src/com/vishnu/flume/config/flume-conf.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | 19 | # The configuration file needs to define the sources, 20 | # the channels and the sinks. 21 | # Sources, channels and sinks are defined per agent, 22 | # in this case called 'agent' 23 | 24 | agent.sources = CustomTcpSource 25 | agent.channels = memoryChannel 26 | agent.sinks = loggerSink 27 | 28 | # For each one of the sources, the type is defined 29 | agent.sources.CustomTcpSource.type = com.vishnu.flume.source.CustomFlumeTCPSource 30 | agent.sources.CustomTcpSource.port = 4443 31 | agent.sources.CustomTcpSource.buffer = 1 32 | 33 | 34 | # The channel can be defined as follows. 35 | agent.sources.CustomTcpSource.channels = memoryChannel 36 | 37 | # Each sink's type must be defined 38 | agent.sinks.loggerSink.type = logger 39 | 40 | #Specify the channel the sink should use 41 | agent.sinks.loggerSink.channel = memoryChannel 42 | 43 | # Each channel's type is defined. 44 | agent.channels.memoryChannel.type = memory 45 | 46 | # Other config values specific to each type of channel(sink or source) 47 | # can be defined as well 48 | # In this case, it specifies the capacity of the memory channel 49 | agent.channels.memoryChannel.capacity = 100 50 | -------------------------------------------------------------------------------- /Flume/src/com/vishnu/flume/config/flume-conf_spark.properties: -------------------------------------------------------------------------------- 1 | # Flume configuration to listen to netcat host and port, 2 | # sink is of the type avro 3 | # Created for testing spark streaming from flume 4 | # @author vishnu viswanath 5 | 6 | agent.sources = Netcat 7 | agent.channels = memoryChannel 8 | agent.sinks = avroSink 9 | #agent.sinks = loggerSink 10 | 11 | # For each one of the sources, the type is defined 12 | agent.sources.Netcat.type = netcat 13 | agent.sources.Netcat.bind = localhost 14 | agent.sources.Netcat.port = 6666 15 | agent.sources.Netcat.channels = memoryChannel 16 | 17 | # avro sink for spark 18 | agent.sinks.avroSink.type = avro 19 | agent.sinks.avroSink.channel = memoryChannel 20 | agent.sinks.avroSink.hostname = localhost 21 | agent.sinks.avroSink.port = 4444 22 | 23 | #logger sink 24 | #agent.sinks.loggerSink.type = logger 25 | #agent.sinks.loggerSink.channel = memoryChannel 26 | 27 | # Each channel's type is defined. 28 | agent.channels.memoryChannel.type = memory 29 | 30 | # Other config values specific to each type of channel(sink or source) 31 | # can be defined as well 32 | # In this case, it specifies the capacity of the memory channel 33 | agent.channels.memoryChannel.capacity = 100 34 | -------------------------------------------------------------------------------- /Flume/src/com/vishnu/tcp/client/TcpClient.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.tcp.client; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.DataOutputStream; 5 | import java.io.InputStreamReader; 6 | import java.net.Socket; 7 | 8 | public class TcpClient { 9 | 10 | public static void main(String[] args) throws Exception { 11 | String sentence; 12 | String modifiedSentence; 13 | BufferedReader inFromUser = new BufferedReader(new InputStreamReader(System.in)); 14 | Socket clientSocket = new Socket("localhost", 4443); 15 | DataOutputStream outToServer = new DataOutputStream(clientSocket.getOutputStream()); 16 | outToServer.writeBytes("test message" + '\n'); 17 | clientSocket.close(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /KafkaStreams/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /KafkaStreams/README.md: -------------------------------------------------------------------------------- 1 | **KafkaStreams** is a stream processing library on top of Apache Kafka. 2 | 3 | This project contains basic examples of how to create a Kafka Stream application in Scala. For more detailed explaination visit the [blog post](http://vishnuviswanath.com/hello-kafka-streams.html). 4 | -------------------------------------------------------------------------------- /KafkaStreams/build.sbt: -------------------------------------------------------------------------------- 1 | name := "KafkaStreams" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.11.8" 6 | 7 | organization := "com.vishnuviswanath" 8 | 9 | val kafkaStreamsVersion = "0.10.2.0" 10 | 11 | val kafkaDependencies = Seq( 12 | "org.apache.kafka" % "kafka-streams" % kafkaStreamsVersion) 13 | 14 | val otherDependencies = Seq( 15 | "com.esotericsoftware.kryo" % "kryo" % "2.24.0" 16 | ) 17 | 18 | val main = "com.vishnuviswanath.kafka.streams.KafkaStreamsExample" 19 | mainClass in (Compile, run) := Some(main) 20 | mainClass in (Compile, packageBin) := Some(main) 21 | 22 | lazy val root = (project in file(".")). 23 | settings( 24 | libraryDependencies ++= kafkaDependencies, 25 | libraryDependencies ++= otherDependencies 26 | ) 27 | 28 | 29 | -------------------------------------------------------------------------------- /KafkaStreams/project/assembly.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/KafkaStreams/project/assembly.sbt -------------------------------------------------------------------------------- /KafkaStreams/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.13 -------------------------------------------------------------------------------- /KafkaStreams/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") -------------------------------------------------------------------------------- /KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/HelloKafkaStreams.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.kafka.streams 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.kafka.clients.consumer.ConsumerConfig 6 | import org.apache.kafka.common.serialization.Serdes 7 | import org.apache.kafka.streams.{KafkaStreams, StreamsConfig} 8 | import org.apache.kafka.streams.kstream.{KStream, KStreamBuilder, ValueMapper} 9 | 10 | /** 11 | * Created by vviswanath on 4/22/17. 12 | * 13 | * HelloKafkaStream reads a list of names from a topic and 14 | * outputs "hello " in output topic 15 | */ 16 | object HelloKafkaStreams { 17 | 18 | def main(args: Array[String]): Unit = { 19 | val settings = new Properties 20 | settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "hello-kafka-streams") 21 | settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092") 22 | settings.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest") 23 | settings.put(StreamsConfig.KEY_SERDE_CLASS_CONFIG, Serdes.serdeFrom(classOf[String]).getClass.getName) 24 | settings.put(StreamsConfig.VALUE_SERDE_CLASS_CONFIG, Serdes.serdeFrom(classOf[String]).getClass.getName) 25 | 26 | val kstreamBuilder = new KStreamBuilder 27 | val rawStream: KStream[String, String] = kstreamBuilder.stream("names") 28 | 29 | val helloStream: KStream[String, String] = rawStream.mapValues(new ValueMapper[String, String]{ 30 | override def apply(value: String): String = s"hello $value" 31 | }) 32 | 33 | helloStream.to(Serdes.String, Serdes.String, "hellostream") 34 | 35 | val streams = new KafkaStreams(kstreamBuilder, settings) 36 | streams.start 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hadoop and ML repository 2 | 3 | A repository to hold all my Hadoop and Machine Learning related codes. 4 | 5 | Visit my blog at : www.vishnuviswanath.com 6 | 7 | ### Contents 8 | 9 | 1. Flink Streaming 10 | 2. Spark ML, Streaming, SQL and GraphX 11 | 3. Kafka Streams 12 | 4. StormKafka streaming application POC 13 | 5. Flume custom source and config files 14 | 6. Hadoop MapReduce old api joins,custom types etc 15 | 7. Solutions for kaggle problems using numpy or graphlab 16 | -------------------------------------------------------------------------------- /datascience/kaggle/TitanicMachinLearnigFromDisaster/Kaggle Titanic Random Forest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /datascience/rnn/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.tsv 3 | *.DS_Store 4 | -------------------------------------------------------------------------------- /datascience/tensorflow/cnn_cat_dog.py: -------------------------------------------------------------------------------- 1 | #import the modules 2 | from keras.models import Sequential 3 | from keras.layers.convolutional import Conv2D 4 | from keras.layers.pooling import MaxPooling2D 5 | from keras.layers.core import Flatten 6 | from keras.layers.core import Dense 7 | from keras.preprocessing.image import ImageDataGenerator 8 | import os 9 | 10 | from pyspark import SparkSession 11 | 12 | 13 | if __name__ == "__main__": 14 | spark = SparkSession().master("spark://mm-mac-4797:7077").appName("CatsDogsCNN").getOrCreate() 15 | 16 | 17 | #os.getcwd() 18 | 19 | #change working directory if needed 20 | #os.chdir("path to your dataset folder") 21 | 22 | #initialize the classifier 23 | classifier = Sequential() 24 | 25 | #add layers 26 | classifier.add(Conv2D(32, (3, 3), input_shape=(64, 64, 3), activation = 'relu')) 27 | classifier.add(MaxPooling2D()) #default pool size is (2, 2) 28 | classifier.add(Flatten()) #flatten all layers into a single layer 29 | classifier.add(Dense(128, activation = 'relu')) 30 | classifier.add(Dense(1, activation = 'sigmoid')) #2 = number of outputs 31 | 32 | classifier.compile(optimizer = 'adam', 33 | loss = 'binary_crossentropy', 34 | metrics = ['accuracy']) 35 | 36 | #image pre-processing 37 | train_datagen = ImageDataGenerator( 38 | rescale=1./255, 39 | shear_range=0.2, 40 | zoom_range=0.2, 41 | horizontal_flip=True) 42 | 43 | test_datagen = ImageDataGenerator(rescale=1./255) 44 | 45 | train_generator = train_datagen.flow_from_directory( 46 | 'dataset/training_set', 47 | target_size=(64, 64), 48 | batch_size=32, 49 | class_mode='binary') 50 | 51 | test_generator = test_datagen.flow_from_directory( 52 | 'dataset/test_set', 53 | target_size=(64, 64), 54 | batch_size=32, 55 | class_mode='binary') 56 | 57 | classifier.fit_generator( 58 | train_generator, 59 | steps_per_epoch=200, 60 | epochs=5, 61 | validation_data=test_generator, 62 | validation_steps=100) -------------------------------------------------------------------------------- /datascience/tensorflow/kaggle_invasive_species/cnn_model_1.py: -------------------------------------------------------------------------------- 1 | #nvidia-smi 2 | #~/.keras/keras.json 3 | 4 | #import keras 5 | #print keras.__version__ 6 | #1.2.2 7 | #https://faroit.github.io/keras-docs/1.2.2/ 8 | 9 | 10 | from keras.models import Sequential 11 | from keras.layers.pooling import MaxPooling2D 12 | from keras.layers.core import Dense 13 | from keras.layers.core import Flatten 14 | from keras.layers.core import Dropout 15 | from keras.layers.convolutional import Conv2D 16 | from keras.layers.pooling import MaxPooling2D 17 | from keras.models import model_from_json 18 | 19 | from keras.preprocessing.image import ImageDataGenerator 20 | 21 | #check 22 | #from tensorflow.python.client import device_lib 23 | #print(device_lib.list_local_devices()) 24 | 25 | classification = Sequential() 26 | classification.add(Conv2D(50, 3,3, input_shape=(128, 128, 3), activation = 'relu')) 27 | classification.add(MaxPooling2D()) 28 | 29 | classification.add(Conv2D(25, 3,3, activation = 'relu')) 30 | classification.add(MaxPooling2D()) 31 | 32 | classification.add(Flatten()) 33 | 34 | classification.add(Dense(200, activation = 'relu')) 35 | classification.add(Dropout(0.5)) 36 | 37 | classification.add(Dense(1, activation = 'sigmoid')) 38 | 39 | for layer in classification.layers: 40 | print(str(layer.name)+" "+str(layer.input_shape)+" -> "+str(layer.output_shape)) 41 | 42 | 43 | classification.compile(optimizer='Adam', loss='binary_crossentropy', metrics = ['accuracy']) 44 | 45 | train_data_gen = ImageDataGenerator(rescale=1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True) 46 | valid_data_gen = ImageDataGenerator(rescale=1./255) 47 | 48 | train_gen = train_data_gen.flow_from_directory('training_set', target_size=(128, 128), batch_size=25, class_mode='binary') 49 | valid_gen = valid_data_gen.flow_from_directory('validation_set', target_size=(128, 128), batch_size=25, class_mode='binary') 50 | 51 | #classification.load_weights("classification_model.h5") 52 | classification.fit_generator(train_gen, samples_per_epoch=2145, validation_data=valid_gen, nb_epoch=25, nb_val_samples=150) 53 | 54 | classification_json = classification.to_json() 55 | with open("cnn_model_1.json", "w") as json_file: 56 | json_file.write(classification_json) 57 | classification.save_weights("cnn_model_1.h5") 58 | #91.3 val accuracy 59 | 60 | 61 | test_data_gen = ImageDataGenerator(rescale=1./255) 62 | test_gen = test_data_gen.flow_from_directory('test', target_size=(128, 128), batch_size=25, class_mode='binary') 63 | 64 | prediction = classification.predict_generator(test_gen, 1531) 65 | 66 | result = [] 67 | filenames = test_gen.filenames 68 | for i in range(len(filenames)): 69 | result.append((int(filenames[i].split("/")[1].split(".")[0]), prediction[i][0])) 70 | 71 | result.sort(key=lambda tup: tup[0]) 72 | 73 | with open("submission.csv", "w") as output: 74 | output.write("name,invasive\n") 75 | for i in range(0, len(result)): 76 | output.write(str(result[i][0])+","+str(result[i][1])+"\n") -------------------------------------------------------------------------------- /datascience/tensorflow/kaggle_invasive_species/perpare_folder_structure.py: -------------------------------------------------------------------------------- 1 | from shutil import copyfile 2 | import os 3 | 4 | with open("train_labels.csv") as labels: 5 | train_labels = labels.read().splitlines() 6 | 7 | def copy(base, img, dest, claz): 8 | dest_file = dest+"/"+claz+"/"+img 9 | source_file = base+"/"+img 10 | if not os.path.exists(os.path.dirname(dest_file)): 11 | os.makedirs(os.path.dirname(dest_file)) 12 | copyfile(source_file, dest_file) 13 | 14 | for i in train_labels[1:]: 15 | parts = i.split(",") 16 | img = parts[0]+".jpg" 17 | claz = parts[1] 18 | print("copying "+img+" to class "+claz) 19 | copy("train", img, "training_set", claz) 20 | 21 | 22 | 23 | #copy("train", "1.jpg", "train_new", "0") 24 | 25 | -------------------------------------------------------------------------------- /datascience/tensorflow/kaggle_invasive_species/prepare_validset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from shutil import copyfile 3 | from shutil import move 4 | import random 5 | 6 | training_dir = 'training_set' 7 | validation_dir = 'validation_set' 8 | classes = ["0", "1"] 9 | 10 | for claz in classes: 11 | val_size = 150 12 | cdir = training_dir+"/"+claz 13 | print(cdir) 14 | vcdiir = validation_dir+"/"+claz 15 | imgs = os.listdir(cdir) 16 | random.shuffle(imgs) 17 | for file in imgs: 18 | if val_size <= 0: 19 | break 20 | val_size = val_size - 1 21 | source_file = cdir+"/"+file 22 | dest_file = vcdiir+"/"+file 23 | print("moving "+source_file+" to "+dest_file) 24 | if not os.path.exists(os.path.dirname(dest_file)): 25 | os.makedirs(os.path.dirname(dest_file)) 26 | move(source_file, dest_file) -------------------------------------------------------------------------------- /datascience/tensorflow/kaggle_invasive_species/sample_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/datascience/tensorflow/kaggle_invasive_species/sample_model.h5 -------------------------------------------------------------------------------- /datascience/tensorflow/kaggle_invasive_species/sample_model.py: -------------------------------------------------------------------------------- 1 | from keras.models import Sequential 2 | from keras.layers.pooling import MaxPooling2D 3 | from keras.layers.core import Dense 4 | from keras.layers.core import Flatten 5 | from keras.layers.core import Dropout 6 | from keras.layers.convolutional import Conv2D 7 | from keras.layers.pooling import MaxPooling2D 8 | 9 | from keras.preprocessing.image import ImageDataGenerator 10 | classification = Sequential() 11 | classification.add(Conv2D(10, (3,3), input_shape=(64, 64, 3), activation = 'relu')) 12 | 13 | classification.add(Conv2D(10, (3,3), activation = 'relu')) 14 | classification.add(MaxPooling2D()) 15 | 16 | classification.add(Conv2D(5, (3,3), activation = 'relu')) 17 | classification.add(MaxPooling2D()) 18 | 19 | classification.add(Flatten()) 20 | 21 | classification.add(Dense(50, activation = 'relu')) 22 | classification.add(Dropout(0.5)) 23 | classification.add(Dense(1, activation = 'sigmoid')) 24 | 25 | for layer in classification.layers: 26 | print(str(layer.name)+" "+str(layer.input_shape)+" -> "+str(layer.output_shape)) 27 | 28 | 29 | classification.compile(optimizer='Adam', loss='binary_crossentropy', metrics = ['accuracy']) 30 | 31 | train_data_gen = ImageDataGenerator(rescale=1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True) 32 | valid_data_gen = ImageDataGenerator(rescale=1./255) 33 | 34 | train_gen = train_data_gen.flow_from_directory('sample/training_set', target_size=(64, 64), batch_size=5, class_mode='binary') 35 | valid_gen = valid_data_gen.flow_from_directory('sample/validation_set', target_size=(64, 64), batch_size=5, class_mode='binary') 36 | 37 | #classification.load_weights("sample_model.h5") 38 | classification.fit_generator(train_gen, steps_per_epoch=36, validation_data=valid_gen, epochs=10, validation_steps=36) 39 | 40 | #save the weights 41 | classification.save_weights("sample_model.h5") 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /docker/spark-kafka-docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jre 2 | MAINTAINER Vishnu Viswanath "vishnuviswanath.com" 3 | 4 | RUN apt-get update && apt-get install -y curl \ 5 | procps 6 | 7 | ENV APACHE_DOWNLOAD_URL https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename= 8 | 9 | #KAFKA 10 | ARG KAFKA_VERSION=1.0.0 11 | ARG SCALA_VERSION=2.11 12 | ENV KAFKA_PACKAGE kafka_${SCALA_VERSION}-${KAFKA_VERSION} 13 | ENV KAFKA_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}kafka/${KAFKA_VERSION}/${KAFKA_PACKAGE}.tgz 14 | ENV KAFKA_HOME /usr/share/${KAFKA_PACKAGE} 15 | ENV PATH $PATH:${KAFKA_HOME}/bin 16 | 17 | RUN curl -L \ 18 | "${KAFKA_DOWNLOAD_URL}" \ 19 | | gunzip \ 20 | | tar x -C /usr/share/ 21 | 22 | 23 | #SPARK 24 | ARG SPARK_VERSION=2.3.0 25 | ARG HADOOP_VERSION=2.7 26 | ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} 27 | ENV SPARK_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz 28 | ENV SPARK_HOME /usr/share/${SPARK_PACKAGE} 29 | ENV PATH $PATH:${SPARK_HOME}/bin:${SPARK_HOME}/sbin 30 | 31 | RUN curl -L \ 32 | "${SPARK_DOWNLOAD_URL}" \ 33 | | gunzip \ 34 | | tar x -C /usr/share/ 35 | 36 | 37 | EXPOSE 8080 8081 6066 7077 4040 7001 7002 7003 7004 7005 7006 2181 9092 38 | 39 | ADD entrypoint.sh / 40 | 41 | ENTRYPOINT ["/entrypoint.sh"] -------------------------------------------------------------------------------- /docker/spark-kafka-docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | services: 3 | spark-master: 4 | image: soniclavier/spark-kafka:2.3.0_1.0.0 5 | command: spark-master 6 | expose: 7 | - 8080 8 | - 7077 9 | ports: 10 | - 8080:8080 11 | - 7077:7077 12 | 13 | spark-worker: 14 | image: soniclavier/spark-kafka:2.3.0_1.0.0 15 | command: spark-worker spark-master:7077 16 | depends_on: 17 | - spark-master 18 | expose: 19 | - 8081 20 | 21 | zookeeper: 22 | image: soniclavier/spark-kafka:2.3.0_1.0.0 23 | command: zookeeper 24 | expose: 25 | - 2181 26 | 27 | kafka-broker: 28 | image: soniclavier/spark-kafka:2.3.0_1.0.0 29 | command: kafka-broker zookeeper 30 | depends_on: 31 | - zookeeper 32 | expose: 33 | - 9092 34 | -------------------------------------------------------------------------------- /docker/spark-kafka-docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | case $1 in 4 | spark-master) exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.master.Master 5 | ;; 6 | 7 | spark-worker) exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.worker.Worker $2 8 | ;; 9 | 10 | zookeeper) exec $KAFKA_HOME/bin/zookeeper-server-start.sh $KAFKA_HOME/config/zookeeper.properties 11 | ;; 12 | 13 | kafka-broker) 14 | sed -r -i "s/(zookeeper.connect)=(.*)/\1=$2:2181/g" $KAFKA_HOME/config/server.properties 15 | exec $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties 16 | ;; 17 | 18 | *) echo "Unknown entrypoint $1, valid entry points are [spark-master, spark-worker , zookeeper, kafka-broker ]" 19 | ;; 20 | esac 21 | -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jre 2 | MAINTAINER Vishnu Viswanath "vishnuviswanath.com" 3 | 4 | RUN apt-get update && apt-get install -y curl \ 5 | procps 6 | 7 | ENV APACHE_DOWNLOAD_URL https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename= 8 | 9 | #KAFKA 10 | ARG KAFKA_VERSION=1.0.0 11 | ARG SCALA_VERSION=2.11 12 | ENV KAFKA_PACKAGE kafka_${SCALA_VERSION}-${KAFKA_VERSION} 13 | ENV KAFKA_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}kafka/${KAFKA_VERSION}/${KAFKA_PACKAGE}.tgz 14 | ENV KAFKA_HOME /usr/share/${KAFKA_PACKAGE} 15 | ENV PATH $PATH:${KAFKA_HOME}/bin 16 | 17 | RUN curl -L \ 18 | "${KAFKA_DOWNLOAD_URL}" \ 19 | | gunzip \ 20 | | tar x -C /usr/share/ 21 | 22 | 23 | #SPARK 24 | ARG SPARK_VERSION=2.3.0 25 | ARG HADOOP_VERSION=2.7 26 | ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} 27 | ENV SPARK_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz 28 | ENV SPARK_HOME /usr/share/${SPARK_PACKAGE} 29 | ENV PATH $PATH:${SPARK_HOME}/bin:${SPARK_HOME}/sbin 30 | 31 | RUN curl -L \ 32 | "${SPARK_DOWNLOAD_URL}" \ 33 | | gunzip \ 34 | | tar x -C /usr/share/ 35 | 36 | 37 | EXPOSE 8080 8081 6066 7077 4040 7001 7002 7003 7004 7005 7006 2181 9092 38 | 39 | 40 | ADD entrypoint.sh / 41 | 42 | ENTRYPOINT ["/entrypoint.sh"] -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export HOSTNAME=$(hostname -i) 4 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.master.Master >/dev/null 2>&1 < /dev/null & 5 | echo "starting spark master.." 6 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.worker.Worker spark://${HOSTNAME}:7077 >/dev/null 2>&1 < /dev/null & 7 | echo "starting spark worker.." 8 | exec $KAFKA_HOME/bin/zookeeper-server-start.sh $KAFKA_HOME/config/zookeeper.properties >/dev/null 2>&1 < /dev/null & 9 | echo "starting zookeeper.." 10 | exec $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties >/dev/null 2>&1 < /dev/null & 11 | echo "starting kafka broker.." 12 | 13 | #make container wait 14 | exec "$@"; 15 | -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jre 2 | MAINTAINER Vishnu Viswanath "vishnuviswanath.com" 3 | 4 | RUN apt-get update && apt-get install -y curl \ 5 | procps \ 6 | netcat 7 | 8 | ENV APACHE_DOWNLOAD_URL https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename= 9 | 10 | #KAFKA 11 | ARG KAFKA_VERSION=1.0.0 12 | ARG SCALA_VERSION=2.11 13 | ENV KAFKA_PACKAGE kafka_${SCALA_VERSION}-${KAFKA_VERSION} 14 | ENV KAFKA_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}kafka/${KAFKA_VERSION}/${KAFKA_PACKAGE}.tgz 15 | ENV KAFKA_HOME /usr/share/${KAFKA_PACKAGE} 16 | ENV PATH $PATH:${KAFKA_HOME}/bin 17 | 18 | RUN curl -L \ 19 | "${KAFKA_DOWNLOAD_URL}" \ 20 | | gunzip \ 21 | | tar x -C /usr/share/ 22 | 23 | 24 | #SPARK 25 | ARG SPARK_VERSION=2.3.0 26 | ARG HADOOP_VERSION=2.7 27 | ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} 28 | ENV SPARK_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz 29 | ENV SPARK_HOME /usr/share/${SPARK_PACKAGE} 30 | ENV PATH $PATH:${SPARK_HOME}/bin:${SPARK_HOME}/sbin 31 | 32 | RUN curl -L \ 33 | "${SPARK_DOWNLOAD_URL}" \ 34 | | gunzip \ 35 | | tar x -C /usr/share/ 36 | 37 | 38 | EXPOSE 8080 8081 6066 7077 4040 7001 7002 7003 7004 7005 7006 2181 9092 39 | 40 | 41 | RUN export COLUMNS=250 42 | 43 | ADD data /data 44 | ADD spark_23-assembly-1.0.jar /examples/ 45 | ADD entrypoint.sh / 46 | 47 | ENTRYPOINT ["/entrypoint.sh"] -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/data/site-device/csv/site-device-corrupted.csv: -------------------------------------------------------------------------------- 1 | Device Category,Page,Users,New Users,Sessions 2 | mobile,/spark_lr.html,83,77,101 3 | mobile,/,60,40,56 4 | tablet,,/spark_session.html,10,10,11 5 | desktop,/flink_queryable_state2.html,162,52,141 6 | desktop,/blog/page5/,19,0,0,extrafield 7 | desktop,/blog/page3/,120,0,1 8 | desktop,/spark_rdd.html,1071,946,1199 9 | desktop,/kaggle-titanic.html,&$^%*,43,54 -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/.part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/.part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/.part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/.part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/_SUCCESS -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/data/site-views/csv/site-views.csv: -------------------------------------------------------------------------------- 1 | Page,Country,Pageviews,Unique Pageviews,Avg. Time on Page /spark_session.html,United States,2584,2443,0:04:25 /spark_lr.html,United States,2004,1846,0:04:40 /,United States,1748,1516,0:00:47 /spark_rdd.html,India,1723,1563,0:06:01 /spark_rdd.html,United States,1542,1398,0:04:16 /spark_session.html,India,1480,1401,0:04:23 /spark_lr.html,India,1216,1099,0:05:30 /flink_eventtime.html,United States,1054,937,0:05:42 /realtime-storm-kafka3.html,India,958,827,0:03:54 /hello-kafka-streams.html,United States,885,789,0:02:45 /realtime-storm-kafka2.html,India,868,722,0:03:37 /spark-scala.html,United States,825,774,0:03:16 /spark_rdd_part2.html,India,791,715,0:04:32 /realtime-storm-kafka2.html,United States,766,594,0:02:29 /realtime-storm-kafka3.html,United States,754,624,0:03:08 /flink_streaming.html,United States,718,658,0:03:16 /spark_rdd_part2.html,United States,703,620,0:03:26 /,India,702,531,0:00:40 /realtime-storm-kafka1.html,India,687,577,0:04:01 /spark-scala.html,India,671,623,0:06:43 /kafka-streams-part2.html,United States,645,535,0:01:53 /realtime-storm-kafka1.html,United States,614,512,0:02:50 /search.html?query=spark,India,577,425,0:00:31 /flink_trigger_evictor.html,United States,501,437,0:03:21 /search.html?query=spark,United States,461,321,0:00:32 /flink_eventtime.html,India,448,401,0:06:13 /search.html?query=flink,United States,434,317,0:00:29 /hello-kafka-streams.html,India,415,375,0:03:57 /flink_streaming.html,India,325,305,0:03:45 -------------------------------------------------------------------------------- /docker/spark-kafka-single-node-for-meetup/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export HOSTNAME=$(hostname -i) 4 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.master.Master >/dev/null 2>&1 < /dev/null & 5 | echo "starting spark master.." 6 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.worker.Worker spark://${HOSTNAME}:7077 >/dev/null 2>&1 < /dev/null & 7 | echo "starting spark worker.." 8 | exec $KAFKA_HOME/bin/zookeeper-server-start.sh $KAFKA_HOME/config/zookeeper.properties >/dev/null 2>&1 < /dev/null & 9 | echo "starting zookeeper.." 10 | exec $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties >/dev/null 2>&1 < /dev/null & 11 | echo "starting kafka broker.." 12 | 13 | #make container wait 14 | exec "$@"; 15 | -------------------------------------------------------------------------------- /flink-examples-java/.gitignore: -------------------------------------------------------------------------------- 1 | #idea 2 | .idea/ 3 | *.iml 4 | 5 | #osx 6 | *.DS_Store 7 | 8 | #artifacts 9 | *.jar 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /flink-examples-java/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink 2 | 3 | MAINTAINER Vishnu Viswanath 4 | 5 | ADD submit.sh / 6 | 7 | ARG JAR_FILE 8 | ADD target/${JAR_FILE} /usr/share/flink-job.jar 9 | 10 | CMD ["/bin/bash", "/submit.sh"] -------------------------------------------------------------------------------- /flink-examples-java/src/main/java/com/vishnuviswanath/flink/streaming/HelloStreamingJob.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.flink.streaming; 2 | 3 | import org.apache.flink.streaming.api.datastream.DataStream; 4 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 5 | import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | public class HelloStreamingJob { 11 | 12 | public static void main(String[] args) throws Exception { 13 | StreamExecutionEnvironment senv = StreamExecutionEnvironment.getExecutionEnvironment(); 14 | 15 | List sample = new ArrayList<>(); 16 | sample.add("test"); 17 | sample.add("data"); 18 | DataStream sampleStream = senv.fromCollection(sample); 19 | sampleStream.addSink(new PrintSinkFunction<>()); 20 | 21 | senv.execute("hellow data stream"); 22 | 23 | } 24 | } 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /flink-examples-java/src/main/java/com/vishnuviswanath/flink/streaming/sources/SocketStreamingJob.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.flink.streaming.sources; 2 | 3 | import org.apache.flink.streaming.api.datastream.DataStream; 4 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 5 | import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; 6 | 7 | public class SocketStreamingJob { 8 | 9 | public static void main(String[] args) throws Exception { 10 | StreamExecutionEnvironment senv = StreamExecutionEnvironment.getExecutionEnvironment(); 11 | 12 | DataStream socketDataStream = senv.socketTextStream("localhost", 9999); 13 | DataStream sensorDataStream = socketDataStream.map(x -> new SensorData(x)); 14 | sensorDataStream.addSink(new PrintSinkFunction<>()); 15 | 16 | senv.execute("Sensor data stream"); 17 | 18 | } 19 | } 20 | 21 | class SensorData { 22 | double reading; 23 | 24 | public SensorData(String reading) { 25 | this.reading = Double.parseDouble(reading); 26 | } 27 | 28 | @Override 29 | public String toString() { 30 | return String.format("{Temp : %10.4f}", reading); 31 | } 32 | } 33 | 34 | 35 | -------------------------------------------------------------------------------- /flink-examples-java/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | log4j.rootLogger=INFO, console 20 | 21 | log4j.appender.console=org.apache.log4j.ConsoleAppender 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n 24 | -------------------------------------------------------------------------------- /flink-examples-java/submit.sh: -------------------------------------------------------------------------------- 1 | #start job manager 2 | ./docker-entrypoint.sh jobmanager & 3 | 4 | sleep 10 5 | 6 | #submit job 7 | flink run /usr/share/flink-job.jar -------------------------------------------------------------------------------- /flink/.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | .cache-main 4 | 5 | #eclipse specific 6 | .classpath 7 | .project 8 | .settings/ 9 | .idea/ 10 | # sbt specific 11 | .cache 12 | .history 13 | .lib/ 14 | dist/* 15 | target/ 16 | lib_managed/ 17 | src_managed/ 18 | project/boot/ 19 | project/plugins/project/ 20 | 21 | # Scala-IDE specific 22 | .scala_dependencies 23 | .worksheet 24 | /bin/ 25 | -------------------------------------------------------------------------------- /flink/README.md: -------------------------------------------------------------------------------- 1 | ## Flink Streaming 2 | 3 |
Note : print() functions used on DataStream objects will not print in the console, this will be printed in the .out file of log. 4 | This is defined in the log4j.properties in the conf folder. Change the logger type to ConsoleAppender to print log to console.
5 | 6 | All the socket based streaming jobs in the examples listen to port 4444. To simulate messages coming through this port, 7 | run `nc -lk 4444` and send sample messages 8 | 9 | #### SocketStreming 10 | ``` 11 | flink run target/scala-2.10/flink-vishnu_2.10-1.0.jar -c com.vishnu.flink.streaming.FlinkStreamingWordCount 12 | ``` 13 | #### Tumbling window streaming (similar to batch) 14 | ``` 15 | flink run target/scala-2.10/flink-vishnu_2.10-1.0.jar -c com.vishnu.flink.streaming.TumblingWindowStreamiming 16 | ``` 17 | -------------------------------------------------------------------------------- /flink/build.sbt: -------------------------------------------------------------------------------- 1 | ThisBuild / resolvers ++= Seq("Apache Development Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots/", Resolver.mavenLocal) 2 | 3 | ThisBuild / scalaVersion := "2.11.7" 4 | 5 | val flinkVersion = "1.5.0" 6 | val kafkaVersion = "0.11.0.2" 7 | 8 | val flinkDependencies = Seq( 9 | "org.apache.flink" %% "flink-scala" % flinkVersion % "provided", 10 | "org.apache.flink" %% "flink-clients" % flinkVersion % "provided", 11 | "org.apache.flink" %% "flink-streaming-scala" %flinkVersion % "provided", 12 | "org.apache.flink" %% "flink-statebackend-rocksdb" % flinkVersion % "provided", 13 | "org.apache.flink" %% "flink-queryable-state-client-java" % flinkVersion % "provided", 14 | "org.apache.flink" %% "flink-queryable-state-runtime" % flinkVersion % "provided", 15 | "org.apache.flink" %% "flink-cep-scala" % flinkVersion, 16 | "org.apache.flink" %% "flink-connector-kafka-0.11" % flinkVersion 17 | ) 18 | 19 | val otherDependencies = Seq( 20 | "org.apache.kafka" % "kafka-clients" % kafkaVersion, 21 | "joda-time" % "joda-time" % "2.9.4", 22 | "org.slf4j" % "slf4j-log4j12" % "1.7.25", 23 | "log4j" % "log4j" % "1.2.17" 24 | ) 25 | 26 | val main = "com.vishnu.flink.streaming.queryablestate.QuerybleStateStream" 27 | 28 | Compile / run / mainClass := Some(main) 29 | 30 | assembly / mainClass := Some(main) 31 | 32 | Compile / run := Defaults.runTask(Compile / fullClasspath, 33 | Compile / run / mainClass, 34 | Compile / run / runner).evaluated 35 | 36 | lazy val commonSettings = Seq( 37 | organization := "com.vishnuviswanath", 38 | version := "1.0", 39 | name := "flink-examples" 40 | ) 41 | 42 | lazy val root = (project in file(".")). 43 | settings(commonSettings:_*). 44 | settings( 45 | libraryDependencies ++= flinkDependencies, 46 | libraryDependencies ++= otherDependencies, 47 | retrieveManaged := true 48 | ) 49 | 50 | 51 | lazy val mainRunner = project.in(file("mainRunner")).dependsOn(RootProject(file("."))).settings( 52 | // we set all provided dependencies to none, so that they are included in the classpath of mainRunner 53 | libraryDependencies := (libraryDependencies in RootProject(file("."))).value.map{ 54 | module => module.configurations match { 55 | case Some("provided") => module.withConfigurations(None) 56 | case _ => module 57 | } 58 | } 59 | ) 60 | 61 | 62 | -------------------------------------------------------------------------------- /flink/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.1 2 | -------------------------------------------------------------------------------- /flink/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") -------------------------------------------------------------------------------- /flink/src/main/java/FlinkMain.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by vishnu on 9/2/16. 3 | */ 4 | public class FlinkMain { 5 | } 6 | -------------------------------------------------------------------------------- /flink/src/main/java/utility/Server.java: -------------------------------------------------------------------------------- 1 | package utility; 2 | 3 | /** 4 | * Created by vishnu on 11/3/16. 5 | */ 6 | public class Server { 7 | } 8 | -------------------------------------------------------------------------------- /flink/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/WordCount.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink 2 | 3 | import org.apache.flink.api.scala._ 4 | 5 | object WordCount { 6 | def main(args: Array[String]) { 7 | 8 | val env = ExecutionEnvironment.getExecutionEnvironment 9 | val text = env.fromElements( 10 | "Who's there?", 11 | "I think I hear them. Stand, ho! Who's there?") 12 | 13 | val counts = text.flatMap { _.toLowerCase.split("\\W+") filter { _.nonEmpty } } 14 | .map { (_, 1) } 15 | .groupBy(0) 16 | .sum(1) 17 | 18 | counts.print() 19 | } 20 | } -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/dataset/WordCount.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.dataset 2 | 3 | import java.lang.Iterable 4 | 5 | import org.apache.flink.api.common.functions.{GroupReduceFunction, FlatMapFunction} 6 | import org.apache.flink.api.scala.ExecutionEnvironment 7 | import org.apache.flink.util.Collector 8 | import org.apache.flink.api.scala._ 9 | 10 | /** 11 | * Created by vishnu on 3/12/16. 12 | * Scala equivalent for WordCount program in http://dataartisans.github.io/flink-training/dataSetBasics/slides.html 13 | * Reads from hdfs file, mapper emits 1 for each word and Reducer aggregates 14 | * 15 | */ 16 | object WordCount { 17 | 18 | def main(args:Array[String]): Unit = { 19 | val env = ExecutionEnvironment.getExecutionEnvironment 20 | val data = env.readTextFile("hdfs://localhost:9000/states") 21 | 22 | val counts = data.flatMap(new Tokenizer()) 23 | .groupBy(0) 24 | .reduceGroup(new SumWords()) 25 | 26 | 27 | counts.print() 28 | } 29 | 30 | 31 | } 32 | 33 | 34 | class Tokenizer extends FlatMapFunction[String,(String,Int)] { 35 | override def flatMap(value: String, out: Collector[(String,Int)]): Unit = { 36 | val tokens = value.split("\\W+") 37 | for (token <- tokens if token.length>0) out.collect(token,1) 38 | } 39 | } 40 | 41 | class SumWords extends GroupReduceFunction[(String,Int),(String,Int)] { 42 | override def reduce(words: Iterable[(String,Int)], out: Collector[(String,Int)]): Unit = { 43 | var count = 0 44 | var prev: (String, Int) = null 45 | val it = words.iterator() 46 | while(it.hasNext) { 47 | prev = it.next() 48 | count = 1 + prev._2 49 | } 50 | out.collect(prev._1,count) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/CountTumblingWindow.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.streaming.api.windowing.time.Time 6 | 7 | 8 | /** 9 | * A tumbling window based on count 10 | */ 11 | object CountTumblingWindow { 12 | def main(args: Array[String]) { 13 | val sev = StreamExecutionEnvironment.getExecutionEnvironment 14 | val socTextStream = sev.socketTextStream("localhost",4444) 15 | 16 | //the following window is triggered for every 5 items 17 | //since we are doing keyby 18 | //each window will be containing only words of the same group 19 | //e.g., 20 | //if stream is : one two one two one two one two one 21 | //window1 = {one,one,one,one,one} 22 | //window2 = {two,two,two,two} 23 | //window1 will triggered but not window 2, it need one more 'two' to make it 5 24 | val counts = socTextStream.flatMap{_.split("\\s")} 25 | .map { (_, 1) } 26 | .keyBy(0) 27 | .countWindow(5) 28 | .sum(1).setParallelism(4); 29 | 30 | counts.print() 31 | sev.execute() 32 | } 33 | } -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/EventTimeProcessing.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.streaming.api.TimeCharacteristic 6 | import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks 7 | import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} 8 | import org.apache.flink.streaming.api.watermark.Watermark 9 | import org.apache.flink.util.Collector 10 | import org.joda.time.format.DateTimeFormat 11 | 12 | object EventTimeWindowWithTrigger { 13 | def main(args: Array[String]) { 14 | val sev = StreamExecutionEnvironment.getExecutionEnvironment 15 | sev.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 16 | val rawMessages: DataStream[String] = sev.socketTextStream("localhost",4444) 17 | 18 | val coloredMessagesStream: DataStream[ColoredMessage] = rawMessages.flatMap(new FlatMapFunction[String,ColoredMessage] { 19 | override def flatMap(value: String, out: Collector[ColoredMessage]): Unit = { 20 | out.collect(ColoredMessage(value.split(","))) 21 | } 22 | }) 23 | 24 | 25 | 26 | 27 | sev.execute() 28 | } 29 | } 30 | case class ColoredMessage(eventTime: Long, color: String) 31 | 32 | object ColoredMessage { 33 | def apply(parts: Array[String]): ColoredMessage = { 34 | ColoredMessage( 35 | eventTime = getDate(parts(0)), 36 | color = parts(1)) 37 | } 38 | def getDate(date: String): Long = { 39 | val formatter = DateTimeFormat.forPattern("HH:mm:ss") 40 | val dt = formatter.parseDateTime(date) 41 | dt.getMillis 42 | } 43 | } 44 | 45 | class TimestampAndWatermarkGen extends AssignerWithPeriodicWatermarks[ColoredMessage] { 46 | val maxDelay = 1*60*1000 //1 minute 47 | var maxTime = 0L 48 | override def getCurrentWatermark: Watermark = { 49 | new Watermark(maxTime - maxDelay) 50 | } 51 | override def extractTimestamp(element: ColoredMessage, previousElementTimestamp: Long): Long = { 52 | maxTime = Math.max(element.eventTime, maxTime) 53 | return element.eventTime 54 | } 55 | } 56 | 57 | -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/FlinkStreamingWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.api.scala._ 5 | 6 | 7 | 8 | object FlinkStreamingWordCount { 9 | 10 | def main(args: Array[String]) { 11 | val sev = StreamExecutionEnvironment.getExecutionEnvironment 12 | val socTxtStream = sev.socketTextStream("localhost",4444) 13 | 14 | val counts = socTxtStream.flatMap{_.toLowerCase.split(" ") filter { _.nonEmpty } } 15 | .map { (_, 1) } 16 | .keyBy(0) 17 | .sum(1) 18 | counts.print() 19 | sev.execute() 20 | 21 | } 22 | 23 | } -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/ProcessingTimeWindowEvictor.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming 2 | 3 | import org.apache.flink.api.scala._ 4 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 5 | import org.apache.flink.streaming.api.windowing.assigners.SlidingProcessingTimeWindows 6 | import org.apache.flink.streaming.api.windowing.evictors.CountEvictor 7 | import org.apache.flink.streaming.api.windowing.time.Time 8 | import org.apache.flink.streaming.api.windowing.triggers.CountTrigger 9 | 10 | object ProcessingTimeWindowEvictor { 11 | def main(args: Array[String]) { 12 | val sev = StreamExecutionEnvironment.getExecutionEnvironment 13 | val socTextStream = sev.socketTextStream("localhost",4444) 14 | 15 | //a window of size 20 seconds is created, window slides every 10 seconds 16 | //execution of window is triggered when there are 3 elements in the window 17 | val counts = socTextStream.flatMap{_.split("\\s")} 18 | .map { (_, 1) } 19 | .keyBy(0) 20 | .window(SlidingProcessingTimeWindows.of(Time.seconds(15),Time.seconds(10))) 21 | .trigger(CountTrigger.of(5)) 22 | .evictor(CountEvictor.of(3)) 23 | .sum(1).setParallelism(4); 24 | 25 | counts.print() 26 | sev.execute() 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/ProcessingTimeWindowWithTrigger.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.streaming.api.windowing.time.Time 6 | import org.apache.flink.streaming.api.windowing.assigners.{SlidingProcessingTimeWindows} 7 | import org.apache.flink.streaming.api.windowing.evictors.{CountEvictor} 8 | 9 | import org.apache.flink.streaming.api.windowing.triggers.CountTrigger 10 | 11 | object ProcessingTimeWindowWithTrigger { 12 | def main(args: Array[String]) { 13 | val sev = StreamExecutionEnvironment.getExecutionEnvironment 14 | val socTextStream = sev.socketTextStream("localhost",4444) 15 | 16 | //a window of size 20 seconds is created, window slides every 10 seconds 17 | //execution of window is triggered when there are 3 elements in the window 18 | val counts = socTextStream.flatMap{_.split("\\s")} 19 | .map { (_, 1) } 20 | .keyBy(0) 21 | .window(SlidingProcessingTimeWindows.of(Time.seconds(15),Time.seconds(10))) 22 | .trigger(CountTrigger.of(5)) 23 | .sum(1).setParallelism(4); 24 | 25 | counts.print() 26 | sev.execute() 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/StreamingWithRocksDb.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming 2 | 3 | /** 4 | * Created by vviswanath on 3/20/17. 5 | */ 6 | object StreamingWithRocksDb { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/TimeSlidingWindow.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.streaming.api.windowing.time.Time 6 | 7 | /** 8 | * A sliding window based on time. In contrast to Tumbling window a sliding is an overlapping window. 9 | */ 10 | object TimeSlidingWindow { 11 | def main(args: Array[String]) { 12 | val sev = StreamExecutionEnvironment.getExecutionEnvironment 13 | val socTextStream = sev.socketTextStream("localhost",4444) 14 | 15 | //the following window is triggered every 10 seconds,for last 15 seconds data 16 | //therefore there is an overlap between data being processed at an instance and previous processing. 17 | val counts = socTextStream.flatMap{_.split("\\s")} 18 | .map { (_, 1) } 19 | .keyBy(0) 20 | .timeWindow(Time.seconds(15),Time.seconds(10)) 21 | .sum(1).setParallelism(4); 22 | 23 | counts.print() 24 | sev.execute() 25 | } 26 | } -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/TimeTumblingWindow.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.streaming.api.windowing.time.Time 6 | 7 | 8 | /** 9 | * A tumbling window based on time 10 | */ 11 | 12 | object TimeTumblingWindow { 13 | def main(args: Array[String]) { 14 | val sev = StreamExecutionEnvironment.getExecutionEnvironment 15 | val socTextStream = sev.socketTextStream("localhost",4444) 16 | 17 | //the following window is triggered every 15 seconds. 18 | val counts = socTextStream.flatMap{_.split("\\s")} 19 | .map { (_, 1) } 20 | .keyBy(0) 21 | .timeWindow(Time.seconds(15)) 22 | .sum(1).setParallelism(4); 23 | 24 | counts.print() 25 | sev.execute() 26 | } 27 | } -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/queryablestate/QueryClient.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming.queryablestate 2 | 3 | import com.vishnu.flink.streaming.queryablestate.QuerybleStateStream.ClimateLog 4 | import org.apache.flink.api.common.functions.ReduceFunction 5 | import org.apache.flink.api.common.state.{ReducingState, ReducingStateDescriptor} 6 | import org.apache.flink.api.common.typeinfo.{BasicTypeInfo, TypeHint, TypeInformation} 7 | import org.apache.flink.api.common.{ExecutionConfig, JobID} 8 | import org.apache.flink.api.java.utils.ParameterTool 9 | import org.apache.flink.queryablestate.client.QueryableStateClient 10 | 11 | import scala.compat.java8.FutureConverters.toScala 12 | import scala.concurrent.{Await, ExecutionContext, duration} 13 | import scala.util.{Failure, Success} 14 | 15 | /** 16 | * Created by vviswanath on 3/13/17. 17 | */ 18 | object QueryClient { 19 | 20 | def main(args: Array[String]) { 21 | 22 | val parameterTool = ParameterTool.fromArgs(args) 23 | val jobId = JobID.fromHexString(parameterTool.get("jobId")) 24 | val key = parameterTool.get("stateKey") 25 | 26 | val client = new QueryableStateClient("10.0.0.189", 9067) 27 | 28 | val reduceFunction = new ReduceFunction[ClimateLog] { 29 | override def reduce(c1: ClimateLog, c2: ClimateLog): ClimateLog = { 30 | c1.copy( 31 | temperature = c1.temperature + c2.temperature, 32 | humidity = c1.humidity + c2.humidity) 33 | } 34 | } 35 | 36 | val climateLogStateDesc = new ReducingStateDescriptor[ClimateLog]( 37 | "climate-record-state", 38 | reduceFunction, 39 | TypeInformation.of(new TypeHint[ClimateLog]() {}).createSerializer(new ExecutionConfig())) 40 | 41 | implicit val ec = ExecutionContext.global 42 | val resultFuture = toScala(client.getKvState (jobId, "queryable-climatelog-stream", key, new TypeHint[String]{}.getTypeInfo, climateLogStateDesc)) 43 | 44 | while(!resultFuture.isCompleted) { 45 | println("waiting...") 46 | Thread.sleep(1000) 47 | } 48 | 49 | resultFuture.onComplete(r ⇒ println(r.get)) 50 | resultFuture.onFailure(PartialFunction(println)) 51 | 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/queryablestate/QuerybleStateStream.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming.queryablestate 2 | 3 | import org.apache.flink.api.common.functions.ReduceFunction 4 | import org.apache.flink.api.common.state.ReducingStateDescriptor 5 | import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation} 6 | import org.apache.flink.api.java.tuple.Tuple 7 | import org.apache.flink.api.scala._ 8 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 9 | import org.apache.flink.streaming.api.windowing.time.Time 10 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 11 | import org.apache.flink.util.Collector 12 | import org.apache.log4j.Logger 13 | 14 | /** 15 | * Created by vviswanath on 3/12/17. 16 | */ 17 | 18 | object QuerybleStateStream { 19 | 20 | val logger = Logger.getLogger("QueryableStateStream") 21 | 22 | case class ClimateLog(country: String, state: String, temperature: Float, humidity: Float) 23 | object ClimateLog { 24 | def apply(line: String): Option[ClimateLog] = { 25 | val parts = line.split(",") 26 | try{ 27 | Some(ClimateLog(parts(0), parts(1), parts(2).toFloat, parts(3).toFloat)) 28 | } catch { 29 | case e: Exception => { 30 | logger.warn(s"Unable to parse line $line") 31 | None 32 | } 33 | } 34 | } 35 | } 36 | 37 | def main(args: Array[String]): Unit = { 38 | val senv = StreamExecutionEnvironment.getExecutionEnvironment 39 | senv.setParallelism(1) 40 | 41 | //this is required if org.apache.flink.api.scala._ is not imported 42 | //implicit val typeInfo = TypeInformation.of(classOf[ClimateLog]) 43 | 44 | val climateLogStream = senv.socketTextStream("localhost", 2222) 45 | .flatMap(ClimateLog(_)) 46 | 47 | val climateLogAgg = climateLogStream 48 | .name("climate-log-agg") 49 | .keyBy("country", "state") 50 | .timeWindow(Time.seconds(10)) 51 | .reduce(reduceFunction) 52 | 53 | val climateLogStateDesc = new ReducingStateDescriptor[ClimateLog]( 54 | "climate-record-state", 55 | reduceFunction, 56 | TypeInformation.of(new TypeHint[ClimateLog]() {})) 57 | 58 | 59 | val queryableStream = climateLogAgg 60 | .name("queryable-state") 61 | .keyBy("country") 62 | .asQueryableState("queryable-climatelog-stream", climateLogStateDesc) 63 | 64 | climateLogAgg.print() 65 | 66 | senv.execute("Queryablestate example streaming job") 67 | } 68 | 69 | val reduceFunction = new ReduceFunction[ClimateLog] { 70 | override def reduce(c1: ClimateLog, c2: ClimateLog): ClimateLog = { 71 | c1.copy( 72 | temperature = c1.temperature + c2.temperature, 73 | humidity=c1.humidity + c2.humidity) 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/sessionwindows/SessionWindowExampleDummyDataProd.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.streaming.sessionwindows 2 | 3 | import java.io._ 4 | import java.net.ServerSocket 5 | 6 | /** 7 | * Created by vviswanath on 6/8/17. 8 | * 9 | * Data producer for ttesting SessionWindowExample.scala 10 | */ 11 | object SessionWindowExampleDummyDataProd { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | val serverSocket = new ServerSocket(4444) 16 | val clientSocket = serverSocket.accept 17 | val out = new PrintWriter(clientSocket.getOutputStream, true) 18 | 19 | /* 20 | //0th second 21 | out.write(s"${System.currentTimeMillis},user2,recommendation\n") 22 | out.flush() 23 | Thread.sleep(1000)//1st second 24 | out.write(s"${System.currentTimeMillis},user1,recommendation\n") 25 | out.flush() 26 | Thread.sleep(1000) //2nd second 27 | out.write(s"${System.currentTimeMillis},user1,ad\n") 28 | out.flush() 29 | Thread.sleep(4000) //6th second 30 | out.write(s"${System.currentTimeMillis - 5000},user2,ad\n") //event time 3rd second 31 | out.flush() 32 | Thread.sleep(1000) //7th second 33 | out.write(s"${System.currentTimeMillis},user2,recommendation\n") 34 | out.flush() 35 | Thread.sleep(2000) //9th second 36 | out.write(s"${System.currentTimeMillis},user1,recommendation\n") 37 | out.flush() 38 | Thread.sleep(4000) 39 | out.close() 40 | */ 41 | 42 | //0th second 43 | out.write(s"${System.currentTimeMillis},user1,recommendation\n") 44 | out.flush() 45 | Thread.sleep(1000)//1st second 46 | out.write(s"${System.currentTimeMillis},user1,recommendation\n") 47 | out.flush() 48 | Thread.sleep(2001)//2nd second 49 | //this message is sent just to advance watermark, to show how AllowedLateness can cause a Window to be evaluated multiple times 50 | out.write(s"${System.currentTimeMillis},user2,recommendation\n") 51 | out.flush() 52 | Thread.sleep(2500) //4.5th second 53 | out.write(s"${System.currentTimeMillis - 3500},user1,ad\n") //event time 3rd second 54 | out.flush() 55 | Thread.sleep(2500) //7th second 56 | out.write(s"${System.currentTimeMillis},user1,recommendation\n") 57 | out.flush() 58 | Thread.sleep(4000) 59 | out.close() 60 | } 61 | 62 | 63 | } 64 | -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/streaming/windowtimer/ProccWindowFun.scala: -------------------------------------------------------------------------------- 1 | //package com.vishnu.flink.streaming.windowtimer 2 | // 3 | //import java.lang.Iterable 4 | // 5 | //import org.apache.flink.api.java.tuple.Tuple 6 | //import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction 7 | //import org.apache.flink.streaming.api.windowing.windows.{TimeWindow, Window} 8 | //import org.apache.flink.util.Collector 9 | // 10 | ///** 11 | // * Created by vviswanath on 4/16/17. 12 | // */ 13 | //class ProcWindowFun[IN, OUT, K, W <: Window] extends ProcessWindowFunction[String, String, Tuple, TimeWindow] { 14 | // override def process(key: Tuple, context: ProcessWindowFunction[String, String, Tuple, TimeWindow]#Context, iterable: Iterable[String], collector: Collector[String]): Unit = { 15 | // context.registerEventTimeTimer(100) 16 | // } 17 | // 18 | // override def onTimer(t: Long, context: ProcessWindowFunction[String, String, Tuple, TimeWindow]#OnTimerContext, out: Collector[String]): Unit = { 19 | // println("Timer triggered") 20 | // } 21 | //} 22 | -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/util/ParameterParser.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.util 2 | 3 | /** 4 | * Created by vviswanath on 3/2/18. 5 | * A very simple arg parser. Expects args to be in the format Array("--key1", "value1", "--key2", "value2") 6 | * everything else is dropped. 7 | * e.g, key in ("key", "value") is dropped since key doesn't have "--" as prefix 8 | * everything after key1 in ("--key1", "value1", "--key2", "--key3", "value3") is dropped since key2 doesn't have a corresponding value2. 9 | * Returns a map of key → value 10 | */ 11 | object ParameterParser { 12 | 13 | def parse(args: Array[String]): Map[String, String] = { 14 | val Param = "--(.+)".r 15 | args.grouped(2).flatMap(l ⇒ 16 | if (l.length == 2) (l(0), l(1)) match { 17 | case (Param(key), value) ⇒ Some(key → value) 18 | case _ ⇒ None 19 | } 20 | else None 21 | ).toMap 22 | } 23 | } -------------------------------------------------------------------------------- /flink/src/main/scala/com/vishnu/flink/util/RandomServerEventsKafkaProducer.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.flink.util 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 6 | 7 | import scala.annotation.tailrec 8 | import scala.util.Random 9 | 10 | /** 11 | * Created by vviswanath on 1/15/18. 12 | */ 13 | object RandomServerEventsKafkaProducer { 14 | 15 | def eventType: String = { 16 | Random.nextInt(3) match { 17 | case 0 ⇒ "cpu-usage" 18 | case 1 ⇒ "mem-usage" 19 | case 2 ⇒ "disk-usage" 20 | } 21 | } 22 | 23 | def serverIp: String = { 24 | s"192.168.23.${Random.nextInt(10)}" 25 | } 26 | 27 | def value: Double = { 28 | Random.nextDouble * 100 29 | } 30 | 31 | def now(possibleDelay: Boolean, maxDelay: Long): Long = { 32 | val now = System.currentTimeMillis() 33 | if (possibleDelay && Random.nextBoolean()) now - Random.nextLong % maxDelay 34 | else now 35 | } 36 | 37 | //returns a key,value 38 | def nextServerEvent: (String, String) = { 39 | 40 | val event = (serverIp, s"${now(possibleDelay = true, 10000)},$eventType,$serverIp,$value") 41 | print(s"Produced event $event\n") 42 | event 43 | } 44 | 45 | def main(args: Array[String]): Unit = { 46 | 47 | val parameters = ParameterParser.parse(args) 48 | 49 | val props = new Properties() 50 | props.put("bootstrap.servers", parameters.getOrElse("kafka-bootstrap-server", "localhost:9092")) 51 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 52 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 53 | 54 | val producer = new KafkaProducer[String, String](props) 55 | val interval = 10 56 | val topic = parameters("topic") 57 | val numRecsToProduce: Option[Int] = None //None = infinite 58 | 59 | 60 | @tailrec 61 | def produceRecord(numRecToProduce: Option[Int]): Unit = { 62 | def generateRecord(topic: String, f: ⇒ (String, String)): ProducerRecord[String, String] = { 63 | val event = f 64 | new ProducerRecord[String, String](topic, event._1, event._2) 65 | } 66 | 67 | numRecToProduce match { 68 | case Some(x) if x > 0 ⇒ 69 | producer.send(generateRecord(topic, nextServerEvent)) 70 | Thread.sleep(interval) 71 | produceRecord(Some(x - 1)) 72 | 73 | case None ⇒ 74 | producer.send(generateRecord(topic, nextServerEvent)) 75 | Thread.sleep(interval) 76 | produceRecord(None) 77 | 78 | case _ ⇒ 79 | } 80 | } 81 | 82 | produceRecord(numRecsToProduce) 83 | 84 | 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /mapreduce/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /mapreduce/ArrayWritableExample.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.ArrayWritable; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapred.FileInputFormat; 11 | import org.apache.hadoop.mapred.FileOutputFormat; 12 | import org.apache.hadoop.mapred.JobClient; 13 | import org.apache.hadoop.mapred.JobConf; 14 | import org.apache.hadoop.mapred.MapReduceBase; 15 | import org.apache.hadoop.mapred.Mapper; 16 | import org.apache.hadoop.mapred.OutputCollector; 17 | import org.apache.hadoop.mapred.Reporter; 18 | import org.apache.hadoop.mapred.TextInputFormat; 19 | import org.apache.hadoop.mapred.TextOutputFormat; 20 | import org.apache.hadoop.mapred.lib.IdentityReducer; 21 | 22 | /** 23 | * An example implementing ArrayWritable 24 | * @author vishnu 25 | * 26 | */ 27 | 28 | public class ArrayWritableExample { 29 | 30 | private static class MyMapper extends MapReduceBase implements 31 | Mapper { 32 | @Override 33 | public void map(LongWritable dummKey, Text value, 34 | OutputCollector output, Reporter reporter) 35 | throws IOException { 36 | String line = value.toString(); 37 | String[] parts = line.split(" "); 38 | IntArrayWritable arr = new IntArrayWritable(); 39 | IntWritable[] intArr = new IntWritable[parts.length - 1]; 40 | if (parts.length >= 2) { 41 | Text key = new Text(parts[0]); 42 | for (int i = 1; i < parts.length; i++) { 43 | IntWritable val = new IntWritable( 44 | Integer.parseInt(parts[i])); 45 | intArr[i - 1] = val; 46 | } 47 | arr.set(intArr); 48 | System.out.println("key "+key.toString()+" arr"+arr.toString()); 49 | output.collect(key, arr); 50 | } 51 | } 52 | } 53 | 54 | private static class IntArrayWritable extends ArrayWritable { 55 | 56 | public IntArrayWritable() { 57 | super(IntWritable.class); 58 | } 59 | 60 | @Override 61 | public String toString() { 62 | String[] arr = super.toStrings(); 63 | String result = ""; 64 | for ( String str:arr) { 65 | result+=str+" "; 66 | } 67 | return result; 68 | } 69 | 70 | } 71 | 72 | public static void main(String[] args) throws IOException { 73 | 74 | JobConf conf = new JobConf(ArrayWritableExample.class); 75 | conf.setJobName("array writable"); 76 | 77 | conf.setOutputKeyClass(Text.class); 78 | conf.setOutputValueClass(IntArrayWritable.class); 79 | 80 | conf.setMapperClass(MyMapper.class); 81 | conf.setReducerClass(IdentityReducer.class); 82 | 83 | conf.setInputFormat(TextInputFormat.class); 84 | conf.setOutputFormat(TextOutputFormat.class); 85 | 86 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 87 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 88 | 89 | JobClient.runJob(conf); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /mapreduce/DistributeCache.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.filecache.DistributedCache; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapred.FileInputFormat; 12 | import org.apache.hadoop.mapred.FileOutputFormat; 13 | import org.apache.hadoop.mapred.JobClient; 14 | import org.apache.hadoop.mapred.JobConf; 15 | import org.apache.hadoop.mapred.MapReduceBase; 16 | import org.apache.hadoop.mapred.Mapper; 17 | import org.apache.hadoop.mapred.OutputCollector; 18 | import org.apache.hadoop.mapred.Reporter; 19 | import org.apache.hadoop.mapred.TextInputFormat; 20 | import org.apache.hadoop.mapred.TextOutputFormat; 21 | 22 | /** 23 | * Shows how a file can be loaded to dist cache. and how it can be used in mapper. 24 | * @author vishnu 25 | * 26 | */ 27 | 28 | public class DistributeCache { 29 | 30 | private static class MyMapper extends MapReduceBase implements Mapper { 31 | 32 | private Path[] localFiles; 33 | 34 | @Override 35 | public void map(LongWritable key, Text value, 36 | OutputCollector output, Reporter reporter) 37 | throws IOException { 38 | for(Path path : localFiles) { 39 | output.collect(new Text(path.getName()), new IntWritable(1)); 40 | } 41 | 42 | } 43 | 44 | @Override 45 | public void configure(JobConf conf) { 46 | try { 47 | localFiles = DistributedCache.getLocalCacheFiles(conf); 48 | } catch (IOException e) { 49 | e.printStackTrace(); 50 | } 51 | } 52 | 53 | } 54 | 55 | public static void main(String[] args) throws IOException { 56 | JobConf conf = new JobConf(DistributeCache.class); 57 | conf.setJobName("distcache"); 58 | 59 | conf.setOutputKeyClass(Text.class); 60 | conf.setOutputValueClass(IntWritable.class); 61 | 62 | conf.setMapperClass(MyMapper.class); 63 | /* by default identity reducer will be called 64 | * conf.setReducerClass(MyReducer.class);*/ 65 | conf.setInputFormat(TextInputFormat.class); 66 | conf.setOutputFormat(TextOutputFormat.class); 67 | FileSystem fs = FileSystem.get(conf); 68 | DistributedCache.addFileToClassPath(new Path(args[2]), conf, fs); 69 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 70 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 71 | 72 | JobClient.runJob(conf); 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /mapreduce/ImageReader.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples; 2 | 3 | import java.io.ByteArrayOutputStream; 4 | import java.io.IOException; 5 | import java.net.URI; 6 | 7 | import org.apache.hadoop.fs.FSDataInputStream; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.BytesWritable; 11 | import org.apache.hadoop.io.IOUtils; 12 | import org.apache.hadoop.io.IntWritable; 13 | import org.apache.hadoop.io.LongWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapred.FileInputFormat; 16 | import org.apache.hadoop.mapred.FileOutputFormat; 17 | import org.apache.hadoop.mapred.JobClient; 18 | import org.apache.hadoop.mapred.JobConf; 19 | import org.apache.hadoop.mapred.MapReduceBase; 20 | import org.apache.hadoop.mapred.Mapper; 21 | import org.apache.hadoop.mapred.OutputCollector; 22 | import org.apache.hadoop.mapred.Reporter; 23 | import org.apache.hadoop.mapred.TextInputFormat; 24 | import org.apache.hadoop.mapred.TextOutputFormat; 25 | 26 | /** 27 | * Reads an image file in the hdfs and converts it to bytes and output 28 | * The input should contain the image_name,image_path. 29 | * @author vishnu 30 | * 31 | */ 32 | public class ImageReader { 33 | 34 | private static class ImageMapper extends MapReduceBase implements 35 | Mapper { 36 | 37 | private JobConf localconf; 38 | 39 | @Override 40 | public void map(LongWritable offset, Text value, 41 | OutputCollector output, Reporter reporter) 42 | throws IOException { 43 | 44 | String line = value.toString(); 45 | String[] parts = line.split(" "); 46 | Text key = new Text(parts[0]); 47 | String path = parts[1]; 48 | FileSystem fs = FileSystem.get(URI.create(path), localconf); 49 | FSDataInputStream fsin = null; 50 | 51 | try { 52 | fsin = fs.open(new Path(path)); 53 | ByteArrayOutputStream bout = new ByteArrayOutputStream(); 54 | byte[] buffer = new byte[1024 * 1024]; 55 | 56 | while (fsin.read(buffer, 0, buffer.length) >= 0) { 57 | bout.write(buffer); 58 | } 59 | output.collect(key, new BytesWritable(bout.toByteArray())); 60 | } finally { 61 | IOUtils.closeStream(fsin); 62 | } 63 | 64 | } 65 | 66 | @Override 67 | public void configure(JobConf conf) { 68 | localconf = conf; 69 | } 70 | 71 | } 72 | 73 | public static void main(String[] args) throws IOException { 74 | JobConf conf = new JobConf(ImageReader.class); 75 | conf.setJobName("imagereader"); 76 | 77 | conf.setOutputKeyClass(Text.class); 78 | conf.setOutputValueClass(BytesWritable.class); 79 | 80 | conf.setMapperClass(ImageMapper.class); 81 | 82 | conf.setInputFormat(TextInputFormat.class); 83 | conf.setOutputFormat(TextOutputFormat.class); 84 | 85 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 86 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 87 | 88 | JobClient.runJob(conf); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /mapreduce/LetterWordMapper.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | import java.util.StringTokenizer; 6 | 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.*; 9 | import org.apache.hadoop.mapred.*; 10 | 11 | 12 | public class LetterWordMapper { 13 | 14 | private static class MyMapper extends MapReduceBase implements 15 | Mapper { 16 | private Text word = new Text(); 17 | 18 | public void map(LongWritable key, Text value, 19 | OutputCollector output, Reporter reporter) 20 | throws IOException { 21 | String line = value.toString(); 22 | StringTokenizer tokenizer = new StringTokenizer(line); 23 | while (tokenizer.hasMoreTokens()) { 24 | String strWord = tokenizer.nextToken(); 25 | word.set(strWord.charAt(0)+""); 26 | if (!strWord.trim().equals("")) { 27 | System.out.println("emitting word "+strWord); 28 | output.collect(word, new Text(strWord)); 29 | } 30 | } 31 | } 32 | } 33 | 34 | private static class MyReducer extends MapReduceBase implements 35 | Reducer { 36 | public void reduce(Text key, Iterator values, 37 | OutputCollector output, Reporter reporter) 38 | throws IOException { 39 | System.out.println("key is "+key.toString()); 40 | String result = ""; 41 | while(values.hasNext()) { 42 | String next = values.next().toString(); 43 | System.out.println("next value is "+next); 44 | result+=next+","; 45 | } 46 | result = result.substring(0,result.length()-1); 47 | System.out.println("result is "+result); 48 | output.collect(key, new Text(result)); 49 | } 50 | } 51 | 52 | public static void main(String[] args) throws Exception { 53 | JobConf conf = new JobConf(WordCount.class); 54 | conf.setJobName("lettemapper"); 55 | 56 | conf.setOutputKeyClass(Text.class); 57 | conf.setOutputValueClass(Text.class); 58 | 59 | conf.setMapperClass(MyMapper.class); 60 | conf.setReducerClass(MyReducer.class); 61 | 62 | conf.setInputFormat(TextInputFormat.class); 63 | conf.setOutputFormat(TextOutputFormat.class); 64 | 65 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 66 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 67 | 68 | JobClient.runJob(conf); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /mapreduce/MultiInputPath.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapred.FileOutputFormat; 10 | import org.apache.hadoop.mapred.JobClient; 11 | import org.apache.hadoop.mapred.JobConf; 12 | import org.apache.hadoop.mapred.MapReduceBase; 13 | import org.apache.hadoop.mapred.Mapper; 14 | import org.apache.hadoop.mapred.OutputCollector; 15 | import org.apache.hadoop.mapred.Reporter; 16 | import org.apache.hadoop.mapred.TextInputFormat; 17 | import org.apache.hadoop.mapred.TextOutputFormat; 18 | import org.apache.hadoop.mapred.lib.IdentityReducer; 19 | import org.apache.hadoop.mapred.lib.MultipleInputs; 20 | 21 | /** 22 | * Move to new api... 23 | * @author vishnu 24 | * 25 | */ 26 | 27 | public class MultiInputPath { 28 | 29 | private static class MyMapper extends MapReduceBase implements Mapper { 30 | 31 | @Override 32 | public void map(LongWritable key, Text value, 33 | OutputCollector output, Reporter reporter) 34 | throws IOException { 35 | System.out.println("In mapper 1"); 36 | output.collect(key, new IntWritable(value.toString().length())); 37 | } 38 | 39 | } 40 | 41 | 42 | private static class MyMapper2 extends MapReduceBase implements Mapper { 43 | 44 | @Override 45 | public void map(LongWritable key, Text value, 46 | OutputCollector output, Reporter reporter) 47 | throws IOException { 48 | System.out.println("In mapper 2"); 49 | output.collect(key, new IntWritable(value.toString().length())); 50 | } 51 | 52 | } 53 | public static void main(String[] args) throws IOException { 54 | 55 | JobConf conf = new JobConf(MultiInputPath.class); 56 | conf.setJobName("multi"); 57 | conf.setMapperClass(MyMapper.class); 58 | conf.setReducerClass(IdentityReducer.class); 59 | conf.setInputFormat(TextInputFormat.class); 60 | conf.setOutputFormat(TextOutputFormat.class); 61 | conf.setOutputKeyClass(LongWritable.class); 62 | conf.setOutputValueClass(IntWritable.class); 63 | MultipleInputs.addInputPath(conf, new Path(args[0]), TextInputFormat.class,MyMapper.class); 64 | MultipleInputs.addInputPath(conf,new Path(args[1]),TextInputFormat.class,MyMapper2.class); 65 | FileOutputFormat.setOutputPath(conf,new Path(args[2])); 66 | 67 | JobClient.runJob(conf); 68 | 69 | 70 | } 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /mapreduce/README.md: -------------------------------------------------------------------------------- 1 | This folder has examples for 2 | 3 | 1. basic mapreduce programs 4 | 2. custom types 5 | 3. mapreduce chaining examples 6 | 4. joins using mapreduce 7 | 5. incremental aggergation using datafu 8 | -------------------------------------------------------------------------------- /mapreduce/SequenceFileTest.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.SequenceFile; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapred.JobConf; 11 | 12 | public class SequenceFileTest { 13 | 14 | public static void main(String[] args) throws IOException { 15 | 16 | JobConf conf = new JobConf(); 17 | Path sqFile = new Path(args[0]); 18 | FileSystem fs = sqFile.getFileSystem(conf); 19 | /*for local files 20 | * FileSystem fs = FileSystem.getLocal(conf); 21 | Path sqFile = new Path(args[0]);*/ 22 | 23 | 24 | SequenceFile.Writer sqWriter = SequenceFile.createWriter(fs,conf,sqFile, 25 | Text.class, 26 | LongWritable.class); 27 | sqWriter.append(new Text("key1"),new LongWritable(1)); 28 | sqWriter.close(); 29 | SequenceFile.Reader sqReader = new SequenceFile.Reader(fs,sqFile,conf); 30 | 31 | Text key = new Text(); 32 | LongWritable value = new LongWritable(); 33 | sqReader.next(key,value); 34 | 35 | System.out.println(key.toString()+" - "+value.toString()); 36 | 37 | sqReader.close(); 38 | 39 | 40 | 41 | 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /mapreduce/WordCount.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples; 2 | /** 3 | * Simple word count program 4 | */ 5 | import java.io.IOException; 6 | import java.util.*; 7 | 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.*; 10 | import org.apache.hadoop.mapred.*; 11 | 12 | public class WordCount { 13 | 14 | private static class MyMapper extends MapReduceBase implements Mapper { 15 | private final static IntWritable one = new IntWritable(1); 16 | private Text word = new Text(); 17 | 18 | public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { 19 | String line = value.toString(); 20 | StringTokenizer tokenizer = new StringTokenizer(line); 21 | while(tokenizer.hasMoreTokens()){ 22 | word.set(tokenizer.nextToken()); 23 | output.collect(word, one); 24 | } 25 | } 26 | } 27 | 28 | private static class MyReducer extends MapReduceBase implements Reducer { 29 | public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { 30 | int sum = 0; 31 | while (values.hasNext()) { 32 | sum += values.next().get(); 33 | } 34 | output.collect(key, new IntWritable(sum)); 35 | } 36 | } 37 | 38 | public static void main(String[] args) throws Exception { 39 | JobConf conf = new JobConf(WordCount.class); 40 | conf.setJobName("wordcount"); 41 | 42 | conf.setOutputKeyClass(Text.class); 43 | conf.setOutputValueClass(IntWritable.class); 44 | 45 | conf.setMapperClass(MyMapper.class); 46 | conf.setCombinerClass(MyReducer.class); 47 | conf.setReducerClass(MyReducer.class); 48 | 49 | conf.setInputFormat(TextInputFormat.class); 50 | conf.setOutputFormat(TextOutputFormat.class); 51 | 52 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 53 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 54 | 55 | JobClient.runJob(conf); 56 | } 57 | } -------------------------------------------------------------------------------- /mapreduce/chaining/ChainMapperExample.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.chaining; 2 | 3 | /** 4 | * This example demonstrated how ChainMapper class can be used to chain mappers 5 | * Each mapper in the chain will be called using the output of the prev one. 6 | * Output key, Output value of the first mapper must mach with the Input key 7 | * and Input value of the second mapper. 8 | * 9 | * Output of a mapper will be used as the input of the next mapper. And again another 10 | * output file/folder will be created. So its better to delete the existing output 11 | * from the driver. 12 | * 13 | * Reducer will be called after all the mappers are called. 14 | * 15 | * TO-DO: check if reducers can be chained. 16 | */ 17 | import java.io.IOException; 18 | 19 | import org.apache.hadoop.fs.FileSystem; 20 | import org.apache.hadoop.fs.Path; 21 | import org.apache.hadoop.io.IntWritable; 22 | import org.apache.hadoop.io.LongWritable; 23 | import org.apache.hadoop.io.Text; 24 | import org.apache.hadoop.mapred.FileInputFormat; 25 | import org.apache.hadoop.mapred.FileOutputFormat; 26 | import org.apache.hadoop.mapred.JobClient; 27 | import org.apache.hadoop.mapred.JobConf; 28 | import org.apache.hadoop.mapred.TextInputFormat; 29 | import org.apache.hadoop.mapred.TextOutputFormat; 30 | import org.apache.hadoop.mapred.lib.ChainMapper; 31 | import org.apache.hadoop.mapred.lib.ChainReducer; 32 | 33 | public class ChainMapperExample { 34 | public static void main(String[] args) throws IOException { 35 | 36 | /* 37 | * This conf is used as a ref. Set only the input fileformat and output fileformat 38 | * 39 | */ 40 | JobConf conf1 = new JobConf(WordCount.class); 41 | conf1.setJobName("wordcount"); 42 | 43 | conf1.setInputFormat(TextInputFormat.class); 44 | conf1.setOutputFormat(TextOutputFormat.class); 45 | 46 | FileInputFormat.setInputPaths(conf1,new Path(args[0])); 47 | Path output = new Path(args[1]); 48 | FileSystem fileSystem = FileSystem.get(conf1); 49 | fileSystem.delete(output,true); 50 | FileOutputFormat.setOutputPath(conf1,output); 51 | 52 | 53 | /* 54 | * Local job conf files 55 | */ 56 | JobConf mapConf = new JobConf(false); 57 | JobConf reduceConf = new JobConf(false); 58 | 59 | /* 60 | * First argument is the global conf file we already created 61 | * Second is the Mapper/Reducer class we gona use 62 | * Third,fourth,fifth and sixth arguments are mapper/reducer inputkey inputvalue,outputkey and outputvalue respectively 63 | */ 64 | ChainMapper.addMapper(conf1,WordCount.WordCountMapper.class,LongWritable.class,Text.class,Text.class,IntWritable.class,true,mapConf); 65 | ChainMapper.addMapper(conf1,ToUpperCase.class,Text.class,IntWritable.class,Text.class,IntWritable.class,true,mapConf); 66 | ChainReducer.setReducer(conf1,WordCount.WordCountReducer.class,Text.class,IntWritable.class,Text.class,IntWritable.class,true,reduceConf); 67 | 68 | JobClient.runJob(conf1); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /mapreduce/chaining/ChainingJobControl.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.chaining; 2 | 3 | /** 4 | * Chains jobs using JobControl. 5 | * Steps: 6 | * 1.Create jobs 7 | * 2.Create a job control. 8 | * 3.Add jobs to the job control. 9 | * 4.Add dependecy of jobs 10 | * 5.Start the job using JobControl.run() 11 | * 12 | */ 13 | 14 | import java.io.IOException; 15 | 16 | import org.apache.hadoop.fs.Path; 17 | import org.apache.hadoop.io.IntWritable; 18 | import org.apache.hadoop.io.Text; 19 | import org.apache.hadoop.mapred.FileInputFormat; 20 | import org.apache.hadoop.mapred.FileOutputFormat; 21 | import org.apache.hadoop.mapred.JobConf; 22 | import org.apache.hadoop.mapred.TextInputFormat; 23 | import org.apache.hadoop.mapred.TextOutputFormat; 24 | import org.apache.hadoop.mapred.jobcontrol.Job; 25 | import org.apache.hadoop.mapred.jobcontrol.JobControl; 26 | 27 | public class ChainingJobControl { 28 | 29 | public static void main(String[] args) throws IOException { 30 | JobConf conf1 = new JobConf(WordCount.class); 31 | conf1.setJobName("wordcount"); 32 | conf1.setOutputKeyClass(Text.class); 33 | conf1.setOutputValueClass(IntWritable.class); 34 | conf1.setMapperClass(WordCount.WordCountMapper.class); 35 | conf1.setCombinerClass(WordCount.WordCountReducer.class); 36 | conf1.setReducerClass(WordCount.WordCountReducer.class); 37 | conf1.setInputFormat(TextInputFormat.class); 38 | conf1.setOutputFormat(TextOutputFormat.class); 39 | FileInputFormat.setInputPaths(conf1,new Path(args[0])); 40 | Path intermediate = new Path("intermediate"); 41 | FileOutputFormat.setOutputPath(conf1,intermediate); 42 | Job job1 = new Job(conf1); 43 | System.out.println("job 1 conf created"); 44 | 45 | JobConf conf2 = new JobConf(WordCount.class); 46 | conf2.setOutputKeyClass(Text.class); 47 | conf2.setOutputValueClass(IntWritable.class); 48 | conf2.setMapperClass(LetterCount.LetterCountMapper.class); 49 | conf2.setCombinerClass(LetterCount.LetterCountReducer.class); 50 | conf2.setReducerClass(LetterCount.LetterCountReducer.class); 51 | conf2.setInputFormat(TextInputFormat.class); 52 | conf2.setOutputFormat(TextOutputFormat.class); 53 | FileInputFormat.setInputPaths(conf2,intermediate); 54 | FileOutputFormat.setOutputPath(conf2,new Path(args[1])); 55 | Job job2 = new Job(conf2); 56 | System.out.println("job 2 conf created"); 57 | 58 | JobControl jbCntrol = new JobControl("cntroller"); 59 | jbCntrol.addJob(job1); 60 | jbCntrol.addJob(job2); 61 | job2.addDependingJob(job1); 62 | System.out.println("dependency added"); 63 | jbCntrol.run(); 64 | System.out.println("Done"); 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /mapreduce/chaining/ChainingSimple.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.chaining; 2 | 3 | /** 4 | * Simple job chaining example using more than one JobConf. 5 | * Chains jobs by calling JobClient.runJob(conf) in the order required 6 | */ 7 | import java.io.IOException; 8 | 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapred.FileInputFormat; 13 | import org.apache.hadoop.mapred.FileOutputFormat; 14 | import org.apache.hadoop.mapred.JobClient; 15 | import org.apache.hadoop.mapred.JobConf; 16 | import org.apache.hadoop.mapred.TextInputFormat; 17 | import org.apache.hadoop.mapred.TextOutputFormat; 18 | 19 | public class ChainingSimple { 20 | 21 | public static void main(String[] args) throws IOException { 22 | 23 | JobConf conf1 = new JobConf(WordCount.class); 24 | conf1.setJobName("wordcount"); 25 | 26 | conf1.setOutputKeyClass(Text.class); 27 | conf1.setOutputValueClass(IntWritable.class); 28 | conf1.setMapperClass(WordCount.WordCountMapper.class); 29 | conf1.setCombinerClass(WordCount.WordCountReducer.class); 30 | conf1.setReducerClass(WordCount.WordCountReducer.class); 31 | conf1.setInputFormat(TextInputFormat.class); 32 | conf1.setOutputFormat(TextOutputFormat.class); 33 | FileInputFormat.setInputPaths(conf1,new Path(args[0])); 34 | Path intermediate = new Path("intermediate"); 35 | FileOutputFormat.setOutputPath(conf1,intermediate); 36 | JobClient.runJob(conf1); 37 | 38 | 39 | JobConf conf2 = new JobConf(WordCount.class); 40 | conf2.setOutputKeyClass(Text.class); 41 | conf2.setOutputValueClass(IntWritable.class); 42 | conf2.setMapperClass(LetterCount.LetterCountMapper.class); 43 | conf2.setCombinerClass(LetterCount.LetterCountReducer.class); 44 | conf2.setReducerClass(LetterCount.LetterCountReducer.class); 45 | conf2.setInputFormat(TextInputFormat.class); 46 | conf2.setOutputFormat(TextOutputFormat.class); 47 | FileInputFormat.setInputPaths(conf2,intermediate); 48 | FileOutputFormat.setOutputPath(conf2,new Path(args[1])); 49 | JobClient.runJob(conf2); 50 | 51 | System.out.println("Done"); 52 | 53 | 54 | 55 | } 56 | } 57 | 58 | 59 | -------------------------------------------------------------------------------- /mapreduce/chaining/LetterCount.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.chaining; 2 | /** 3 | * Take a word and count as input 4 | * Emit the count for each letter in the word 5 | * In effect generates a value for each letter which = number of times the letter occurs * number of words in which the word occur 6 | */ 7 | import java.io.IOException; 8 | import java.util.Arrays; 9 | import java.util.HashMap; 10 | import java.util.Iterator; 11 | import java.util.Map; 12 | import java.util.StringTokenizer; 13 | 14 | import org.apache.hadoop.io.IntWritable; 15 | import org.apache.hadoop.io.LongWritable; 16 | import org.apache.hadoop.io.Text; 17 | import org.apache.hadoop.mapred.MapReduceBase; 18 | import org.apache.hadoop.mapred.Mapper; 19 | import org.apache.hadoop.mapred.OutputCollector; 20 | import org.apache.hadoop.mapred.Reducer; 21 | import org.apache.hadoop.mapred.Reporter; 22 | 23 | 24 | public class LetterCount { 25 | 26 | 27 | public static class LetterCountMapper extends MapReduceBase implements Mapper { 28 | 29 | @Override 30 | public void map(LongWritable key, Text value, 31 | OutputCollector output, Reporter reporter) 32 | throws IOException { 33 | 34 | String line = value.toString(); 35 | System.out.println("line is "+line); 36 | StringTokenizer tokenizer = new StringTokenizer(line); 37 | boolean first = true; 38 | String word = ""; 39 | int sum = 0; 40 | while(tokenizer.hasMoreTokens()) { 41 | String next = tokenizer.nextToken(); 42 | if (first) { 43 | first = false; 44 | word = next; 45 | }else { 46 | sum += Integer.parseInt(next); 47 | } 48 | } 49 | System.out.println("word is "+word); 50 | System.out.println("sum is "+sum); 51 | 52 | for(char ch : word.toCharArray()) { 53 | output.collect(new Text(ch+""),new IntWritable(sum)); 54 | } 55 | } 56 | } 57 | 58 | public static class LetterCountReducer extends MapReduceBase implements Reducer { 59 | 60 | @Override 61 | public void reduce(Text key, Iterator values, 62 | OutputCollector output, Reporter reorter) 63 | throws IOException { 64 | 65 | System.out.println("In reducer of letter count"); 66 | int sum = 0; 67 | while(values.hasNext()) { 68 | int value = values.next().get(); 69 | System.out.println(value); 70 | sum += value; 71 | } 72 | output.collect(key,new IntWritable(sum)); 73 | } 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /mapreduce/chaining/ToUpperCase.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.chaining; 2 | 3 | 4 | /** 5 | * A mapper which converts the key to upper case 6 | */ 7 | import java.io.IOException; 8 | 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapred.MapReduceBase; 12 | import org.apache.hadoop.mapred.Mapper; 13 | import org.apache.hadoop.mapred.OutputCollector; 14 | import org.apache.hadoop.mapred.Reporter; 15 | 16 | public class ToUpperCase extends MapReduceBase implements Mapper{ 17 | 18 | @Override 19 | public void map(Text key, IntWritable value, 20 | OutputCollector output, Reporter reporter) 21 | throws IOException { 22 | String keyText = key.toString().toUpperCase(); 23 | output.collect(new Text(keyText),value); 24 | 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /mapreduce/chaining/WordCount.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.chaining; 2 | /** 3 | * Word count program. 4 | */ 5 | import java.io.IOException; 6 | import java.util.Iterator; 7 | import java.util.StringTokenizer; 8 | 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapred.MapReduceBase; 13 | import org.apache.hadoop.mapred.Mapper; 14 | import org.apache.hadoop.mapred.OutputCollector; 15 | import org.apache.hadoop.mapred.Reducer; 16 | import org.apache.hadoop.mapred.Reporter; 17 | 18 | public class WordCount { 19 | 20 | public static class WordCountMapper extends MapReduceBase implements Mapper { 21 | 22 | @Override 23 | public void map(LongWritable key, Text value, 24 | OutputCollector output, Reporter reporter) 25 | throws IOException { 26 | String line = value.toString(); 27 | StringTokenizer tokenizer = new StringTokenizer(line); 28 | while(tokenizer.hasMoreTokens()) { 29 | Text word = new Text(); 30 | word.set(tokenizer.nextToken()); 31 | IntWritable one = new IntWritable(1); 32 | output.collect(word,one); 33 | } 34 | } 35 | 36 | 37 | } 38 | 39 | public static class WordCountReducer extends MapReduceBase implements Reducer { 40 | 41 | @Override 42 | public void reduce(Text key, Iterator values, 43 | OutputCollector output, Reporter reporter) 44 | throws IOException { 45 | int sum = 0; 46 | while(values.hasNext()) { 47 | int value = values.next().get(); 48 | sum += value; 49 | } 50 | output.collect(key,new IntWritable(sum)); 51 | } 52 | } 53 | 54 | } -------------------------------------------------------------------------------- /mapreduce/customtypes/Comparator.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.customtypes; 2 | 3 | /** 4 | * A custom comparator for comparing Text. 5 | * source : http://developer.yahoo.com/hadoop/tutorial/module5.html#types 6 | */ 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.io.WritableComparator; 9 | import org.apache.hadoop.io.WritableUtils; 10 | 11 | 12 | public class Comparator extends WritableComparator { 13 | 14 | protected Comparator() { 15 | super(Text.class); 16 | } 17 | 18 | public int compare(byte[] b1, int s1, int l1, 19 | byte[] b2, int s2, int l2) { 20 | System.out.println("Using custom comparator"); 21 | int n1 = WritableUtils.decodeVIntSize(b1[s1]); 22 | int n2 = WritableUtils.decodeVIntSize(b2[s2]); 23 | return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /mapreduce/customtypes/CustomPartitioner.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.customtypes; 2 | 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.mapred.JobConf; 5 | import org.apache.hadoop.mapred.Partitioner; 6 | 7 | /** 8 | * A custom partitioner to partition keys into reducers 9 | * @author vishnu 10 | * 11 | */ 12 | public class CustomPartitioner implements Partitioner{ 13 | 14 | @Override 15 | public int getPartition(Text key, Text value, int numReducers) { 16 | // TODO Auto-generated method stub 17 | System.out.println("number of reducers is "+numReducers); 18 | if (key.toString().equals("reduce")) { 19 | System.out.println("in custom partioner is returning 0"); 20 | return 0%numReducers; 21 | } else { 22 | System.out.println("in custom partioner is returning 1"); 23 | return 1%numReducers; 24 | } 25 | } 26 | 27 | 28 | /** 29 | * provides an example of bit masking which will convert negative results to positive 30 | */ 31 | /*@Override 32 | public int getPartition(Text key, IntWritable value, int numPartitions) { 33 | return (key.toString().hashCode() % numPartitions) & 0x7FFFFFFF; 34 | }*/ 35 | 36 | @Override 37 | public void configure(JobConf conf) { 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /mapreduce/customtypes/DollarInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.customtypes; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.InputSplit; 8 | import org.apache.hadoop.mapreduce.RecordReader; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | 12 | /** 13 | * A Custom InputFormat which will split the lines based on $ and will ignore \n 14 | * @author vishnu 15 | * 16 | */ 17 | public class DollarInputFormat extends FileInputFormat{ 18 | 19 | 20 | @Override 21 | public RecordReader createRecordReader(InputSplit split, 22 | TaskAttemptContext context) throws IOException, InterruptedException { 23 | return new DollarRecordReader(); 24 | } 25 | 26 | 27 | 28 | } 29 | -------------------------------------------------------------------------------- /mapreduce/customtypes/DollarStreamExample.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.customtypes; 2 | /** 3 | * Simple program to test dollar($) as file delimiter 4 | */ 5 | import java.io.IOException; 6 | import java.util.*; 7 | 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.*; 10 | import org.apache.hadoop.mapred.FileInputFormat; 11 | import org.apache.hadoop.mapred.FileOutputFormat; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.mapred.MapReduceBase; 14 | import org.apache.hadoop.mapred.Mapper; 15 | import org.apache.hadoop.mapred.OutputCollector; 16 | import org.apache.hadoop.mapred.Reducer; 17 | import org.apache.hadoop.mapred.Reporter; 18 | import org.apache.hadoop.mapreduce.Job; 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 20 | 21 | 22 | /**\ 23 | * have to do it in the new api 24 | * @author vishnu 25 | * 26 | */ 27 | public class DollarStreamExample { 28 | 29 | public static class MyMapper extends MapReduceBase implements Mapper { 30 | 31 | public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { 32 | String line = value.toString(); 33 | System.out.println("received in mapper"+line); 34 | output.collect(key, value); 35 | } 36 | } 37 | 38 | public static class MyReducer extends MapReduceBase implements Reducer { 39 | public void reduce(LongWritable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { 40 | String temp = ""; 41 | while(values.hasNext()) { 42 | temp += values.next().toString(); 43 | } 44 | System.out.println("In reducer "+temp); 45 | output.collect(key,new Text(temp)); 46 | } 47 | } 48 | 49 | public static void main(String[] args) throws Exception { 50 | JobConf conf = new JobConf(DollarStreamExample.class); 51 | conf.setJobName("wordcount"); 52 | 53 | 54 | conf.setMapperClass(MyMapper.class); 55 | conf.setReducerClass(MyReducer.class); 56 | conf.setOutputKeyClass(LongWritable.class); 57 | conf.setOutputValueClass(Text.class); 58 | 59 | conf.setMapperClass(MyMapper.class); 60 | conf.setReducerClass(MyReducer.class); 61 | Job job = new Job(conf,"wordcount"); 62 | job.setOutputKeyClass(LongWritable.class); 63 | job.setOutputValueClass(Text.class); 64 | job.setInputFormatClass(DollarInputFormat.class); 65 | job.setOutputFormatClass(TextOutputFormat.class); 66 | 67 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 68 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 69 | job.waitForCompletion(true); 70 | } 71 | } -------------------------------------------------------------------------------- /mapreduce/customtypes/IdentityReducerEx.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.customtypes; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapred.FileInputFormat; 10 | import org.apache.hadoop.mapred.FileOutputFormat; 11 | import org.apache.hadoop.mapred.JobClient; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.mapred.MapReduceBase; 14 | import org.apache.hadoop.mapred.Mapper; 15 | import org.apache.hadoop.mapred.OutputCollector; 16 | import org.apache.hadoop.mapred.Reporter; 17 | import org.apache.hadoop.mapred.TextInputFormat; 18 | import org.apache.hadoop.mapred.TextOutputFormat; 19 | import org.apache.hadoop.mapred.lib.IdentityReducer; 20 | 21 | public class IdentityReducerEx { 22 | 23 | public static class MyMapper extends MapReduceBase implements Mapper{ 24 | 25 | @Override 26 | public void map(LongWritable key, Text value, 27 | OutputCollector output, Reporter reporter) 28 | throws IOException { 29 | if (value.toString().contains("vishnu")) 30 | output.collect(new Text("vishnu"),value); 31 | else 32 | output.collect(new Text("reduce"),value); 33 | } 34 | 35 | } 36 | 37 | 38 | public static void main(String[] args) throws IOException { 39 | 40 | JobConf conf1 = new JobConf(IdentityReducerEx.class); 41 | conf1.setJobName("partition_identity"); 42 | 43 | conf1.setMapperClass(MyMapper.class); 44 | conf1.setReducerClass(IdentityReducer.class); 45 | 46 | conf1.setPartitionerClass(CustomPartitioner.class); 47 | 48 | conf1.setOutputKeyClass(Text.class); 49 | conf1.setOutputValueClass(Text.class); 50 | 51 | conf1.setInputFormat(TextInputFormat.class); 52 | conf1.setOutputFormat(TextOutputFormat.class); 53 | 54 | FileInputFormat.setInputPaths(conf1,new Path(args[0])); 55 | FileOutputFormat.setOutputPath(conf1,new Path(args[1])); 56 | 57 | JobClient.runJob(conf1); 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /mapreduce/customtypes/Point2D.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.customtypes; 2 | 3 | /** 4 | * Custom type representing a 2D point 5 | */ 6 | import java.io.DataInput; 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | 10 | import org.apache.hadoop.io.Writable; 11 | 12 | public class Point2D implements Writable { 13 | 14 | public float x; 15 | public float y; 16 | 17 | public Point2D(float x,float y) { 18 | this.x = x; 19 | this.y = y; 20 | } 21 | 22 | @Override 23 | public void readFields(DataInput in) throws IOException { 24 | x = in.readFloat(); 25 | y = in.readFloat(); 26 | 27 | } 28 | 29 | @Override 30 | public void write(DataOutput out) throws IOException { 31 | out.writeFloat(x); 32 | out.writeFloat(y); 33 | } 34 | 35 | @Override 36 | public String toString() { 37 | return "("+x+","+y+")"; 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /mapreduce/customtypes/RectangleCount.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.customtypes; 2 | /** 3 | * Simple word count program with custom counters and custom comparator(commented) 4 | */ 5 | import java.io.IOException; 6 | import java.util.*; 7 | 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.*; 10 | import org.apache.hadoop.mapred.*; 11 | 12 | public class RectangleCount { 13 | 14 | // A custom counter named CUSTOM_COUNT 15 | static enum CustomCounter{CUSTOM_COUNT}; 16 | private static class MyMapper extends MapReduceBase implements Mapper { 17 | 18 | public void map(Text key, RectangleKey value, OutputCollector output, Reporter reporter) throws IOException { 19 | String line = value.toString(); 20 | System.out.println("Received "+line); 21 | reporter.incrCounter(CustomCounter.CUSTOM_COUNT,1); 22 | output.collect(value,new IntWritable(1)); 23 | } 24 | } 25 | 26 | private static class MyReducer extends MapReduceBase implements Reducer { 27 | public void reduce(RectangleKey key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { 28 | int sum = 0; 29 | while (values.hasNext()) { 30 | sum += values.next().get(); 31 | } 32 | output.collect(new Text(key.toString()), new IntWritable(sum)); 33 | } 34 | } 35 | 36 | public static void main(String[] args) throws Exception { 37 | JobConf conf = new JobConf(RectangleCount.class); 38 | conf.setJobName("rectanglecount"); 39 | 40 | conf.setOutputKeyClass(RectangleKey.class); 41 | conf.setOutputValueClass(IntWritable.class); 42 | 43 | /* 44 | * Add the custom comparator for the key output class 45 | * It didnt work out by adding the comparator using WritableComparator.define() in the static block 46 | * 47 | * Add this to add the custom compartor. This comparator expects a Text key class. 48 | * conf.setOutputKeyComparatorClass(Comparator.class); 49 | */ 50 | 51 | 52 | conf.setMapperClass(MyMapper.class); 53 | conf.setReducerClass(MyReducer.class); 54 | 55 | conf.setInputFormat(RectangleInputFormat.class); 56 | conf.setOutputFormat(TextOutputFormat.class); 57 | 58 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 59 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 60 | 61 | JobClient.runJob(conf); 62 | } 63 | } -------------------------------------------------------------------------------- /mapreduce/customtypes/RectangleInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.customtypes; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapred.FileInputFormat; 7 | import org.apache.hadoop.mapred.FileSplit; 8 | import org.apache.hadoop.mapred.InputSplit; 9 | import org.apache.hadoop.mapred.JobConf; 10 | import org.apache.hadoop.mapred.RecordReader; 11 | import org.apache.hadoop.mapred.Reporter; 12 | 13 | /** 14 | * A custom input format for reading a rectangle info from a file 15 | * @author vishnu 16 | * 17 | */ 18 | 19 | public class RectangleInputFormat extends FileInputFormat{ 20 | 21 | @Override 22 | public RecordReader getRecordReader(InputSplit input, 23 | JobConf conf, Reporter reporter) throws IOException { 24 | reporter.setStatus(input.toString()); 25 | return new RectangleRecordReader(conf,(FileSplit)input); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /mapreduce/customtypes/XmlOutputDriver.java: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.examples.customtypes; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapred.FileInputFormat; 10 | import org.apache.hadoop.mapred.FileOutputFormat; 11 | import org.apache.hadoop.mapred.JobClient; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.mapred.MapReduceBase; 14 | import org.apache.hadoop.mapred.Mapper; 15 | import org.apache.hadoop.mapred.OutputCollector; 16 | import org.apache.hadoop.mapred.Reducer; 17 | import org.apache.hadoop.mapred.Reporter; 18 | import org.apache.hadoop.mapred.TextInputFormat; 19 | 20 | 21 | public class XmlOutputDriver { 22 | 23 | private static class XmlMapper extends MapReduceBase implements Mapper { 24 | 25 | @Override 26 | public void map(LongWritable offset, Text value, 27 | OutputCollector output, Reporter reporter) 28 | throws IOException { 29 | String line = value.toString(); 30 | String[] parts = line.split(" "); 31 | Text key = new Text(parts[0]); 32 | for(int i=1;i { 39 | 40 | @Override 41 | public void reduce(Text key, Iterator values, 42 | OutputCollector output, Reporter reporter) 43 | throws IOException { 44 | while(values.hasNext()) { 45 | output.collect(key, values.next()); 46 | } 47 | } 48 | 49 | } 50 | 51 | public static void main(String[] args) throws IOException { 52 | JobConf conf = new JobConf(XmlOutputDriver.class); 53 | conf.setJobName("xmlwriter"); 54 | 55 | conf.setOutputKeyClass(Text.class); 56 | conf.setOutputValueClass(Text.class); 57 | 58 | conf.setMapperClass(XmlMapper.class); 59 | conf.setReducerClass(XmlReducer.class); 60 | 61 | conf.setInputFormat(TextInputFormat.class); 62 | conf.setOutputFormat(XmlOutputFormat.class); 63 | 64 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 65 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 66 | 67 | JobClient.runJob(conf); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /mapreduce/datafu_example/.gitignore: -------------------------------------------------------------------------------- 1 | incremental-hourglass/ 2 | -------------------------------------------------------------------------------- /mapreduce/datafu_example/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | incrementaljob 6 | incremental-hourglass 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | incremental-hourglass 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-core 27 | 1.2.1 28 | 29 | 30 | com.linkedin.datafu 31 | datafu-hourglass 32 | 0.1.3 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /mapreduce/datafu_example/src/main/java/incrementaljob/IncrementalAccumulator.java: -------------------------------------------------------------------------------- 1 | package incrementaljob; 2 | 3 | import org.apache.avro.Schema; 4 | import org.apache.avro.generic.GenericData; 5 | import org.apache.avro.generic.GenericRecord; 6 | 7 | import datafu.hourglass.model.Accumulator; 8 | 9 | public class IncrementalAccumulator implements 10 | Accumulator { 11 | private transient long sum; 12 | private transient Schema oSchema; 13 | private String outputSchemaString; 14 | 15 | public IncrementalAccumulator(String outputSchemaString) { 16 | this.outputSchemaString = outputSchemaString; 17 | } 18 | 19 | @Override 20 | public void accumulate(GenericRecord value) { 21 | this.sum += (Long) value.get("score"); 22 | } 23 | 24 | @Override 25 | public GenericRecord getFinal() { 26 | if (oSchema == null) { 27 | oSchema = new Schema.Parser().parse(outputSchemaString); 28 | } 29 | GenericRecord output = new GenericData.Record(oSchema); 30 | output.put("score", sum); 31 | return output; 32 | } 33 | 34 | @Override 35 | public void cleanup() { 36 | this.sum = 0; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /mapreduce/datafu_example/src/main/java/incrementaljob/IncrementalAggr.java: -------------------------------------------------------------------------------- 1 | package incrementaljob; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import org.apache.avro.Schema; 9 | import org.apache.avro.Schema.Field; 10 | import org.apache.avro.Schema.Type; 11 | import org.apache.avro.generic.GenericData; 12 | import org.apache.avro.generic.GenericRecord; 13 | import org.apache.hadoop.fs.Path; 14 | 15 | import datafu.hourglass.jobs.PartitionCollapsingIncrementalJob; 16 | import datafu.hourglass.model.Accumulator; 17 | import datafu.hourglass.model.KeyValueCollector; 18 | import datafu.hourglass.model.Mapper; 19 | 20 | public class IncrementalAggr { 21 | 22 | 23 | 24 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 25 | PartitionCollapsingIncrementalJob job = new PartitionCollapsingIncrementalJob(IncrementalAggr.class); 26 | final String namespace = "incrementaljob.datafu"; 27 | 28 | 29 | //create schema for site and load it into memory. 30 | final Schema keySchema = Schema.createRecord("Key", null, namespace,false); 31 | List keyFields = new ArrayList(); 32 | Field id = new Field("exam",Schema.create(Type.STRING),null,null); 33 | keyFields.add(id); 34 | keySchema.setFields(keyFields); 35 | final String keySchemaString = keySchema.toString(true); 36 | 37 | final Schema valueSchema = Schema.createRecord("Value", null, namespace,false); 38 | List valueFields = new ArrayList(); 39 | Field value = new Field("score",Schema.create(Type.LONG),null,null); 40 | valueFields.add(value); 41 | valueSchema.setFields(valueFields); 42 | final String valueSchemaString = valueSchema.toString(true); 43 | 44 | 45 | final Schema outputSchema = Schema.createRecord("Output", null, namespace,false); 46 | List outputFields = new ArrayList(); 47 | Field result = new Field("result",Schema.create(Type.LONG),null,null); 48 | outputFields.add(result); 49 | outputSchema.setFields(outputFields); 50 | final String outputSchemaString = outputSchema.toString(true); 51 | 52 | job.setKeySchema(keySchema); 53 | job.setIntermediateValueSchema(valueSchema); 54 | job.setOutputValueSchema(valueSchema); 55 | job.setInputPaths(Arrays.asList(new Path("datafu/data/input"))); 56 | job.setOutputPath(new Path("datafu/data/output")); 57 | job.setReusePreviousOutput(true); 58 | 59 | Mapper mapper = new IncrementalMapper(keySchemaString, valueSchemaString); 60 | job.setMapper(mapper); 61 | Accumulator accumulator = new IncrementalAccumulator(outputSchemaString); 62 | job.setReducerAccumulator(accumulator); 63 | // job.setCombinerAccumulator(job.getReducerAccumulator()); 64 | // job.setUseCombiner(true); 65 | job.run(); 66 | 67 | } 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /mapreduce/datafu_example/src/main/java/incrementaljob/IncrementalMapper.java: -------------------------------------------------------------------------------- 1 | package incrementaljob; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.avro.Schema; 6 | import org.apache.avro.generic.GenericData; 7 | import org.apache.avro.generic.GenericRecord; 8 | 9 | import datafu.hourglass.model.KeyValueCollector; 10 | import datafu.hourglass.model.Mapper; 11 | 12 | /** 13 | * An example of incremental mapreduce using datafu 14 | * @author vishnu 15 | * 16 | */ 17 | public class IncrementalMapper implements Mapper 18 | { 19 | 20 | private transient Schema kSchema; 21 | private transient Schema vSchema; 22 | private String keySchemaString; 23 | private String valueSchemaString; 24 | 25 | public IncrementalMapper(String keySchemaString,String valueSchemaString) { 26 | this.keySchemaString = keySchemaString; 27 | this.valueSchemaString = valueSchemaString; 28 | } 29 | 30 | 31 | @Override 32 | public void map(GenericRecord input, 33 | KeyValueCollector collector) 34 | throws IOException, InterruptedException { 35 | if (kSchema == null) kSchema = new Schema.Parser().parse(keySchemaString); 36 | if (vSchema == null) vSchema = new Schema.Parser().parse(valueSchemaString); 37 | GenericRecord key = new GenericData.Record(kSchema); 38 | key.put("name", input.get("name")); 39 | GenericRecord value = new GenericData.Record(vSchema); 40 | value.put("score",input.get("score")); 41 | collector.collect(key,value); 42 | } 43 | } 44 | 45 | 46 | -------------------------------------------------------------------------------- /mapreduce/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.vishnu 6 | mapreduce 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | mapreduce 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-core 27 | 1.2.1 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/vishnu/mapreduce/CustomMultiplOututFormat.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.mapreduce; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.mapred.JobConf; 7 | import org.apache.hadoop.mapred.RecordWriter; 8 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat; 9 | 10 | import org.apache.hadoop.util.Progressable; 11 | 12 | public class CustomMultiplOututFormat extends MultipleTextOutputFormat{ 13 | 14 | @Override 15 | public RecordWriter getRecordWriter(FileSystem fs, JobConf job, 16 | String name, Progressable arg3) throws IOException { 17 | String newName = name.substring(0,name.indexOf("-")); 18 | System.out.println(name); 19 | System.out.println(newName); 20 | return super.getRecordWriter(fs, job, newName, arg3); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/vishnu/mapreduce/CustomOutputFormatTest.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.mapreduce; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 15 | 16 | public class CustomOutputFormatTest { 17 | 18 | public static class ParserMapper extends Mapper { 19 | 20 | Configuration conf = null; 21 | MultipleOutputs mout; 22 | 23 | 24 | 25 | public void map(Object key, Text value, Context context) 26 | throws IOException, InterruptedException { 27 | String val = value.toString(); 28 | mout.write("filename",key,new Text(val)); 29 | } 30 | } 31 | 32 | public static void main(String[] args) throws Exception { 33 | Configuration conf = new Configuration(); 34 | Job job = Job.getInstance(conf, "CustomMultiplOutput"); 35 | job.setJarByClass(CustomOutputFormatTest.class); 36 | job.setMapperClass(ParserMapper.class); 37 | job.setOutputKeyClass(Text.class); 38 | job.setOutputValueClass(Text.class); 39 | Path source = new Path(args[0]); 40 | FileInputFormat.addInputPath(job,source); 41 | CustomMultiplOututFormat.set 42 | //MultipleOutputs.addNamedOutput(job, BLUECOAT, TextOutputFormat.class, Text.class, Text.class); 43 | //MultipleOutputs.addNamedOutput(job, BTDIAMOND, TextOutputFormat.class, Text.class, Text.class); 44 | LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); 45 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 46 | boolean success = job.waitForCompletion(true); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/vishnu/mapreduce/WordCount.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.mapreduce; 2 | 3 | import java.io.IOException; 4 | import java.util.StringTokenizer; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | public class WordCount { 17 | 18 | public static class TokenizerMapper 19 | extends Mapper{ 20 | 21 | private Text word = new Text(); 22 | float one; 23 | float two; 24 | public void map(Object key, Text value, Context context 25 | ) throws IOException, InterruptedException { 26 | StringTokenizer itr = new StringTokenizer(value.toString()); 27 | while (itr.hasMoreTokens()) { 28 | word.set(itr.nextToken()); 29 | context.write(word,new IntWritable(1)); 30 | } 31 | } 32 | 33 | @Override 34 | protected void setup(Context context) throws IOException, 35 | InterruptedException { 36 | Configuration conf = context.getConfiguration(); 37 | one = conf.getFloat("one",0f); 38 | two = conf.getFloat("two",0f); 39 | } 40 | } 41 | 42 | public static class IntSumReducer 43 | extends Reducer { 44 | private IntWritable result = new IntWritable(); 45 | 46 | public void reduce(Text key, Iterable values, 47 | Context context 48 | ) throws IOException, InterruptedException { 49 | int sum = 0; 50 | for (IntWritable val : values) { 51 | sum += val.get(); 52 | } 53 | result.set(sum); 54 | context.write(key, result); 55 | } 56 | } 57 | 58 | public static void main(String[] args) throws Exception { 59 | Configuration conf = new Configuration(); 60 | Job job = Job.getInstance(conf, "word count"); 61 | job.setJarByClass(WordCount.class); 62 | job.setMapperClass(TokenizerMapper.class); 63 | job.setCombinerClass(IntSumReducer.class); 64 | job.setReducerClass(IntSumReducer.class); 65 | job.setOutputKeyClass(Text.class); 66 | job.setOutputValueClass(IntWritable.class); 67 | FileInputFormat.addInputPath(job, new Path(args[0])); 68 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 69 | System.exit(job.waitForCompletion(true) ? 0 : 1); 70 | } 71 | } -------------------------------------------------------------------------------- /mapreduce/src/test/java/com/vishnu/mapreduce/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.mapreduce; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /spark/.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | .cache-main 4 | 5 | #eclipse specific 6 | .classpath 7 | .project 8 | .settings/ 9 | .idea/ 10 | # sbt specific 11 | .cache 12 | .history 13 | .lib/ 14 | dist/* 15 | target/ 16 | lib_managed/ 17 | src_managed/ 18 | project/boot/ 19 | project/plugins/project/ 20 | 21 | # Scala-IDE specific 22 | .scala_dependencies 23 | .worksheet 24 | /bin/ 25 | metastore_db/ 26 | -------------------------------------------------------------------------------- /spark/README.md: -------------------------------------------------------------------------------- 1 | **Update:** This project uses spark 1.6, for Spark 2.3 examples click [here](https://github.com/soniclavier/bigdata-notebook/tree/master/spark_23) 2 | 3 | 4 | Edit your .bash_profile or .profile file in OSX/Unix or Edit your Environment variables in Windows. 5 | 6 | **OSX/Unix:** 7 | ```b 8 | vi ~/.bash_profile 9 | export SPARK_HOME=/Users/vishnu/spark-1.6.0-bin-hadoop2.6 10 | export PATH=$PATH/:$SPARK_HOME/sbin 11 | export PATH=$PATH/:$SPARK_HOME/bin 12 | ``` 13 | **To submit the application:** 14 | ```scala 15 | //start spark master 16 | $SPARK_HOME/sbin/start-master.sh 17 | 18 | //start worker 19 | //Get the spark the master url from http://localhost:8080/ 20 | $SPARK_HOME/sbin/start-slaves.sh spark://Vishnus-MacBook-Pro.local:7077 21 | 22 | spark-submit --class "package.name.Object" --master spark://your_master_server:7077 target/path/to/your/jar_file.jar 23 | ``` 24 | 25 | E.g., 26 | 27 | For running Titanic ML example 28 | ``` 29 | spark-submit --class "com.vishnu.spark.kaggle.titanic.TitanicWithPipeline" --master spark://Vishnus-MacBook-Pro.local:7077 --packages com.databricks:spark-csv_2.11:1.3.0 target/scala-2.10/spark-vishnu-assemlby-1.0.jar 30 | ``` 31 | 32 | For running Streaming Example 33 | ``` 34 | spark-submit --class "com.vishnu.spark.streaming.SocketStreaming" --master spark://Vishnus-MacBook-Pro.local:7077 target/scala-2.10/spark-vishnu-assemlby-1.0.jar 35 | ``` 36 | -------------------------------------------------------------------------------- /spark/build.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/spark/build.properties -------------------------------------------------------------------------------- /spark/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.2") 2 | -------------------------------------------------------------------------------- /spark/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0") 2 | -------------------------------------------------------------------------------- /spark/pyspark-files/helloworld.py: -------------------------------------------------------------------------------- 1 | #refs : https://www.youtube.com/watch?v=08mrnJxcIWw 2 | # https://github.com/databricks/tensorframes 3 | #Spark version 2.1.1 4 | #bin/pyspark --master spark://vishnu-macbook-pro:7077 --packages databricks:tensorframes:0.2.8-s_2.11 5 | 6 | import tensorflow as tf 7 | import tensorframes as tfs 8 | 9 | df = spark.createDataFrame(zip(range(0,10), range(1,11))).toDF("x","y") 10 | df.show(10) 11 | 12 | x = tfs.row(df, "x") 13 | y = tfs.row(df, "y") 14 | 15 | output = tf.add(x, y, name="out") 16 | 17 | df2 = tfs.map_rows(output, df) 18 | 19 | df2.show() -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/Test.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark 2 | 3 | object Test { 4 | 5 | def main(args: Array[String]) { 6 | 7 | } 8 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/bClassifier.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark 2 | 3 | import org.apache.spark.mllib.classification.{ SVMModel, SVMWithSGD } 4 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 5 | import org.apache.spark.mllib.util.MLUtils 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.SparkConf 8 | 9 | /** 10 | * An example from Spark site 11 | */ 12 | object bClassifier { 13 | 14 | def main(args: Array[String]) { 15 | val conf = new SparkConf().setAppName("bClassiier").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 16 | val sc = new SparkContext(conf) 17 | val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") 18 | 19 | // Split data into training (60%) and test (40%). 20 | val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) 21 | val training = splits(0).cache() 22 | val test = splits(1) 23 | 24 | // Run training algorithm to build the model 25 | val numIterations = 100 26 | val model = SVMWithSGD.train(training, numIterations) 27 | 28 | // Clear the default threshold. 29 | model.clearThreshold() 30 | 31 | // Compute raw scores on the test set. 32 | val scoreAndLabels = test.map { point => 33 | val score = model.predict(point.features) 34 | (score, point.label) 35 | } 36 | 37 | // Get evaluation metrics. 38 | val metrics = new BinaryClassificationMetrics(scoreAndLabels) 39 | val auROC = metrics.areaUnderROC() 40 | 41 | println("Area under ROC = " + auROC) 42 | 43 | // Save and load model 44 | model.save(sc, "myModelPath") 45 | val sameModel = SVMModel.load(sc, "myModelPath") 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/basics/AuctionApp.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.basics 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.SQLContext 6 | 7 | object AuctionApp { 8 | 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setAppName("AuctionsApp") 11 | val sc = new SparkContext(conf) 12 | 13 | val aucFile = "/user/vishnu/mapr/dev360/auctiondata.csv" 14 | val auctionRDD = sc.textFile(aucFile).map(_.split(",")).cache() 15 | } 16 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/basics/CustomPartitioner.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.basics 2 | 3 | import org.apache.spark.Partitioner 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | 7 | 8 | class MyPartitioner extends Partitioner{ 9 | def numPartitions = 10 10 | 11 | def getPartition(key: Any): Int = { 12 | key match { 13 | case s: String => s.length()%numPartitions 14 | } 15 | } 16 | } 17 | 18 | object CustomPartitioner { 19 | 20 | val conf = new SparkConf().setAppName("CustomPartitioner") 21 | val sc = new SparkContext(conf) 22 | 23 | val rdd = sc.parallelize(List("word","stream","sql","dataframe","auction","averylongword","anotherveryverylongword")) 24 | val myPart = new MyPartitioner 25 | 26 | val pairRdd = rdd.map(word=>(word,1)) 27 | val partitionedRdd = pairRdd.partitionBy(myPart) 28 | 29 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/basics/SequenceFileTest.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.basics 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.hadoop.io.Text 6 | import org.apache.hadoop.io.IntWritable 7 | 8 | object SequenceFileTest { 9 | 10 | def main(args: Array[String]): Unit = { 11 | val conf = new SparkConf().setAppName("SeqFileTest") 12 | val sc = new SparkContext(conf) 13 | 14 | 15 | //create a sequence file 16 | val data = sc.parallelize(List(("key1",1), ("key2",2))) 17 | data.saveAsSequenceFile("/usr/vishnu/spark_temp/seqfile_sample") 18 | 19 | //read from sequence file 20 | val dataLoaded = sc.sequenceFile("/usr/vishnu/spark_temp/seqfile_sample/part-00003", classOf[Text], classOf[IntWritable]) 21 | dataLoaded.foreach(println) 22 | 23 | } 24 | 25 | 26 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/basics/pairrdd.sc: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.basics 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.SQLContext 6 | 7 | object pairrdd { 8 | println("Welcome to the Scala worksheet") 9 | 10 | val IncidntNum = 0 11 | val Category = 1 12 | val Descript = 2 13 | val DayOfWeek = 3 14 | val Date = 4 15 | val Time = 5 16 | val PdDistrict = 6 17 | val Resolution = 7 18 | val Address = 8 19 | val X = 9 20 | val Y = 10 21 | val PdId = 11 22 | 23 | val conf = new SparkConf().setAppName("pairrdd-test").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 24 | val sc = new SparkContext(conf) 25 | 26 | val sfpd = sc.textFile("/user/vishnu/mapr/dev361/sfpd.csv").map(_.split(",")) 27 | val totincs = sfpd.count() 28 | val cat = sfpd.map(x=>x(Category)).distinct.collect() 29 | 30 | val bayviewRDD = sfpd.filter(incident=>incident.contains("BAYVIEW")) 31 | 32 | val incByCat = sfpd.map(x=>(x(Category),1)) 33 | 34 | sfpd.map(x=>(x(PdDistrict),1)).reduceByKey(_+_).map(x=>(x._2,x._1)).sortByKey(false).take(4) 35 | 36 | val pdDists = sfpd.map(x=>(x(PdDistrict),x(Address))) 37 | val catRes = sfpd.map(x=>(x(PdDistrict),(x(Category),x(Resolution)))) 38 | val incCatRes = sfpd.map(x=>(x(PdDistrict),x(Address))) 39 | 40 | pdDists.join(catRes) 41 | 42 | // only if dataset can fit in memory 43 | val num_inc_dist = sfpd.map(x=>(x(PdDistrict),1)).countByKey() 44 | 45 | val catAdd = sc.textFile("/user/vishnu/mapr/dev361/J_AddCat.csv").map(x=>x.split(",")).map(x=>(x(1),x(0))) 46 | val distAdd = sc.textFile("/user/vishnu/mapr/dev361/J_AddDist.csv").map(x=>x.split(",")).map(x=>(x(1),x(0))) 47 | 48 | 49 | val incByDists = sfpd.map(x=>(x(PdDistrict),1)).reduceByKey(_+_) 50 | val inc_map = incByDists.map(x=>((x._2,x._1))) 51 | val inc_sort = incByDists.map(x=>(x._2,x._1)).sortByKey(false) 52 | val inc_group = sfpd.map(x=>(x(PdDistrict),1)).groupByKey() 53 | 54 | val incByDists2 = sfpd.map(x=>(x(PdDistrict),1)).reduceByKey(_+_,10) 55 | 56 | 57 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/basics/rdds.sc: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.basics 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | 6 | 7 | object rdds { 8 | 9 | 10 | println("Welcome to the Scala worksheet") 11 | 12 | val conf = new SparkConf().setAppName("rdd-test").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 13 | val sc = new SparkContext(conf) 14 | //define RDD 15 | val auctionRDD = sc.textFile("/user/vishnu/mapr/dev360/auctiondata.csv").map(_.split(",")) 16 | 17 | //filter transformation, applying anonymous function 18 | val xboxRDD = auctionRDD.filter(line => line.contains("xbox")) 19 | 20 | val auctionid = 0 21 | val bid = 1 22 | val bidtime = 2 23 | val bidder = 3 24 | val bidderrate = 4 25 | val openbid = 5 26 | val price = 6 27 | val itemtype = 7 28 | val daystolive = 8 29 | 30 | //how many items where sold 31 | val items_sold = auctionRDD.map(entry=>entry(auctionid)) 32 | .distinct 33 | .count 34 | 35 | //how many bids per item type 36 | val bidAuctionRDD = auctionRDD.map(entry=>(entry(itemtype),1)).reduceByKey((x,y)=>x+y) 37 | 38 | //cache 39 | bidAuctionRDD.cache 40 | 41 | bidAuctionRDD.collect 42 | 43 | 44 | 45 | 46 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/basics/streams.sc: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.basics 2 | 3 | import org.apache.spark.{ SparkConf, SparkContext } 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.streaming._ 6 | import org.apache.spark.streaming.StreamingContext._ 7 | 8 | object streams { 9 | println("Welcome to the Scala worksheet") 10 | 11 | val sparkConf = new SparkConf().setAppName("SensorStream") 12 | val sc = new SparkContext(sparkConf) 13 | case class Sensor(resid: String, date: String, time: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double) extends Serializable 14 | 15 | val ssc = new StreamingContext(sc, Seconds(2)) 16 | val linesDStream = ssc.textFileStream("/user/user01/stream") 17 | linesDStream.print() 18 | linesDStream.foreachRDD(rdd => { 19 | val srdd = rdd.map(_.split(",")).map(p => Sensor(p(0), p(1), p(2), p(3).toDouble, p(4).toDouble, p(5).toDouble, p(6).toDouble, p(7).toDouble, p(8).toDouble)) 20 | val alertRDD = srdd.filter(sensor=>sensor.psi < 5.0) 21 | srdd.take(2).foreach(println) 22 | alertRDD.take(2).foreach(println) 23 | }) 24 | 25 | 26 | ssc.start() 27 | ssc.awaitTermination() 28 | 29 | 30 | } 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/blog/supportfiles/spark_session_blog_commands: -------------------------------------------------------------------------------- 1 | //LOAD 2 | val df = spark.read.json("/spark_learning/pandainfo.json") 3 | df.show 4 | df.registerTempTable("pandas") 5 | df.createOrReplaceTempView("pandas") 6 | 7 | //TABLE AND SQL 8 | spark.table("pandas") 9 | spark.sql("select name from pandas").show 10 | 11 | //UDF 12 | spark.udf.register("addone",(x:Int)=>x+1) 13 | 14 | //CREATE DATASET 15 | val ds = spark.createDataset(List(1,2,3)) 16 | val rdd = sc.parallelize(List(1,2,3)) 17 | val ds = spark.createDataset(rdd) 18 | 19 | 20 | 21 | //CREATE DATAFRAME 22 | case class Num(x:Int) 23 | val rdd = sc.parallelize(List(Num(1),Num(2),Num(3))) 24 | spark.createDataFrame(rdd).show 25 | 26 | import org.apache.spark.sql.types.{StructType,StructField,IntegerType}; 27 | import org.apache.spark.sql.Row 28 | val rowRDD = rdd.map(x=>Row(x)) 29 | val schema = StructType(Array(StructField("num", IntegerType, true))) 30 | spark.createDataFrame(rowRDD,schema).show 31 | 32 | 33 | //CATALOG 34 | spark.catalog.cacheTable("pandas") // caches the table into memory, throws Table or view not found in database exeception if not found. 35 | spark.catalog.uncacheTable("pandas") // to remove table from memory 36 | spark.catalog.currentDatabase 37 | spark.catalog.isCached("pandas") 38 | spark.catalog.clearCache 39 | spark.catalog.listDatabases.take(1) 40 | spark.catalog.listTables("default").take(1) 41 | spark.catalog.dropTempView("pandas") //drops the table -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/graph/PropertyGraphExample.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.graph 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.graphx._ 5 | import org.apache.spark.rdd.RDD 6 | 7 | object PropertyGraphExample { 8 | 9 | def main(args: Array[String]): Unit = { 10 | //confs 11 | val conf = new SparkConf().setAppName("AirportGraph") 12 | val sc = new SparkContext(conf) 13 | 14 | //load data 15 | val airports = sc.textFile("/mapr_lab_data/data/airports.csv").map(parseAirport) 16 | val vertices = airports.map(airport => (airport.id.toLong,airport)) //note id.toLong -> we need that for creating Graph, because Graph()'s first arg takes an RDD of tuples with _0 that has a Long 17 | 18 | val routes = sc.textFile("/mapr_lab_data/data/routes.csv").map(parseRoute) 19 | val edges = routes.map(route => Edge(route.src, route.dest, route)) 20 | 21 | //create defualt vertex 22 | val defaultVertex = Airport(0,"default") 23 | 24 | //create graph 25 | val graph = Graph(vertices, edges, defaultVertex) 26 | 27 | graph.vertices.collect.foreach(println) 28 | 29 | graph.triplets.collect.foreach(println) 30 | println(graph.inDegrees) 31 | println(graph.vertices.count()) 32 | println(graph.edges.count()) 33 | 34 | graph.edges.filter{case Edge(src,dest,route) => route.dist > 1000}.count 35 | graph.edges.filter{case Edge(src,dest,route) => route.dist > 1000}.collect.foreach(println) 36 | 37 | graph.triplets.sortBy(_.attr,ascending=false).collect.foreach(println) 38 | 39 | 40 | //page rank 41 | val ranks = graph.pageRank(0.1).vertices 42 | ranks.take(3) 43 | 44 | ranks.join(vertices).sortBy(_._2._1,false).map(_._2._2).collect.foreach(println) 45 | } 46 | 47 | case class Route(src:Int, dest:Int, dist: Int) 48 | object Route{ 49 | 50 | implicit def orderingByDist[A <: Route]: Ordering[A] = 51 | Ordering.by(r => (r.dist)) 52 | } 53 | case class Airport(id:Int, name:String) 54 | 55 | def parseRoute(str:String): Route = { 56 | val p = str.split(",") 57 | new Route(p(0).toInt, p(1).toInt, p(2).toInt) 58 | } 59 | def parseAirport(str:String): Airport = { 60 | val p = str.split(",") 61 | Airport(p(0).toInt, p(1)) 62 | } 63 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/graph/res/airports.csv: -------------------------------------------------------------------------------- 1 | 1,SFO 2 | 2,ORD 3 | 3,DFW -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/graph/res/routes.csv: -------------------------------------------------------------------------------- 1 | 1,2,1800 2 | 2,3,800 3 | 3,1,1400 -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/mllib/ALSRecommender.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.mllib 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.mllib.recommendation.{ALS,MatrixFactorizationModel,Rating} 6 | import org.apache.spark.sql.SQLContext 7 | import org.apache.spark.sql._ 8 | 9 | object ALSRecommender { 10 | 11 | def main(args: Array[String]): Unit = { 12 | //conf 13 | val conf = new SparkConf().setAppName("MovieRecommender") 14 | val sc = new SparkContext(conf) 15 | 16 | //load data 17 | val ratingText = sc.textFile("/mapr_lab_data/data/ratings.dat") 18 | val ratingsRDD = ratingText.map(parseRating).cache() 19 | 20 | //split into training and testing set 21 | val splits = ratingsRDD.randomSplit(Array(0.8,0.2),0L) 22 | val trainingRatingsRDD = splits(0).cache 23 | val testRatingsRDD = splits(1).cache 24 | 25 | //buid ALS model 26 | val model = (new ALS().setRank(20).setIterations(10).run(trainingRatingsRDD)) 27 | 28 | val testUserProductRDD = testRatingsRDD.map{ case Rating(user,product,rating) => (user,product)} 29 | 30 | val predictionsRDD = model.predict(testUserProductRDD) 31 | 32 | val predictionsKeyed = predictionsRDD.map{case Rating(user,prod,pred) => ((user,prod),pred)} 33 | val testUserKeyed = testRatingsRDD.map{case Rating(user,prod,rating) => ((user,prod),rating)} 34 | 35 | val testAndPred = testUserKeyed.join(predictionsKeyed) 36 | 37 | //find false positive, if predicted high (>4) and actual was low (<1) 38 | val falsePositives = testAndPred.filter{case ((user,prod),(rating,pred)) => rating <= 1 && pred >= 4} 39 | 40 | //MAE (mean absolute error) 41 | val absoluteError = testAndPred.map{case ((user,prod),(rating,pred)) => Math.abs(pred-rating)} 42 | val mean = absoluteError.mean() 43 | 44 | //prediction for new user 45 | val newRatingsRDD = sc.parallelize(Array(Rating(0,260,4), Rating(0,1,3))) 46 | val unionRatingsRDD = ratingsRDD.union(newRatingsRDD) 47 | val newModel = (new ALS().setRank(20).setIterations(10).run(unionRatingsRDD)) 48 | 49 | //recommend 50 | val topRecForUser = newModel.recommendProducts(0,5) 51 | } 52 | 53 | def parseRating(str: String): Rating = { 54 | val p = str.split("::") 55 | Rating(p(0).toInt,p(1).toInt,p(2).toDouble) 56 | } 57 | 58 | //case class Rating(user:Int, movie: Int, rating: Double) no need of this since spark ml lib package is having Rating class 59 | 60 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/mllib/ALSRecommender2.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.mllib 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.mllib.recommendation.{ALS,MatrixFactorizationModel,Rating} 6 | import org.apache.spark.sql.SQLContext 7 | import org.apache.spark.sql._ 8 | 9 | object ALSRecommender2 { 10 | 11 | def main(args: Array[String]): Unit = { 12 | //conf 13 | val conf = new SparkConf().setAppName("MovieRecommender") 14 | val sc = new SparkContext(conf) 15 | val sqlContext = new SQLContext(sc) 16 | import sqlContext.implicits._ 17 | 18 | 19 | //load data 20 | val ratingText = sc.textFile("/mapr_lab_data/data/ratings.dat") 21 | val ratingsRDD = ratingText.map(parseRating).cache() 22 | 23 | val moviesDF= sc.textFile("/mapr_lab_data/data/movies.dat").map(parseMovie).toDF() 24 | val usersDF = sc.textFile("/mapr_lab_data/data/users.dat").map(parseUser).toDF() 25 | val ratingsDF = ratingsRDD.toDF() 26 | 27 | ratingsDF.registerTempTable("ratings") 28 | usersDF.registerTempTable("users") 29 | moviesDF.registerTempTable("movies") 30 | 31 | //TODO 32 | } 33 | 34 | def parseRating(str: String): Rating = { 35 | val p = str.split("::") 36 | Rating(p(0).toInt,p(1).toInt,p(2).toDouble) 37 | } 38 | 39 | def parseUser(str: String): User = { 40 | val fields = str.split("::") 41 | assert(fields.size == 5) 42 | User(fields(0).toInt, fields(1).toString, fields(2).toInt, fields(3).toInt, fields(4).toString) 43 | } 44 | 45 | def parseMovie(str: String): Movie = { 46 | val fields = str.split("::") 47 | assert(fields.size == 3) 48 | Movie(fields(0).toInt, fields(1)) 49 | } 50 | case class Movie(movieId: Int, title: String) 51 | case class User(userId: Int, gender: String, age: Int, occupation: Int, zip: String) 52 | //case class Rating(user:Int, movie: Int, rating: Double) no need of this since spark ml lib package is having Rating class 53 | 54 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/mllib/FeatureTransformations.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.mllib 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.feature.StandardScaler 7 | import org.apache.spark.mllib.feature.Normalizer 8 | 9 | object FeatureTransformations { 10 | 11 | def main(args: Array[String]): Unit = { 12 | val conf = new SparkConf().setAppName("FeatureTransfomrations") 13 | val sc = new SparkContext(conf) 14 | 15 | val vectors = List(Vectors.dense(Array(-2.0,5.0,1.0)),Vectors.dense(Array(2.0,0.0,1.0))) 16 | val dataset = sc.parallelize(vectors) 17 | 18 | //with mean = true, with std = true 19 | val scaler = new StandardScaler(true,true) 20 | val scalerModel = scaler.fit(dataset) 21 | scalerModel.transform(dataset).collect.foreach(println) 22 | 23 | val normalizer = new Normalizer() 24 | normalizer.transform(dataset).collect.foreach(println) 25 | 26 | } 27 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/mllib/LinearRegr.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.mllib 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | import org.apache.spark.mllib.feature.HashingTF 7 | import org.apache.spark.mllib.regression.LinearRegressionWithSGD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.mllib.linalg.Vectors 10 | 11 | 12 | object LinearRegr { 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | val conf = new SparkConf().setAppName("LinearRegression") 17 | val sc = new SparkContext(conf) 18 | val sqlContext = new SQLContext(sc) 19 | 20 | val features = Array("price","numBeds","year","sqft") 21 | val path = "/spark_learning/house_data.csv" 22 | val housePrice = sc.textFile(path).map(line => Vectors.dense(line.split(",").map(_.toDouble))) 23 | 24 | val houseFeaturesLP = housePrice.map(house => LabeledPoint(house(0).toLong,house)) 25 | 26 | 27 | val lrModel = LinearRegressionWithSGD.train(houseFeaturesLP,10) 28 | 29 | println(lrModel.intercept+" "+lrModel.weights) 30 | 31 | val entry = "0,5,2016,4000" 32 | val newEntry = LabeledPoint(0,Vectors.dense(entry.split(",").map(_.toDouble))) 33 | println(lrModel.predict(newEntry.features)) 34 | 35 | 36 | 37 | } 38 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/mllib/LogisticRegr.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.mllib 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.feature.HashingTF 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD 8 | 9 | object LogisticRegr { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val conf = new SparkConf().setAppName("LogisticRegression") 14 | val sc = new SparkContext(conf) 15 | 16 | val tf = new HashingTF(10000) 17 | 18 | val spam = sc.textFile("/spark_learning/spam.txt") 19 | val normal = sc.textFile("/spark_learning/normal.txt") 20 | 21 | val spamFeatures = spam.map(email=> tf.transform(email.split(" "))) 22 | val normalFeatures = normal.map(email=> tf.transform(email.split(" "))) 23 | 24 | val positiveLP = spamFeatures.map(features => LabeledPoint(1,features)) 25 | val negativeLP = normalFeatures.map(features => LabeledPoint(0,features)) 26 | 27 | val trainingData = positiveLP.union(negativeLP) 28 | trainingData.cache() 29 | 30 | val model = new LogisticRegressionWithSGD().run(trainingData) 31 | 32 | 33 | val newMail = tf.transform("You have won 100000$ free".split(" ")) 34 | model.predict(newMail) 35 | 36 | 37 | 38 | } 39 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/mllib/TFIDF.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.mllib 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.feature.HashingTF 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.mllib.feature.IDF 10 | 11 | object TFIDF { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | val conf = new SparkConf().setAppName("TFIDF") 16 | val sc = new SparkContext(conf) 17 | 18 | val tf = new HashingTF() 19 | 20 | val docs = sc.wholeTextFiles("/spark_learning") 21 | val wordsRDD: RDD[Seq[String]] = docs.map{case (name,content)=> content.split(" ")} 22 | println(wordsRDD.take(3)) 23 | val tfVectors = tf.transform(wordsRDD).cache 24 | 25 | val idf = new IDF() 26 | val idfModel = idf.fit(tfVectors) 27 | val tfIdfVectors = idfModel.transform(tfVectors) 28 | 29 | 30 | } 31 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/mllib/house_data.csv: -------------------------------------------------------------------------------- 1 | 1000000,2,2010,1500 2 | 5000000,3,2015,2000 3 | 25000,1,200,900 4 | -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/sql/FromJson.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.sql 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.SQLContext 6 | 7 | object FromJson { 8 | 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setAppName("SparkSQLBasics") 11 | val sc = new SparkContext(conf) 12 | val sqlContext = new SQLContext(sc) 13 | 14 | val input = sqlContext.read.json("/spark_learning/testweet.json") 15 | 16 | input.registerTempTable("tweets") 17 | val texts = sqlContext.sql("select text from tweets") 18 | 19 | 20 | //udf register 21 | sqlContext.udf.register("strLen",(x:String)=>{findLength(x)}) 22 | texts.foreach(println) 23 | } 24 | 25 | def findLength(x:String) = { 26 | x.length 27 | } 28 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/sql/HiveTest.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.sql 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.hive.HiveContext 6 | 7 | object HiveTest { 8 | 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setAppName("SparkSQLBasics") 11 | val sc = new SparkContext(conf) 12 | 13 | val sqlContext = new HiveContext(sc) 14 | 15 | val input = sqlContext.read.json("/spark_learning/testweet.json") 16 | input.registerTempTable("tweets") 17 | val texts = sqlContext.sql("select text from tweets") 18 | texts.saveAsTable("texts") 19 | } 20 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/sql/ToMongoDB.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.sql 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.SQLContext 6 | import com.stratio.datasource._ 7 | import com.stratio.datasource.mongodb._ 8 | import com.stratio.datasource.mongodb.schema._ 9 | import com.stratio.datasource.mongodb.writer._ 10 | import com.stratio.datasource.mongodb.config.MongodbConfig._ 11 | import org.apache.spark.sql.SQLContext 12 | import com.stratio.datasource.util.Config._ 13 | import com.stratio.datasource.mongodb.config.MongodbConfigBuilder 14 | 15 | /** 16 | * Using https://github.com/Stratio/Spark-MongoDB 17 | */ 18 | object ToMongoDB { 19 | 20 | def main(args: Array[String]): Unit = { 21 | val conf = new SparkConf().setAppName("ToMongoDB") 22 | val sc = new SparkContext(conf) 23 | val sqlContext = new SQLContext(sc) 24 | 25 | val input = sqlContext.read.json("/spark_learning/testweet.json") 26 | val avroInput = sqlContext.read.format("com.databricks.spark.avro").load("/spark_learning/avro/") 27 | 28 | input.registerTempTable("tweets") 29 | val targetData = sqlContext.sql("Select * from tweets") 30 | 31 | 32 | val targetOutputBuilder = MongodbConfigBuilder( 33 | Map(Host -> List("localhost:27017"), 34 | Database -> "test", 35 | Collection -> "target", 36 | SamplingRatio -> 1.0, 37 | WriteConcern -> "normal", 38 | SplitKey -> "_id", 39 | SplitSize -> 8)) 40 | 41 | val writeConfig = targetOutputBuilder.build() 42 | 43 | // Writing data into the mongoDb table 44 | //targetData.saveToMongodb(writeConfig) 45 | //write avro data to mongodb dable 46 | avroInput.saveToMongodb(writeConfig) 47 | } 48 | 49 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/sql/res/twitter.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/spark/src/main/scala/com/vishnu/spark/sql/res/twitter.avro -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/sql/res/twitter.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type" : "record", 3 | "name" : "twitter_schema", 4 | "namespace" : "com.miguno.avro", 5 | "fields" : [ { 6 | "name" : "username", 7 | "type" : "string", 8 | "doc" : "Name of the user account on Twitter.com" 9 | }, { 10 | "name" : "tweet", 11 | "type" : "string", 12 | "doc" : "The content of the user's Twitter message" 13 | }, { 14 | "name" : "timestamp", 15 | "type" : "long", 16 | "doc" : "Unix epoch time in seconds" 17 | } ], 18 | "doc:" : "A basic schema for storing Twitter messages" 19 | } 20 | -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/FlumeStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.StreamingContext._ 6 | import org.apache.spark.streaming.flume._ 7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 8 | 9 | 10 | object FlumeStreaming { 11 | def main(args: Array[String]) { 12 | 13 | val host = "localhost" 14 | val port = 4444 15 | val conf = new SparkConf().setAppName("FlumeStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 16 | val ssc = new StreamingContext(conf, Seconds(1)) 17 | 18 | val stream = FlumeUtils.createStream(ssc, host, port) 19 | val words = stream.flatMap(_.event.toString().split(" ")) 20 | val pairs = words.map(word => (word,1)) 21 | val wordCounts = pairs.reduceByKey(_+_) 22 | 23 | wordCounts.print() 24 | 25 | ssc.start() 26 | ssc.awaitTermination() 27 | } 28 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/KafkaDirectStream.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | 4 | import org.apache.spark._ 5 | import org.apache.spark.streaming.StreamingContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.kafka.KafkaUtils 8 | import kafka.serializer.StringDecoder 9 | 10 | 11 | 12 | /** 13 | * DirectStream approach periodically queries the kafka topic for new offset and takes in data 14 | * from previous offset to new offset as an RDD 15 | * 16 | * 1. creates as many RDD partitions as there are kafka partitions 17 | * 2. no need of write ahead log to ensure no data loss 18 | * 3. no zookeeper hence hence exactly-once guarantee can be maintained. In the case of zookeeper 19 | * there might some miss communication b/w spark and zookeeper during failures and chances are 20 | * there that some data may be read twice. 21 | * 22 | */ 23 | object KafkaDirectStream { 24 | 25 | def main(args :Array[String]) { 26 | val conf = new SparkConf().setAppName("KafkaStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 27 | val ssc = new StreamingContext(conf,Seconds(1)) 28 | val topics = "spark_streaming" 29 | val topicsSet = topics.split(",").toSet 30 | val brokers = "localhost:9092" 31 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) 32 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) 33 | val lines = messages.map(_._2) 34 | 35 | val words = lines.flatMap(_.split(" ")) 36 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) 37 | wordCounts.print() 38 | 39 | // Start the computation 40 | ssc.start() 41 | ssc.awaitTermination() 42 | 43 | } 44 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/KafkaStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark.streaming.kafka._ 4 | import org.apache.spark._ 5 | import org.apache.spark.streaming._ 6 | import org.apache.spark.streaming.StreamingContext._ 7 | 8 | /** 9 | * @author : vishnu viswanath 10 | * Receiver based approach, i.e., used kafka consumer api to implement receiver 11 | * Drawback : possible loss of data incase of failures 12 | * 13 | * Solution : use write-ahead logs and Reliable receivers. 14 | * Spark provides a built in ReliableKafkaReceiver class which is not used by default. 15 | * To use this receiver, set spark.streaming.receiver.writeAheadLog.enable to true 16 | * 17 | * 18 | */ 19 | object KafkaStreaming { 20 | 21 | def main(args: Array[String]) { 22 | val conf = new SparkConf().setAppName("KafkaStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 23 | val ssc = new StreamingContext(conf, Seconds(1)) 24 | 25 | //default zookeeper quorum is localhost in single node setup 26 | val zqQuorum = "localhost" 27 | val groupId = "spark" 28 | val topics = "spark_streaming" 29 | val topicMap = topics.split(",").map((_, 1)).toMap 30 | val lines = KafkaUtils.createStream(ssc,zqQuorum,groupId,topicMap) 31 | val words = lines.map(_._2).flatMap(_.split(" ")) 32 | val pairs = words.map(word => (word,1)) 33 | val wordCounts = pairs.reduceByKey(_+_) 34 | wordCounts.print() 35 | 36 | ssc.start() 37 | ssc.awaitTermination() 38 | } 39 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/SeqFileStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.StreamingContext._ 6 | import org.apache.hadoop.io.Text 7 | import org.apache.hadoop.io.IntWritable 8 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat 9 | 10 | 11 | /** 12 | * An example of how to stream from sequence file 13 | */ 14 | object SeqFileStreaming { 15 | def main(args: Array[String]) { 16 | 17 | val conf = new SparkConf().setAppName("SeqFileStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 18 | val ssc = new StreamingContext(conf, Seconds(10)) 19 | val inputDir = "/usr/vishnu/spark_temp/seqfile_sample/" 20 | val keyValue = ssc.fileStream[Text,IntWritable, SequenceFileInputFormat[Text,IntWritable]](inputDir).map { 21 | 22 | //x.toString is needed because Text by itself is not serializ able and it will throw an error 23 | case (x,y) => (x.toString,y.get()) 24 | } 25 | keyValue.print() 26 | 27 | ssc.start() 28 | ssc.awaitTermination() 29 | } 30 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/SocketStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.StreamingContext._ 6 | 7 | 8 | /** 9 | * Example from spark programming guide 10 | * https://spark.apache.org/docs/1.4.1/streaming-programming-guide.html 11 | */ 12 | object SocketStreaming { 13 | def main(args: Array[String]) { 14 | 15 | val conf = new SparkConf().setAppName("BasicStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 16 | val ssc = new StreamingContext(conf, Seconds(1)) 17 | 18 | val lines = ssc.socketTextStream("localhost", 9999) 19 | val words = lines.flatMap(_.split(" ")) 20 | val pairs = words.map(word => (word,1)) 21 | val wordCounts = pairs.reduceByKey(_+_) 22 | 23 | wordCounts.print() 24 | 25 | ssc.start() 26 | ssc.awaitTermination() 27 | } 28 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/StreamingFromCheckpoint.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.StreamingContext 5 | import org.apache.spark.streaming.Seconds 6 | 7 | 8 | object StreamingFromCheckpoint { 9 | 10 | val checkpoint_dir = "/user/vishnu/spark_checkpoint" 11 | var dataDir = "" 12 | def main(args: Array[String]): Unit = { 13 | dataDir = args(0) 14 | val ssc = StreamingContext.getOrCreate(checkpoint_dir,createStreamingContext _) 15 | 16 | ssc.start() 17 | ssc.awaitTermination() 18 | } 19 | 20 | def createStreamingContext() = { 21 | println("creating new stream") 22 | val conf = new SparkConf().setAppName("StreamingRecoverFromCheckpoint") 23 | val ssc = new StreamingContext(conf,Seconds(10)) 24 | ssc.checkpoint(checkpoint_dir) 25 | val dataDirDStream = ssc.textFileStream(dataDir) 26 | dataDirDStream.print() 27 | ssc 28 | } 29 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/StreamingJoins.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.StreamingContext._ 6 | 7 | 8 | /** 9 | * An example with multiple receivers and stream joins 10 | * This is also an example of Multiple DStream, => this created multiple receivers 11 | */ 12 | object StreamingJoins { 13 | def main(args: Array[String]) { 14 | 15 | val conf = new SparkConf().setAppName("StreamingJoins").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 16 | val ssc = new StreamingContext(conf, Seconds(10)) 17 | 18 | val stream1 = ssc.socketTextStream("localhost", 9999) 19 | val stream2 = ssc.socketTextStream("localhost", 8888) 20 | 21 | 22 | val words1 = stream1.map(processLine) 23 | val words2 = stream2.map(processLine) 24 | val joined = words1.join(words2) 25 | joined.print() 26 | 27 | ssc.start() 28 | ssc.awaitTermination() 29 | } 30 | 31 | def processLine(line:String) = { 32 | val words = line.split(" ") 33 | (words(0),line) 34 | } 35 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/StreamingWindow.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.streaming.StreamingContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming._ 8 | import org.apache.spark.SparkContext._ 9 | import org.apache.spark.streaming.api._ 10 | import org.apache.spark.streaming.dstream.DStream 11 | import org.apache.spark.streaming.StreamingContext._ 12 | import org.apache.spark.SparkContext._ 13 | import org.apache.spark.sql.SQLContext 14 | 15 | object StreamingWindow { 16 | 17 | def main(args: Array[String]): Unit = { 18 | //confs 19 | val conf = new SparkConf().setAppName("StreamingWindow") 20 | val sc = new SparkContext(conf) 21 | val ssc = new StreamingContext(sc, Seconds(1)) 22 | 23 | //stream from text file 24 | val linesDStream = ssc.textFileStream("/user/vishnu/mapr/dev362"); 25 | val sensorDStream = linesDStream.map(parseSensor) 26 | 27 | //count of events by resid 28 | val counts = sensorDStream.map(sensor=>(sensor.resid,1)).reduceByKeyAndWindow((a:Int,b:Int)=>(a+b), Seconds(6), Seconds(2)) 29 | counts.print() 30 | 31 | //6 seconds data, 2 seconds window 32 | sensorDStream.window(Seconds(6),Seconds(2)).foreachRDD { 33 | rdd => 34 | if (!rdd.partitions.isEmpty) { 35 | val sqlContext = SQLContext.getOrCreate(rdd.sparkContext) 36 | import sqlContext.implicits._ 37 | import org.apache.spark.sql.functions._ 38 | 39 | val sensorDF = rdd.toDF() 40 | sensorDF.registerTempTable("sensor") 41 | 42 | val res = sqlContext.sql("SELECT resid, date, count(resid) as total FROM sensor GROUP BY resid, date") 43 | println("sensor count ") 44 | res.show 45 | val res2 = sqlContext.sql("SELECT resid, date, MAX(psi) as maxpsi, min(psi) as minpsi, avg(psi) as avgpsi FROM sensor GROUP BY resid,date") 46 | println("sensor max, min, averages ") 47 | res2.show 48 | } 49 | } 50 | 51 | 52 | println("Starting streaming") 53 | ssc.start() 54 | ssc.awaitTermination() 55 | 56 | 57 | } 58 | 59 | case class Sensor(resid: String, date: String, time: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double) extends Serializable 60 | 61 | def parseSensor(str: String): Sensor = { 62 | val p = str.split(",") 63 | Sensor(p(0), p(1), p(2), p(3).toDouble, p(4).toDouble, p(5).toDouble, p(6).toDouble, p(7).toDouble, p(8).toDouble) 64 | } 65 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/StreamingWithCheckpoint.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.StreamingContext._ 6 | 7 | 8 | /** 9 | * An example of Streaming with checkpointing 10 | */ 11 | object StreamingWithCheckpointing { 12 | def main(args: Array[String]) { 13 | 14 | val conf = new SparkConf().setAppName("StreamingWithCheckpointing").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 15 | val ssc = new StreamingContext(conf, Seconds(1)) 16 | ssc.checkpoint("hdfs:///user/vishnu/spark_checkpoint") 17 | 18 | val linesDStream = ssc.socketTextStream("localhost", 9999) 19 | 20 | val lines = linesDStream.window(Seconds(5),Seconds(10)) 21 | val words = lines.flatMap(_.split(" ")) 22 | val pairs = words.map(word => (word,1)) 23 | 24 | pairs.checkpoint(Seconds(10)); 25 | val wordCounts = pairs.reduceByKey(_+_) 26 | 27 | wordCounts.print() 28 | 29 | ssc.start() 30 | ssc.awaitTermination() 31 | } 32 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/UpdateStateByKey.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.StreamingContext._ 6 | 7 | 8 | /** 9 | * An example of update state by key 10 | * Each log entry contains 11 | * based on the action, state of the userid is udpated 12 | */ 13 | object UpdateStateByKey { 14 | def main(args: Array[String]) { 15 | 16 | 17 | val conf = new SparkConf().setAppName("UpdateStateByKey").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 18 | val ssc = new StreamingContext(conf, Seconds(1)) 19 | ssc.checkpoint("hdfs:///user/vishnu/spark_checkpoint") 20 | 21 | 22 | val linesDStream = ssc.socketTextStream("localhost", 9999) 23 | 24 | //input is expected to be of the format 25 | val userActionPair = linesDStream.map(line => { 26 | val parts = line.split(" ") 27 | (parts(0),parts(1)) 28 | }) 29 | 30 | val userStates = userActionPair.updateStateByKey(updateUserState) 31 | userStates.print() 32 | 33 | ssc.start() 34 | ssc.awaitTermination() 35 | } 36 | 37 | def updateUserState(values: Seq[String], state:Option[String]) = { 38 | val currState = state.getOrElse("Unknown") 39 | var newState = Option(currState) 40 | if (!currState.equals(values.lastOption)) { 41 | if (values.lastOption != None) { 42 | newState = values.lastOption 43 | } 44 | } 45 | newState 46 | } 47 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/WindowedStream.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.StreamingContext._ 6 | 7 | 8 | /** 9 | * Streaming with sliding window 10 | */ 11 | object WindowedStream { 12 | def main(args: Array[String]) { 13 | 14 | val conf = new SparkConf().setAppName("StreamingWithCheckpointing").setMaster("spark://Vishnus-MacBook-Pro.local:7077") 15 | val ssc = new StreamingContext(conf, Seconds(1)) 16 | ssc.checkpoint("hdfs:///user/vishnu/spark_checkpoint") 17 | 18 | 19 | val linesDStream = ssc.socketTextStream("localhost", 9999) 20 | val lines = linesDStream.window(Seconds(10),Seconds(5)) 21 | val words = lines.flatMap(_.split(" ")) 22 | val pairs = words.map(word => (word,1)) 23 | pairs.checkpoint(Seconds(10)); 24 | val wordCounts = pairs.reduceByKey(_+_) 25 | //wordCounts.print() 26 | 27 | 28 | //reduce by key and window, will do reduce by key and use the first function to do the aggregation 29 | //and second function to do the inverse aggregation 30 | val windowedWordCount = pairs.reduceByKeyAndWindow({(x,y)=>x+y},{(x,y)=>x-y}, Seconds(10),Seconds(5)) 31 | //windowedWordCount.print() 32 | 33 | 34 | 35 | //expected input 36 | //e.g, 10.90.123.42 some long log contenxt 37 | val logsDStream = ssc.socketTextStream("localhost", 8888) 38 | val ipAddress = logsDStream.map(line => line.split(" ")(0)) 39 | val count1 = ipAddress.countByValueAndWindow(Seconds(10),Seconds(5)); 40 | val count2 = ipAddress.countByWindow(Seconds(10),Seconds(5)); 41 | 42 | count1.print() 43 | count2.print() 44 | 45 | 46 | 47 | 48 | 49 | 50 | ssc.start() 51 | ssc.awaitTermination() 52 | } 53 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/akka/SendToActor.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming.akka 2 | 3 | import org.apache.spark._ 4 | import akka.actor.ActorSystem 5 | 6 | /** 7 | * This object is used to send message to HelloSpark akka Actor 8 | */ 9 | object SendToActor { 10 | 11 | def main(args: Array[String]) : Unit = { 12 | val actorSystem = ActorSystem("sparkMaster") 13 | 14 | val url = s"akka.tcp://sparkDriver@$SparkAkkaSource.driverHost:$SparkAkkaSource.driverPort/user/Supervisor0/$SparkAkkaSource.actorName" 15 | val helloer = actorSystem.actorSelection(url) 16 | 17 | var ok = true 18 | while (ok) { 19 | val ln = readLine() 20 | ok = ln != null 21 | if (ok) { 22 | helloer ! ln 23 | } 24 | } 25 | } 26 | 27 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/akka/SparkAkkaSource.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming.akka 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | import akka.actor.Props 6 | import org.apache.spark.streaming.receiver.ActorHelper 7 | import akka.actor.Actor 8 | 9 | /** 10 | * Example form http://www.lightbend.com/activator/template/spark-streaming-scala-akka#code/src/main/scala/StreamingApp.scala 11 | */ 12 | 13 | //INCOMPLETE 14 | 15 | class HelloSpark extends Actor with ActorHelper { 16 | 17 | override def preStart() = { 18 | println("") 19 | println("Starting HelloSpark Actor") 20 | println("") 21 | } 22 | 23 | def receive = { 24 | case s => store(s) 25 | } 26 | } 27 | 28 | object SparkAkkaSource { 29 | 30 | //fix a driver port, this is by default random (but now we need to know what driver port is) 31 | val driverPort = 7777 32 | val driverHost = "localhost" 33 | val actorName = "HelloSparkActor" 34 | 35 | def main(args: Array[String]): Unit = { 36 | 37 | val conf = new SparkConf(false) 38 | .setMaster("local[*]") 39 | .setAppName("Spark Streaming from Akka") 40 | .set("spark.logConf", "true") 41 | .set("spark.driver.port", driverPort.toString) 42 | .set("spark.driver.host", driverHost) 43 | .set("spark.akka,logLifeCycleEvents", "true") 44 | 45 | val ssc = new StreamingContext(conf, Seconds(1)) 46 | 47 | val actorStream = ssc.actorStream[String](Props[HelloSpark], actorName) 48 | actorStream.print() 49 | 50 | ssc.start() 51 | java.util.concurrent.TimeUnit.SECONDS.sleep(3) 52 | 53 | val actorSystem = SparkEnv.get.actorSystem 54 | 55 | val url = s"akka.tcp://sparkDriver@$driverHost:$driverPort/user/vishnu/$actorName" 56 | val helloer = actorSystem.actorSelection(url) 57 | helloer ! "Hello" 58 | helloer ! "from" 59 | helloer ! "Spark Streaming" 60 | helloer ! "with" 61 | helloer ! "Scala" 62 | helloer ! "and" 63 | helloer ! "Akka" 64 | 65 | val ln = readLine() 66 | ssc.stop(stopSparkContext = true, stopGracefully = true) 67 | 68 | } 69 | 70 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/customsource/ActivityReceiver.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming.customsource 2 | 3 | import org.apache.spark.streaming.receiver.Receiver 4 | import org.apache.spark.storage.StorageLevel 5 | import scala.util.control.Breaks._ 6 | import java.net.Socket 7 | import java.io.BufferedReader 8 | import java.io.InputStreamReader 9 | 10 | 11 | case class Activity(user: String,action:String) 12 | 13 | /** 14 | * Based on https://www.mapr.com/blog/how-integrate-custom-data-sources-apache-spark?platform=hootsuite 15 | */ 16 | class ActivityReceiver(port:Int) extends Receiver[Activity] (StorageLevel.MEMORY_ONLY){ 17 | 18 | override def onStart(): Unit = { 19 | println("Activity Receiver starting") 20 | val thread = new Thread("ActivityReceiverThread") { 21 | override def run() { 22 | val socket = new Socket("localhost",port) 23 | val reader = new BufferedReader(new InputStreamReader (socket.getInputStream(), "UTF-8")) 24 | var line = "" 25 | while(!isStopped()) { 26 | var line = reader.readLine() 27 | if (line == null) break 28 | else { 29 | val parts = line.split(" ") 30 | val activity = Activity(parts(0),parts(1)) 31 | store(activity) 32 | } 33 | } 34 | } 35 | } 36 | thread.start() 37 | } 38 | 39 | override def onStop(): Unit = { 40 | stop("Activity receiver stopping") 41 | } 42 | } -------------------------------------------------------------------------------- /spark/src/main/scala/com/vishnu/spark/streaming/customsource/StreamingWithCustomSource.scala: -------------------------------------------------------------------------------- 1 | package com.vishnu.spark.streaming.customsource 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.StreamingContext 5 | import org.apache.spark.streaming.Seconds 6 | 7 | object StreamingWithCustomSource { 8 | 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setAppName("StreamingWithCustomSource") 11 | val ssc = new StreamingContext(conf,Seconds(5)) 12 | 13 | val activityDStream = ssc.receiverStream(new ActivityReceiver(9999)) 14 | activityDStream.print() 15 | 16 | ssc.start() 17 | ssc.awaitTermination() 18 | 19 | } 20 | } -------------------------------------------------------------------------------- /spark/uberjar.md: -------------------------------------------------------------------------------- 1 | ## How to create Uber/Fat jar using sbt 2 | 3 | 1. create assembly.sbt file inside your project folder. 4 | e.g., if your eclipse project root folder is spark_learn, then you should have spark_learn/project/assembly.sbt 5 | 2. add below lines to assembly.sbt 6 | ``` 7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.2") 8 | ``` 9 | 3. To create the jar, from your sbt console type `assembly`. Note that you won't get a fat jar if you use the command `package` 10 | 4. All dependecies would now be packaged in your jar file. to exclude a jar file and its dependencies, mention it as provided. 11 | e.g., `val spark_streaming = "org.apache.spark" % "spark-streaming_2.10" % "1.6.0" % "provided"` 12 | 5. You might need to add some merge starategies since multiple depnedencies can depend on same dependency(e.g., A can depend on C and B can depend on another verision of C) 13 | . 14 | This is how my merge strategy is defined. Works so far for Spark 15 | 16 | ``` 17 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => 18 | { 19 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first 20 | case PathList(ps @ _*) if ps.last endsWith ".html" => MergeStrategy.first 21 | case x if x.contains("unused") => MergeStrategy.last 22 | case "application.conf" => MergeStrategy.concat 23 | case "unwanted.txt" => MergeStrategy.discard 24 | case x => old(x) 25 | } 26 | } 27 | ``` 28 | -------------------------------------------------------------------------------- /spark_23/build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark_23" 2 | version := "1.0" 3 | scalaVersion := "2.11.9" 4 | 5 | val sparkVersion = "2.4.0-SNAPSHOT" 6 | val kafkaVersion = "0.10.2.1" 7 | 8 | resolvers += Resolver.mavenLocal 9 | 10 | libraryDependencies += "org.apache.spark" %% "spark-sql-kafka-0-10" % sparkVersion 11 | libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion 12 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion 13 | libraryDependencies += "org.apache.kafka" % "kafka-clients" % kafkaVersion 14 | 15 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.2" % Test 16 | 17 | -------------------------------------------------------------------------------- /spark_23/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.16 -------------------------------------------------------------------------------- /spark_23/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/ContinuousKafkaStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.streaming.Trigger 7 | 8 | 9 | //convert aggregates into typed data 10 | case class CarEvent(carId: String, speed: Option[Int], acceleration: Option[Double], timestamp: Timestamp) 11 | object CarEvent { 12 | def apply(rawStr: String): CarEvent = { 13 | val parts = rawStr.split(",") 14 | CarEvent(parts(0), Some(Integer.parseInt(parts(1))), Some(java.lang.Double.parseDouble(parts(2))), new Timestamp(parts(3).toLong)) 15 | } 16 | } 17 | 18 | /** 19 | * Created by vviswanath on 2/18/18. 20 | */ 21 | object ContinuousKafkaStreaming { 22 | 23 | def main(args: Array[String]): Unit = { 24 | val spark = SparkSession.builder() 25 | .appName("ContinuousStreaming Kafka example") 26 | .master("local[*]") 27 | .getOrCreate() 28 | 29 | import spark.implicits._ 30 | 31 | val raw = spark 32 | .readStream 33 | .format("kafka") 34 | .option("kafka.bootstrap.servers", "localhost:9092") 35 | .option("subscribe", "cars") 36 | .load() 37 | 38 | //supported operations in Continuous Processing includes - Map, Filter, Project 39 | val fastCars = raw 40 | .selectExpr("CAST(value as STRING)") //project 41 | .map(r ⇒ CarEvent(r.getString(0))) //map 42 | .filter("speed > 70") //filter 43 | //.filter(c ⇒ c.speed.getOrElse(0) > 70) //TypedFilter not supported in continuous processing, 44 | 45 | 46 | val consoleQuery = fastCars 47 | .writeStream 48 | .format("console") 49 | .outputMode("append") 50 | //.outputMode("update") 51 | //.outputMode("complete") not supported since it requires an agg, and Continuous processing does not support aggregations. 52 | .trigger(Trigger.Continuous("1 second")) 53 | .start() 54 | 55 | 56 | val kafkaSinkQuery = fastCars 57 | .selectExpr("CAST(carId as STRING) as value") //kafka needs a value field 58 | .writeStream 59 | .format("kafka") 60 | .outputMode("update") 61 | .option("kafka.bootstrap.servers", "localhost:9092") 62 | .option("topic", "fastcars") 63 | .option("checkpointLocation", "/tmp/spark/continuousCheckpoint") 64 | .outputMode("update") 65 | .trigger(Trigger.Continuous("10 seconds")) //how often to checkpoint the offsets, 66 | .start() 67 | 68 | spark.streams.awaitAnyTermination() 69 | 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/CustomV2SourceExample.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming 2 | import com.vishnuviswanath.spark.streaming.sources.netcat.NetcatSourceProvider 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.Trigger 5 | 6 | /** 7 | * Created by vviswanath on 2/20/18. 8 | * 9 | * An example that uses CustomV2 source {@link NetcatSourceProvider} 10 | */ 11 | object CustomV2SourceExample { 12 | 13 | def main(args: Array[String]): Unit = { 14 | val spark = SparkSession.builder() 15 | .appName("CustomV2 source") 16 | .master("local[*]") 17 | .getOrCreate() 18 | 19 | spark.sparkContext.setLogLevel("ERROR") 20 | 21 | import spark.implicits._ 22 | val raw = spark 23 | .readStream 24 | .format(classOf[NetcatSourceProvider].getName) 25 | .option("port", 9999) 26 | .option("host", "localhost") 27 | .option("buffSize", 100) 28 | .load() 29 | 30 | val consoleQuery = raw 31 | .selectExpr("cast(value as STRING)") 32 | .writeStream 33 | .queryName("console-query") 34 | .format("console") 35 | .outputMode("update") 36 | //.outputMode("update") 37 | //.outputMode("complete") not supported since it requires an agg, and Continuous processing does not support aggregations. 38 | .trigger(Trigger.Continuous("3 second")) //how often to checkpoint 39 | .start() 40 | 41 | consoleQuery.awaitTermination() 42 | 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/HelloStructredStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming 2 | 3 | import org.apache.spark.sql.{Dataset, SparkSession} 4 | 5 | /** 6 | * Created by vviswanath on 1/9/18. 7 | * 8 | * Word count program to get started with Spark Structured Streaming 9 | */ 10 | object HelloStructredStreaming { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | //create a spark session, and run it on local mode 15 | val spark = SparkSession.builder() 16 | .appName("HelloStructuredStreaming") 17 | .master("local[*]") 18 | .getOrCreate() 19 | 20 | import spark.implicits._ 21 | 22 | //read from a directory as text stream 23 | val readme: Dataset[String] = spark 24 | .readStream 25 | .textFile("/Users/vviswanath/Downloads/streaming_input_dir/cars/") 26 | 27 | //do word count 28 | val words = readme.flatMap(_.split(" ")) 29 | val wordCounts = words.groupBy("value").count() 30 | 31 | //run the wordCount query and write to console 32 | val query = wordCounts 33 | .writeStream 34 | .queryName("WordCount") 35 | .outputMode("complete") 36 | .format("console") 37 | .start() 38 | 39 | //wait till query.stop() is called 40 | query.awaitTermination() 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/SocketSourceStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming 2 | 3 | import org.apache.spark.sql.streaming.Trigger 4 | import org.apache.spark.sql.{Dataset, SparkSession} 5 | 6 | /** 7 | * Created by vviswanath on 1/9/18. 8 | * 9 | * Wordcount from socket streams. 10 | * 11 | * nc -lk 9999 12 | */ 13 | object SocketSourceStreaming { 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | //create a spark session, and run it on local mode 18 | val spark = SparkSession.builder() 19 | .appName("NetcatSourceStreaming") 20 | .master("local[*]") 21 | .getOrCreate() 22 | 23 | spark.sparkContext.setLogLevel("ERROR") 24 | 25 | import spark.implicits._ 26 | 27 | //read from a directory as text stream 28 | val socketData = spark 29 | .readStream 30 | .format("socket") 31 | .option("host", "localhost") 32 | .option("port", 9999) 33 | .load() 34 | 35 | //do word count 36 | val words = socketData.as[String].flatMap(_.split(" ")) 37 | val wordCounts = words.groupBy("value").count() 38 | 39 | //run the wordCount query and write to console 40 | val query = wordCounts 41 | .writeStream 42 | .queryName("WordCount") 43 | .outputMode("update") //output only the counts that changed 44 | //.outputMode("complete") //output all the counts seen till now 45 | .format("console") 46 | //.trigger(Trigger.ProcessingTime(5000)) //triggers the query every "interval" if any new element was received. 47 | .start() 48 | 49 | 50 | 51 | //wait till query.stop() is called 52 | query.awaitTermination() 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/StreamingAggregations.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.types._ 5 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} 6 | 7 | /** 8 | * Created by vviswanath on 1/10/18. 9 | * 10 | * Explore Spark streaming aggregations. 11 | */ 12 | object StreamingAggregations { 13 | 14 | //convert aggregates into typed data 15 | case class CarEvent(car: String, speed: Option[Int], acceleration: Option[Double]) 16 | 17 | def main(args: Array[String]): Unit = { 18 | 19 | //create a spark session, and run it on local mode 20 | val spark = SparkSession.builder() 21 | .appName("StreaminAggregations") 22 | .master("local[*]") 23 | .getOrCreate() 24 | 25 | //spark.sparkContext.setLogLevel("WARN") 26 | 27 | import spark.implicits._ 28 | 29 | //define the schema 30 | val schema = StructType( 31 | StructField("car", StringType) :: 32 | StructField("speed", IntegerType) :: 33 | StructField("acceleration", DoubleType) :: Nil) 34 | 35 | //read the source 36 | val cars: DataFrame = spark 37 | .readStream 38 | .schema(schema) 39 | .csv("/Users/vviswanath/Downloads/streaming_input_dir/cars/") 40 | 41 | //do aggregates 42 | val aggregates = cars 43 | .groupBy("car") 44 | .agg( 45 | "speed" → "max", 46 | "acceleration" → "avg") 47 | .withColumnRenamed("max(speed)", "speed") 48 | .withColumnRenamed("avg(acceleration)", "acceleration") 49 | 50 | aggregates.printSchema() 51 | aggregates.explain() 52 | 53 | val typedAggregates = aggregates.as[CarEvent] 54 | val filtered = typedAggregates 55 | .filter(_.speed.exists(_ > 70)) 56 | .where("acceleration > 10") 57 | .repartition(10) 58 | 59 | val query = filtered 60 | .writeStream 61 | .queryName("fastVehicles") 62 | .partitionBy("car") 63 | .outputMode("complete") 64 | .format("console") 65 | .start() 66 | 67 | query.awaitTermination() 68 | 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/sources/netcat/NetcatContinuousReader.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming.sources.netcat 2 | 3 | import java.util 4 | import java.util.Optional 5 | 6 | import org.apache.spark.sql.Row 7 | import org.apache.spark.sql.sources.v2.reader.{DataReader, DataReaderFactory} 8 | import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, Offset, PartitionOffset} 9 | import org.apache.spark.sql.types.StructType 10 | 11 | /** 12 | * Created by vviswanath on 2/21/18. 13 | */ 14 | class NetcatContinuousReader(schema: StructType, 15 | sourceOptions: Map[String, String]) extends ContinuousReader { 16 | 17 | 18 | val numPartitions = 1 19 | 20 | private var offset: Offset = _ 21 | 22 | override def getStartOffset: Offset = offset 23 | 24 | override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = new NetcatOffset 25 | 26 | override def setStartOffset(start: Optional[Offset]): Unit = {} 27 | 28 | override def deserializeOffset(json: String): Offset = new NetcatOffset 29 | 30 | override def commit(end: Offset): Unit = {} 31 | 32 | /** 33 | * Create a reader factory with just 1 reader. 34 | * @return 35 | */ 36 | override def createDataReaderFactories(): util.List[DataReaderFactory[Row]] = { 37 | java.util.Arrays.asList(new DataReaderFactory[Row] { 38 | val port = sourceOptions.getOrElse("port", "9999").toInt 39 | val host = sourceOptions.getOrElse("host", "localhost") 40 | val buffSize = sourceOptions.getOrElse("buffSize", "100").toInt 41 | override def createDataReader(): DataReader[Row] = new NetcatReader(port, host, buffSize) 42 | }) 43 | } 44 | 45 | override def readSchema(): StructType = schema 46 | 47 | override def stop(): Unit = {} 48 | } 49 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/sources/netcat/NetcatOffset.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming.sources.netcat 2 | 3 | import org.apache.spark.sql.sources.v2 4 | import org.json4s.jackson.Serialization 5 | 6 | /** 7 | * Created by vviswanath on 2/21/18. 8 | */ 9 | class NetcatOffset extends v2.reader.streaming.Offset { 10 | 11 | override def json(): String = "{}" 12 | } 13 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/sources/netcat/NetcatReader.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming.sources.netcat 2 | 3 | import java.io.{BufferedReader, InputStreamReader} 4 | import java.net.Socket 5 | 6 | import org.apache.spark.sql.Row 7 | import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousDataReader, PartitionOffset} 8 | 9 | /** 10 | * Created by vviswanath on 2/21/18. 11 | */ 12 | class NetcatReader(port: Int, host: String, buffSize: Int) extends ContinuousDataReader[Row] { 13 | 14 | val conn = new Socket(host, port) 15 | val inReader = new BufferedReader(new InputStreamReader(conn.getInputStream)) 16 | 17 | var line: String = _ 18 | 19 | 20 | override def next(): Boolean = { 21 | line = inReader.readLine() 22 | line != null 23 | } 24 | 25 | override def get(): Row = { 26 | //print(s"read value $line") 27 | Row(line) 28 | } 29 | 30 | override def close(): Unit = { 31 | conn.close() 32 | } 33 | 34 | override def getOffset: PartitionOffset = NetcatPartitionOffset(0) 35 | } 36 | 37 | case class NetcatPartitionOffset(offset: Long) extends PartitionOffset 38 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/sources/netcat/NetcatSourceProvider.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming.sources.netcat 2 | 3 | import java.util.Optional 4 | 5 | import org.apache.spark.sql.sources.DataSourceRegister 6 | import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceOptions} 7 | import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader 8 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 9 | 10 | import scala.collection.JavaConverters._ 11 | 12 | /** 13 | * Created by vviswanath on 2/21/18. 14 | */ 15 | class NetcatSourceProvider extends ContinuousReadSupport 16 | with DataSourceRegister { 17 | 18 | override def shortName(): String = "netcat" 19 | 20 | val netcatSchema: StructType = StructType(Seq(StructField("value", StringType))) 21 | 22 | override def createContinuousReader(schema: Optional[StructType], checkpointLocation: String, options: DataSourceOptions): ContinuousReader = { 23 | new NetcatContinuousReader(netcatSchema, options.asMap().asScala.toMap) 24 | } 25 | } -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/util/NetcatProducer.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.util 2 | 3 | import java.io.{BufferedWriter, OutputStreamWriter, PrintWriter} 4 | import java.net.ServerSocket 5 | 6 | /** 7 | * Created by vviswanath on 2/21/18. 8 | * 9 | * A util for writing to a socket} 10 | */ 11 | object NetcatProducer { 12 | 13 | def main(args: Array[String]): Unit = { 14 | val port = 9999 15 | val server = new ServerSocket(port) 16 | val sleepInterval = 100 17 | 18 | 19 | val socket = server.accept() 20 | val outputStream = socket.getOutputStream 21 | 22 | val writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(outputStream))) 23 | 24 | val contentStream = WordsStream.stream 25 | 26 | for { 27 | word ← contentStream 28 | } { 29 | print(s"writing word $word\n") 30 | writer.println(word) 31 | writer.flush() 32 | Thread.sleep(sleepInterval) 33 | } 34 | 35 | print("close the producer?") 36 | System.in.read() 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/util/RandomCarsKafkaProducer.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.util 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 6 | 7 | import scala.annotation.tailrec 8 | import scala.util.{Random ⇒ r} 9 | /** 10 | * Created by vviswanath on 1/15/18. 11 | */ 12 | object RandomCarsKafkaProducer { 13 | 14 | def main(args: Array[String]): Unit = { 15 | val props = new Properties() 16 | props.put("bootstrap.servers", "localhost:9092") 17 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 18 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 19 | 20 | val producer = new KafkaProducer[String, String](props) 21 | val interval = 1000 22 | val topic = "cars" 23 | val numRecsToProduce: Option[Int] = None //None = infinite 24 | 25 | 26 | @tailrec 27 | def produceRecord(numRecToProduce: Option[Int]): Unit = { 28 | def generateCarRecord(topic: String): ProducerRecord[String, String] = { 29 | val carName = s"car${r.nextInt(10)}" 30 | val speed = r.nextInt(150) 31 | val acc = r.nextFloat * 100 32 | 33 | val value = s"$carName,$speed,$acc,${System.currentTimeMillis()}" 34 | print(s"Writing $value\n") 35 | val d = r.nextFloat() * 100 36 | if (d < 2) { 37 | //induce random delay 38 | println("Argh! some network dealy") 39 | Thread.sleep((d*100).toLong) 40 | } 41 | new ProducerRecord[String, String](topic,"key", value) 42 | } 43 | 44 | numRecToProduce match { 45 | case Some(x) if x > 0 ⇒ 46 | producer.send(generateCarRecord(topic)) 47 | Thread.sleep(interval) 48 | produceRecord(Some(x - 1)) 49 | 50 | case None ⇒ 51 | producer.send(generateCarRecord(topic)) 52 | Thread.sleep(interval) 53 | produceRecord(None) 54 | 55 | case _ ⇒ 56 | } 57 | } 58 | 59 | produceRecord(numRecsToProduce) 60 | 61 | 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/util/SimulateLateDateProducer.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.util 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.{Calendar, Properties} 5 | 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 7 | 8 | import scala.annotation.tailrec 9 | import scala.util.{Random => r} 10 | 11 | /** 12 | * Created by vviswanath on 1/15/18. 13 | */ 14 | object SimulateLateDateProducer { 15 | 16 | def main(args: Array[String]): Unit = { 17 | val props = new Properties() 18 | props.put("bootstrap.servers", "localhost:9092") 19 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 20 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 21 | 22 | val producer = new KafkaProducer[String, String](props) 23 | val topic = "cars" 24 | var mCount = 1 25 | 26 | def generateCarRecord(carName: String, speed: Int = r.nextInt(150), topic: String = topic, lateby: Long = 0): ProducerRecord[String, String] = { 27 | val acc = r.nextFloat * 100 28 | val nowTs = System.currentTimeMillis() 29 | val ts = nowTs - lateby 30 | val value = s"$carName,$speed,$acc,$ts" 31 | val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") 32 | 33 | val cal = Calendar.getInstance() 34 | cal.setTimeInMillis(ts) 35 | 36 | val now = Calendar.getInstance() 37 | now.setTimeInMillis(nowTs) 38 | print(s"[$mCount] Writing $value at ${format.format(now.getTime)} with Event time = ${format.format(cal.getTime)}\n") 39 | mCount += 1 40 | new ProducerRecord[String, String](topic,"key", value) 41 | } 42 | 43 | producer.send(generateCarRecord("car1", speed = 75)) 44 | Thread.sleep(1000) 45 | producer.send(generateCarRecord("car2", speed = 20)) 46 | Thread.sleep(1000) 47 | producer.send(generateCarRecord("car2", speed = 20)) 48 | Thread.sleep(8000) 49 | producer.send(generateCarRecord("car2", speed = 20)) //this message has a hidden importance, it increments the event time 50 | Thread.sleep(3000) 51 | producer.send(generateCarRecord("car1", speed = 50, lateby = 12000)) 52 | 53 | /* 54 | this will not throw away the state for the last message even though its past the watermark, since the eventtime never got updated in between 55 | producer.send(generateCarRecord("car1", speed = 75)) 56 | Thread.sleep(1000) 57 | producer.send(generateCarRecord("car2", speed = 20)) 58 | Thread.sleep(1000) 59 | producer.send(generateCarRecord("car2", speed = 20)) 60 | Thread.sleep(1000) 61 | producer.send(generateCarRecord("car2", speed = 20)) 62 | Thread.sleep(8000) 63 | producer.send(generateCarRecord("car1", speed = 50, lateby = 12000))*/ 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/util/ToFileProducer.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.util 2 | 3 | import java.io._ 4 | 5 | /** 6 | * Created by vviswanath on 2/22/18. 7 | * 8 | * Writes to a file, rolls over every rollOverCount, stops when maxFiles #files are created. 9 | * Sources from WordsStream.scala 10 | */ 11 | object ToFileProducer { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | val wordsStream = WordsStream.stream 16 | val rollOverCount = 1000 17 | var fileIndex = 0 18 | val maxFiles: Option[Int] = Some(50) 19 | 20 | val path = if (args.length > 0) args(0) else "/tmp/spark_file_stream" 21 | 22 | 23 | val filePrefix = "words_set" 24 | 25 | val stream = WordsStream.stream 26 | 27 | def rollingWriter(path: String, filePrefix: String)(index: Int, previousWriter: Option[PrintWriter]): PrintWriter = { 28 | previousWriter.foreach(w ⇒ { 29 | w.flush() 30 | w.close() 31 | }) 32 | val file = new File(s"$path/${filePrefix}_$index") 33 | print(s"new file created ${file.getAbsolutePath}\n") 34 | file.getParentFile.mkdirs() 35 | file.createNewFile() 36 | new PrintWriter(file) 37 | } 38 | 39 | val writerGen: (Int, Option[PrintWriter]) => PrintWriter = rollingWriter(path, filePrefix) 40 | 41 | var writer = writerGen(0, None) 42 | 43 | var wordsWritten = 0 44 | 45 | for { 46 | word ← stream 47 | } { 48 | if (wordsWritten == rollOverCount) { 49 | wordsWritten = 0 50 | if (maxFiles.isDefined && fileIndex + 1 > maxFiles.get) { 51 | System.exit(0) 52 | } 53 | fileIndex += 1 54 | writer = writerGen(fileIndex, Some(writer)) 55 | } 56 | writer.write(word+" ") 57 | wordsWritten += 1 58 | } 59 | 60 | writer.close() 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /spark_23/src/main/scala/com/vishnuviswanath/spark/util/WordsStream.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.util 2 | 3 | /** 4 | * Created by vviswanath on 2/22/18. 5 | */ 6 | object WordsStream { 7 | 8 | val content = "Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the Dataset/DataFrame API in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write Ahead Logs. In short, Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming.".split(" ") 9 | 10 | def infiniteWordsStream(content: Array[String], index: Int): Stream[String] = { 11 | val nextIndex = if (index == content.length - 1) 0 else index + 1 12 | content(index) #:: infiniteWordsStream(content, nextIndex) 13 | } 14 | 15 | val stream = infiniteWordsStream(content, 0) 16 | } 17 | -------------------------------------------------------------------------------- /spark_23/src/test/scala/com/vishnuviswanath/spark/streaming/HelloStructuredStreamingSpec.scala: -------------------------------------------------------------------------------- 1 | package com.vishnuviswanath.spark.streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.scalatest.{BeforeAndAfterEach, FunSuite} 5 | 6 | /** 7 | * Created by vviswanath on 1/10/18. 8 | */ 9 | class HelloStructuredStreamingSpec extends FunSuite with BeforeAndAfterEach { 10 | 11 | var spark: SparkSession = _ 12 | 13 | override def beforeEach(): Unit = { 14 | spark = SparkSession.builder() 15 | .appName("unitTest") 16 | .master("local") 17 | .getOrCreate() 18 | } 19 | 20 | override def afterEach(): Unit = { 21 | spark.stop() 22 | } 23 | 24 | test("Hello structured streaming") { 25 | 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /stormkafka/.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | .settings 4 | target 5 | 6 | *.class 7 | 8 | # Mobile Tools for Java (J2ME) 9 | .mtj.tmp/ 10 | 11 | # Package Files # 12 | *.jar 13 | *.war 14 | *.ear 15 | 16 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 17 | hs_err_pid* -------------------------------------------------------------------------------- /stormkafka/README.md: -------------------------------------------------------------------------------- 1 | This is a POC on how to build a near Realtime Processing system using **Apache Storm** and **Kafka** in **Java**.
2 | 3 | Messages come into a Kafka topic, Storm picks up these messages using Kafka Spout and gives it to a Bolt, 4 | which parses and identifies the message type based on the header. 5 | 6 | Once the message type is identified, the content of the message is extracted and is sent to different bolts for 7 | persistence - SOLR bolt, MongoDB bolt or HDFS bolt. 8 | 9 | [view the blog](http://vishnuviswanath.com/realtime-storm-kafka1.html) 10 | -------------------------------------------------------------------------------- /stormkafka/src/main/java/com/vishnu/storm/Keys.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.storm; 2 | 3 | /** 4 | * @author vishnu viswanath 5 | * This is an utility class. It contains the keys that should be present in the input config-file 6 | */ 7 | public class Keys { 8 | 9 | 10 | public static final String TOPOLOGY_NAME = "topology"; 11 | 12 | //kafka spout 13 | public static final String KAFKA_SPOUT_ID = "kafka-spout"; 14 | public static final String KAFKA_ZOOKEEPER = "kafka.zookeeper"; 15 | public static final String KAFKA_TOPIC = "kafa.topic"; 16 | public static final String KAFKA_ZKROOT = "kafka.zkRoot"; 17 | public static final String KAFKA_CONSUMERGROUP = "kafka.consumer.group"; 18 | public static final String KAFKA_SPOUT_COUNT = "kafkaspout.count"; 19 | 20 | //sink bolt 21 | public static final String SINK_TYPE_BOLT_ID = "sink-type-bolt"; 22 | public static final String SINK_BOLT_COUNT = "sinkbolt.count"; 23 | 24 | //solr bolt 25 | public static final String SOLR_BOLT_ID = "solr-bolt"; 26 | public static final String SOLR_BOLT_COUNT = "solrbolt.count"; 27 | public static final String SOLR_COLLECTION = "solr.collection"; 28 | public static final String SOLR_SERVER = "solr.url"; 29 | public static final String SOLR_ZOOKEEPER_HOSTS = "solr.zookeeper.hosts"; 30 | 31 | //hdfs bolt 32 | public static final String HDFS_BOLT_ID = "hdfs-bolt"; 33 | public static final String HDFS_BOLT_COUNT = "hdfsbolt.count"; 34 | public static final String HDFS_FOLDER = "hdfs.folder"; 35 | public static final String HDFS_PORT = "hdfs.port"; 36 | public static final String HDFS_HOST = "hdfs.host"; 37 | 38 | //mongodb bolt 39 | public static final String MONGO_BOLT_ID = "mongodb.bolt.id"; 40 | public static final String MONGO_HOST = "mongodb.host"; 41 | public static final String MONGO_PORT = "mongodb.port"; 42 | public static final String MONGO_DATABASE = "mongodb.database"; 43 | public static final String MONGO_COLLECTION = "mongodb.collection"; 44 | public static final String MONGO_BOLT_COUNT = "mongodbbolt.count"; 45 | 46 | 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /stormkafka/src/main/java/com/vishnu/storm/bolt/BoltBuilder.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.storm.bolt; 2 | 3 | import java.util.Properties; 4 | 5 | import org.apache.storm.hdfs.bolt.HdfsBolt; 6 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat; 7 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat; 8 | import org.apache.storm.hdfs.bolt.format.FileNameFormat; 9 | import org.apache.storm.hdfs.bolt.format.RecordFormat; 10 | import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy; 11 | import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy; 12 | import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy.Units; 13 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy; 14 | import org.apache.storm.hdfs.bolt.sync.SyncPolicy; 15 | 16 | import com.vishnu.storm.Keys; 17 | 18 | /** 19 | * @author vishnu viswanath 20 | * This class is used for building bolts 21 | */ 22 | public class BoltBuilder { 23 | 24 | public Properties configs = null; 25 | 26 | public BoltBuilder(Properties configs) { 27 | this.configs = configs; 28 | } 29 | 30 | public SinkTypeBolt buildSinkTypeBolt() { 31 | return new SinkTypeBolt(); 32 | } 33 | 34 | public MongodbBolt buildMongodbBolt() { 35 | String host = configs.getProperty(Keys.MONGO_HOST); 36 | int port = Integer.parseInt(configs.getProperty(Keys.MONGO_PORT)); 37 | String db = configs.getProperty(Keys.MONGO_DATABASE); 38 | String collection = configs.getProperty(Keys.MONGO_COLLECTION); 39 | return new MongodbBolt(host, port, db, collection); 40 | } 41 | 42 | public SolrBolt buildSolrBolt() { 43 | String solrServerUlr = configs.getProperty(Keys.SOLR_SERVER); 44 | String collection = configs.getProperty(Keys.SOLR_COLLECTION); 45 | SolrBolt solrBolt = new SolrBolt(solrServerUlr+collection); 46 | return solrBolt; 47 | } 48 | 49 | public HdfsBolt buildHdfsBolt() { 50 | RecordFormat format = new DelimitedRecordFormat().withFieldDelimiter("|"); 51 | SyncPolicy syncPolicy = new CountSyncPolicy(1); 52 | FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, Units.MB); 53 | FileNameFormat fileNameFormat = new DefaultFileNameFormat().withPath(configs.getProperty(Keys.HDFS_FOLDER)); 54 | String port = configs.getProperty((Keys.HDFS_PORT)); 55 | String host = configs.getProperty((Keys.HDFS_HOST)); 56 | HdfsBolt bolt = new HdfsBolt() 57 | .withFsUrl("hdfs://"+host+":"+port) 58 | .withFileNameFormat(fileNameFormat) 59 | .withRecordFormat(format) 60 | .withRotationPolicy(rotationPolicy) 61 | .withSyncPolicy(syncPolicy); 62 | return bolt; 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /stormkafka/src/main/java/com/vishnu/storm/bolt/MongodbBolt.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.storm.bolt; 2 | 3 | import java.util.Map; 4 | 5 | import org.bson.Document; 6 | 7 | import com.mongodb.MongoClient; 8 | import com.mongodb.client.MongoDatabase; 9 | 10 | import backtype.storm.task.OutputCollector; 11 | import backtype.storm.task.TopologyContext; 12 | import backtype.storm.topology.OutputFieldsDeclarer; 13 | import backtype.storm.topology.base.BaseRichBolt; 14 | import backtype.storm.tuple.Tuple; 15 | 16 | 17 | public class MongodbBolt extends BaseRichBolt { 18 | /** 19 | * 20 | */ 21 | private static final long serialVersionUID = 1L; 22 | private OutputCollector collector; 23 | private MongoDatabase mongoDB; 24 | private MongoClient mongoClient; 25 | private String collection; 26 | 27 | public String host; 28 | public int port ; 29 | public String db; 30 | 31 | protected MongodbBolt(String host, int port, String db,String collection) { 32 | this.host = host; 33 | this.port = port; 34 | this.db = db; 35 | this.collection = collection; 36 | } 37 | 38 | 39 | public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { 40 | this.collector = collector; 41 | this.mongoClient = new MongoClient(host,port); 42 | this.mongoDB = mongoClient.getDatabase(db); 43 | } 44 | 45 | 46 | public void execute(Tuple input) { 47 | 48 | Document mongoDoc = getMongoDocForInput(input); 49 | try{ 50 | mongoDB.getCollection(collection).insertOne(mongoDoc); 51 | collector.ack(input); 52 | }catch(Exception e) { 53 | e.printStackTrace(); 54 | collector.fail(input); 55 | } 56 | } 57 | 58 | 59 | @Override 60 | public void cleanup() { 61 | this.mongoClient.close(); 62 | } 63 | 64 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 65 | // TODO Auto-generated method stub 66 | } 67 | 68 | public Document getMongoDocForInput(Tuple input) { 69 | Document doc = new Document(); 70 | String content = (String) input.getValueByField("content"); 71 | String[] parts = content.trim().split(" "); 72 | System.out.println("Received in MongoDB bolt "+content); 73 | try { 74 | for(String part : parts) { 75 | String[] subParts = part.split(":"); 76 | String fieldName = subParts[0]; 77 | String value = subParts[1]; 78 | doc.append(fieldName, value); 79 | } 80 | } catch(Exception e) { 81 | 82 | } 83 | return doc; 84 | } 85 | 86 | 87 | 88 | } -------------------------------------------------------------------------------- /stormkafka/src/main/java/com/vishnu/storm/bolt/SinkTypeBolt.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.storm.bolt; 2 | 3 | import java.util.Map; 4 | 5 | import com.vishnu.storm.Topology; 6 | 7 | import backtype.storm.task.OutputCollector; 8 | import backtype.storm.task.TopologyContext; 9 | import backtype.storm.topology.OutputFieldsDeclarer; 10 | import backtype.storm.topology.base.BaseRichBolt; 11 | import backtype.storm.tuple.Fields; 12 | import backtype.storm.tuple.Tuple; 13 | import backtype.storm.tuple.Values; 14 | 15 | /** 16 | * @author vishnu viswanath 17 | * This class parses the incoming messages and decided which bolt the message has to be passed on to 18 | * There are two cases in this example, first if of solr type and second is of hdfs type. 19 | */ 20 | public class SinkTypeBolt extends BaseRichBolt { 21 | 22 | 23 | private static final long serialVersionUID = 1L; 24 | private OutputCollector collector; 25 | 26 | 27 | public void execute(Tuple tuple) { 28 | String value = tuple.getString(0); 29 | System.out.println("Received in SinkType bolt : "+value); 30 | int index = value.indexOf(" "); 31 | if (index == -1) 32 | return; 33 | String type = value.substring(0,index); 34 | System.out.println("Type : "+type); 35 | value = value.substring(index); 36 | if(type.equals("solr")) { 37 | collector.emit(Topology.SOLR_STREAM,new Values(type,value)); 38 | System.out.println("Emitted : "+value); 39 | } else if (type.equals("hdfs")) { 40 | collector.emit(Topology.HDFS_STREAM,new Values(type,value)); 41 | System.out.println("Emitted : "+value); 42 | } else if (type.equals("mongo")) { 43 | collector.emit(Topology.MONGODB_STREAM,new Values(type,value)); 44 | System.out.println("Emitted : "+value); 45 | } 46 | collector.ack(tuple); 47 | } 48 | 49 | 50 | public void prepare(Map conf, TopologyContext context, OutputCollector collector) { 51 | this.collector = collector; 52 | 53 | } 54 | 55 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 56 | declarer.declareStream(Topology.SOLR_STREAM, new Fields( "sinkType","content" )); 57 | declarer.declareStream(Topology.HDFS_STREAM, new Fields( "sinkType","content" )); 58 | declarer.declareStream(Topology.MONGODB_STREAM, new Fields( "sinkType","content" )); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /stormkafka/src/main/java/com/vishnu/storm/bolt/SolrBolt.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.storm.bolt; 2 | 3 | import java.util.Map; 4 | 5 | import org.apache.solr.client.solrj.SolrClient; 6 | import org.apache.solr.client.solrj.impl.HttpSolrClient; 7 | import org.apache.solr.common.SolrInputDocument; 8 | 9 | import backtype.storm.task.OutputCollector; 10 | import backtype.storm.task.TopologyContext; 11 | import backtype.storm.topology.OutputFieldsDeclarer; 12 | import backtype.storm.topology.base.BaseRichBolt; 13 | import backtype.storm.tuple.Tuple; 14 | 15 | /** 16 | * @author vishnu viswanath 17 | * This class is used for ingesting data into SOLR 18 | */ 19 | public class SolrBolt extends BaseRichBolt { 20 | 21 | private static final long serialVersionUID = 1L; 22 | private OutputCollector collector; 23 | SolrClient solrClient; 24 | String solrAddress; 25 | 26 | /** 27 | * 28 | * @param solrAddress url that is used to connect to solr 29 | * e.g., http://localhost:8983/solr/collection1" 30 | */ 31 | public SolrBolt(String solrAddress) { 32 | this.solrAddress = solrAddress; 33 | 34 | } 35 | 36 | public void prepare(Map conf, TopologyContext context, OutputCollector collector) { 37 | this.collector = collector; 38 | this.solrClient = new HttpSolrClient(solrAddress); 39 | } 40 | 41 | 42 | public void execute(Tuple input) { 43 | 44 | SolrInputDocument document = getSolrInputDocumentForInput(input); 45 | try{ 46 | solrClient.add(document); 47 | solrClient.commit(); 48 | collector.ack(input); 49 | }catch(Exception e) { 50 | 51 | } 52 | 53 | } 54 | 55 | /** 56 | * Converts the tuple into SOLR document. 57 | * Input will have the content in the field named "content" ( this is set by the SinkTypeBolt ) 58 | * It is assumed that the content will be of the format fieldName1:Value1 fieldName2:Value2 .. 59 | * @param input 60 | * @return 61 | */ 62 | public SolrInputDocument getSolrInputDocumentForInput(Tuple input) { 63 | String content = (String) input.getValueByField("content"); 64 | String[] parts = content.trim().split(" "); 65 | System.out.println("Received in SOLR bolt "+content); 66 | SolrInputDocument document = new SolrInputDocument(); 67 | try { 68 | for(String part : parts) { 69 | String[] subParts = part.split(":"); 70 | String fieldName = subParts[0]; 71 | String value = subParts[1]; 72 | document.addField(fieldName, value); 73 | } 74 | } catch(Exception e) { 75 | 76 | } 77 | return document; 78 | } 79 | 80 | @Override 81 | public void cleanup() { 82 | } 83 | 84 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 85 | // TODO Auto-generated method stub 86 | 87 | } 88 | 89 | } 90 | 91 | -------------------------------------------------------------------------------- /stormkafka/src/main/java/com/vishnu/storm/spout/SpoutBuilder.java: -------------------------------------------------------------------------------- 1 | package com.vishnu.storm.spout; 2 | 3 | import storm.kafka.BrokerHosts; 4 | import storm.kafka.KafkaSpout; 5 | import storm.kafka.SpoutConfig; 6 | import storm.kafka.StringScheme; 7 | import storm.kafka.ZkHosts; 8 | 9 | import java.util.Properties; 10 | 11 | import com.vishnu.storm.Keys; 12 | 13 | import backtype.storm.spout.RawScheme; 14 | import backtype.storm.spout.SchemeAsMultiScheme; 15 | 16 | /** 17 | * @author vishnu viswanath 18 | * 19 | */ 20 | public class SpoutBuilder { 21 | 22 | public Properties configs = null; 23 | 24 | public SpoutBuilder(Properties configs) { 25 | this.configs = configs; 26 | } 27 | public KafkaSpout buildKafkaSpout() { 28 | BrokerHosts hosts = new ZkHosts(configs.getProperty(Keys.KAFKA_ZOOKEEPER)); 29 | String topic = configs.getProperty(Keys.KAFKA_TOPIC); 30 | String zkRoot = configs.getProperty(Keys.KAFKA_ZKROOT); 31 | String groupId = configs.getProperty(Keys.KAFKA_CONSUMERGROUP); 32 | SpoutConfig spoutConfig = new SpoutConfig(hosts, topic, zkRoot, groupId); 33 | spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme()); 34 | KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig); 35 | return kafkaSpout; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /stormkafka/src/main/java/default_config.properties: -------------------------------------------------------------------------------- 1 | topology=storm-kafka-topology 2 | 3 | kafka-spout=kafka-spout 4 | kafka.zookeeper=localhost:2181 5 | kafa.topic=incoming 6 | kafka.zkRoot=/kafka 7 | kafka.consumer.group=sample_group 8 | kafkaspout.count=1 9 | 10 | sink-type-bolt=sink-type 11 | sinkbolt.count=1 12 | 13 | solr-bolt=solr-bolt 14 | solrbolt.count=1 15 | solr.collection=collection1 16 | solr.url=http://localhost:8983/solr/ 17 | solr.zookeeper.hosts=localhost:2181 18 | 19 | 20 | hdfs-bolt=hdfs-bolt 21 | hdfsbolt.count=1 22 | hdfs.folder=/from_storm 23 | hdfs.port=9000 24 | hdfs.host=localhost 25 | 26 | 27 | mongodb.host=localhost 28 | mongodb.port=27017 29 | mongodb.database=storm 30 | mongodb.collection=collection1 31 | mongodb.bolt.id=mongodb-bolt 32 | mongodbbolt.count=1 --------------------------------------------------------------------------------