├── .gitattributes
├── .gitignore
├── Flume
    ├── .gitignore
    ├── README.md
    ├── pom.xml
    └── src
    │   └── com
    │       └── vishnu
    │           ├── flume
    │               ├── config
    │               │   ├── flume-conf.properties
    │               │   └── flume-conf_spark.properties
    │               └── source
    │               │   └── CustomFlumeTCPSource.java
    │           └── tcp
    │               └── client
    │                   └── TcpClient.java
├── KafkaStreams
    ├── .gitignore
    ├── README.md
    ├── build.sbt
    ├── project
    │   ├── assembly.sbt
    │   ├── build.properties
    │   └── plugins.sbt
    └── src
    │   └── main
    │       └── scala-2.11
    │           └── com
    │               └── vishnuviswanath
    │                   └── kafka
    │                       └── streams
    │                           ├── ClimateLogStream.scala
    │                           └── HelloKafkaStreams.scala
├── README.md
├── datascience
    ├── kaggle
    │   ├── AnimalShelter
    │   │   └── AnimalShelterPreprocess.java
    │   ├── SanfranciscoCrime Classification
    │   │   ├── Sanfrancisco crime_LogisticRegression.ipynb
    │   │   ├── SanfranciscoCrime_KNN.ipynb
    │   │   └── SanfranciscoCrime_RandomForest.ipynb
    │   └── TitanicMachinLearnigFromDisaster
    │   │   ├── Kaggle Titanic Random Forest.ipynb
    │   │   └── Kaggle Titanic.ipynb
    ├── rnn
    │   ├── .gitignore
    │   └── Webtraffic_forecasting_LSTM.ipynb
    └── tensorflow
    │   ├── Basics.ipynb
    │   ├── cnn_cat_dog.py
    │   ├── kaggle_invasive_species
    │       ├── cnn_model_1.py
    │       ├── cnn_model_2.py
    │       ├── cnn_model_3.py
    │       ├── cnn_model_4.py
    │       ├── cnn_model_5.py
    │       ├── cnn_model_6.py
    │       ├── cnn_model_7.py
    │       ├── cnn_model_7_3.py
    │       ├── perpare_folder_structure.py
    │       ├── prepare_validset.py
    │       ├── sample_model.h5
    │       ├── sample_model.py
    │       ├── submission.csv
    │       ├── submission2.csv
    │       ├── submission4.csv
    │       ├── submission4_2.csv
    │       └── train_labels.csv
    │   └── tensorflow_nn_model.ipynb
├── docker
    ├── spark-kafka-docker
    │   ├── Dockerfile
    │   ├── docker-compose.yml
    │   └── entrypoint.sh
    ├── spark-kafka-single-node-docker
    │   ├── Dockerfile
    │   └── entrypoint.sh
    └── spark-kafka-single-node-for-meetup
    │   ├── Dockerfile
    │   ├── data
    │       ├── site-device
    │       │   ├── csv
    │       │   │   ├── site-device-corrupted.csv
    │       │   │   └── site-device.csv
    │       │   └── parquet
    │       │   │   ├── ._SUCCESS.crc
    │       │   │   ├── .part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc
    │       │   │   ├── .part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc
    │       │   │   ├── _SUCCESS
    │       │   │   ├── part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet
    │       │   │   └── part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet
    │       └── site-views
    │       │   ├── csv
    │       │       └── site-views.csv
    │       │   ├── json
    │       │       └── site-views.jsonl
    │       │   └── xml
    │       │       └── site-views.xml
    │   └── entrypoint.sh
├── flink-examples-java
    ├── .gitignore
    ├── Dockerfile
    ├── pom.xml
    ├── src
    │   └── main
    │   │   ├── java
    │   │       └── com
    │   │       │   └── vishnuviswanath
    │   │       │       └── flink
    │   │       │           └── streaming
    │   │       │               ├── HelloStreamingJob.java
    │   │       │               └── sources
    │   │       │                   ├── SocketStreamingJob.java
    │   │       │                   └── TextSourceJob.java
    │   │   └── resources
    │   │       └── log4j.properties
    └── submit.sh
├── flink
    ├── .gitignore
    ├── README.md
    ├── build.sbt
    ├── project
    │   ├── build.properties
    │   └── plugins.sbt
    └── src
    │   └── main
    │       ├── java
    │           ├── FlinkMain.java
    │           └── utility
    │           │   └── Server.java
    │       ├── resources
    │           └── log4j.properties
    │       └── scala
    │           └── com
    │               └── vishnu
    │                   └── flink
    │                       ├── WordCount.scala
    │                       ├── dataset
    │                           └── WordCount.scala
    │                       ├── streaming
    │                           ├── CountTumblingWindow.scala
    │                           ├── EventTimeProcessing.scala
    │                           ├── FlinkStreamingWordCount.scala
    │                           ├── ProcessingTimeWindowEvictor.scala
    │                           ├── ProcessingTimeWindowWithTrigger.scala
    │                           ├── StreamingWithRocksDb.scala
    │                           ├── TimeSlidingWindow.scala
    │                           ├── TimeTumblingWindow.scala
    │                           ├── cep
    │                           │   └── HelloCep.scala
    │                           ├── queryablestate
    │                           │   ├── QueryClient.scala
    │                           │   └── QuerybleStateStream.scala
    │                           ├── sessionwindows
    │                           │   ├── SessionWindowExample.scala
    │                           │   └── SessionWindowExampleDummyDataProd.scala
    │                           └── windowtimer
    │                           │   ├── ProccWindowFun.scala
    │                           │   └── ProcessWindowExample.scala
    │                       └── util
    │                           ├── ParameterParser.scala
    │                           └── RandomServerEventsKafkaProducer.scala
├── mapreduce
    ├── .gitignore
    ├── ArrayWritableExample.java
    ├── DistributeCache.java
    ├── FindMaximum.java
    ├── ImageReader.java
    ├── LetterWordMapper.java
    ├── MultiInputPath.java
    ├── README.md
    ├── SequenceFileTest.java
    ├── WordCount.java
    ├── chaining
    │   ├── ChainMapperExample.java
    │   ├── ChainingJobControl.java
    │   ├── ChainingSimple.java
    │   ├── LetterCount.java
    │   ├── ToUpperCase.java
    │   └── WordCount.java
    ├── customtypes
    │   ├── AreaCalculator.java
    │   ├── Comparator.java
    │   ├── CustomPartitioner.java
    │   ├── DollarInputFormat.java
    │   ├── DollarRecordReader.java
    │   ├── DollarStreamExample.java
    │   ├── IdentityReducerEx.java
    │   ├── Point2D.java
    │   ├── Rectangle.java
    │   ├── RectangleCount.java
    │   ├── RectangleInputFormat.java
    │   ├── RectangleKey.java
    │   ├── RectangleRecordReader.java
    │   ├── XmlOutputDriver.java
    │   └── XmlOutputFormat.java
    ├── datafu_example
    │   ├── .gitignore
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── incrementaljob
    │   │               ├── IncrementalAccumulator.java
    │   │               ├── IncrementalAggr.java
    │   │               └── IncrementalMapper.java
    ├── joins
    │   └── ReduceSideJoin.java
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── com
    │       │       └── vishnu
    │       │           └── mapreduce
    │       │               ├── CustomMultiplOututFormat.java
    │       │               ├── CustomOutputFormatTest.java
    │       │               └── WordCount.java
    │   └── test
    │       └── java
    │           └── com
    │               └── vishnu
    │                   └── mapreduce
    │                       └── AppTest.java
├── spark
    ├── .gitignore
    ├── README.md
    ├── build.properties
    ├── build.sbt
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── pyspark-files
    │   └── helloworld.py
    ├── src
    │   └── main
    │   │   └── scala
    │   │       └── com
    │   │           └── vishnu
    │   │               └── spark
    │   │                   ├── Test.scala
    │   │                   ├── bClassifier.scala
    │   │                   ├── basics
    │   │                       ├── ApiLearn.scala
    │   │                       ├── AuctionApp.scala
    │   │                       ├── CustomPartitioner.scala
    │   │                       ├── DataframeExample.scala
    │   │                       ├── SequenceFileTest.scala
    │   │                       ├── dataframes.sc
    │   │                       ├── pairrdd.sc
    │   │                       ├── rdds.sc
    │   │                       └── streams.sc
    │   │                   ├── blog
    │   │                       └── supportfiles
    │   │                       │   └── spark_session_blog_commands
    │   │                   ├── graph
    │   │                       ├── PregelGraphExample.scala
    │   │                       ├── PropertyGraphExample.scala
    │   │                       └── res
    │   │                       │   ├── airports.csv
    │   │                       │   └── routes.csv
    │   │                   ├── kaggle
    │   │                       └── titanic
    │   │                       │   ├── KaggleTitanic.scala
    │   │                       │   ├── TitanicOverfit.scala
    │   │                       │   ├── TitanicUnderfit.scala
    │   │                       │   └── TitanicWithPipeline.scala
    │   │                   ├── map_reduce_in_spark.scala
    │   │                   ├── mllib
    │   │                       ├── ALSRecommender.scala
    │   │                       ├── ALSRecommender2.scala
    │   │                       ├── FeatureTransformations.scala
    │   │                       ├── LinearRegr.scala
    │   │                       ├── LogisticRegr.scala
    │   │                       ├── TFIDF.scala
    │   │                       └── house_data.csv
    │   │                   ├── sql
    │   │                       ├── FromJson.scala
    │   │                       ├── HiveTest.scala
    │   │                       ├── StreamSQL.scala
    │   │                       ├── ToMongoDB.scala
    │   │                       └── res
    │   │                       │   ├── twitter.avro
    │   │                       │   └── twitter.avsc
    │   │                   └── streaming
    │   │                       ├── FlumeStreaming.scala
    │   │                       ├── KafkaDirectStream.scala
    │   │                       ├── KafkaStreaming.scala
    │   │                       ├── README.md
    │   │                       ├── SeqFileStreaming.scala
    │   │                       ├── SocketStreaming.scala
    │   │                       ├── StreamHbase.scala
    │   │                       ├── StreamingFromCheckpoint.scala
    │   │                       ├── StreamingJoins.scala
    │   │                       ├── StreamingWindow.scala
    │   │                       ├── StreamingWithCheckpoint.scala
    │   │                       ├── UpdateStateByKey.scala
    │   │                       ├── WindowedStream.scala
    │   │                       ├── akka
    │   │                           ├── SendToActor.scala
    │   │                           └── SparkAkkaSource.scala
    │   │                       └── customsource
    │   │                           ├── ActivityReceiver.scala
    │   │                           └── StreamingWithCustomSource.scala
    └── uberjar.md
├── spark_23
    ├── MEETUP_NOTES.md
    ├── README.md
    ├── build.sbt
    ├── project
    │   ├── build.properties
    │   └── plugins.sbt
    └── src
    │   ├── main
    │       └── scala
    │       │   └── com
    │       │       └── vishnuviswanath
    │       │           └── spark
    │       │               ├── streaming
    │       │                   ├── ContinuousKafkaStreaming.scala
    │       │                   ├── CustomV2SourceExample.scala
    │       │                   ├── HelloStructredStreaming.scala
    │       │                   ├── KafkaSourceStreaming.scala
    │       │                   ├── SocketSourceStreaming.scala
    │       │                   ├── StreamingAggregations.scala
    │       │                   └── sources
    │       │                   │   └── netcat
    │       │                   │       ├── NetcatContinuousReader.scala
    │       │                   │       ├── NetcatOffset.scala
    │       │                   │       ├── NetcatReader.scala
    │       │                   │       └── NetcatSourceProvider.scala
    │       │               └── util
    │       │                   ├── NetcatProducer.scala
    │       │                   ├── RandomCarsKafkaProducer.scala
    │       │                   ├── SimulateLateDateProducer.scala
    │       │                   ├── ToFileProducer.scala
    │       │                   └── WordsStream.scala
    │   └── test
    │       └── scala
    │           └── com
    │               └── vishnuviswanath
    │                   └── spark
    │                       └── streaming
    │                           └── HelloStructuredStreamingSpec.scala
└── stormkafka
    ├── .gitignore
    ├── README.md
    ├── pom.xml
    └── src
        └── main
            └── java
                ├── com
                    └── vishnu
                    │   └── storm
                    │       ├── Keys.java
                    │       ├── Topology.java
                    │       ├── bolt
                    │           ├── BoltBuilder.java
                    │           ├── MongodbBolt.java
                    │           ├── SinkTypeBolt.java
                    │           └── SolrBolt.java
                    │       └── spout
                    │           └── SpoutBuilder.java
                └── default_config.properties


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-vendored
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ensime
 2 | .ensime_cache/
 3 | .ipynb_checkpoints/
 4 | _site/
 5 | .jekyll-metadata
 6 | .settings/
 7 | target/
 8 | .metadata
 9 | bin/
10 | tmp/
11 | *.tmp
12 | *.bak
13 | *.swp
14 | *~.nib
15 | local.properties
16 | .settings/
17 | .loadpath
18 | 
19 | # Eclipse Core
20 | .project
21 | 
22 | # External tool builders
23 | .externalToolBuilders/
24 | 
25 | # Locally stored "Eclipse launch configurations"
26 | *.launch
27 | 
28 | # PyDev specific (Python IDE for Eclipse)
29 | *.pydevproject
30 | 
31 | # CDT-specific (C/C++ Development Tooling)
32 | .cproject
33 | 
34 | # JDT-specific (Eclipse Java Development Tools)
35 | .classpath
36 | 
37 | # Java annotation processor (APT)
38 | .factorypath
39 | 
40 | # PDT-specific (PHP Development Tools)
41 | .buildpath
42 | 
43 | # sbteclipse plugin
44 | .target
45 | 
46 | # Tern plugin
47 | .tern-project
48 | 
49 | # TeXlipse plugin
50 | .texlipse
51 | 
52 | # STS (Spring Tool Suite)
53 | .springBeans
54 | 
55 | # Code Recommenders
56 | .recommenders
57 | /target
58 | 
59 | 


--------------------------------------------------------------------------------
/Flume/.gitignore:
--------------------------------------------------------------------------------
 1 | .classpath
 2 | .project
 3 | .settings
 4 | target
 5 | 
 6 | *.class
 7 | 
 8 | # Mobile Tools for Java (J2ME)
 9 | .mtj.tmp/
10 | 
11 | #intellij project file
12 | *.iml
13 | .idea/
14 | .cache-main
15 | 
16 | # Package Files #
17 | *.jar
18 | *.war
19 | *.ear
20 | 
21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
22 | hs_err_pid*
23 | 


--------------------------------------------------------------------------------
/Flume/README.md:
--------------------------------------------------------------------------------
 1 | ### Flume Custom TCP Source
 2 | 
 3 | CustomFlumeTCPSource.java is custom flume source which listens to a port and sends the content to the configured channel. The custom source adds the client information to the header of message before sending to the channel.
 4 | It takes two configurations
 5 | 
 6 | 1. port - the port to listen to
 7 | 2. buffer - how often should the events be send to the channel
 8 | 
 9 | #### Sample configuration
10 | agent.sources.CustomTcpSource.type = com.vishnu.flume.source.CustomFlumeTCPSource
11 | agent.sources.CustomTcpSource.port = 4443
12 | agent.sources.CustomTcpSource.buffer = 1
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/Flume/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>com.vishnu</groupId>
 4 |   <artifactId>Flume</artifactId>
 5 |   <version>0.0.1-SNAPSHOT</version>
 6 |   <build>
 7 |     <sourceDirectory>src</sourceDirectory>
 8 |     <plugins>
 9 |       <plugin>
10 |         <artifactId>maven-compiler-plugin</artifactId>
11 |         <version>3.3</version>
12 |         <configuration>
13 |           <source>1.8</source>
14 |           <target>1.8</target>
15 |         </configuration>
16 |       </plugin>
17 |     </plugins>
18 |   </build>
19 |   <dependencies>
20 |   	<dependency>
21 |   		<groupId>org.apache.flume</groupId>
22 |   		<artifactId>flume-ng-core</artifactId>
23 |   		<version>1.6.0</version>
24 |   	</dependency>
25 |   </dependencies>
26 | </project>


--------------------------------------------------------------------------------
/Flume/src/com/vishnu/flume/config/flume-conf.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | 
19 | # The configuration file needs to define the sources, 
20 | # the channels and the sinks.
21 | # Sources, channels and sinks are defined per agent, 
22 | # in this case called 'agent'
23 | 
24 | agent.sources = CustomTcpSource
25 | agent.channels = memoryChannel
26 | agent.sinks = loggerSink
27 | 
28 | # For each one of the sources, the type is defined
29 | agent.sources.CustomTcpSource.type = com.vishnu.flume.source.CustomFlumeTCPSource
30 | agent.sources.CustomTcpSource.port = 4443
31 | agent.sources.CustomTcpSource.buffer = 1
32 | 
33 | 
34 | # The channel can be defined as follows.
35 | agent.sources.CustomTcpSource.channels = memoryChannel
36 | 
37 | # Each sink's type must be defined
38 | agent.sinks.loggerSink.type = logger
39 | 
40 | #Specify the channel the sink should use
41 | agent.sinks.loggerSink.channel = memoryChannel
42 | 
43 | # Each channel's type is defined.
44 | agent.channels.memoryChannel.type = memory
45 | 
46 | # Other config values specific to each type of channel(sink or source)
47 | # can be defined as well
48 | # In this case, it specifies the capacity of the memory channel
49 | agent.channels.memoryChannel.capacity = 100
50 | 


--------------------------------------------------------------------------------
/Flume/src/com/vishnu/flume/config/flume-conf_spark.properties:
--------------------------------------------------------------------------------
 1 | # Flume configuration to listen to netcat host and port,
 2 | # sink is of the type avro
 3 | # Created for testing spark streaming from flume
 4 | # @author vishnu viswanath
 5 | 
 6 | agent.sources = Netcat
 7 | agent.channels = memoryChannel
 8 | agent.sinks = avroSink
 9 | #agent.sinks = loggerSink
10 | 
11 | # For each one of the sources, the type is defined
12 | agent.sources.Netcat.type = netcat
13 | agent.sources.Netcat.bind = localhost
14 | agent.sources.Netcat.port = 6666
15 | agent.sources.Netcat.channels = memoryChannel
16 | 
17 | # avro sink for spark
18 | agent.sinks.avroSink.type = avro
19 | agent.sinks.avroSink.channel = memoryChannel
20 | agent.sinks.avroSink.hostname = localhost
21 | agent.sinks.avroSink.port = 4444
22 | 
23 | #logger sink
24 | #agent.sinks.loggerSink.type = logger
25 | #agent.sinks.loggerSink.channel = memoryChannel
26 | 
27 | # Each channel's type is defined.
28 | agent.channels.memoryChannel.type = memory
29 | 
30 | # Other config values specific to each type of channel(sink or source)
31 | # can be defined as well
32 | # In this case, it specifies the capacity of the memory channel
33 | agent.channels.memoryChannel.capacity = 100
34 | 


--------------------------------------------------------------------------------
/Flume/src/com/vishnu/tcp/client/TcpClient.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.tcp.client;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.DataOutputStream;
 5 | import java.io.InputStreamReader;
 6 | import java.net.Socket;
 7 | 
 8 | public class TcpClient {
 9 | 
10 | 	public static void main(String[] args) throws Exception {
11 | 		String sentence;
12 | 		String modifiedSentence;
13 | 		BufferedReader inFromUser = new BufferedReader(new InputStreamReader(System.in));
14 | 		Socket clientSocket = new Socket("localhost", 4443);
15 | 		DataOutputStream outToServer = new DataOutputStream(clientSocket.getOutputStream());
16 | 		outToServer.writeBytes("test message" + '\n');
17 | 		clientSocket.close();
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/KafkaStreams/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/KafkaStreams/README.md:
--------------------------------------------------------------------------------
1 | **KafkaStreams** is a stream processing library on top of Apache Kafka.
2 | 
3 | This project contains basic examples of how to create a Kafka Stream application in Scala. For more detailed explaination visit the [blog post](http://vishnuviswanath.com/hello-kafka-streams.html).
4 | 


--------------------------------------------------------------------------------
/KafkaStreams/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "KafkaStreams"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.11.8"
 6 | 
 7 | organization := "com.vishnuviswanath"
 8 | 
 9 | val kafkaStreamsVersion = "0.10.2.0"
10 | 
11 | val kafkaDependencies = Seq(
12 |   "org.apache.kafka" % "kafka-streams" % kafkaStreamsVersion)
13 | 
14 | val otherDependencies = Seq(
15 |     "com.esotericsoftware.kryo" % "kryo" % "2.24.0"
16 | )
17 | 
18 | val main = "com.vishnuviswanath.kafka.streams.KafkaStreamsExample"
19 | mainClass in (Compile, run) := Some(main)
20 | mainClass in (Compile, packageBin) := Some(main)
21 | 
22 | lazy val root = (project in file(".")).
23 |   settings(
24 |     libraryDependencies ++= kafkaDependencies,
25 |     libraryDependencies ++= otherDependencies
26 |   )
27 | 
28 | 
29 |     


--------------------------------------------------------------------------------
/KafkaStreams/project/assembly.sbt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/KafkaStreams/project/assembly.sbt


--------------------------------------------------------------------------------
/KafkaStreams/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.13


--------------------------------------------------------------------------------
/KafkaStreams/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
2 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3")


--------------------------------------------------------------------------------
/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/HelloKafkaStreams.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.kafka.streams
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.kafka.clients.consumer.ConsumerConfig
 6 | import org.apache.kafka.common.serialization.Serdes
 7 | import org.apache.kafka.streams.{KafkaStreams, StreamsConfig}
 8 | import org.apache.kafka.streams.kstream.{KStream, KStreamBuilder, ValueMapper}
 9 | 
10 | /**
11 |   * Created by vviswanath on 4/22/17.
12 |   *
13 |   * HelloKafkaStream reads a list of names from a topic and
14 |   * outputs "hello <name>" in output topic
15 |   */
16 | object HelloKafkaStreams {
17 | 
18 |   def main(args: Array[String]): Unit = {
19 |     val settings = new Properties
20 |     settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "hello-kafka-streams")
21 |     settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092")
22 |     settings.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest")
23 |     settings.put(StreamsConfig.KEY_SERDE_CLASS_CONFIG, Serdes.serdeFrom(classOf[String]).getClass.getName)
24 |     settings.put(StreamsConfig.VALUE_SERDE_CLASS_CONFIG, Serdes.serdeFrom(classOf[String]).getClass.getName)
25 | 
26 |     val kstreamBuilder = new KStreamBuilder
27 |     val rawStream: KStream[String, String] = kstreamBuilder.stream("names")
28 | 
29 |     val helloStream: KStream[String, String] = rawStream.mapValues(new ValueMapper[String, String]{
30 |       override def apply(value: String): String = s"hello $value"
31 |     })
32 | 
33 |     helloStream.to(Serdes.String, Serdes.String, "hellostream")
34 | 
35 |     val streams = new KafkaStreams(kstreamBuilder, settings)
36 |     streams.start
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hadoop and ML repository
 2 | 
 3 | A repository to hold all my Hadoop and Machine Learning related codes.
 4 | 
 5 | Visit my blog at : www.vishnuviswanath.com
 6 | 
 7 | ### Contents
 8 | 
 9 | 1. Flink Streaming
10 | 2. Spark ML, Streaming, SQL and GraphX
11 | 3. Kafka Streams
12 | 4. StormKafka streaming application POC
13 | 5. Flume custom source and config files
14 | 6. Hadoop MapReduce old api joins,custom types etc
15 | 7. Solutions for kaggle problems using numpy or graphlab
16 | 


--------------------------------------------------------------------------------
/datascience/kaggle/TitanicMachinLearnigFromDisaster/Kaggle Titanic Random Forest.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/datascience/rnn/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.tsv
3 | *.DS_Store
4 | 


--------------------------------------------------------------------------------
/datascience/tensorflow/cnn_cat_dog.py:
--------------------------------------------------------------------------------
 1 | #import the modules
 2 | from keras.models import Sequential
 3 | from keras.layers.convolutional import Conv2D
 4 | from keras.layers.pooling import MaxPooling2D
 5 | from keras.layers.core import Flatten
 6 | from keras.layers.core import Dense
 7 | from keras.preprocessing.image import ImageDataGenerator
 8 | import os
 9 | 
10 | from pyspark import SparkSession
11 | 
12 | 
13 | if __name__ == "__main__":
14 | 	spark = SparkSession().master("spark://mm-mac-4797:7077").appName("CatsDogsCNN").getOrCreate()
15 | 
16 | 
17 | #os.getcwd()
18 | 
19 | #change working directory if needed
20 | #os.chdir("path to your dataset folder")
21 | 
22 | #initialize the classifier
23 | classifier = Sequential()
24 | 
25 | #add layers
26 | classifier.add(Conv2D(32, (3, 3), input_shape=(64, 64, 3), activation = 'relu')) 
27 | classifier.add(MaxPooling2D()) #default pool size is (2, 2)
28 | classifier.add(Flatten()) #flatten all layers into a single layer
29 | classifier.add(Dense(128, activation = 'relu'))
30 | classifier.add(Dense(1, activation = 'sigmoid')) #2 = number of outputs
31 | 
32 | classifier.compile(optimizer = 'adam', 
33 | 	loss = 'binary_crossentropy', 
34 | 	metrics = ['accuracy'])
35 | 
36 | #image pre-processing
37 | train_datagen = ImageDataGenerator(
38 | 	rescale=1./255,
39 | 	shear_range=0.2,
40 | 	zoom_range=0.2,
41 | 	horizontal_flip=True)
42 | 
43 | test_datagen = ImageDataGenerator(rescale=1./255)
44 | 
45 | train_generator = train_datagen.flow_from_directory(
46 | 	'dataset/training_set',
47 | 	target_size=(64, 64),
48 | 	batch_size=32,
49 | 	class_mode='binary')
50 | 
51 | test_generator = test_datagen.flow_from_directory(
52 | 	'dataset/test_set',
53 | 	target_size=(64, 64),
54 | 	batch_size=32,
55 | 	class_mode='binary')
56 | 
57 | classifier.fit_generator(
58 | 	train_generator,
59 | 	steps_per_epoch=200,
60 | 	epochs=5,
61 | 	validation_data=test_generator,
62 | 	validation_steps=100)


--------------------------------------------------------------------------------
/datascience/tensorflow/kaggle_invasive_species/cnn_model_1.py:
--------------------------------------------------------------------------------
 1 | #nvidia-smi
 2 | #~/.keras/keras.json
 3 | 
 4 | #import keras
 5 | #print keras.__version__
 6 | #1.2.2
 7 | #https://faroit.github.io/keras-docs/1.2.2/
 8 | 
 9 | 
10 | from keras.models import Sequential
11 | from keras.layers.pooling import MaxPooling2D
12 | from keras.layers.core import Dense
13 | from keras.layers.core import Flatten
14 | from keras.layers.core import Dropout
15 | from keras.layers.convolutional import Conv2D
16 | from keras.layers.pooling import MaxPooling2D
17 | from keras.models import model_from_json
18 | 
19 | from keras.preprocessing.image import ImageDataGenerator
20 | 
21 | #check 
22 | #from tensorflow.python.client import device_lib
23 | #print(device_lib.list_local_devices())
24 | 
25 | classification = Sequential()
26 | classification.add(Conv2D(50, 3,3, input_shape=(128, 128, 3), activation = 'relu'))
27 | classification.add(MaxPooling2D())
28 | 
29 | classification.add(Conv2D(25, 3,3, activation = 'relu'))
30 | classification.add(MaxPooling2D())
31 | 
32 | classification.add(Flatten())
33 | 
34 | classification.add(Dense(200, activation = 'relu'))
35 | classification.add(Dropout(0.5))
36 | 
37 | classification.add(Dense(1, activation = 'sigmoid'))
38 | 
39 | for layer in classification.layers:
40 |     print(str(layer.name)+" "+str(layer.input_shape)+" -> "+str(layer.output_shape))
41 | 
42 | 
43 | classification.compile(optimizer='Adam', loss='binary_crossentropy', metrics = ['accuracy'])
44 | 
45 | train_data_gen = ImageDataGenerator(rescale=1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True)
46 | valid_data_gen = ImageDataGenerator(rescale=1./255)
47 | 
48 | train_gen = train_data_gen.flow_from_directory('training_set', target_size=(128, 128), batch_size=25, class_mode='binary')
49 | valid_gen = valid_data_gen.flow_from_directory('validation_set', target_size=(128, 128), batch_size=25, class_mode='binary')
50 | 
51 | #classification.load_weights("classification_model.h5")
52 | classification.fit_generator(train_gen, samples_per_epoch=2145, validation_data=valid_gen, nb_epoch=25, nb_val_samples=150)
53 | 
54 | classification_json = classification.to_json()
55 | with open("cnn_model_1.json", "w") as json_file:
56 |     json_file.write(classification_json)
57 | classification.save_weights("cnn_model_1.h5")
58 | #91.3 val accuracy
59 | 
60 | 
61 | test_data_gen = ImageDataGenerator(rescale=1./255)
62 | test_gen = test_data_gen.flow_from_directory('test', target_size=(128, 128), batch_size=25, class_mode='binary')
63 | 
64 | prediction = classification.predict_generator(test_gen, 1531)
65 | 
66 | result = []
67 | filenames = test_gen.filenames
68 | for i in range(len(filenames)):
69 | 	result.append((int(filenames[i].split("/")[1].split(".")[0]), prediction[i][0]))
70 | 
71 | result.sort(key=lambda tup: tup[0])
72 | 
73 | with open("submission.csv", "w") as output:
74 | 	output.write("name,invasive\n")
75 | 	for i in range(0, len(result)):
76 | 		output.write(str(result[i][0])+","+str(result[i][1])+"\n")


--------------------------------------------------------------------------------
/datascience/tensorflow/kaggle_invasive_species/perpare_folder_structure.py:
--------------------------------------------------------------------------------
 1 | from shutil import copyfile
 2 | import os
 3 | 
 4 | with open("train_labels.csv") as labels:
 5 | 	train_labels = labels.read().splitlines()
 6 | 
 7 | def copy(base, img, dest, claz):
 8 | 	dest_file = dest+"/"+claz+"/"+img
 9 | 	source_file = base+"/"+img
10 | 	if not os.path.exists(os.path.dirname(dest_file)):
11 | 		os.makedirs(os.path.dirname(dest_file))
12 | 	copyfile(source_file, dest_file)
13 | 
14 | for i in train_labels[1:]:
15 | 	parts = i.split(",")
16 | 	img = parts[0]+".jpg"
17 | 	claz = parts[1]
18 | 	print("copying "+img+" to class "+claz)
19 | 	copy("train", img, "training_set", claz)
20 | 
21 | 
22 | 
23 | #copy("train", "1.jpg", "train_new", "0")
24 | 
25 | 


--------------------------------------------------------------------------------
/datascience/tensorflow/kaggle_invasive_species/prepare_validset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from shutil import copyfile
 3 | from shutil import move
 4 | import random
 5 | 
 6 | training_dir = 'training_set'
 7 | validation_dir = 'validation_set'
 8 | classes = ["0", "1"]
 9 | 
10 | for claz in classes:
11 | 	val_size = 150
12 | 	cdir = training_dir+"/"+claz
13 | 	print(cdir)
14 | 	vcdiir = validation_dir+"/"+claz
15 | 	imgs = os.listdir(cdir)
16 | 	random.shuffle(imgs)
17 | 	for file in imgs:
18 | 		if val_size <= 0:
19 | 			break
20 | 		val_size = val_size - 1
21 | 		source_file = cdir+"/"+file
22 | 		dest_file = vcdiir+"/"+file
23 | 		print("moving "+source_file+" to "+dest_file)
24 | 		if not os.path.exists(os.path.dirname(dest_file)):
25 | 			os.makedirs(os.path.dirname(dest_file))
26 | 		move(source_file, dest_file)


--------------------------------------------------------------------------------
/datascience/tensorflow/kaggle_invasive_species/sample_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/datascience/tensorflow/kaggle_invasive_species/sample_model.h5


--------------------------------------------------------------------------------
/datascience/tensorflow/kaggle_invasive_species/sample_model.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Sequential
 2 | from keras.layers.pooling import MaxPooling2D
 3 | from keras.layers.core import Dense
 4 | from keras.layers.core import Flatten
 5 | from keras.layers.core import Dropout
 6 | from keras.layers.convolutional import Conv2D
 7 | from keras.layers.pooling import MaxPooling2D
 8 | 
 9 | from keras.preprocessing.image import ImageDataGenerator
10 | classification = Sequential()
11 | classification.add(Conv2D(10, (3,3), input_shape=(64, 64, 3), activation = 'relu'))
12 | 
13 | classification.add(Conv2D(10, (3,3), activation = 'relu'))
14 | classification.add(MaxPooling2D())
15 | 
16 | classification.add(Conv2D(5, (3,3), activation = 'relu'))
17 | classification.add(MaxPooling2D())
18 | 
19 | classification.add(Flatten())
20 | 
21 | classification.add(Dense(50, activation = 'relu'))
22 | classification.add(Dropout(0.5))
23 | classification.add(Dense(1, activation = 'sigmoid'))
24 | 
25 | for layer in classification.layers:
26 |     print(str(layer.name)+" "+str(layer.input_shape)+" -> "+str(layer.output_shape))
27 | 
28 | 
29 | classification.compile(optimizer='Adam', loss='binary_crossentropy', metrics = ['accuracy'])
30 | 
31 | train_data_gen = ImageDataGenerator(rescale=1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True)
32 | valid_data_gen = ImageDataGenerator(rescale=1./255)
33 | 
34 | train_gen = train_data_gen.flow_from_directory('sample/training_set', target_size=(64, 64), batch_size=5, class_mode='binary')
35 | valid_gen = valid_data_gen.flow_from_directory('sample/validation_set', target_size=(64, 64), batch_size=5, class_mode='binary')
36 | 
37 | #classification.load_weights("sample_model.h5")
38 | classification.fit_generator(train_gen, steps_per_epoch=36, validation_data=valid_gen, epochs=10, validation_steps=36)
39 | 
40 | #save the weights
41 | classification.save_weights("sample_model.h5")
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/docker/spark-kafka-docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8-jre
 2 | MAINTAINER Vishnu Viswanath "vishnuviswanath.com"
 3 | 
 4 | RUN apt-get update && apt-get install -y curl \
 5 |     procps
 6 | 
 7 | ENV APACHE_DOWNLOAD_URL https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=
 8 | 
 9 | #KAFKA
10 | ARG KAFKA_VERSION=1.0.0
11 | ARG SCALA_VERSION=2.11
12 | ENV KAFKA_PACKAGE kafka_${SCALA_VERSION}-${KAFKA_VERSION}
13 | ENV KAFKA_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}kafka/${KAFKA_VERSION}/${KAFKA_PACKAGE}.tgz
14 | ENV KAFKA_HOME /usr/share/${KAFKA_PACKAGE}
15 | ENV PATH $PATH:${KAFKA_HOME}/bin
16 | 
17 | RUN curl -L \
18 |     "${KAFKA_DOWNLOAD_URL}" \
19 |     | gunzip \
20 |     | tar x -C /usr/share/
21 | 
22 | 
23 | #SPARK
24 | ARG SPARK_VERSION=2.3.0
25 | ARG HADOOP_VERSION=2.7
26 | ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}
27 | ENV SPARK_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz
28 | ENV SPARK_HOME /usr/share/${SPARK_PACKAGE}
29 | ENV PATH $PATH:${SPARK_HOME}/bin:${SPARK_HOME}/sbin
30 | 
31 | RUN curl -L \
32 |   "${SPARK_DOWNLOAD_URL}" \
33 |   | gunzip \
34 |   | tar x -C /usr/share/
35 | 
36 | 
37 | EXPOSE 8080 8081 6066 7077 4040 7001 7002 7003 7004 7005 7006 2181 9092
38 | 
39 | ADD entrypoint.sh /
40 | 
41 | ENTRYPOINT ["/entrypoint.sh"]


--------------------------------------------------------------------------------
/docker/spark-kafka-docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | services:
 3 |   spark-master:
 4 |     image: soniclavier/spark-kafka:2.3.0_1.0.0
 5 |     command: spark-master
 6 |     expose:
 7 |       - 8080
 8 |       - 7077
 9 |     ports:
10 |       - 8080:8080
11 |       - 7077:7077
12 | 
13 |   spark-worker:
14 |     image: soniclavier/spark-kafka:2.3.0_1.0.0
15 |     command: spark-worker spark-master:7077
16 |     depends_on:
17 |       - spark-master
18 |     expose:
19 |       - 8081
20 | 
21 |   zookeeper:
22 |     image: soniclavier/spark-kafka:2.3.0_1.0.0
23 |     command: zookeeper
24 |     expose:
25 |       - 2181
26 | 
27 |   kafka-broker:
28 |     image: soniclavier/spark-kafka:2.3.0_1.0.0
29 |     command: kafka-broker zookeeper
30 |     depends_on:
31 |       - zookeeper
32 |     expose:
33 |       - 9092
34 | 


--------------------------------------------------------------------------------
/docker/spark-kafka-docker/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | case $1 in
 4 |     spark-master) exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.master.Master
 5 |             ;;
 6 | 
 7 |     spark-worker) exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.worker.Worker $2
 8 |             ;;
 9 | 
10 |     zookeeper) exec $KAFKA_HOME/bin/zookeeper-server-start.sh $KAFKA_HOME/config/zookeeper.properties
11 |             ;;
12 | 
13 |     kafka-broker)
14 |             sed -r -i "s/(zookeeper.connect)=(.*)/\1=$2:2181/g" $KAFKA_HOME/config/server.properties
15 |             exec $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties
16 |             ;;
17 | 
18 |     *) echo "Unknown entrypoint $1, valid entry points are [spark-master, spark-worker <master-url>, zookeeper, kafka-broker <zk-host>]"
19 |             ;;
20 | esac
21 | 


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8-jre
 2 | MAINTAINER Vishnu Viswanath "vishnuviswanath.com"
 3 | 
 4 | RUN apt-get update && apt-get install -y curl \
 5 |     procps
 6 | 
 7 | ENV APACHE_DOWNLOAD_URL https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=
 8 | 
 9 | #KAFKA
10 | ARG KAFKA_VERSION=1.0.0
11 | ARG SCALA_VERSION=2.11
12 | ENV KAFKA_PACKAGE kafka_${SCALA_VERSION}-${KAFKA_VERSION}
13 | ENV KAFKA_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}kafka/${KAFKA_VERSION}/${KAFKA_PACKAGE}.tgz
14 | ENV KAFKA_HOME /usr/share/${KAFKA_PACKAGE}
15 | ENV PATH $PATH:${KAFKA_HOME}/bin
16 | 
17 | RUN curl -L \
18 |     "${KAFKA_DOWNLOAD_URL}" \
19 |     | gunzip \
20 |     | tar x -C /usr/share/
21 | 
22 | 
23 | #SPARK
24 | ARG SPARK_VERSION=2.3.0
25 | ARG HADOOP_VERSION=2.7
26 | ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}
27 | ENV SPARK_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz
28 | ENV SPARK_HOME /usr/share/${SPARK_PACKAGE}
29 | ENV PATH $PATH:${SPARK_HOME}/bin:${SPARK_HOME}/sbin
30 | 
31 | RUN curl -L \
32 |   "${SPARK_DOWNLOAD_URL}" \
33 |   | gunzip \
34 |   | tar x -C /usr/share/
35 | 
36 | 
37 | EXPOSE 8080 8081 6066 7077 4040 7001 7002 7003 7004 7005 7006 2181 9092
38 | 
39 | 
40 | ADD entrypoint.sh /
41 | 
42 | ENTRYPOINT ["/entrypoint.sh"]


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-docker/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | export HOSTNAME=$(hostname -i)
 4 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.master.Master >/dev/null 2>&1 < /dev/null &
 5 | echo "starting spark master.."
 6 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.worker.Worker spark://${HOSTNAME}:7077 >/dev/null 2>&1 < /dev/null &
 7 | echo "starting spark worker.."
 8 | exec $KAFKA_HOME/bin/zookeeper-server-start.sh $KAFKA_HOME/config/zookeeper.properties >/dev/null 2>&1 < /dev/null &
 9 | echo "starting zookeeper.."
10 | exec $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties >/dev/null 2>&1 < /dev/null &
11 | echo "starting kafka broker.."
12 | 
13 | #make container wait
14 | exec "$@";
15 | 


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8-jre
 2 | MAINTAINER Vishnu Viswanath "vishnuviswanath.com"
 3 | 
 4 | RUN apt-get update && apt-get install -y curl \
 5 |     procps \
 6 |     netcat
 7 | 
 8 | ENV APACHE_DOWNLOAD_URL https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=
 9 | 
10 | #KAFKA
11 | ARG KAFKA_VERSION=1.0.0
12 | ARG SCALA_VERSION=2.11
13 | ENV KAFKA_PACKAGE kafka_${SCALA_VERSION}-${KAFKA_VERSION}
14 | ENV KAFKA_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}kafka/${KAFKA_VERSION}/${KAFKA_PACKAGE}.tgz
15 | ENV KAFKA_HOME /usr/share/${KAFKA_PACKAGE}
16 | ENV PATH $PATH:${KAFKA_HOME}/bin
17 | 
18 | RUN curl -L \
19 |     "${KAFKA_DOWNLOAD_URL}" \
20 |     | gunzip \
21 |     | tar x -C /usr/share/
22 | 
23 | 
24 | #SPARK
25 | ARG SPARK_VERSION=2.3.0
26 | ARG HADOOP_VERSION=2.7
27 | ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}
28 | ENV SPARK_DOWNLOAD_URL ${APACHE_DOWNLOAD_URL}spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz
29 | ENV SPARK_HOME /usr/share/${SPARK_PACKAGE}
30 | ENV PATH $PATH:${SPARK_HOME}/bin:${SPARK_HOME}/sbin
31 | 
32 | RUN curl -L \
33 |   "${SPARK_DOWNLOAD_URL}" \
34 |   | gunzip \
35 |   | tar x -C /usr/share/
36 | 
37 | 
38 | EXPOSE 8080 8081 6066 7077 4040 7001 7002 7003 7004 7005 7006 2181 9092
39 | 
40 | 
41 | RUN export COLUMNS=250
42 | 
43 | ADD data /data
44 | ADD spark_23-assembly-1.0.jar /examples/
45 | ADD entrypoint.sh /
46 | 
47 | ENTRYPOINT ["/entrypoint.sh"]


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/data/site-device/csv/site-device-corrupted.csv:
--------------------------------------------------------------------------------
1 | Device Category,Page,Users,New Users,Sessions
2 | mobile,/spark_lr.html,83,77,101
3 | mobile,/,60,40,56
4 | tablet,,/spark_session.html,10,10,11
5 | desktop,/flink_queryable_state2.html,162,52,141
6 | desktop,/blog/page5/,19,0,0,extrafield
7 | desktop,/blog/page3/,120,0,1
8 | desktop,/spark_rdd.html,1071,946,1199
9 | desktop,/kaggle-titanic.html,&$^%*,43,54


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/.part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/.part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/.part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/.part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/_SUCCESS


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/part-00000-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/docker/spark-kafka-single-node-for-meetup/data/site-device/parquet/part-00001-fcfa0c31-8f29-48a3-a768-ae5574d4948b-c000.snappy.parquet


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/data/site-views/csv/site-views.csv:
--------------------------------------------------------------------------------
1 | ﻿Page,Country,Pageviews,Unique Pageviews,Avg. Time on Page/spark_session.html,United States,2584,2443,0:04:25/spark_lr.html,United States,2004,1846,0:04:40/,United States,1748,1516,0:00:47/spark_rdd.html,India,1723,1563,0:06:01/spark_rdd.html,United States,1542,1398,0:04:16/spark_session.html,India,1480,1401,0:04:23/spark_lr.html,India,1216,1099,0:05:30/flink_eventtime.html,United States,1054,937,0:05:42/realtime-storm-kafka3.html,India,958,827,0:03:54/hello-kafka-streams.html,United States,885,789,0:02:45/realtime-storm-kafka2.html,India,868,722,0:03:37/spark-scala.html,United States,825,774,0:03:16/spark_rdd_part2.html,India,791,715,0:04:32/realtime-storm-kafka2.html,United States,766,594,0:02:29/realtime-storm-kafka3.html,United States,754,624,0:03:08/flink_streaming.html,United States,718,658,0:03:16/spark_rdd_part2.html,United States,703,620,0:03:26/,India,702,531,0:00:40/realtime-storm-kafka1.html,India,687,577,0:04:01/spark-scala.html,India,671,623,0:06:43/kafka-streams-part2.html,United States,645,535,0:01:53/realtime-storm-kafka1.html,United States,614,512,0:02:50/search.html?query=spark,India,577,425,0:00:31/flink_trigger_evictor.html,United States,501,437,0:03:21/search.html?query=spark,United States,461,321,0:00:32/flink_eventtime.html,India,448,401,0:06:13/search.html?query=flink,United States,434,317,0:00:29/hello-kafka-streams.html,India,415,375,0:03:57/flink_streaming.html,India,325,305,0:03:45


--------------------------------------------------------------------------------
/docker/spark-kafka-single-node-for-meetup/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | export HOSTNAME=$(hostname -i)
 4 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.master.Master >/dev/null 2>&1 < /dev/null &
 5 | echo "starting spark master.."
 6 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.worker.Worker spark://${HOSTNAME}:7077 >/dev/null 2>&1 < /dev/null &
 7 | echo "starting spark worker.."
 8 | exec $KAFKA_HOME/bin/zookeeper-server-start.sh $KAFKA_HOME/config/zookeeper.properties >/dev/null 2>&1 < /dev/null &
 9 | echo "starting zookeeper.."
10 | exec $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties >/dev/null 2>&1 < /dev/null &
11 | echo "starting kafka broker.."
12 | 
13 | #make container wait
14 | exec "$@";
15 | 


--------------------------------------------------------------------------------
/flink-examples-java/.gitignore:
--------------------------------------------------------------------------------
 1 | #idea
 2 | .idea/
 3 | *.iml
 4 | 
 5 | #osx
 6 | *.DS_Store
 7 | 
 8 | #artifacts
 9 | *.jar
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/flink-examples-java/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM flink
 2 | 
 3 | MAINTAINER Vishnu Viswanath <vishnu.viswanath25@gmail.com>
 4 | 
 5 | ADD submit.sh /
 6 | 
 7 | ARG JAR_FILE
 8 | ADD target/${JAR_FILE} /usr/share/flink-job.jar
 9 | 
10 | CMD ["/bin/bash", "/submit.sh"]


--------------------------------------------------------------------------------
/flink-examples-java/src/main/java/com/vishnuviswanath/flink/streaming/HelloStreamingJob.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.flink.streaming;
 2 | 
 3 | import org.apache.flink.streaming.api.datastream.DataStream;
 4 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 5 | import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | 
10 | public class HelloStreamingJob {
11 | 
12 |     public static void main(String[] args) throws Exception {
13 |         StreamExecutionEnvironment senv = StreamExecutionEnvironment.getExecutionEnvironment();
14 | 
15 |         List<String> sample = new ArrayList<>();
16 |         sample.add("test");
17 |         sample.add("data");
18 |         DataStream<String> sampleStream = senv.fromCollection(sample);
19 |         sampleStream.addSink(new PrintSinkFunction<>());
20 | 
21 |         senv.execute("hellow data stream");
22 | 
23 |     }
24 | }
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/flink-examples-java/src/main/java/com/vishnuviswanath/flink/streaming/sources/SocketStreamingJob.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.flink.streaming.sources;
 2 | 
 3 | import org.apache.flink.streaming.api.datastream.DataStream;
 4 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 5 | import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;
 6 | 
 7 | public class SocketStreamingJob {
 8 | 
 9 |     public static void main(String[] args) throws Exception {
10 |         StreamExecutionEnvironment senv = StreamExecutionEnvironment.getExecutionEnvironment();
11 | 
12 |         DataStream<String> socketDataStream = senv.socketTextStream("localhost", 9999);
13 |         DataStream<SensorData> sensorDataStream = socketDataStream.map(x -> new SensorData(x));
14 |         sensorDataStream.addSink(new PrintSinkFunction<>());
15 | 
16 |         senv.execute("Sensor data stream");
17 | 
18 |     }
19 | }
20 | 
21 | class SensorData {
22 |     double reading;
23 | 
24 |     public SensorData(String reading) {
25 |         this.reading = Double.parseDouble(reading);
26 |     }
27 | 
28 |     @Override
29 |     public String toString() {
30 |      return String.format("{Temp : %10.4f}", reading);
31 |     }
32 | }
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/flink-examples-java/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #  Licensed to the Apache Software Foundation (ASF) under one
 3 | #  or more contributor license agreements.  See the NOTICE file
 4 | #  distributed with this work for additional information
 5 | #  regarding copyright ownership.  The ASF licenses this file
 6 | #  to you under the Apache License, Version 2.0 (the
 7 | #  "License"); you may not use this file except in compliance
 8 | #  with the License.  You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | #  Unless required by applicable law or agreed to in writing, software
13 | #  distributed under the License is distributed on an "AS IS" BASIS,
14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | #  See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 | 
19 | log4j.rootLogger=INFO, console
20 | 
21 | log4j.appender.console=org.apache.log4j.ConsoleAppender
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n
24 | 


--------------------------------------------------------------------------------
/flink-examples-java/submit.sh:
--------------------------------------------------------------------------------
1 | #start job manager
2 | ./docker-entrypoint.sh jobmanager &
3 | 
4 | sleep 10
5 | 
6 | #submit job
7 | flink run /usr/share/flink-job.jar


--------------------------------------------------------------------------------
/flink/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | .cache-main
 4 | 
 5 | #eclipse specific
 6 | .classpath
 7 | .project
 8 | .settings/
 9 | .idea/
10 | # sbt specific
11 | .cache
12 | .history
13 | .lib/
14 | dist/*
15 | target/
16 | lib_managed/
17 | src_managed/
18 | project/boot/
19 | project/plugins/project/
20 | 
21 | # Scala-IDE specific
22 | .scala_dependencies
23 | .worksheet
24 | /bin/
25 | 


--------------------------------------------------------------------------------
/flink/README.md:
--------------------------------------------------------------------------------
 1 | ## Flink Streaming
 2 | 
 3 | <blockquote>Note : print() functions used on DataStream objects will not print in the console, this will be printed in the .out file of log.
 4 | This is defined in the log4j.properties in the conf folder. Change the logger type to ConsoleAppender to print log to console.</blockquote>
 5 | 
 6 | All the socket based streaming jobs in the examples listen to port 4444. To simulate messages coming through this port,
 7 | run `nc -lk 4444` and send sample messages
 8 | 
 9 | #### SocketStreming
10 | ```
11 | flink run target/scala-2.10/flink-vishnu_2.10-1.0.jar -c com.vishnu.flink.streaming.FlinkStreamingWordCount
12 | ```
13 | #### Tumbling window streaming (similar to batch)
14 | ```
15 | flink run target/scala-2.10/flink-vishnu_2.10-1.0.jar -c com.vishnu.flink.streaming.TumblingWindowStreamiming
16 | ```
17 | 


--------------------------------------------------------------------------------
/flink/build.sbt:
--------------------------------------------------------------------------------
 1 | ThisBuild / resolvers ++= Seq("Apache Development Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots/", Resolver.mavenLocal)
 2 | 
 3 | ThisBuild / scalaVersion := "2.11.7"
 4 | 
 5 | val flinkVersion = "1.5.0"
 6 | val kafkaVersion = "0.11.0.2"
 7 | 
 8 | val flinkDependencies = Seq(
 9 |   "org.apache.flink" %% "flink-scala" % flinkVersion % "provided",
10 |   "org.apache.flink" %% "flink-clients" % flinkVersion % "provided",
11 |   "org.apache.flink" %% "flink-streaming-scala" %flinkVersion % "provided",
12 |   "org.apache.flink" %% "flink-statebackend-rocksdb" % flinkVersion % "provided",
13 |   "org.apache.flink" %% "flink-queryable-state-client-java" % flinkVersion % "provided",
14 |   "org.apache.flink" %% "flink-queryable-state-runtime" % flinkVersion % "provided",
15 |   "org.apache.flink" %% "flink-cep-scala" % flinkVersion,
16 |   "org.apache.flink" %% "flink-connector-kafka-0.11" % flinkVersion
17 | )
18 | 
19 | val otherDependencies = Seq(
20 |   "org.apache.kafka" % "kafka-clients" % kafkaVersion,
21 |   "joda-time" % "joda-time" % "2.9.4",
22 |   "org.slf4j" % "slf4j-log4j12" % "1.7.25",
23 |   "log4j" % "log4j" % "1.2.17"
24 | )
25 | 
26 | val main = "com.vishnu.flink.streaming.queryablestate.QuerybleStateStream"
27 | 
28 | Compile / run / mainClass := Some(main)
29 | 
30 | assembly / mainClass := Some(main)
31 | 
32 | Compile / run := Defaults.runTask(Compile / fullClasspath,
33 |                                   Compile / run / mainClass,
34 |                                   Compile / run / runner).evaluated
35 | 
36 | lazy val commonSettings = Seq(
37 |   organization := "com.vishnuviswanath",
38 |   version := "1.0",
39 |   name := "flink-examples"
40 | )
41 | 
42 | lazy val root = (project in file(".")).
43 |   settings(commonSettings:_*).
44 |   settings(
45 |     libraryDependencies ++= flinkDependencies,
46 |     libraryDependencies ++= otherDependencies,
47 |     retrieveManaged := true
48 |   )
49 | 
50 | 
51 | lazy val mainRunner = project.in(file("mainRunner")).dependsOn(RootProject(file("."))).settings(
52 |   // we set all provided dependencies to none, so that they are included in the classpath of mainRunner
53 |   libraryDependencies := (libraryDependencies in RootProject(file("."))).value.map{
54 |     module => module.configurations match {
55 |       case Some("provided") => module.withConfigurations(None)
56 |       case _ => module
57 |     }
58 |   }
59 | )
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/flink/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.1.1
2 | 


--------------------------------------------------------------------------------
/flink/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")


--------------------------------------------------------------------------------
/flink/src/main/java/FlinkMain.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Created by vishnu on 9/2/16.
3 |  */
4 | public class FlinkMain {
5 | }
6 | 


--------------------------------------------------------------------------------
/flink/src/main/java/utility/Server.java:
--------------------------------------------------------------------------------
1 | package utility;
2 | 
3 | /**
4 |  * Created by vishnu on 11/3/16.
5 |  */
6 | public class Server {
7 | }
8 | 


--------------------------------------------------------------------------------
/flink/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/WordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink
 2 | 
 3 | import org.apache.flink.api.scala._
 4 | 
 5 | object WordCount {
 6 |   def main(args: Array[String]) {
 7 | 
 8 |     val env = ExecutionEnvironment.getExecutionEnvironment
 9 |     val text = env.fromElements(
10 |       "Who's there?",
11 |       "I think I hear them. Stand, ho! Who's there?")
12 | 
13 |     val counts = text.flatMap { _.toLowerCase.split("\\W+") filter { _.nonEmpty } }
14 |       .map { (_, 1) }
15 |       .groupBy(0)
16 |       .sum(1)
17 | 
18 |     counts.print()
19 |   }
20 | }


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/dataset/WordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.dataset
 2 | 
 3 | import java.lang.Iterable
 4 | 
 5 | import org.apache.flink.api.common.functions.{GroupReduceFunction, FlatMapFunction}
 6 | import org.apache.flink.api.scala.ExecutionEnvironment
 7 | import org.apache.flink.util.Collector
 8 | import org.apache.flink.api.scala._
 9 | 
10 | /**
11 |   * Created by vishnu on 3/12/16.
12 |   * Scala equivalent for WordCount program in http://dataartisans.github.io/flink-training/dataSetBasics/slides.html
13 |   * Reads from hdfs file, mapper emits 1 for each word and Reducer aggregates
14 |   *
15 |   */
16 | object WordCount {
17 | 
18 |   def main(args:Array[String]): Unit = {
19 |     val env = ExecutionEnvironment.getExecutionEnvironment
20 |     val data = env.readTextFile("hdfs://localhost:9000/states")
21 | 
22 |     val counts = data.flatMap(new Tokenizer())
23 |       .groupBy(0)
24 |       .reduceGroup(new SumWords())
25 | 
26 | 
27 |     counts.print()
28 |   }
29 |   
30 | 
31 | }
32 | 
33 | 
34 | class Tokenizer extends FlatMapFunction[String,(String,Int)] {
35 |   override def flatMap(value: String, out: Collector[(String,Int)]): Unit = {
36 |     val tokens = value.split("\\W+")
37 |     for (token <- tokens if token.length>0) out.collect(token,1)
38 |   }
39 | }
40 | 
41 | class SumWords extends GroupReduceFunction[(String,Int),(String,Int)] {
42 |   override def reduce(words: Iterable[(String,Int)], out: Collector[(String,Int)]): Unit = {
43 |     var count = 0
44 |     var prev: (String, Int) = null
45 |     val it = words.iterator()
46 |     while(it.hasNext) {
47 |       prev = it.next()
48 |       count = 1 + prev._2
49 |     }
50 |     out.collect(prev._1,count)
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/CountTumblingWindow.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.api.scala._
 5 | import org.apache.flink.streaming.api.windowing.time.Time
 6 | 
 7 | 
 8 | /**
 9 |  * A tumbling window based on count
10 |  */
11 | object CountTumblingWindow {
12 |   def main(args: Array[String]) {
13 |     val sev = StreamExecutionEnvironment.getExecutionEnvironment
14 |     val socTextStream = sev.socketTextStream("localhost",4444)
15 |     
16 |     //the following window is triggered for every 5 items
17 |     //since we are doing keyby
18 |     //each window will be containing only words of the same group
19 |     //e.g.,
20 |     //if stream is : one two one two one two one two one
21 |     //window1 = {one,one,one,one,one}
22 |     //window2 = {two,two,two,two}
23 |     //window1 will triggered but not window 2, it need one more 'two' to make it 5
24 |     val counts = socTextStream.flatMap{_.split("\\s")}
25 |       .map { (_, 1) }
26 |       .keyBy(0)
27 |       .countWindow(5)
28 |       .sum(1).setParallelism(4);
29 |     
30 |     counts.print()
31 |     sev.execute()
32 |   }
33 | }


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/EventTimeProcessing.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming
 2 | 
 3 | import org.apache.flink.api.common.functions.FlatMapFunction
 4 | import org.apache.flink.api.scala._
 5 | import org.apache.flink.streaming.api.TimeCharacteristic
 6 | import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
 7 | import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
 8 | import org.apache.flink.streaming.api.watermark.Watermark
 9 | import org.apache.flink.util.Collector
10 | import org.joda.time.format.DateTimeFormat
11 | 
12 | object EventTimeWindowWithTrigger {
13 |      def main(args: Array[String]) {
14 |        val sev = StreamExecutionEnvironment.getExecutionEnvironment
15 |        sev.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
16 |        val rawMessages: DataStream[String] = sev.socketTextStream("localhost",4444)
17 | 
18 |        val coloredMessagesStream: DataStream[ColoredMessage] = rawMessages.flatMap(new FlatMapFunction[String,ColoredMessage] {
19 |          override def flatMap(value: String, out: Collector[ColoredMessage]): Unit = {
20 |            out.collect(ColoredMessage(value.split(",")))
21 |          }
22 |        })
23 | 
24 | 
25 | 
26 | 
27 |     sev.execute()
28 |   }
29 | }
30 | case class ColoredMessage(eventTime: Long, color: String)
31 | 
32 | object ColoredMessage {
33 |   def apply(parts: Array[String]): ColoredMessage = {
34 |     ColoredMessage(
35 |       eventTime = getDate(parts(0)),
36 |       color = parts(1))
37 |   }
38 |   def getDate(date: String): Long = {
39 |     val formatter = DateTimeFormat.forPattern("HH:mm:ss")
40 |     val dt = formatter.parseDateTime(date)
41 |     dt.getMillis
42 |   }
43 | }
44 | 
45 | class TimestampAndWatermarkGen extends AssignerWithPeriodicWatermarks[ColoredMessage] {
46 |   val maxDelay = 1*60*1000 //1 minute
47 |   var maxTime = 0L
48 |   override def getCurrentWatermark: Watermark = {
49 |     new Watermark(maxTime - maxDelay)
50 |   }
51 |   override def extractTimestamp(element: ColoredMessage, previousElementTimestamp: Long): Long = {
52 |     maxTime = Math.max(element.eventTime, maxTime)
53 |     return element.eventTime
54 |   }
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/FlinkStreamingWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.api.scala._
 5 | 
 6 | 
 7 | 
 8 | object FlinkStreamingWordCount {
 9 |   
10 |   def main(args: Array[String])  {
11 |     val sev = StreamExecutionEnvironment.getExecutionEnvironment
12 |     val socTxtStream = sev.socketTextStream("localhost",4444)
13 |     
14 |     val counts = socTxtStream.flatMap{_.toLowerCase.split(" ") filter { _.nonEmpty } }
15 |       .map { (_, 1) }
16 |       .keyBy(0)
17 |       .sum(1)
18 |     counts.print()
19 |     sev.execute()
20 |     
21 |   }
22 |   
23 | }


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/ProcessingTimeWindowEvictor.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming
 2 | 
 3 | import org.apache.flink.api.scala._
 4 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 5 | import org.apache.flink.streaming.api.windowing.assigners.SlidingProcessingTimeWindows
 6 | import org.apache.flink.streaming.api.windowing.evictors.CountEvictor
 7 | import org.apache.flink.streaming.api.windowing.time.Time
 8 | import org.apache.flink.streaming.api.windowing.triggers.CountTrigger
 9 | 
10 | object ProcessingTimeWindowEvictor {
11 |   def main(args: Array[String]) {
12 |     val sev = StreamExecutionEnvironment.getExecutionEnvironment
13 |     val socTextStream = sev.socketTextStream("localhost",4444)
14 | 
15 |     //a window of size 20 seconds is created, window slides every 10 seconds
16 |     //execution of window is triggered when there are 3 elements in the window
17 |     val counts = socTextStream.flatMap{_.split("\\s")}
18 |       .map { (_, 1) }
19 |         .keyBy(0)
20 |         .window(SlidingProcessingTimeWindows.of(Time.seconds(15),Time.seconds(10)))
21 |       .trigger(CountTrigger.of(5))
22 |       .evictor(CountEvictor.of(3))
23 |       .sum(1).setParallelism(4);
24 | 
25 |     counts.print()
26 |     sev.execute()
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/ProcessingTimeWindowWithTrigger.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.api.scala._
 5 | import org.apache.flink.streaming.api.windowing.time.Time
 6 | import org.apache.flink.streaming.api.windowing.assigners.{SlidingProcessingTimeWindows}
 7 | import org.apache.flink.streaming.api.windowing.evictors.{CountEvictor}
 8 | 
 9 | import org.apache.flink.streaming.api.windowing.triggers.CountTrigger
10 | 
11 | object ProcessingTimeWindowWithTrigger {
12 |   def main(args: Array[String]) {
13 |     val sev = StreamExecutionEnvironment.getExecutionEnvironment
14 |     val socTextStream = sev.socketTextStream("localhost",4444)
15 | 
16 |     //a window of size 20 seconds is created, window slides every 10 seconds
17 |     //execution of window is triggered when there are 3 elements in the window
18 |     val counts = socTextStream.flatMap{_.split("\\s")}
19 |       .map { (_, 1) }
20 |         .keyBy(0)
21 |         .window(SlidingProcessingTimeWindows.of(Time.seconds(15),Time.seconds(10)))
22 |       .trigger(CountTrigger.of(5))
23 |       .sum(1).setParallelism(4);
24 | 
25 |     counts.print()
26 |     sev.execute()
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/StreamingWithRocksDb.scala:
--------------------------------------------------------------------------------
1 | package com.vishnu.flink.streaming
2 | 
3 | /**
4 |   * Created by vviswanath on 3/20/17.
5 |   */
6 | object StreamingWithRocksDb {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/TimeSlidingWindow.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.api.scala._
 5 | import org.apache.flink.streaming.api.windowing.time.Time
 6 | 
 7 | /**
 8 |  * A sliding window based on time. In contrast to Tumbling window a sliding is an overlapping window.
 9 |  */
10 | object TimeSlidingWindow {
11 |   def main(args: Array[String]) {
12 |     val sev = StreamExecutionEnvironment.getExecutionEnvironment
13 |     val socTextStream = sev.socketTextStream("localhost",4444)
14 | 
15 |     //the following window is triggered every 10 seconds,for last 15 seconds data
16 |     //therefore there is an overlap between data being processed at an instance and previous processing.
17 |     val counts = socTextStream.flatMap{_.split("\\s")}
18 |       .map { (_, 1) }
19 |       .keyBy(0)
20 |       .timeWindow(Time.seconds(15),Time.seconds(10))
21 |       .sum(1).setParallelism(4);
22 | 
23 |     counts.print()
24 |     sev.execute()
25 |   }
26 | }


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/TimeTumblingWindow.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.api.scala._
 5 | import org.apache.flink.streaming.api.windowing.time.Time
 6 | 
 7 | 
 8 | /**
 9 |  * A tumbling window based on time
10 |  */
11 | 
12 | object TimeTumblingWindow {
13 |   def main(args: Array[String]) {
14 |     val sev = StreamExecutionEnvironment.getExecutionEnvironment
15 |     val socTextStream = sev.socketTextStream("localhost",4444)
16 |     
17 |     //the following window is triggered every 15 seconds.
18 |     val counts = socTextStream.flatMap{_.split("\\s")}
19 |       .map { (_, 1) }
20 |       .keyBy(0)
21 |       .timeWindow(Time.seconds(15))
22 |       .sum(1).setParallelism(4);
23 |     
24 |     counts.print()
25 |     sev.execute()
26 |   }
27 | }


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/queryablestate/QueryClient.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming.queryablestate
 2 | 
 3 | import com.vishnu.flink.streaming.queryablestate.QuerybleStateStream.ClimateLog
 4 | import org.apache.flink.api.common.functions.ReduceFunction
 5 | import org.apache.flink.api.common.state.{ReducingState, ReducingStateDescriptor}
 6 | import org.apache.flink.api.common.typeinfo.{BasicTypeInfo, TypeHint, TypeInformation}
 7 | import org.apache.flink.api.common.{ExecutionConfig, JobID}
 8 | import org.apache.flink.api.java.utils.ParameterTool
 9 | import org.apache.flink.queryablestate.client.QueryableStateClient
10 | 
11 | import scala.compat.java8.FutureConverters.toScala
12 | import scala.concurrent.{Await, ExecutionContext, duration}
13 | import scala.util.{Failure, Success}
14 | 
15 | /**
16 |   * Created by vviswanath on 3/13/17.
17 |   */
18 | object QueryClient {
19 | 
20 |   def main(args: Array[String]) {
21 | 
22 |     val parameterTool = ParameterTool.fromArgs(args)
23 |     val jobId = JobID.fromHexString(parameterTool.get("jobId"))
24 |     val key = parameterTool.get("stateKey")
25 | 
26 |     val client = new QueryableStateClient("10.0.0.189", 9067)
27 | 
28 |     val reduceFunction = new ReduceFunction[ClimateLog] {
29 |       override def reduce(c1: ClimateLog, c2: ClimateLog): ClimateLog = {
30 |         c1.copy(
31 |           temperature = c1.temperature + c2.temperature,
32 |           humidity = c1.humidity + c2.humidity)
33 |       }
34 |     }
35 | 
36 |     val climateLogStateDesc = new ReducingStateDescriptor[ClimateLog](
37 |       "climate-record-state",
38 |       reduceFunction,
39 |       TypeInformation.of(new TypeHint[ClimateLog]() {}).createSerializer(new ExecutionConfig()))
40 | 
41 |     implicit val ec = ExecutionContext.global
42 |     val resultFuture = toScala(client.getKvState (jobId, "queryable-climatelog-stream", key, new TypeHint[String]{}.getTypeInfo, climateLogStateDesc))
43 | 
44 |     while(!resultFuture.isCompleted) {
45 |       println("waiting...")
46 |       Thread.sleep(1000)
47 |     }
48 | 
49 |     resultFuture.onComplete(r ⇒ println(r.get))
50 |     resultFuture.onFailure(PartialFunction(println))
51 | 
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/queryablestate/QuerybleStateStream.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming.queryablestate
 2 | 
 3 | import org.apache.flink.api.common.functions.ReduceFunction
 4 | import org.apache.flink.api.common.state.ReducingStateDescriptor
 5 | import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation}
 6 | import org.apache.flink.api.java.tuple.Tuple
 7 | import org.apache.flink.api.scala._
 8 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 9 | import org.apache.flink.streaming.api.windowing.time.Time
10 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow
11 | import org.apache.flink.util.Collector
12 | import org.apache.log4j.Logger
13 | 
14 | /**
15 |   * Created by vviswanath on 3/12/17.
16 |   */
17 | 
18 | object QuerybleStateStream {
19 | 
20 |   val logger = Logger.getLogger("QueryableStateStream")
21 | 
22 |   case class ClimateLog(country: String, state: String, temperature: Float, humidity: Float)
23 |   object ClimateLog {
24 |     def apply(line: String): Option[ClimateLog] = {
25 |       val parts = line.split(",")
26 |       try{
27 |         Some(ClimateLog(parts(0), parts(1), parts(2).toFloat, parts(3).toFloat))
28 |       } catch {
29 |         case e: Exception => {
30 |           logger.warn(s"Unable to parse line $line")
31 |           None
32 |         }
33 |       }
34 |     }
35 |   }
36 | 
37 |   def main(args: Array[String]): Unit = {
38 |     val senv = StreamExecutionEnvironment.getExecutionEnvironment
39 |     senv.setParallelism(1)
40 | 
41 |     //this is required if org.apache.flink.api.scala._ is not imported
42 |     //implicit val typeInfo = TypeInformation.of(classOf[ClimateLog])
43 | 
44 |     val climateLogStream = senv.socketTextStream("localhost", 2222)
45 |       .flatMap(ClimateLog(_))
46 | 
47 |     val climateLogAgg = climateLogStream
48 |       .name("climate-log-agg")
49 |       .keyBy("country", "state")
50 |       .timeWindow(Time.seconds(10))
51 |       .reduce(reduceFunction)
52 | 
53 |     val climateLogStateDesc = new ReducingStateDescriptor[ClimateLog](
54 |       "climate-record-state",
55 |       reduceFunction,
56 |       TypeInformation.of(new TypeHint[ClimateLog]() {}))
57 | 
58 | 
59 |     val queryableStream = climateLogAgg
60 |       .name("queryable-state")
61 |       .keyBy("country")
62 |       .asQueryableState("queryable-climatelog-stream", climateLogStateDesc)
63 | 
64 |     climateLogAgg.print()
65 | 
66 |     senv.execute("Queryablestate example streaming job")
67 |   }
68 | 
69 |   val reduceFunction = new ReduceFunction[ClimateLog] {
70 |     override def reduce(c1: ClimateLog, c2: ClimateLog): ClimateLog = {
71 |       c1.copy(
72 |         temperature = c1.temperature + c2.temperature,
73 |         humidity=c1.humidity + c2.humidity)
74 |     }
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/sessionwindows/SessionWindowExampleDummyDataProd.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.streaming.sessionwindows
 2 | 
 3 | import java.io._
 4 | import java.net.ServerSocket
 5 | 
 6 | /**
 7 |   * Created by vviswanath on 6/8/17.
 8 |   *
 9 |   * Data producer for ttesting SessionWindowExample.scala
10 |   */
11 | object SessionWindowExampleDummyDataProd {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     val serverSocket = new ServerSocket(4444)
16 |     val clientSocket = serverSocket.accept
17 |     val out = new PrintWriter(clientSocket.getOutputStream, true)
18 | 
19 |     /*
20 |     //0th second
21 |     out.write(s"${System.currentTimeMillis},user2,recommendation\n")
22 |     out.flush()
23 |     Thread.sleep(1000)//1st second
24 |     out.write(s"${System.currentTimeMillis},user1,recommendation\n")
25 |     out.flush()
26 |     Thread.sleep(1000) //2nd second
27 |     out.write(s"${System.currentTimeMillis},user1,ad\n")
28 |     out.flush()
29 |     Thread.sleep(4000)  //6th second
30 |     out.write(s"${System.currentTimeMillis - 5000},user2,ad\n") //event time 3rd second
31 |     out.flush()
32 |     Thread.sleep(1000) //7th second
33 |     out.write(s"${System.currentTimeMillis},user2,recommendation\n")
34 |     out.flush()
35 |     Thread.sleep(2000) //9th second
36 |     out.write(s"${System.currentTimeMillis},user1,recommendation\n")
37 |     out.flush()
38 |     Thread.sleep(4000)
39 |     out.close()
40 |     */
41 | 
42 |     //0th second
43 |     out.write(s"${System.currentTimeMillis},user1,recommendation\n")
44 |     out.flush()
45 |     Thread.sleep(1000)//1st second
46 |     out.write(s"${System.currentTimeMillis},user1,recommendation\n")
47 |     out.flush()
48 |     Thread.sleep(2001)//2nd second
49 |     //this message is sent just to advance watermark, to show how AllowedLateness can cause a Window to be evaluated multiple times
50 |     out.write(s"${System.currentTimeMillis},user2,recommendation\n")
51 |     out.flush()
52 |     Thread.sleep(2500)  //4.5th second
53 |     out.write(s"${System.currentTimeMillis - 3500},user1,ad\n") //event time 3rd second
54 |     out.flush()
55 |     Thread.sleep(2500) //7th second
56 |     out.write(s"${System.currentTimeMillis},user1,recommendation\n")
57 |     out.flush()
58 |     Thread.sleep(4000)
59 |     out.close()
60 |   }
61 | 
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/streaming/windowtimer/ProccWindowFun.scala:
--------------------------------------------------------------------------------
 1 | //package com.vishnu.flink.streaming.windowtimer
 2 | //
 3 | //import java.lang.Iterable
 4 | //
 5 | //import org.apache.flink.api.java.tuple.Tuple
 6 | //import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction
 7 | //import org.apache.flink.streaming.api.windowing.windows.{TimeWindow, Window}
 8 | //import org.apache.flink.util.Collector
 9 | //
10 | ///**
11 | //  * Created by vviswanath on 4/16/17.
12 | //  */
13 | //class ProcWindowFun[IN, OUT, K, W <: Window] extends ProcessWindowFunction[String, String, Tuple, TimeWindow] {
14 | //  override def process(key: Tuple, context: ProcessWindowFunction[String, String, Tuple, TimeWindow]#Context, iterable: Iterable[String], collector: Collector[String]): Unit = {
15 | //    context.registerEventTimeTimer(100)
16 | //  }
17 | //
18 | //  override def onTimer(t: Long, context: ProcessWindowFunction[String, String, Tuple, TimeWindow]#OnTimerContext, out: Collector[String]): Unit = {
19 | //    println("Timer triggered")
20 | //  }
21 | //}
22 | 


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/util/ParameterParser.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.util
 2 | 
 3 | /**
 4 |   * Created by vviswanath on 3/2/18.
 5 |   * A very simple arg parser. Expects args to be in the format Array("--key1", "value1", "--key2", "value2")
 6 |   * everything else is dropped.
 7 |   *   e.g,  key in ("key", "value") is dropped since key doesn't have "--" as prefix
 8 |   *   everything after key1 in ("--key1", "value1", "--key2", "--key3", "value3") is dropped since key2 doesn't have a corresponding value2.
 9 |   * Returns a map of key → value
10 |   */
11 | object ParameterParser {
12 | 
13 |   def parse(args: Array[String]): Map[String, String] = {
14 |     val Param = "--(.+)".r
15 |     args.grouped(2).flatMap(l ⇒
16 |       if (l.length == 2) (l(0), l(1)) match {
17 |         case (Param(key), value) ⇒ Some(key → value)
18 |         case _ ⇒ None
19 |       }
20 |       else None
21 |     ).toMap
22 |   }
23 | }


--------------------------------------------------------------------------------
/flink/src/main/scala/com/vishnu/flink/util/RandomServerEventsKafkaProducer.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.flink.util
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 6 | 
 7 | import scala.annotation.tailrec
 8 | import scala.util.Random
 9 | 
10 | /**
11 |   * Created by vviswanath on 1/15/18.
12 |   */
13 | object RandomServerEventsKafkaProducer {
14 | 
15 |   def eventType: String = {
16 |     Random.nextInt(3) match {
17 |       case 0 ⇒ "cpu-usage"
18 |       case 1 ⇒ "mem-usage"
19 |       case 2 ⇒ "disk-usage"
20 |     }
21 |   }
22 | 
23 |   def serverIp: String = {
24 |     s"192.168.23.${Random.nextInt(10)}"
25 |   }
26 | 
27 |   def value: Double = {
28 |     Random.nextDouble * 100
29 |   }
30 | 
31 |   def now(possibleDelay: Boolean, maxDelay: Long): Long = {
32 |     val now = System.currentTimeMillis()
33 |     if (possibleDelay && Random.nextBoolean()) now - Random.nextLong % maxDelay
34 |     else now
35 |   }
36 | 
37 |   //returns a key,value
38 |   def nextServerEvent: (String, String) = {
39 | 
40 |     val event = (serverIp, s"${now(possibleDelay = true, 10000)},$eventType,$serverIp,$value")
41 |     print(s"Produced event $event\n")
42 |     event
43 |   }
44 | 
45 |   def main(args: Array[String]): Unit = {
46 | 
47 |     val parameters = ParameterParser.parse(args)
48 | 
49 |     val props = new Properties()
50 |     props.put("bootstrap.servers", parameters.getOrElse("kafka-bootstrap-server", "localhost:9092"))
51 |     props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
52 |     props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
53 | 
54 |     val producer = new KafkaProducer[String, String](props)
55 |     val interval = 10
56 |     val topic = parameters("topic")
57 |     val numRecsToProduce: Option[Int] = None //None = infinite
58 | 
59 | 
60 |     @tailrec
61 |     def produceRecord(numRecToProduce: Option[Int]): Unit = {
62 |       def generateRecord(topic: String, f: ⇒ (String, String)): ProducerRecord[String, String] = {
63 |         val event = f
64 |         new ProducerRecord[String, String](topic, event._1, event._2)
65 |       }
66 | 
67 |       numRecToProduce match {
68 |         case Some(x) if x > 0 ⇒
69 |           producer.send(generateRecord(topic, nextServerEvent))
70 |           Thread.sleep(interval)
71 |           produceRecord(Some(x - 1))
72 | 
73 |         case None ⇒
74 |           producer.send(generateRecord(topic, nextServerEvent))
75 |           Thread.sleep(interval)
76 |           produceRecord(None)
77 | 
78 |         case _ ⇒
79 |       }
80 |     }
81 | 
82 |     produceRecord(numRecsToProduce)
83 | 
84 | 
85 |   }
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/mapreduce/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 | 


--------------------------------------------------------------------------------
/mapreduce/ArrayWritableExample.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.ArrayWritable;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.LongWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapred.FileInputFormat;
11 | import org.apache.hadoop.mapred.FileOutputFormat;
12 | import org.apache.hadoop.mapred.JobClient;
13 | import org.apache.hadoop.mapred.JobConf;
14 | import org.apache.hadoop.mapred.MapReduceBase;
15 | import org.apache.hadoop.mapred.Mapper;
16 | import org.apache.hadoop.mapred.OutputCollector;
17 | import org.apache.hadoop.mapred.Reporter;
18 | import org.apache.hadoop.mapred.TextInputFormat;
19 | import org.apache.hadoop.mapred.TextOutputFormat;
20 | import org.apache.hadoop.mapred.lib.IdentityReducer;
21 | 
22 | /**
23 |  * An example implementing ArrayWritable
24 |  * @author vishnu
25 |  *
26 |  */
27 | 
28 | public class ArrayWritableExample {
29 | 
30 | 	private static class MyMapper extends MapReduceBase implements
31 | 			Mapper<LongWritable, Text, Text, IntArrayWritable> {
32 | 		@Override
33 | 		public void map(LongWritable dummKey, Text value,
34 | 				OutputCollector<Text, IntArrayWritable> output, Reporter reporter)
35 | 				throws IOException {
36 | 			String line = value.toString();
37 | 			String[] parts = line.split(" ");
38 | 			IntArrayWritable arr = new IntArrayWritable();
39 | 			IntWritable[] intArr = new IntWritable[parts.length - 1];
40 | 			if (parts.length >= 2) {
41 | 				Text key = new Text(parts[0]);
42 | 				for (int i = 1; i < parts.length; i++) {
43 | 					IntWritable val = new IntWritable(
44 | 							Integer.parseInt(parts[i]));
45 | 					intArr[i - 1] = val;
46 | 				}
47 | 				arr.set(intArr);
48 | 				System.out.println("key "+key.toString()+" arr"+arr.toString());
49 | 				output.collect(key, arr);
50 | 			}
51 | 		}
52 | 	}
53 | 	
54 | 	private static class IntArrayWritable extends ArrayWritable {
55 | 
56 | 		public IntArrayWritable() {
57 | 			super(IntWritable.class);
58 | 		}
59 | 		
60 | 		@Override 
61 | 		public String toString() {
62 | 			String[] arr = super.toStrings();
63 | 			String result = "";
64 | 			for ( String str:arr) {
65 | 				result+=str+" ";						
66 | 			}
67 | 			return result;
68 | 		}
69 | 		
70 | 	}
71 | 
72 | 	public static void main(String[] args) throws IOException {
73 | 
74 | 		JobConf conf = new JobConf(ArrayWritableExample.class);
75 | 		conf.setJobName("array writable");
76 | 
77 | 		conf.setOutputKeyClass(Text.class);
78 | 		conf.setOutputValueClass(IntArrayWritable.class);
79 | 
80 | 		conf.setMapperClass(MyMapper.class);
81 | 		conf.setReducerClass(IdentityReducer.class);
82 | 
83 | 		conf.setInputFormat(TextInputFormat.class);
84 | 		conf.setOutputFormat(TextOutputFormat.class);
85 | 
86 | 		FileInputFormat.setInputPaths(conf, new Path(args[0]));
87 | 		FileOutputFormat.setOutputPath(conf, new Path(args[1]));
88 | 		
89 | 		JobClient.runJob(conf);
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/mapreduce/DistributeCache.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.filecache.DistributedCache;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.LongWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapred.FileInputFormat;
12 | import org.apache.hadoop.mapred.FileOutputFormat;
13 | import org.apache.hadoop.mapred.JobClient;
14 | import org.apache.hadoop.mapred.JobConf;
15 | import org.apache.hadoop.mapred.MapReduceBase;
16 | import org.apache.hadoop.mapred.Mapper;
17 | import org.apache.hadoop.mapred.OutputCollector;
18 | import org.apache.hadoop.mapred.Reporter;
19 | import org.apache.hadoop.mapred.TextInputFormat;
20 | import org.apache.hadoop.mapred.TextOutputFormat;
21 | 
22 | /**
23 |  * Shows how a file can be loaded to dist cache. and how it can be used in mapper.
24 |  * @author vishnu
25 |  *
26 |  */
27 | 
28 | public class DistributeCache {
29 | 	
30 | 	private static class MyMapper extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable> {
31 | 		
32 | 		private Path[] localFiles;
33 | 		
34 | 		@Override
35 | 		public void map(LongWritable key, Text value,
36 | 				OutputCollector<Text, IntWritable> output, Reporter reporter)
37 | 				throws IOException {
38 | 			for(Path path : localFiles) {
39 | 				output.collect(new Text(path.getName()), new IntWritable(1));
40 | 			}
41 | 			
42 | 		}
43 | 		
44 | 		@Override 
45 | 		public void configure(JobConf conf) {
46 | 			try {
47 | 				localFiles = DistributedCache.getLocalCacheFiles(conf);
48 | 			} catch (IOException e) {
49 | 				e.printStackTrace();
50 | 			}
51 | 		}
52 | 		
53 | 	}
54 | 	
55 | 	public static void main(String[] args) throws IOException {		
56 | 		JobConf conf = new JobConf(DistributeCache.class);
57 | 		conf.setJobName("distcache");
58 | 
59 | 	    conf.setOutputKeyClass(Text.class);
60 | 	    conf.setOutputValueClass(IntWritable.class);
61 | 
62 | 	    conf.setMapperClass(MyMapper.class);
63 | 	    /* by default identity reducer will be called
64 | 	     * conf.setReducerClass(MyReducer.class);*/
65 | 	    conf.setInputFormat(TextInputFormat.class);
66 | 	    conf.setOutputFormat(TextOutputFormat.class);
67 | 	    FileSystem fs = FileSystem.get(conf);
68 | 	    DistributedCache.addFileToClassPath(new Path(args[2]), conf, fs);
69 | 	    FileInputFormat.setInputPaths(conf, new Path(args[0]));
70 | 	    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
71 | 
72 | 	    JobClient.runJob(conf);
73 | 	}
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/mapreduce/ImageReader.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples;
 2 | 
 3 | import java.io.ByteArrayOutputStream;
 4 | import java.io.IOException;
 5 | import java.net.URI;
 6 | 
 7 | import org.apache.hadoop.fs.FSDataInputStream;
 8 | import org.apache.hadoop.fs.FileSystem;
 9 | import org.apache.hadoop.fs.Path;
10 | import org.apache.hadoop.io.BytesWritable;
11 | import org.apache.hadoop.io.IOUtils;
12 | import org.apache.hadoop.io.IntWritable;
13 | import org.apache.hadoop.io.LongWritable;
14 | import org.apache.hadoop.io.Text;
15 | import org.apache.hadoop.mapred.FileInputFormat;
16 | import org.apache.hadoop.mapred.FileOutputFormat;
17 | import org.apache.hadoop.mapred.JobClient;
18 | import org.apache.hadoop.mapred.JobConf;
19 | import org.apache.hadoop.mapred.MapReduceBase;
20 | import org.apache.hadoop.mapred.Mapper;
21 | import org.apache.hadoop.mapred.OutputCollector;
22 | import org.apache.hadoop.mapred.Reporter;
23 | import org.apache.hadoop.mapred.TextInputFormat;
24 | import org.apache.hadoop.mapred.TextOutputFormat;
25 | 
26 | /**
27 |  * Reads an image file in the hdfs and converts it to bytes and output
28 |  * The input should contain the image_name,image_path.
29 |  * @author vishnu
30 |  *
31 |  */
32 | public class ImageReader {
33 | 
34 | 	private static class ImageMapper extends MapReduceBase implements
35 | 			Mapper<LongWritable, Text, Text, BytesWritable> {
36 | 
37 | 		private JobConf localconf;
38 | 
39 | 		@Override
40 | 		public void map(LongWritable offset, Text value,
41 | 				OutputCollector<Text, BytesWritable> output, Reporter reporter)
42 | 				throws IOException {
43 | 
44 | 			String line = value.toString();
45 | 			String[] parts = line.split(" ");
46 | 			Text key = new Text(parts[0]);
47 | 			String path = parts[1];
48 | 			FileSystem fs = FileSystem.get(URI.create(path), localconf);
49 | 			FSDataInputStream fsin = null;
50 | 
51 | 			try {
52 | 				fsin = fs.open(new Path(path));
53 | 				ByteArrayOutputStream bout = new ByteArrayOutputStream();
54 | 				byte[] buffer = new byte[1024 * 1024];
55 | 
56 | 				while (fsin.read(buffer, 0, buffer.length) >= 0) {
57 | 					bout.write(buffer);
58 | 				}
59 | 				output.collect(key, new BytesWritable(bout.toByteArray()));
60 | 			} finally {
61 | 				IOUtils.closeStream(fsin);
62 | 			}
63 | 
64 | 		}
65 | 
66 | 		@Override
67 | 		public void configure(JobConf conf) {
68 | 			localconf = conf;
69 | 		}
70 | 
71 | 	}
72 | 
73 | 	public static void main(String[] args) throws IOException {
74 | 		JobConf conf = new JobConf(ImageReader.class);
75 | 		conf.setJobName("imagereader");
76 | 
77 | 		conf.setOutputKeyClass(Text.class);
78 | 		conf.setOutputValueClass(BytesWritable.class);
79 | 
80 | 		conf.setMapperClass(ImageMapper.class);
81 | 
82 | 		conf.setInputFormat(TextInputFormat.class);
83 | 		conf.setOutputFormat(TextOutputFormat.class);
84 | 
85 | 		FileInputFormat.setInputPaths(conf, new Path(args[0]));
86 | 		FileOutputFormat.setOutputPath(conf, new Path(args[1]));
87 | 
88 | 		JobClient.runJob(conf);
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/mapreduce/LetterWordMapper.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | import java.util.StringTokenizer;
 6 | 
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.*;
 9 | import org.apache.hadoop.mapred.*;
10 | 
11 | 
12 | public class LetterWordMapper {
13 | 
14 | 	private static class MyMapper extends MapReduceBase implements
15 | 			Mapper<LongWritable, Text, Text, Text> {
16 | 		private Text word = new Text();
17 | 
18 | 		public void map(LongWritable key, Text value,
19 | 				OutputCollector<Text, Text> output, Reporter reporter)
20 | 				throws IOException {
21 | 			String line = value.toString();
22 | 			StringTokenizer tokenizer = new StringTokenizer(line);
23 | 			while (tokenizer.hasMoreTokens()) {
24 | 				String strWord = tokenizer.nextToken();
25 | 				word.set(strWord.charAt(0)+"");
26 | 				if (!strWord.trim().equals("")) {
27 | 					System.out.println("emitting word "+strWord);
28 | 					output.collect(word, new Text(strWord));
29 | 				}
30 | 			}
31 | 		}
32 | 	}
33 | 
34 | 	private static class MyReducer extends MapReduceBase implements
35 | 			Reducer<Text, Text, Text, Text> {
36 | 		public void reduce(Text key, Iterator<Text> values,
37 | 				OutputCollector<Text, Text> output, Reporter reporter)
38 | 				throws IOException {
39 | 			System.out.println("key is "+key.toString());
40 | 			String result = "";
41 | 			while(values.hasNext()) {
42 | 				String next = values.next().toString();
43 | 				System.out.println("next value is "+next);
44 | 				result+=next+",";
45 | 			}
46 | 			result = result.substring(0,result.length()-1);
47 | 			System.out.println("result is "+result);
48 | 			output.collect(key, new Text(result));
49 | 		}
50 | 	}
51 | 
52 | 	public static void main(String[] args) throws Exception {
53 | 		JobConf conf = new JobConf(WordCount.class);
54 | 		conf.setJobName("lettemapper");
55 | 
56 | 		conf.setOutputKeyClass(Text.class);
57 | 		conf.setOutputValueClass(Text.class);
58 | 
59 | 		conf.setMapperClass(MyMapper.class);
60 | 		conf.setReducerClass(MyReducer.class);
61 | 
62 | 		conf.setInputFormat(TextInputFormat.class);
63 | 		conf.setOutputFormat(TextOutputFormat.class);
64 | 
65 | 		FileInputFormat.setInputPaths(conf, new Path(args[0]));
66 | 		FileOutputFormat.setOutputPath(conf, new Path(args[1]));
67 | 
68 | 		JobClient.runJob(conf);
69 | 	}
70 | }
71 | 


--------------------------------------------------------------------------------
/mapreduce/MultiInputPath.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.IntWritable;
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapred.FileOutputFormat;
10 | import org.apache.hadoop.mapred.JobClient;
11 | import org.apache.hadoop.mapred.JobConf;
12 | import org.apache.hadoop.mapred.MapReduceBase;
13 | import org.apache.hadoop.mapred.Mapper;
14 | import org.apache.hadoop.mapred.OutputCollector;
15 | import org.apache.hadoop.mapred.Reporter;
16 | import org.apache.hadoop.mapred.TextInputFormat;
17 | import org.apache.hadoop.mapred.TextOutputFormat;
18 | import org.apache.hadoop.mapred.lib.IdentityReducer;
19 | import org.apache.hadoop.mapred.lib.MultipleInputs;
20 | 
21 | /**
22 |  * Move to new api...
23 |  * @author vishnu
24 |  *
25 |  */
26 | 
27 | public class MultiInputPath {
28 | 	
29 | 	private static class MyMapper extends MapReduceBase implements Mapper<LongWritable,Text,LongWritable,IntWritable> {
30 | 
31 | 		@Override
32 | 		public void map(LongWritable key, Text value,
33 | 				OutputCollector<LongWritable, IntWritable> output, Reporter reporter)
34 | 				throws IOException {	
35 | 			System.out.println("In mapper 1");
36 | 			output.collect(key, new IntWritable(value.toString().length()));
37 | 		}
38 | 		
39 | 	}
40 | 	
41 | 	
42 | 	private static class MyMapper2 extends MapReduceBase implements Mapper<LongWritable,Text,LongWritable,IntWritable> {
43 | 
44 | 		@Override
45 | 		public void map(LongWritable key, Text value,
46 | 				OutputCollector<LongWritable, IntWritable> output, Reporter reporter)
47 | 				throws IOException {	
48 | 			System.out.println("In mapper 2");
49 | 			output.collect(key, new IntWritable(value.toString().length()));
50 | 		}
51 | 		
52 | 	}
53 | 	public static void main(String[] args) throws IOException {
54 | 		
55 | 		JobConf conf = new JobConf(MultiInputPath.class);
56 | 		conf.setJobName("multi");
57 | 		conf.setMapperClass(MyMapper.class);
58 | 		conf.setReducerClass(IdentityReducer.class);
59 | 		conf.setInputFormat(TextInputFormat.class);
60 | 		conf.setOutputFormat(TextOutputFormat.class);
61 | 		conf.setOutputKeyClass(LongWritable.class);
62 | 		conf.setOutputValueClass(IntWritable.class);
63 | 		MultipleInputs.addInputPath(conf, new Path(args[0]), TextInputFormat.class,MyMapper.class);
64 | 		MultipleInputs.addInputPath(conf,new Path(args[1]),TextInputFormat.class,MyMapper2.class);
65 | 		FileOutputFormat.setOutputPath(conf,new Path(args[2]));
66 | 		
67 | 		JobClient.runJob(conf);
68 | 		
69 | 		
70 | 	}
71 | }
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/mapreduce/README.md:
--------------------------------------------------------------------------------
1 | This folder has examples for
2 | 
3 | 1. basic mapreduce programs
4 | 2. custom types
5 | 3. mapreduce chaining examples
6 | 4. joins using mapreduce
7 | 5. incremental aggergation using datafu
8 | 


--------------------------------------------------------------------------------
/mapreduce/SequenceFileTest.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.SequenceFile;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapred.JobConf;
11 | 
12 | public class SequenceFileTest {
13 | 	
14 | 	public static void main(String[] args) throws IOException {
15 | 		
16 | 		JobConf conf = new JobConf();
17 | 		Path sqFile = new Path(args[0]);
18 | 		FileSystem fs = sqFile.getFileSystem(conf);
19 | 		/*for local files
20 | 		 * FileSystem fs = FileSystem.getLocal(conf);
21 | 		Path sqFile = new Path(args[0]);*/
22 | 		
23 | 		
24 | 		SequenceFile.Writer sqWriter = SequenceFile.createWriter(fs,conf,sqFile,
25 | 				Text.class,
26 | 				LongWritable.class);
27 | 		sqWriter.append(new Text("key1"),new LongWritable(1));
28 | 		sqWriter.close();
29 | 		SequenceFile.Reader sqReader = new SequenceFile.Reader(fs,sqFile,conf);
30 | 		
31 | 		Text key = new Text();
32 | 		LongWritable value = new LongWritable();
33 | 		sqReader.next(key,value);
34 | 		
35 | 		System.out.println(key.toString()+" - "+value.toString());
36 | 		
37 | 		sqReader.close();
38 | 		
39 | 		
40 | 		
41 | 		
42 | 	}
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/mapreduce/WordCount.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples;
 2 | /**
 3 |  * Simple word count program
 4 |  */
 5 | import java.io.IOException;
 6 | import java.util.*;
 7 | 
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.*;
10 | import org.apache.hadoop.mapred.*;
11 | 
12 | public class WordCount {
13 | 
14 | 	private static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
15 |     private final static IntWritable one = new IntWritable(1);
16 |     private Text word = new Text();
17 | 
18 |     public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
19 |       String line = value.toString();
20 |       StringTokenizer tokenizer = new StringTokenizer(line);
21 |       while(tokenizer.hasMoreTokens()){
22 |         word.set(tokenizer.nextToken());
23 |         output.collect(word, one);
24 |       }
25 |     }
26 |   }
27 | 
28 |   private static class MyReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
29 |     public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
30 |       int sum = 0;
31 |       while (values.hasNext()) {
32 |         sum += values.next().get();
33 |       }
34 |       output.collect(key, new IntWritable(sum));
35 |     }
36 |   }
37 | 
38 |   public static void main(String[] args) throws Exception {
39 |     JobConf conf = new JobConf(WordCount.class);
40 |     conf.setJobName("wordcount");
41 | 
42 |     conf.setOutputKeyClass(Text.class);
43 |     conf.setOutputValueClass(IntWritable.class);
44 | 
45 |     conf.setMapperClass(MyMapper.class);
46 |     conf.setCombinerClass(MyReducer.class);
47 |     conf.setReducerClass(MyReducer.class);
48 | 
49 |     conf.setInputFormat(TextInputFormat.class);
50 |     conf.setOutputFormat(TextOutputFormat.class);
51 | 
52 |     FileInputFormat.setInputPaths(conf, new Path(args[0]));
53 |     FileOutputFormat.setOutputPath(conf, new Path(args[1]));
54 | 
55 |     JobClient.runJob(conf);
56 |   }
57 | }


--------------------------------------------------------------------------------
/mapreduce/chaining/ChainMapperExample.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.chaining;
 2 | 
 3 | /**
 4 |  * This example demonstrated how ChainMapper class can be used to chain mappers
 5 |  * Each mapper in the chain will be called using the output of the prev one.
 6 |  * Output key, Output value of the first mapper must mach with the Input key 
 7 |  * and Input value of the second mapper.
 8 |  * 
 9 |  * Output of a mapper will be used as the input of the next mapper. And again another
10 |  * output file/folder will be created. So its better to delete the existing output
11 |  * from the driver.
12 |  * 
13 |  * Reducer will be called after all the mappers are called. 
14 |  * 
15 |  * TO-DO: check if reducers can be chained.
16 |  */
17 | import java.io.IOException;
18 | 
19 | import org.apache.hadoop.fs.FileSystem;
20 | import org.apache.hadoop.fs.Path;
21 | import org.apache.hadoop.io.IntWritable;
22 | import org.apache.hadoop.io.LongWritable;
23 | import org.apache.hadoop.io.Text;
24 | import org.apache.hadoop.mapred.FileInputFormat;
25 | import org.apache.hadoop.mapred.FileOutputFormat;
26 | import org.apache.hadoop.mapred.JobClient;
27 | import org.apache.hadoop.mapred.JobConf;
28 | import org.apache.hadoop.mapred.TextInputFormat;
29 | import org.apache.hadoop.mapred.TextOutputFormat;
30 | import org.apache.hadoop.mapred.lib.ChainMapper;
31 | import org.apache.hadoop.mapred.lib.ChainReducer;
32 | 
33 | public class ChainMapperExample {
34 | 	public static void main(String[] args) throws IOException {
35 | 		
36 | 		/*
37 | 		 *  This conf is used as a ref. Set only the input fileformat and output fileformat
38 | 		 *  
39 | 		 */
40 | 		JobConf conf1 = new JobConf(WordCount.class);
41 | 		conf1.setJobName("wordcount");
42 | 
43 | 		conf1.setInputFormat(TextInputFormat.class);
44 | 		conf1.setOutputFormat(TextOutputFormat.class);
45 | 		
46 | 		FileInputFormat.setInputPaths(conf1,new Path(args[0]));
47 | 		Path output = new Path(args[1]);
48 | 		FileSystem fileSystem = FileSystem.get(conf1);
49 | 		fileSystem.delete(output,true);
50 | 		FileOutputFormat.setOutputPath(conf1,output);
51 | 		
52 | 		
53 | 		/*
54 | 		 * Local job conf files
55 | 		 */
56 | 		JobConf mapConf = new JobConf(false);
57 | 		JobConf reduceConf = new JobConf(false);
58 | 		
59 | 		/*
60 | 		 * First argument is the global conf file we already created
61 | 		 * Second is the Mapper/Reducer class we gona use
62 | 		 * Third,fourth,fifth and sixth arguments are mapper/reducer inputkey inputvalue,outputkey and outputvalue respectively
63 | 		 */
64 | 		ChainMapper.addMapper(conf1,WordCount.WordCountMapper.class,LongWritable.class,Text.class,Text.class,IntWritable.class,true,mapConf);
65 | 		ChainMapper.addMapper(conf1,ToUpperCase.class,Text.class,IntWritable.class,Text.class,IntWritable.class,true,mapConf);
66 | 		ChainReducer.setReducer(conf1,WordCount.WordCountReducer.class,Text.class,IntWritable.class,Text.class,IntWritable.class,true,reduceConf);
67 | 		
68 | 		JobClient.runJob(conf1);
69 | 	}
70 | }
71 | 


--------------------------------------------------------------------------------
/mapreduce/chaining/ChainingJobControl.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.chaining;
 2 | 
 3 | /**
 4 |  * Chains jobs using JobControl.
 5 |  * Steps:
 6 |  * 1.Create jobs
 7 |  * 2.Create a job control.
 8 |  * 3.Add jobs to the job control.
 9 |  * 4.Add dependecy of jobs
10 |  * 5.Start the job using JobControl.run()
11 |  * 
12 |  */
13 | 
14 | import java.io.IOException;
15 | 
16 | import org.apache.hadoop.fs.Path;
17 | import org.apache.hadoop.io.IntWritable;
18 | import org.apache.hadoop.io.Text;
19 | import org.apache.hadoop.mapred.FileInputFormat;
20 | import org.apache.hadoop.mapred.FileOutputFormat;
21 | import org.apache.hadoop.mapred.JobConf;
22 | import org.apache.hadoop.mapred.TextInputFormat;
23 | import org.apache.hadoop.mapred.TextOutputFormat;
24 | import org.apache.hadoop.mapred.jobcontrol.Job;
25 | import org.apache.hadoop.mapred.jobcontrol.JobControl;
26 | 
27 | public class ChainingJobControl {
28 | 	
29 | 	public static void main(String[] args) throws IOException {
30 | 		JobConf conf1 = new JobConf(WordCount.class);
31 | 		conf1.setJobName("wordcount");
32 | 		conf1.setOutputKeyClass(Text.class);
33 | 		conf1.setOutputValueClass(IntWritable.class);
34 | 		conf1.setMapperClass(WordCount.WordCountMapper.class);
35 | 		conf1.setCombinerClass(WordCount.WordCountReducer.class);
36 | 		conf1.setReducerClass(WordCount.WordCountReducer.class);
37 | 		conf1.setInputFormat(TextInputFormat.class);
38 | 		conf1.setOutputFormat(TextOutputFormat.class);
39 | 		FileInputFormat.setInputPaths(conf1,new Path(args[0]));
40 | 		Path intermediate = new Path("intermediate");
41 | 		FileOutputFormat.setOutputPath(conf1,intermediate);
42 | 		Job job1 = new Job(conf1);
43 | 		System.out.println("job 1 conf created");
44 | 		
45 | 		JobConf conf2 = new JobConf(WordCount.class);
46 | 		conf2.setOutputKeyClass(Text.class);
47 | 		conf2.setOutputValueClass(IntWritable.class);
48 | 		conf2.setMapperClass(LetterCount.LetterCountMapper.class);
49 | 		conf2.setCombinerClass(LetterCount.LetterCountReducer.class);
50 | 		conf2.setReducerClass(LetterCount.LetterCountReducer.class);
51 | 		conf2.setInputFormat(TextInputFormat.class);
52 | 		conf2.setOutputFormat(TextOutputFormat.class);
53 | 		FileInputFormat.setInputPaths(conf2,intermediate);
54 | 		FileOutputFormat.setOutputPath(conf2,new Path(args[1]));
55 | 		Job job2 = new Job(conf2);
56 | 		System.out.println("job 2 conf created");
57 | 		
58 | 		JobControl jbCntrol = new JobControl("cntroller");
59 | 		jbCntrol.addJob(job1);
60 | 		jbCntrol.addJob(job2);
61 | 		job2.addDependingJob(job1);
62 | 		System.out.println("dependency added");
63 | 		jbCntrol.run();
64 | 		System.out.println("Done");
65 | 	}
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/mapreduce/chaining/ChainingSimple.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.chaining;
 2 | 
 3 | /**
 4 |  * Simple job chaining example using more than one JobConf.
 5 |  * Chains jobs by calling JobClient.runJob(conf) in the order required
 6 |  */
 7 | import java.io.IOException;
 8 | 
 9 | import org.apache.hadoop.fs.Path;
10 | import org.apache.hadoop.io.IntWritable;
11 | import org.apache.hadoop.io.Text;
12 | import org.apache.hadoop.mapred.FileInputFormat;
13 | import org.apache.hadoop.mapred.FileOutputFormat;
14 | import org.apache.hadoop.mapred.JobClient;
15 | import org.apache.hadoop.mapred.JobConf;
16 | import org.apache.hadoop.mapred.TextInputFormat;
17 | import org.apache.hadoop.mapred.TextOutputFormat;
18 | 
19 | public class ChainingSimple {
20 | 
21 | 	public static void main(String[] args) throws IOException {
22 | 		
23 | 		JobConf conf1 = new JobConf(WordCount.class);
24 | 		conf1.setJobName("wordcount");
25 | 		
26 | 		conf1.setOutputKeyClass(Text.class);
27 | 		conf1.setOutputValueClass(IntWritable.class);
28 | 		conf1.setMapperClass(WordCount.WordCountMapper.class);
29 | 		conf1.setCombinerClass(WordCount.WordCountReducer.class);
30 | 		conf1.setReducerClass(WordCount.WordCountReducer.class);
31 | 		conf1.setInputFormat(TextInputFormat.class);
32 | 		conf1.setOutputFormat(TextOutputFormat.class);
33 | 		FileInputFormat.setInputPaths(conf1,new Path(args[0]));
34 | 		Path intermediate = new Path("intermediate");
35 | 		FileOutputFormat.setOutputPath(conf1,intermediate);		
36 | 		JobClient.runJob(conf1);
37 | 		
38 | 		
39 | 		JobConf conf2 = new JobConf(WordCount.class);
40 | 		conf2.setOutputKeyClass(Text.class);
41 | 		conf2.setOutputValueClass(IntWritable.class);
42 | 		conf2.setMapperClass(LetterCount.LetterCountMapper.class);
43 | 		conf2.setCombinerClass(LetterCount.LetterCountReducer.class);
44 | 		conf2.setReducerClass(LetterCount.LetterCountReducer.class);
45 | 		conf2.setInputFormat(TextInputFormat.class);
46 | 		conf2.setOutputFormat(TextOutputFormat.class);
47 | 		FileInputFormat.setInputPaths(conf2,intermediate);
48 | 		FileOutputFormat.setOutputPath(conf2,new Path(args[1]));
49 | 		JobClient.runJob(conf2);
50 | 		
51 | 		System.out.println("Done");
52 | 		
53 | 		
54 | 		
55 | 	}
56 | }
57 | 	
58 | 
59 | 


--------------------------------------------------------------------------------
/mapreduce/chaining/LetterCount.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.chaining;
 2 | /**
 3 |  * Take a word and count as input
 4 |  * Emit the count for each letter in the word
 5 |  * In effect generates a value for each letter which = number of times the letter occurs * number of words in which the word occur 
 6 |  */
 7 | import java.io.IOException;
 8 | import java.util.Arrays;
 9 | import java.util.HashMap;
10 | import java.util.Iterator;
11 | import java.util.Map;
12 | import java.util.StringTokenizer;
13 | 
14 | import org.apache.hadoop.io.IntWritable;
15 | import org.apache.hadoop.io.LongWritable;
16 | import org.apache.hadoop.io.Text;
17 | import org.apache.hadoop.mapred.MapReduceBase;
18 | import org.apache.hadoop.mapred.Mapper;
19 | import org.apache.hadoop.mapred.OutputCollector;
20 | import org.apache.hadoop.mapred.Reducer;
21 | import org.apache.hadoop.mapred.Reporter;
22 | 
23 | 
24 | public class LetterCount {
25 | 	
26 | 
27 | public static class LetterCountMapper extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable> {
28 | 
29 | 	@Override
30 | 	public void map(LongWritable key, Text value,
31 | 			OutputCollector<Text, IntWritable> output, Reporter reporter)
32 | 			throws IOException {
33 | 		
34 | 		String line = value.toString();
35 | 		System.out.println("line is "+line);
36 | 		StringTokenizer tokenizer = new StringTokenizer(line);
37 | 		boolean first = true;
38 | 		String word = "";
39 | 		int sum = 0;
40 | 		while(tokenizer.hasMoreTokens()) {
41 | 				String next = tokenizer.nextToken();
42 | 				if (first) {
43 | 					first = false;
44 | 					word = next;
45 | 				}else {
46 | 					sum += Integer.parseInt(next);
47 | 				}
48 | 			}
49 | 		System.out.println("word is "+word);
50 | 		System.out.println("sum is "+sum);
51 | 		
52 | 		for(char ch : word.toCharArray()) {
53 | 			output.collect(new Text(ch+""),new IntWritable(sum));
54 | 		}
55 | 	}
56 | }
57 | 
58 | public static class LetterCountReducer extends MapReduceBase implements Reducer<Text,IntWritable,Text,IntWritable> {
59 | 
60 | 	@Override
61 | 	public void reduce(Text key, Iterator<IntWritable> values,
62 | 			OutputCollector<Text, IntWritable> output, Reporter reorter)
63 | 			throws IOException {
64 | 		
65 | 	System.out.println("In reducer of letter count");
66 | 	int sum = 0;
67 | 		while(values.hasNext()) {
68 | 			int value = values.next().get();
69 | 			System.out.println(value);
70 | 			sum += value;
71 | 		}
72 | 	output.collect(key,new IntWritable(sum));
73 | 	}
74 | }
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/mapreduce/chaining/ToUpperCase.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.chaining;
 2 | 
 3 | 
 4 | /**
 5 |  * A mapper which converts the key to upper case
 6 |  */
 7 | import java.io.IOException;
 8 | 
 9 | import org.apache.hadoop.io.IntWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapred.MapReduceBase;
12 | import org.apache.hadoop.mapred.Mapper;
13 | import org.apache.hadoop.mapred.OutputCollector;
14 | import org.apache.hadoop.mapred.Reporter;
15 | 
16 | public class ToUpperCase extends MapReduceBase implements Mapper<Text, IntWritable, Text, IntWritable>{
17 | 
18 | 	@Override
19 | 	public void map(Text key, IntWritable value,
20 | 			OutputCollector<Text, IntWritable> output, Reporter reporter)
21 | 			throws IOException {
22 | 		String keyText = key.toString().toUpperCase();
23 | 		output.collect(new Text(keyText),value);
24 | 		
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/mapreduce/chaining/WordCount.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.chaining;
 2 | /**
 3 |  * Word count program.
 4 |  */
 5 | import java.io.IOException;
 6 | import java.util.Iterator;
 7 | import java.util.StringTokenizer;
 8 | 
 9 | import org.apache.hadoop.io.IntWritable;
10 | import org.apache.hadoop.io.LongWritable;
11 | import org.apache.hadoop.io.Text;
12 | import org.apache.hadoop.mapred.MapReduceBase;
13 | import org.apache.hadoop.mapred.Mapper;
14 | import org.apache.hadoop.mapred.OutputCollector;
15 | import org.apache.hadoop.mapred.Reducer;
16 | import org.apache.hadoop.mapred.Reporter;
17 | 
18 | public class WordCount {
19 | 	
20 | public static  class WordCountMapper extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable> {
21 | 
22 | 	@Override
23 | 	public void map(LongWritable key, Text value,
24 | 			OutputCollector<Text, IntWritable> output, Reporter reporter)
25 | 			throws IOException {
26 | 		String line = value.toString();
27 | 		StringTokenizer tokenizer = new StringTokenizer(line);
28 | 		while(tokenizer.hasMoreTokens()) {
29 | 			Text word = new Text();
30 | 			word.set(tokenizer.nextToken());
31 | 			IntWritable one = new IntWritable(1);
32 | 			output.collect(word,one);
33 | 		}
34 | 	}
35 | 	
36 | 	
37 | }
38 | 
39 | public static class WordCountReducer extends MapReduceBase implements Reducer<Text,IntWritable,Text,IntWritable> {
40 | 
41 | 	@Override
42 | 	public void reduce(Text key, Iterator<IntWritable> values,
43 | 			OutputCollector<Text, IntWritable> output, Reporter reporter)
44 | 			throws IOException {
45 | 		int sum = 0;
46 | 		while(values.hasNext()) {
47 | 			int value = values.next().get();
48 | 			sum += value;				
49 | 		}
50 | 		output.collect(key,new IntWritable(sum));
51 | 	}	
52 | }
53 | 
54 | }


--------------------------------------------------------------------------------
/mapreduce/customtypes/Comparator.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.customtypes;
 2 | 
 3 | /**
 4 |  * A custom comparator for comparing Text.
 5 |  * source : http://developer.yahoo.com/hadoop/tutorial/module5.html#types
 6 |  */
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.io.WritableComparator;
 9 | import org.apache.hadoop.io.WritableUtils;
10 | 
11 | 
12 | public class Comparator extends WritableComparator {
13 | 
14 | 	protected Comparator() {
15 | 		super(Text.class);
16 | 	}
17 | 	
18 | 	public int compare(byte[] b1, int s1, int l1,
19 |             byte[] b2, int s2, int l2) {
20 | 		System.out.println("Using custom comparator");		
21 | 		int n1 = WritableUtils.decodeVIntSize(b1[s1]);
22 | 		int n2 = WritableUtils.decodeVIntSize(b2[s2]);
23 | 		return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/mapreduce/customtypes/CustomPartitioner.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.customtypes;
 2 | 
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapred.JobConf;
 5 | import org.apache.hadoop.mapred.Partitioner;
 6 | 
 7 | /**
 8 |  * A custom partitioner to partition keys into reducers
 9 |  * @author vishnu
10 |  *
11 |  */
12 | public class CustomPartitioner implements Partitioner<Text,Text>{
13 | 
14 | 	@Override
15 | 	public int getPartition(Text key, Text value, int numReducers) {
16 | 		// TODO Auto-generated method stub
17 | 		System.out.println("number of reducers is "+numReducers);
18 | 		if (key.toString().equals("reduce")) {
19 | 			System.out.println("in custom partioner is returning 0");
20 | 			return 0%numReducers;
21 | 		} else {
22 | 			System.out.println("in custom partioner is returning 1");
23 | 			return 1%numReducers;
24 | 		}
25 | 	}
26 | 	
27 | 	
28 | 	/**
29 | 	 * provides an example of bit masking which will convert negative results to positive
30 | 	 */
31 | 	/*@Override
32 | 	public int getPartition(Text key, IntWritable value, int numPartitions) {
33 | 	    return (key.toString().hashCode() % numPartitions) & 0x7FFFFFFF;
34 | 	}*/
35 | 
36 | 	@Override
37 | 	public void configure(JobConf conf) {
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/mapreduce/customtypes/DollarInputFormat.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.customtypes;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.InputSplit;
 8 | import org.apache.hadoop.mapreduce.RecordReader;
 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | 
12 | /**
13 |  * A Custom InputFormat which will split the lines based on $ and will ignore \n
14 |  * @author vishnu
15 |  *
16 |  */
17 | public class DollarInputFormat extends FileInputFormat<LongWritable,Text>{
18 | 
19 | 
20 | 	@Override
21 | 	public RecordReader<LongWritable, Text> createRecordReader(InputSplit split,
22 | 			TaskAttemptContext context) throws IOException, InterruptedException {
23 | 		return new DollarRecordReader();
24 | 	}
25 | 	
26 | 	
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/mapreduce/customtypes/DollarStreamExample.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.customtypes;
 2 | /**
 3 |  * Simple program to test dollar($) as file delimiter
 4 |  */
 5 | import java.io.IOException;
 6 | import java.util.*;
 7 | 
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.*;
10 | import org.apache.hadoop.mapred.FileInputFormat;
11 | import org.apache.hadoop.mapred.FileOutputFormat;
12 | import org.apache.hadoop.mapred.JobConf;
13 | import org.apache.hadoop.mapred.MapReduceBase;
14 | import org.apache.hadoop.mapred.Mapper;
15 | import org.apache.hadoop.mapred.OutputCollector;
16 | import org.apache.hadoop.mapred.Reducer;
17 | import org.apache.hadoop.mapred.Reporter;
18 | import org.apache.hadoop.mapreduce.Job;
19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
20 | 
21 | 
22 | /**\
23 |  * have to do it in the new api
24 |  * @author vishnu
25 |  *
26 |  */
27 | public class DollarStreamExample {
28 | 
29 |   public static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, LongWritable, Text> {
30 |     
31 |     public void map(LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException {
32 |       String line = value.toString();
33 |       System.out.println("received in mapper"+line);
34 |       output.collect(key, value);
35 |     }
36 |   }
37 | 
38 |   public static class MyReducer extends MapReduceBase implements Reducer<LongWritable, Text, LongWritable, Text> {
39 |     public void reduce(LongWritable key, Iterator<Text> values, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException {
40 |      String temp = "";
41 |      while(values.hasNext()) {
42 |     	 temp += values.next().toString();
43 |      }
44 |      System.out.println("In reducer "+temp);
45 |       output.collect(key,new Text(temp));
46 |     }
47 |   }
48 | 
49 |   public static void main(String[] args) throws Exception {
50 |     JobConf conf = new JobConf(DollarStreamExample.class);
51 |     conf.setJobName("wordcount");
52 |     
53 |     
54 |     conf.setMapperClass(MyMapper.class);
55 |     conf.setReducerClass(MyReducer.class);
56 |     conf.setOutputKeyClass(LongWritable.class);
57 |     conf.setOutputValueClass(Text.class);
58 | 
59 |     conf.setMapperClass(MyMapper.class);
60 |     conf.setReducerClass(MyReducer.class);
61 |     Job job = new Job(conf,"wordcount");
62 |     job.setOutputKeyClass(LongWritable.class);
63 |     job.setOutputValueClass(Text.class);
64 |     job.setInputFormatClass(DollarInputFormat.class);
65 |     job.setOutputFormatClass(TextOutputFormat.class);
66 | 
67 |     FileInputFormat.setInputPaths(conf, new Path(args[0]));
68 |     FileOutputFormat.setOutputPath(conf, new Path(args[1]));
69 |     job.waitForCompletion(true);
70 |   }
71 | }


--------------------------------------------------------------------------------
/mapreduce/customtypes/IdentityReducerEx.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.customtypes;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapred.FileInputFormat;
10 | import org.apache.hadoop.mapred.FileOutputFormat;
11 | import org.apache.hadoop.mapred.JobClient;
12 | import org.apache.hadoop.mapred.JobConf;
13 | import org.apache.hadoop.mapred.MapReduceBase;
14 | import org.apache.hadoop.mapred.Mapper;
15 | import org.apache.hadoop.mapred.OutputCollector;
16 | import org.apache.hadoop.mapred.Reporter;
17 | import org.apache.hadoop.mapred.TextInputFormat;
18 | import org.apache.hadoop.mapred.TextOutputFormat;
19 | import org.apache.hadoop.mapred.lib.IdentityReducer;
20 | 
21 | public class IdentityReducerEx {
22 | 	
23 | 	public static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>{
24 | 
25 | 		@Override
26 | 		public void map(LongWritable key, Text value,
27 | 				OutputCollector<Text, Text> output, Reporter reporter)
28 | 				throws IOException {
29 | 			if (value.toString().contains("vishnu"))
30 | 				output.collect(new Text("vishnu"),value);
31 | 			else 
32 | 				output.collect(new Text("reduce"),value);
33 | 		}
34 | 		
35 | 	}
36 | 	
37 | 	
38 | 	public static void main(String[] args) throws IOException {
39 | 		
40 | 		JobConf conf1 = new JobConf(IdentityReducerEx.class);
41 | 		conf1.setJobName("partition_identity");
42 | 		
43 | 		conf1.setMapperClass(MyMapper.class);
44 | 		conf1.setReducerClass(IdentityReducer.class);
45 | 		
46 | 		conf1.setPartitionerClass(CustomPartitioner.class);
47 | 		
48 | 		conf1.setOutputKeyClass(Text.class);
49 | 		conf1.setOutputValueClass(Text.class);
50 | 		
51 | 		conf1.setInputFormat(TextInputFormat.class);
52 | 		conf1.setOutputFormat(TextOutputFormat.class);
53 | 		
54 | 		FileInputFormat.setInputPaths(conf1,new Path(args[0]));
55 | 		FileOutputFormat.setOutputPath(conf1,new Path(args[1]));
56 | 		
57 | 		JobClient.runJob(conf1);
58 | 		
59 | 		
60 | 		
61 | 	
62 | 		
63 | 		
64 | 		
65 | 	}
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/mapreduce/customtypes/Point2D.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.customtypes;
 2 | 
 3 | /**
 4 |  * Custom type representing a 2D point
 5 |  */
 6 | import java.io.DataInput;
 7 | import java.io.DataOutput;
 8 | import java.io.IOException;
 9 | 
10 | import org.apache.hadoop.io.Writable;
11 | 
12 | public class Point2D implements Writable {
13 | 	
14 | 	public float x;
15 | 	public float y;
16 | 	
17 | 	public Point2D(float x,float y) {
18 | 		this.x = x;
19 | 		this.y = y;
20 | 	}
21 | 	
22 | 	@Override
23 | 	public void readFields(DataInput in) throws IOException {
24 | 		x = in.readFloat();
25 | 		y = in.readFloat();
26 | 		
27 | 	}
28 | 
29 | 	@Override
30 | 	public void write(DataOutput out) throws IOException {
31 | 		out.writeFloat(x);
32 | 		out.writeFloat(y);
33 | 	}
34 | 	
35 | 	@Override
36 | 	public String toString() {
37 | 		return "("+x+","+y+")";
38 | 	}
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/mapreduce/customtypes/RectangleCount.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.customtypes;
 2 | /**
 3 |  * Simple word count program with custom counters and custom comparator(commented)
 4 |  */
 5 | import java.io.IOException;
 6 | import java.util.*;
 7 | 
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.*;
10 | import org.apache.hadoop.mapred.*;
11 | 
12 | public class RectangleCount {
13 | 	
14 | 	// A custom counter named CUSTOM_COUNT
15 | 	static enum CustomCounter{CUSTOM_COUNT};
16 |   private static class MyMapper extends MapReduceBase implements Mapper<Text, RectangleKey, RectangleKey, IntWritable> {
17 | 
18 |     public void map(Text key, RectangleKey value, OutputCollector<RectangleKey, IntWritable> output, Reporter reporter) throws IOException {
19 |       String line = value.toString();
20 |       System.out.println("Received "+line);
21 |       reporter.incrCounter(CustomCounter.CUSTOM_COUNT,1);
22 |       output.collect(value,new IntWritable(1));
23 |     }
24 |   }
25 | 
26 |   private static class MyReducer extends MapReduceBase implements Reducer<RectangleKey, IntWritable, Text, IntWritable> {
27 |     public void reduce(RectangleKey key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
28 |       int sum = 0;
29 |       while (values.hasNext()) {
30 |         sum += values.next().get();
31 |       }
32 |       output.collect(new Text(key.toString()), new IntWritable(sum));
33 |     }
34 |   }
35 | 
36 |   public static void main(String[] args) throws Exception {
37 |     JobConf conf = new JobConf(RectangleCount.class);
38 |     conf.setJobName("rectanglecount");
39 | 
40 |     conf.setOutputKeyClass(RectangleKey.class);
41 |     conf.setOutputValueClass(IntWritable.class);
42 |     
43 |     /*
44 |      * Add the custom comparator for the key output class
45 |      * It didnt work out by adding the comparator using WritableComparator.define() in the static block
46 |      *    
47 |      *  Add this to add the custom compartor. This comparator expects a Text key class.
48 |      *	conf.setOutputKeyComparatorClass(Comparator.class); 
49 |      */
50 |     
51 |     
52 |     conf.setMapperClass(MyMapper.class);
53 |     conf.setReducerClass(MyReducer.class);
54 | 
55 |     conf.setInputFormat(RectangleInputFormat.class);
56 |     conf.setOutputFormat(TextOutputFormat.class);
57 | 
58 |     FileInputFormat.setInputPaths(conf, new Path(args[0]));
59 |     FileOutputFormat.setOutputPath(conf, new Path(args[1]));
60 | 
61 |     JobClient.runJob(conf);
62 |   }
63 | }


--------------------------------------------------------------------------------
/mapreduce/customtypes/RectangleInputFormat.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.customtypes;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapred.FileInputFormat;
 7 | import org.apache.hadoop.mapred.FileSplit;
 8 | import org.apache.hadoop.mapred.InputSplit;
 9 | import org.apache.hadoop.mapred.JobConf;
10 | import org.apache.hadoop.mapred.RecordReader;
11 | import org.apache.hadoop.mapred.Reporter;
12 | 
13 | /**
14 |  * A custom input format for reading a rectangle info from a file
15 |  * @author vishnu
16 |  *
17 |  */
18 | 
19 | public class RectangleInputFormat extends FileInputFormat<Text,RectangleKey>{
20 | 
21 | 	@Override
22 | 	public RecordReader<Text, RectangleKey> getRecordReader(InputSplit input,
23 | 			JobConf conf, Reporter reporter) throws IOException {
24 | 		reporter.setStatus(input.toString());
25 | 		return new RectangleRecordReader(conf,(FileSplit)input);
26 | 	}
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/mapreduce/customtypes/XmlOutputDriver.java:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.examples.customtypes;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapred.FileInputFormat;
10 | import org.apache.hadoop.mapred.FileOutputFormat;
11 | import org.apache.hadoop.mapred.JobClient;
12 | import org.apache.hadoop.mapred.JobConf;
13 | import org.apache.hadoop.mapred.MapReduceBase;
14 | import org.apache.hadoop.mapred.Mapper;
15 | import org.apache.hadoop.mapred.OutputCollector;
16 | import org.apache.hadoop.mapred.Reducer;
17 | import org.apache.hadoop.mapred.Reporter;
18 | import org.apache.hadoop.mapred.TextInputFormat;
19 | 
20 | 
21 | public class XmlOutputDriver {
22 | 	
23 | 	private static class XmlMapper extends MapReduceBase implements Mapper<LongWritable,Text,Text,Text> {
24 | 
25 | 		@Override
26 | 		public void map(LongWritable offset, Text value,
27 | 				OutputCollector<Text, Text> output, Reporter reporter)
28 | 				throws IOException {
29 | 			String line = value.toString();
30 | 			String[] parts = line.split(" ");
31 | 			Text key = new Text(parts[0]);
32 | 			for(int i=1;i<parts.length;i++) {
33 | 				output.collect(key,new Text(parts[i]));
34 | 			}
35 | 		}		
36 | 	}
37 | 	
38 | 	private static class XmlReducer extends MapReduceBase implements Reducer<Text,Text,Text,Text> {
39 | 
40 | 		@Override
41 | 		public void reduce(Text key, Iterator<Text> values,
42 | 				OutputCollector<Text, Text> output, Reporter reporter)
43 | 				throws IOException {
44 | 			while(values.hasNext()) {
45 | 				output.collect(key, values.next());
46 | 			}			
47 | 		}
48 | 		
49 | 	}
50 | 	
51 | 	public static void main(String[] args) throws IOException {
52 | 		JobConf conf = new JobConf(XmlOutputDriver.class);
53 | 	    conf.setJobName("xmlwriter");
54 | 
55 | 	    conf.setOutputKeyClass(Text.class);
56 | 	    conf.setOutputValueClass(Text.class);
57 | 
58 | 	    conf.setMapperClass(XmlMapper.class);
59 | 	    conf.setReducerClass(XmlReducer.class);
60 | 
61 | 	    conf.setInputFormat(TextInputFormat.class);
62 | 	    conf.setOutputFormat(XmlOutputFormat.class);
63 | 
64 | 	    FileInputFormat.setInputPaths(conf, new Path(args[0]));
65 | 	    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
66 | 
67 | 	    JobClient.runJob(conf);
68 | 	}
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/mapreduce/datafu_example/.gitignore:
--------------------------------------------------------------------------------
1 | incremental-hourglass/
2 | 


--------------------------------------------------------------------------------
/mapreduce/datafu_example/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>incrementaljob</groupId>
 6 |   <artifactId>incremental-hourglass</artifactId>
 7 |   <version>0.0.1-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>incremental-hourglass</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>junit</groupId>
20 |       <artifactId>junit</artifactId>
21 |       <version>3.8.1</version>
22 |       <scope>test</scope>
23 |     </dependency>
24 |     <dependency>
25 |   		<groupId>org.apache.hadoop</groupId>
26 |   		<artifactId>hadoop-core</artifactId>
27 |   		<version>1.2.1</version>
28 |   	</dependency>
29 |     <dependency>
30 |   		<groupId>com.linkedin.datafu</groupId>
31 |   		<artifactId>datafu-hourglass</artifactId>
32 |   		<version>0.1.3</version>
33 | 	</dependency>
34 |   </dependencies>
35 | </project>
36 | 


--------------------------------------------------------------------------------
/mapreduce/datafu_example/src/main/java/incrementaljob/IncrementalAccumulator.java:
--------------------------------------------------------------------------------
 1 | package incrementaljob;
 2 | 
 3 | import org.apache.avro.Schema;
 4 | import org.apache.avro.generic.GenericData;
 5 | import org.apache.avro.generic.GenericRecord;
 6 | 
 7 | import datafu.hourglass.model.Accumulator;
 8 | 
 9 | public class IncrementalAccumulator implements
10 | 		Accumulator<GenericRecord, GenericRecord> {
11 | 	private transient long sum;
12 | 	private transient Schema oSchema;
13 | 	private String outputSchemaString;
14 | 	
15 | 	public IncrementalAccumulator(String outputSchemaString) {
16 | 		this.outputSchemaString = outputSchemaString;
17 | 	}
18 | 	
19 | 	@Override
20 | 	public void accumulate(GenericRecord value) {
21 | 		this.sum += (Long) value.get("score");
22 | 	}
23 | 
24 | 	@Override
25 | 	public GenericRecord getFinal() {
26 | 		if (oSchema == null) {
27 | 			oSchema = new Schema.Parser().parse(outputSchemaString);
28 | 		}
29 | 		GenericRecord output = new GenericData.Record(oSchema);
30 | 		output.put("score", sum);
31 | 		return output;
32 | 	}
33 | 
34 | 	@Override
35 | 	public void cleanup() {
36 | 		this.sum = 0;
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/mapreduce/datafu_example/src/main/java/incrementaljob/IncrementalAggr.java:
--------------------------------------------------------------------------------
 1 | package incrementaljob;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.Arrays;
 6 | import java.util.List;
 7 | 
 8 | import org.apache.avro.Schema;
 9 | import org.apache.avro.Schema.Field;
10 | import org.apache.avro.Schema.Type;
11 | import org.apache.avro.generic.GenericData;
12 | import org.apache.avro.generic.GenericRecord;
13 | import org.apache.hadoop.fs.Path;
14 | 
15 | import datafu.hourglass.jobs.PartitionCollapsingIncrementalJob;
16 | import datafu.hourglass.model.Accumulator;
17 | import datafu.hourglass.model.KeyValueCollector;
18 | import datafu.hourglass.model.Mapper;
19 | 
20 | public class IncrementalAggr {
21 | 	
22 | 	
23 | 	
24 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
25 | 		PartitionCollapsingIncrementalJob job = new PartitionCollapsingIncrementalJob(IncrementalAggr.class);
26 | 		final String namespace = "incrementaljob.datafu";
27 | 		
28 | 		
29 | 		//create schema for site and load it into memory.
30 | 		final Schema keySchema = Schema.createRecord("Key", null, namespace,false);
31 | 		List<Field> keyFields = new ArrayList<Field>();
32 | 		Field id = new Field("exam",Schema.create(Type.STRING),null,null);
33 | 		keyFields.add(id);
34 | 		keySchema.setFields(keyFields);
35 | 		final String keySchemaString = keySchema.toString(true);
36 | 		
37 | 		final Schema valueSchema = Schema.createRecord("Value", null, namespace,false);
38 | 		List<Field> valueFields = new ArrayList<Field>();
39 | 		Field value = new Field("score",Schema.create(Type.LONG),null,null);
40 | 		valueFields.add(value);
41 | 		valueSchema.setFields(valueFields);
42 | 		final String valueSchemaString = valueSchema.toString(true);
43 | 		
44 | 		
45 | 		final Schema outputSchema = Schema.createRecord("Output", null, namespace,false);
46 | 		List<Field> outputFields = new ArrayList<Field>();
47 | 		Field result = new Field("result",Schema.create(Type.LONG),null,null);
48 | 		outputFields.add(result);
49 | 		outputSchema.setFields(outputFields);
50 | 		final String outputSchemaString = outputSchema.toString(true);
51 | 		
52 | 		job.setKeySchema(keySchema);
53 | 		job.setIntermediateValueSchema(valueSchema);
54 | 		job.setOutputValueSchema(valueSchema);
55 | 		job.setInputPaths(Arrays.asList(new Path("datafu/data/input")));
56 | 		job.setOutputPath(new Path("datafu/data/output"));
57 | 		job.setReusePreviousOutput(true);
58 | 		
59 | 		Mapper mapper = new IncrementalMapper(keySchemaString, valueSchemaString);
60 | 		job.setMapper(mapper);
61 | 		Accumulator accumulator = new IncrementalAccumulator(outputSchemaString);
62 | 		job.setReducerAccumulator(accumulator);
63 | //		job.setCombinerAccumulator(job.getReducerAccumulator());
64 | //		job.setUseCombiner(true);
65 | 		job.run();
66 | 		 
67 | 	}
68 | 	
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/mapreduce/datafu_example/src/main/java/incrementaljob/IncrementalMapper.java:
--------------------------------------------------------------------------------
 1 | package incrementaljob;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.avro.Schema;
 6 | import org.apache.avro.generic.GenericData;
 7 | import org.apache.avro.generic.GenericRecord;
 8 | 
 9 | import datafu.hourglass.model.KeyValueCollector;
10 | import datafu.hourglass.model.Mapper;
11 | 
12 | /**
13 |  * An example of incremental mapreduce using datafu
14 |  * @author vishnu
15 |  *
16 |  */
17 | public class IncrementalMapper implements Mapper<GenericRecord,GenericRecord,GenericRecord>
18 | {
19 | 	
20 | 	private transient Schema kSchema;
21 |     private transient Schema vSchema;
22 |     private String keySchemaString;
23 |     private String valueSchemaString;
24 |     
25 | 	public IncrementalMapper(String keySchemaString,String valueSchemaString) {
26 | 		this.keySchemaString = keySchemaString;
27 | 		this.valueSchemaString = valueSchemaString;
28 | 	}
29 | 	
30 |     
31 | 	@Override
32 | 	public void map(GenericRecord input,
33 | 			KeyValueCollector<GenericRecord, GenericRecord> collector)
34 | 			throws IOException, InterruptedException {
35 | 		  if (kSchema == null) kSchema = new Schema.Parser().parse(keySchemaString);
36 | 	      if (vSchema == null) vSchema = new Schema.Parser().parse(valueSchemaString);
37 | 	      GenericRecord key = new GenericData.Record(kSchema);
38 | 	      key.put("name", input.get("name"));
39 | 	      GenericRecord value = new GenericData.Record(vSchema);
40 | 	      value.put("score",input.get("score"));    
41 | 	      collector.collect(key,value);
42 | 	}
43 | }
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/mapreduce/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>com.vishnu</groupId>
 6 |   <artifactId>mapreduce</artifactId>
 7 |   <version>0.0.1-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>mapreduce</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>junit</groupId>
20 |       <artifactId>junit</artifactId>
21 |       <version>3.8.1</version>
22 |       <scope>test</scope>
23 |     </dependency>
24 |     <dependency>
25 | 		<groupId>org.apache.hadoop</groupId>
26 | 		<artifactId>hadoop-core</artifactId>
27 | 		<version>1.2.1</version>
28 | 	</dependency>
29 |     
30 |   </dependencies>
31 | </project>
32 | 


--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/vishnu/mapreduce/CustomMultiplOututFormat.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.mapreduce;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.mapred.JobConf;
 7 | import org.apache.hadoop.mapred.RecordWriter;
 8 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
 9 | 
10 | import org.apache.hadoop.util.Progressable;
11 | 
12 | public class CustomMultiplOututFormat<K,V> extends MultipleTextOutputFormat<K,V>{
13 | 
14 | 	@Override
15 | 	public RecordWriter<K, V> getRecordWriter(FileSystem fs, JobConf job, 
16 | 			String name, Progressable arg3) throws IOException {
17 | 		String newName = name.substring(0,name.indexOf("-"));
18 | 		System.out.println(name);
19 | 		System.out.println(newName);
20 | 		return super.getRecordWriter(fs, job, newName, arg3);
21 | 	}
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/vishnu/mapreduce/CustomOutputFormatTest.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.mapreduce;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Job;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
15 | 
16 | public class CustomOutputFormatTest {
17 | 	
18 | public static class ParserMapper extends Mapper<Object, Text, Text, Text> {
19 | 		
20 | 		Configuration conf = null;
21 | 		MultipleOutputs<Text, Text> mout;
22 | 				
23 | 		
24 | 
25 | 		public void map(Object key, Text value, Context context)
26 | 				throws IOException, InterruptedException {
27 | 			String val = value.toString();
28 | 			mout.write("filename",key,new Text(val));			
29 | 		}
30 | }
31 | 
32 | 	public static void main(String[] args) throws Exception {
33 | 		Configuration conf = new Configuration();
34 | 		Job job = Job.getInstance(conf, "CustomMultiplOutput");
35 | 		job.setJarByClass(CustomOutputFormatTest.class);
36 | 		job.setMapperClass(ParserMapper.class);
37 | 		job.setOutputKeyClass(Text.class);
38 | 		job.setOutputValueClass(Text.class);
39 | 		Path source = new Path(args[0]);
40 | 		FileInputFormat.addInputPath(job,source);
41 | 		CustomMultiplOututFormat.set
42 | 		//MultipleOutputs.addNamedOutput(job, BLUECOAT, TextOutputFormat.class, Text.class, Text.class);
43 | 		//MultipleOutputs.addNamedOutput(job, BTDIAMOND, TextOutputFormat.class, Text.class, Text.class);
44 | 		LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
45 | 		FileOutputFormat.setOutputPath(job, new Path(args[1]));
46 | 		boolean success = job.waitForCompletion(true);
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/vishnu/mapreduce/WordCount.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.mapreduce;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.StringTokenizer;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | 
16 | public class WordCount {
17 | 
18 |   public static class TokenizerMapper
19 |        extends Mapper<Object, Text, Text, IntWritable>{
20 | 
21 |     private Text word = new Text();
22 |     float one;
23 |     float two;
24 |     public void map(Object key, Text value, Context context
25 |                     ) throws IOException, InterruptedException {
26 |       StringTokenizer itr = new StringTokenizer(value.toString());
27 |       while (itr.hasMoreTokens()) {
28 |         word.set(itr.nextToken());
29 |         context.write(word,new IntWritable(1));
30 |       }
31 |     }
32 |     
33 |     @Override
34 |     protected void setup(Context context) throws IOException,
35 |       InterruptedException {
36 |        Configuration conf = context.getConfiguration();
37 |        one = conf.getFloat("one",0f);
38 |        two = conf.getFloat("two",0f);
39 |     }
40 |   }
41 | 
42 |   public static class IntSumReducer
43 |        extends Reducer<Text,IntWritable,Text,IntWritable> {
44 |     private IntWritable result = new IntWritable();
45 | 
46 |     public void reduce(Text key, Iterable<IntWritable> values,
47 |                        Context context
48 |                        ) throws IOException, InterruptedException {
49 |       int sum = 0;
50 |       for (IntWritable val : values) {
51 |         sum += val.get();
52 |       }
53 |       result.set(sum);
54 |       context.write(key, result);
55 |     }
56 |   }
57 | 
58 |   public static void main(String[] args) throws Exception {
59 |     Configuration conf = new Configuration();
60 |     Job job = Job.getInstance(conf, "word count");
61 |     job.setJarByClass(WordCount.class);
62 |     job.setMapperClass(TokenizerMapper.class);
63 |     job.setCombinerClass(IntSumReducer.class);
64 |     job.setReducerClass(IntSumReducer.class);
65 |     job.setOutputKeyClass(Text.class);
66 |     job.setOutputValueClass(IntWritable.class);
67 |     FileInputFormat.addInputPath(job, new Path(args[0]));
68 |     FileOutputFormat.setOutputPath(job, new Path(args[1]));
69 |     System.exit(job.waitForCompletion(true) ? 0 : 1);
70 |   }
71 | }


--------------------------------------------------------------------------------
/mapreduce/src/test/java/com/vishnu/mapreduce/AppTest.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.mapreduce;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 |     extends TestCase
12 | {
13 |     /**
14 |      * Create the test case
15 |      *
16 |      * @param testName name of the test case
17 |      */
18 |     public AppTest( String testName )
19 |     {
20 |         super( testName );
21 |     }
22 | 
23 |     /**
24 |      * @return the suite of tests being tested
25 |      */
26 |     public static Test suite()
27 |     {
28 |         return new TestSuite( AppTest.class );
29 |     }
30 | 
31 |     /**
32 |      * Rigourous Test :-)
33 |      */
34 |     public void testApp()
35 |     {
36 |         assertTrue( true );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/spark/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | .cache-main
 4 | 
 5 | #eclipse specific
 6 | .classpath
 7 | .project
 8 | .settings/
 9 | .idea/
10 | # sbt specific
11 | .cache
12 | .history
13 | .lib/
14 | dist/*
15 | target/
16 | lib_managed/
17 | src_managed/
18 | project/boot/
19 | project/plugins/project/
20 | 
21 | # Scala-IDE specific
22 | .scala_dependencies
23 | .worksheet
24 | /bin/
25 | metastore_db/
26 | 


--------------------------------------------------------------------------------
/spark/README.md:
--------------------------------------------------------------------------------
 1 | **Update:** This project uses spark 1.6, for Spark 2.3 examples click [here](https://github.com/soniclavier/bigdata-notebook/tree/master/spark_23)
 2 | 
 3 | 
 4 | Edit your .bash_profile or .profile file in OSX/Unix or Edit your Environment variables in Windows.
 5 | 
 6 | **OSX/Unix:**
 7 | ```b
 8 | vi ~/.bash_profile
 9 | export SPARK_HOME=/Users/vishnu/spark-1.6.0-bin-hadoop2.6
10 | export PATH=$PATH/:$SPARK_HOME/sbin
11 | export PATH=$PATH/:$SPARK_HOME/bin
12 | ```
13 | **To submit the application:**
14 | ```scala
15 | //start spark master
16 | $SPARK_HOME/sbin/start-master.sh
17 | 
18 | //start worker
19 | //Get the spark the master url from http://localhost:8080/
20 | $SPARK_HOME/sbin/start-slaves.sh spark://Vishnus-MacBook-Pro.local:7077
21 | 
22 | spark-submit   --class "package.name.Object"   --master spark://your_master_server:7077 target/path/to/your/jar_file.jar
23 | ```
24 | 
25 | E.g.,
26 | 
27 | For running Titanic ML example
28 | ```
29 | spark-submit   --class "com.vishnu.spark.kaggle.titanic.TitanicWithPipeline"   --master spark://Vishnus-MacBook-Pro.local:7077 --packages com.databricks:spark-csv_2.11:1.3.0  target/scala-2.10/spark-vishnu-assemlby-1.0.jar
30 | ```
31 | 
32 | For running Streaming Example
33 | ```
34 | spark-submit   --class "com.vishnu.spark.streaming.SocketStreaming"   --master spark://Vishnus-MacBook-Pro.local:7077 target/scala-2.10/spark-vishnu-assemlby-1.0.jar
35 | ```
36 | 


--------------------------------------------------------------------------------
/spark/build.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/spark/build.properties


--------------------------------------------------------------------------------
/spark/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.2")
2 | 


--------------------------------------------------------------------------------
/spark/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0")
2 | 


--------------------------------------------------------------------------------
/spark/pyspark-files/helloworld.py:
--------------------------------------------------------------------------------
 1 | #refs : https://www.youtube.com/watch?v=08mrnJxcIWw
 2 | # 	    https://github.com/databricks/tensorframes
 3 | #Spark version 2.1.1
 4 | #bin/pyspark --master spark://vishnu-macbook-pro:7077 --packages databricks:tensorframes:0.2.8-s_2.11
 5 | 
 6 | import tensorflow as tf
 7 | import tensorframes as tfs
 8 | 
 9 | df = spark.createDataFrame(zip(range(0,10), range(1,11))).toDF("x","y")
10 | df.show(10)
11 | 
12 | x = tfs.row(df, "x")
13 | y = tfs.row(df, "y")
14 | 
15 | output = tf.add(x, y, name="out")
16 | 
17 | df2 = tfs.map_rows(output, df)
18 | 
19 | df2.show()


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/Test.scala:
--------------------------------------------------------------------------------
1 | package com.vishnu.spark
2 | 
3 | object Test {
4 |   
5 |    def main(args: Array[String]) {
6 | 
7 |    }
8 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/bClassifier.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark
 2 | 
 3 | import org.apache.spark.mllib.classification.{ SVMModel, SVMWithSGD }
 4 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 5 | import org.apache.spark.mllib.util.MLUtils
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.SparkConf
 8 | 
 9 | /**
10 |  * An example from Spark site
11 |  */
12 | object bClassifier {
13 | 
14 |   def main(args: Array[String]) {
15 |     val conf = new SparkConf().setAppName("bClassiier").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
16 |     val sc = new SparkContext(conf)
17 |     val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
18 | 
19 |     // Split data into training (60%) and test (40%).
20 |     val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
21 |     val training = splits(0).cache()
22 |     val test = splits(1)
23 | 
24 |     // Run training algorithm to build the model
25 |     val numIterations = 100
26 |     val model = SVMWithSGD.train(training, numIterations)
27 | 
28 |     // Clear the default threshold.
29 |     model.clearThreshold()
30 | 
31 |     // Compute raw scores on the test set.
32 |     val scoreAndLabels = test.map { point =>
33 |       val score = model.predict(point.features)
34 |       (score, point.label)
35 |     }
36 | 
37 |     // Get evaluation metrics.
38 |     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
39 |     val auROC = metrics.areaUnderROC()
40 | 
41 |     println("Area under ROC = " + auROC)
42 | 
43 |     // Save and load model
44 |     model.save(sc, "myModelPath")
45 |     val sameModel = SVMModel.load(sc, "myModelPath")
46 |   }
47 | 
48 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/basics/AuctionApp.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.basics
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.sql.SQLContext
 6 | 
 7 | object AuctionApp {
 8 |   
 9 |   def main(args: Array[String]): Unit = {
10 |     val conf = new SparkConf().setAppName("AuctionsApp")
11 |     val sc = new SparkContext(conf)
12 |     
13 |     val aucFile = "/user/vishnu/mapr/dev360/auctiondata.csv"
14 |     val auctionRDD = sc.textFile(aucFile).map(_.split(",")).cache()
15 |   }
16 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/basics/CustomPartitioner.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.basics
 2 | 
 3 | import org.apache.spark.Partitioner
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | 
 7 | 
 8 | class MyPartitioner extends Partitioner{
 9 |   def numPartitions = 10
10 |   
11 |   def getPartition(key: Any): Int = {
12 |     key match {
13 |       case s: String => s.length()%numPartitions
14 |     }
15 |   }
16 | }
17 | 
18 | object CustomPartitioner {
19 |   
20 |   val conf = new SparkConf().setAppName("CustomPartitioner")
21 |   val sc = new SparkContext(conf)
22 |   
23 |   val rdd = sc.parallelize(List("word","stream","sql","dataframe","auction","averylongword","anotherveryverylongword"))
24 |   val myPart = new MyPartitioner
25 |   
26 |   val pairRdd = rdd.map(word=>(word,1))
27 |   val partitionedRdd = pairRdd.partitionBy(myPart)
28 |   
29 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/basics/SequenceFileTest.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.basics
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.hadoop.io.Text
 6 | import org.apache.hadoop.io.IntWritable
 7 | 
 8 | object SequenceFileTest {
 9 |   
10 |   def main(args: Array[String]): Unit = {
11 |     val conf = new SparkConf().setAppName("SeqFileTest")
12 |     val sc = new SparkContext(conf)
13 |     
14 |     
15 |     //create a sequence file
16 |     val data = sc.parallelize(List(("key1",1), ("key2",2)))
17 |     data.saveAsSequenceFile("/usr/vishnu/spark_temp/seqfile_sample")
18 |     
19 |     //read from sequence file
20 |     val dataLoaded = sc.sequenceFile("/usr/vishnu/spark_temp/seqfile_sample/part-00003", classOf[Text], classOf[IntWritable])
21 |     dataLoaded.foreach(println)
22 |     
23 |   }
24 |   
25 |   
26 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/basics/pairrdd.sc:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.basics
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.sql.SQLContext
 6 | 
 7 | object pairrdd {
 8 |   println("Welcome to the Scala worksheet")
 9 | 
10 |   val IncidntNum = 0
11 |   val Category = 1
12 |   val Descript = 2
13 |   val DayOfWeek = 3
14 |   val Date = 4
15 |   val Time = 5
16 |   val PdDistrict = 6
17 |   val Resolution = 7
18 |   val Address = 8
19 |   val X = 9
20 |   val Y = 10
21 |   val PdId = 11
22 |   
23 |   val conf = new SparkConf().setAppName("pairrdd-test").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
24 | 	val sc = new SparkContext(conf)
25 | 	
26 | 	val sfpd = sc.textFile("/user/vishnu/mapr/dev361/sfpd.csv").map(_.split(","))
27 | 	val totincs = sfpd.count()
28 | 	val cat = sfpd.map(x=>x(Category)).distinct.collect()
29 | 	
30 | 	val bayviewRDD = sfpd.filter(incident=>incident.contains("BAYVIEW"))
31 | 	
32 | 	val incByCat = sfpd.map(x=>(x(Category),1))
33 | 	
34 | 	sfpd.map(x=>(x(PdDistrict),1)).reduceByKey(_+_).map(x=>(x._2,x._1)).sortByKey(false).take(4)
35 | 	
36 | 	val pdDists = sfpd.map(x=>(x(PdDistrict),x(Address)))
37 | 	val catRes = sfpd.map(x=>(x(PdDistrict),(x(Category),x(Resolution))))
38 | 	val incCatRes = sfpd.map(x=>(x(PdDistrict),x(Address)))
39 | 	
40 | 	pdDists.join(catRes)
41 | 	
42 | 	// only if dataset can fit in memory
43 | 	val num_inc_dist = sfpd.map(x=>(x(PdDistrict),1)).countByKey()
44 | 	
45 | 	val catAdd = sc.textFile("/user/vishnu/mapr/dev361/J_AddCat.csv").map(x=>x.split(",")).map(x=>(x(1),x(0)))
46 | 	val distAdd = sc.textFile("/user/vishnu/mapr/dev361/J_AddDist.csv").map(x=>x.split(",")).map(x=>(x(1),x(0)))
47 | 	
48 | 	
49 | 	val incByDists = sfpd.map(x=>(x(PdDistrict),1)).reduceByKey(_+_)
50 | 	val inc_map = incByDists.map(x=>((x._2,x._1)))
51 | 	val inc_sort = incByDists.map(x=>(x._2,x._1)).sortByKey(false)
52 | 	val inc_group = sfpd.map(x=>(x(PdDistrict),1)).groupByKey()
53 | 	
54 | 	val incByDists2 = sfpd.map(x=>(x(PdDistrict),1)).reduceByKey(_+_,10)
55 | 	
56 | 	
57 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/basics/rdds.sc:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.basics
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | 
 6 | 
 7 | object rdds {
 8 | 
 9 | 	
10 |   println("Welcome to the Scala worksheet")
11 |   
12 |   val conf = new SparkConf().setAppName("rdd-test").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
13 | 	val sc = new SparkContext(conf)
14 | 	//define RDD
15 |   val auctionRDD = sc.textFile("/user/vishnu/mapr/dev360/auctiondata.csv").map(_.split(","))
16 |   
17 |  	//filter transformation, applying anonymous function
18 |   val xboxRDD = auctionRDD.filter(line => line.contains("xbox"))
19 |   
20 |   val auctionid = 0
21 |   val bid = 1
22 |   val bidtime = 2
23 |   val bidder = 3
24 |   val bidderrate = 4
25 |   val openbid = 5
26 |   val price = 6
27 |   val itemtype = 7
28 |   val daystolive = 8
29 |   
30 |   //how many items where sold
31 |   val items_sold = auctionRDD.map(entry=>entry(auctionid))
32 |   .distinct
33 |   .count
34 |   
35 |   //how many bids per item type
36 |   val bidAuctionRDD = auctionRDD.map(entry=>(entry(itemtype),1)).reduceByKey((x,y)=>x+y)
37 |   
38 |   //cache
39 |   bidAuctionRDD.cache
40 |   
41 |   bidAuctionRDD.collect
42 |   
43 |    
44 |   
45 |   
46 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/basics/streams.sc:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.basics
 2 | 
 3 | import org.apache.spark.{ SparkConf, SparkContext }
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.streaming._
 6 | import org.apache.spark.streaming.StreamingContext._
 7 | 
 8 | object streams {
 9 |   println("Welcome to the Scala worksheet")
10 | 
11 |   val sparkConf = new SparkConf().setAppName("SensorStream")
12 |   val sc = new SparkContext(sparkConf)
13 |   case class Sensor(resid: String, date: String, time: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double) extends Serializable
14 | 
15 |   val ssc = new StreamingContext(sc, Seconds(2))
16 |   val linesDStream = ssc.textFileStream("/user/user01/stream")
17 |   linesDStream.print()
18 |   linesDStream.foreachRDD(rdd => {
19 |     val srdd = rdd.map(_.split(",")).map(p => Sensor(p(0), p(1), p(2), p(3).toDouble, p(4).toDouble, p(5).toDouble, p(6).toDouble, p(7).toDouble, p(8).toDouble))
20 |     val alertRDD = srdd.filter(sensor=>sensor.psi < 5.0)
21 |     srdd.take(2).foreach(println)
22 |     alertRDD.take(2).foreach(println)
23 |   })
24 |   
25 | 
26 |   ssc.start()
27 |   ssc.awaitTermination()
28 |   
29 | 
30 | }
31 | 
32 | 
33 |   
34 |   


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/blog/supportfiles/spark_session_blog_commands:
--------------------------------------------------------------------------------
 1 | //LOAD
 2 | val df = spark.read.json("/spark_learning/pandainfo.json")
 3 | df.show
 4 | df.registerTempTable("pandas")
 5 | df.createOrReplaceTempView("pandas")
 6 | 
 7 | //TABLE AND SQL
 8 | spark.table("pandas")
 9 | spark.sql("select name from pandas").show
10 | 
11 | //UDF
12 | spark.udf.register("addone",(x:Int)=>x+1)
13 | 
14 | //CREATE DATASET
15 | val ds = spark.createDataset(List(1,2,3)) 
16 | val rdd = sc.parallelize(List(1,2,3))
17 | val ds = spark.createDataset(rdd)
18 | 
19 | 
20 | 
21 | //CREATE DATAFRAME
22 | case class Num(x:Int)
23 | val rdd = sc.parallelize(List(Num(1),Num(2),Num(3)))
24 | spark.createDataFrame(rdd).show
25 | 
26 | import org.apache.spark.sql.types.{StructType,StructField,IntegerType};
27 | import org.apache.spark.sql.Row
28 | val rowRDD = rdd.map(x=>Row(x))
29 | val schema = StructType(Array(StructField("num", IntegerType, true)))
30 | spark.createDataFrame(rowRDD,schema).show
31 | 
32 | 
33 | //CATALOG
34 | spark.catalog.cacheTable("pandas") // caches the table into memory, throws Table or view  not found in database exeception if not found.
35 | spark.catalog.uncacheTable("pandas")  // to remove table from memory
36 | spark.catalog.currentDatabase
37 | spark.catalog.isCached("pandas")
38 | spark.catalog.clearCache 
39 | spark.catalog.listDatabases.take(1)
40 | spark.catalog.listTables("default").take(1)
41 | spark.catalog.dropTempView("pandas") //drops the table


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/graph/PropertyGraphExample.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.graph
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.graphx._
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | object PropertyGraphExample {
 8 |   
 9 |   def main(args: Array[String]): Unit = {
10 |     //confs
11 |     val conf = new SparkConf().setAppName("AirportGraph")
12 |     val sc = new SparkContext(conf)
13 |     
14 |     //load data
15 |     val airports = sc.textFile("/mapr_lab_data/data/airports.csv").map(parseAirport)
16 |     val vertices = airports.map(airport => (airport.id.toLong,airport))  //note id.toLong -> we need that for creating Graph, because Graph()'s first arg takes an RDD of tuples with _0 that has a Long 
17 |     
18 |     val routes = sc.textFile("/mapr_lab_data/data/routes.csv").map(parseRoute)
19 |     val edges = routes.map(route => Edge(route.src, route.dest, route))
20 |     
21 |     //create defualt vertex
22 |     val defaultVertex = Airport(0,"default")
23 |     
24 |     //create graph
25 |     val graph = Graph(vertices, edges, defaultVertex)
26 |     
27 |     graph.vertices.collect.foreach(println)
28 |     
29 |     graph.triplets.collect.foreach(println)
30 |     println(graph.inDegrees)
31 |     println(graph.vertices.count())
32 |     println(graph.edges.count())
33 |     
34 |     graph.edges.filter{case Edge(src,dest,route) => route.dist > 1000}.count
35 |     graph.edges.filter{case Edge(src,dest,route) => route.dist > 1000}.collect.foreach(println)
36 |     
37 |     graph.triplets.sortBy(_.attr,ascending=false).collect.foreach(println)
38 |     
39 |     
40 |     //page rank
41 |     val ranks = graph.pageRank(0.1).vertices
42 |     ranks.take(3)
43 |     
44 |     ranks.join(vertices).sortBy(_._2._1,false).map(_._2._2).collect.foreach(println)
45 |   }
46 |   
47 |   case class Route(src:Int, dest:Int, dist: Int) 
48 |   object Route{
49 |     
50 |     implicit def orderingByDist[A <: Route]: Ordering[A] =
51 |       Ordering.by(r => (r.dist))
52 |   }
53 |   case class Airport(id:Int, name:String)
54 |   
55 |   def parseRoute(str:String): Route = {
56 |     val p = str.split(",")
57 |     new Route(p(0).toInt, p(1).toInt, p(2).toInt)
58 |   }
59 |   def parseAirport(str:String): Airport = {
60 |     val p = str.split(",")
61 |     Airport(p(0).toInt, p(1))
62 |   }
63 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/graph/res/airports.csv:
--------------------------------------------------------------------------------
1 | 1,SFO
2 | 2,ORD
3 | 3,DFW


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/graph/res/routes.csv:
--------------------------------------------------------------------------------
1 | 1,2,1800
2 | 2,3,800
3 | 3,1,1400


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/mllib/ALSRecommender.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.mllib
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.mllib.recommendation.{ALS,MatrixFactorizationModel,Rating}
 6 | import org.apache.spark.sql.SQLContext
 7 | import org.apache.spark.sql._
 8 | 
 9 | object ALSRecommender {
10 |   
11 |   def main(args: Array[String]): Unit = {
12 |     //conf
13 |     val conf = new SparkConf().setAppName("MovieRecommender")
14 |     val sc = new SparkContext(conf)
15 |     
16 |     //load data
17 |     val ratingText = sc.textFile("/mapr_lab_data/data/ratings.dat") 
18 |     val ratingsRDD = ratingText.map(parseRating).cache()
19 |     
20 |     //split into training and testing set
21 |     val splits = ratingsRDD.randomSplit(Array(0.8,0.2),0L)
22 |     val trainingRatingsRDD = splits(0).cache
23 |     val testRatingsRDD = splits(1).cache
24 |     
25 |     //buid ALS model
26 |     val model = (new ALS().setRank(20).setIterations(10).run(trainingRatingsRDD))
27 |     
28 |     val testUserProductRDD = testRatingsRDD.map{ case Rating(user,product,rating) => (user,product)}
29 |     
30 |     val predictionsRDD = model.predict(testUserProductRDD)
31 |     
32 |     val predictionsKeyed = predictionsRDD.map{case Rating(user,prod,pred) => ((user,prod),pred)}
33 |     val testUserKeyed = testRatingsRDD.map{case Rating(user,prod,rating) => ((user,prod),rating)}
34 |     
35 |     val testAndPred = testUserKeyed.join(predictionsKeyed)
36 |     
37 |     //find false positive, if predicted high (>4) and actual was low (<1)
38 |     val falsePositives = testAndPred.filter{case ((user,prod),(rating,pred)) => rating <= 1 && pred >= 4}
39 |     
40 |     //MAE (mean absolute error)
41 |     val absoluteError = testAndPred.map{case ((user,prod),(rating,pred)) => Math.abs(pred-rating)}
42 |     val mean = absoluteError.mean()
43 |     
44 |     //prediction for new user
45 |     val newRatingsRDD = sc.parallelize(Array(Rating(0,260,4), Rating(0,1,3)))
46 |     val unionRatingsRDD = ratingsRDD.union(newRatingsRDD)
47 |     val newModel =  (new ALS().setRank(20).setIterations(10).run(unionRatingsRDD))
48 |     
49 |     //recommend
50 |     val topRecForUser = newModel.recommendProducts(0,5)
51 |   }
52 |   
53 |   def parseRating(str: String): Rating = {
54 |     val p = str.split("::")
55 |     Rating(p(0).toInt,p(1).toInt,p(2).toDouble)
56 |   }
57 |   
58 |   //case class Rating(user:Int, movie: Int, rating: Double) no need of this since spark ml lib package is having Rating class
59 |   
60 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/mllib/ALSRecommender2.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.mllib
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.mllib.recommendation.{ALS,MatrixFactorizationModel,Rating}
 6 | import org.apache.spark.sql.SQLContext
 7 | import org.apache.spark.sql._
 8 | 
 9 | object ALSRecommender2 {
10 |   
11 |   def main(args: Array[String]): Unit = {
12 |     //conf
13 |     val conf = new SparkConf().setAppName("MovieRecommender")
14 |     val sc = new SparkContext(conf)
15 |     val sqlContext = new SQLContext(sc)
16 |     import sqlContext.implicits._
17 |     
18 |     
19 |     //load data
20 |     val ratingText = sc.textFile("/mapr_lab_data/data/ratings.dat")
21 |     val ratingsRDD = ratingText.map(parseRating).cache()
22 |     
23 |     val moviesDF= sc.textFile("/mapr_lab_data/data/movies.dat").map(parseMovie).toDF()  
24 |     val usersDF = sc.textFile("/mapr_lab_data/data/users.dat").map(parseUser).toDF() 
25 |     val ratingsDF = ratingsRDD.toDF()
26 |     
27 |     ratingsDF.registerTempTable("ratings")
28 |     usersDF.registerTempTable("users")
29 |     moviesDF.registerTempTable("movies")
30 |     
31 |     //TODO
32 |   }
33 |   
34 |   def parseRating(str: String): Rating = {
35 |     val p = str.split("::")
36 |     Rating(p(0).toInt,p(1).toInt,p(2).toDouble)
37 |   }
38 |   
39 |   def parseUser(str: String): User = {
40 |       val fields = str.split("::")
41 |       assert(fields.size == 5)
42 |       User(fields(0).toInt, fields(1).toString, fields(2).toInt, fields(3).toInt, fields(4).toString)
43 |     }
44 |   
45 |    def parseMovie(str: String): Movie = {
46 |       val fields = str.split("::")
47 |       assert(fields.size == 3)
48 |       Movie(fields(0).toInt, fields(1))
49 |     }
50 |   case class Movie(movieId: Int, title: String)
51 |   case class User(userId: Int, gender: String, age: Int, occupation: Int, zip: String)
52 |   //case class Rating(user:Int, movie: Int, rating: Double) no need of this since spark ml lib package is having Rating class
53 |   
54 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/mllib/FeatureTransformations.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.mllib
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.feature.StandardScaler
 7 | import org.apache.spark.mllib.feature.Normalizer
 8 | 
 9 | object FeatureTransformations {
10 |   
11 |   def main(args: Array[String]): Unit = {
12 |     val conf = new SparkConf().setAppName("FeatureTransfomrations")
13 |     val sc = new SparkContext(conf)
14 |     
15 |     val vectors = List(Vectors.dense(Array(-2.0,5.0,1.0)),Vectors.dense(Array(2.0,0.0,1.0)))
16 |     val dataset = sc.parallelize(vectors)
17 |     
18 |     //with mean = true, with std = true
19 |     val scaler = new StandardScaler(true,true)    
20 |     val scalerModel = scaler.fit(dataset)
21 |     scalerModel.transform(dataset).collect.foreach(println)
22 |     
23 |     val normalizer = new Normalizer()
24 |     normalizer.transform(dataset).collect.foreach(println)
25 |     
26 |   }
27 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/mllib/LinearRegr.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.mllib
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.regression.LabeledPoint
 6 | import org.apache.spark.mllib.feature.HashingTF
 7 | import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.mllib.linalg.Vectors
10 | 
11 | 
12 | object LinearRegr {
13 |   
14 |   def main(args: Array[String]): Unit = {
15 |     
16 |     val conf = new SparkConf().setAppName("LinearRegression")
17 |     val sc = new SparkContext(conf)
18 |      val sqlContext = new SQLContext(sc)
19 |     
20 |     val features = Array("price","numBeds","year","sqft")
21 |     val path = "/spark_learning/house_data.csv"
22 |     val housePrice = sc.textFile(path).map(line => Vectors.dense(line.split(",").map(_.toDouble)))
23 |     
24 |     val houseFeaturesLP = housePrice.map(house => LabeledPoint(house(0).toLong,house))
25 |     
26 |   
27 |     val lrModel = LinearRegressionWithSGD.train(houseFeaturesLP,10)
28 |     
29 |     println(lrModel.intercept+" "+lrModel.weights)
30 |     
31 |     val entry = "0,5,2016,4000"
32 |     val newEntry = LabeledPoint(0,Vectors.dense(entry.split(",").map(_.toDouble)))
33 |     println(lrModel.predict(newEntry.features))
34 |     
35 | 
36 |     
37 |   }
38 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/mllib/LogisticRegr.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.mllib
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.feature.HashingTF
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
 8 | 
 9 | object LogisticRegr {
10 |   
11 |   def main(args: Array[String]): Unit = {
12 |     
13 |     val conf = new SparkConf().setAppName("LogisticRegression")
14 |     val sc = new SparkContext(conf)
15 |     
16 |     val tf = new HashingTF(10000)
17 |     
18 |     val spam = sc.textFile("/spark_learning/spam.txt")
19 |     val normal = sc.textFile("/spark_learning/normal.txt")
20 |     
21 |     val spamFeatures = spam.map(email=> tf.transform(email.split(" ")))
22 |     val normalFeatures = normal.map(email=> tf.transform(email.split(" ")))
23 |     
24 |     val positiveLP = spamFeatures.map(features => LabeledPoint(1,features))
25 |     val negativeLP = normalFeatures.map(features => LabeledPoint(0,features))
26 |     
27 |     val trainingData = positiveLP.union(negativeLP)
28 |     trainingData.cache()
29 |     
30 |     val model = new LogisticRegressionWithSGD().run(trainingData)
31 |     
32 |     
33 |     val newMail = tf.transform("You have won 100000$ free".split(" "))
34 |     model.predict(newMail)
35 |     
36 |     
37 |     
38 |   }
39 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/mllib/TFIDF.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.mllib
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.feature.HashingTF
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.mllib.feature.IDF
10 | 
11 | object TFIDF {
12 |   
13 |   def main(args: Array[String]): Unit = {
14 |     
15 |    val conf = new SparkConf().setAppName("TFIDF")
16 |     val sc = new SparkContext(conf)
17 |     
18 |     val tf = new HashingTF()
19 |     
20 |     val docs = sc.wholeTextFiles("/spark_learning")    
21 |     val wordsRDD: RDD[Seq[String]] = docs.map{case (name,content)=> content.split(" ")}
22 |     println(wordsRDD.take(3))
23 |     val tfVectors = tf.transform(wordsRDD).cache
24 |     
25 |     val idf = new IDF()
26 |     val idfModel = idf.fit(tfVectors)
27 |     val tfIdfVectors = idfModel.transform(tfVectors)
28 |     
29 | 
30 |   }
31 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/mllib/house_data.csv:
--------------------------------------------------------------------------------
1 | 1000000,2,2010,1500
2 | 5000000,3,2015,2000
3 | 25000,1,200,900
4 | 


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/sql/FromJson.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.sql
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.sql.SQLContext
 6 | 
 7 | object FromJson {
 8 |   
 9 |   def main(args: Array[String]): Unit = {
10 |     val conf = new SparkConf().setAppName("SparkSQLBasics")
11 |     val sc = new SparkContext(conf)
12 |     val sqlContext = new SQLContext(sc)
13 |     
14 |     val input = sqlContext.read.json("/spark_learning/testweet.json")
15 |     
16 |     input.registerTempTable("tweets")
17 |     val texts = sqlContext.sql("select text from tweets")
18 |     
19 |     
20 |     //udf register
21 |     sqlContext.udf.register("strLen",(x:String)=>{findLength(x)})
22 |     texts.foreach(println)
23 |   }
24 |   
25 |   def findLength(x:String) = {
26 |     x.length
27 |   }
28 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/sql/HiveTest.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.sql
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.sql.hive.HiveContext
 6 | 
 7 | object HiveTest {
 8 |   
 9 |   def main(args: Array[String]): Unit = {
10 |     val conf = new SparkConf().setAppName("SparkSQLBasics")
11 |     val sc = new SparkContext(conf)
12 |     
13 |     val sqlContext = new HiveContext(sc)
14 |     
15 |     val input = sqlContext.read.json("/spark_learning/testweet.json")
16 |     input.registerTempTable("tweets")
17 |     val texts = sqlContext.sql("select text from tweets")
18 |     texts.saveAsTable("texts")
19 |   }
20 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/sql/ToMongoDB.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.sql
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.sql.SQLContext
 6 | import com.stratio.datasource._
 7 | import com.stratio.datasource.mongodb._
 8 | import com.stratio.datasource.mongodb.schema._
 9 | import com.stratio.datasource.mongodb.writer._
10 | import com.stratio.datasource.mongodb.config.MongodbConfig._
11 | import org.apache.spark.sql.SQLContext
12 | import com.stratio.datasource.util.Config._
13 | import com.stratio.datasource.mongodb.config.MongodbConfigBuilder
14 | 
15 | /**
16 |  * Using https://github.com/Stratio/Spark-MongoDB
17 |  */
18 | object ToMongoDB {
19 | 
20 |   def main(args: Array[String]): Unit = {
21 |     val conf = new SparkConf().setAppName("ToMongoDB")
22 |     val sc = new SparkContext(conf)
23 |     val sqlContext = new SQLContext(sc)
24 | 
25 |     val input = sqlContext.read.json("/spark_learning/testweet.json")
26 |     val avroInput = sqlContext.read.format("com.databricks.spark.avro").load("/spark_learning/avro/")
27 | 
28 |     input.registerTempTable("tweets")
29 |     val targetData = sqlContext.sql("Select * from tweets")
30 |     
31 | 
32 |     val targetOutputBuilder = MongodbConfigBuilder(
33 |       Map(Host -> List("localhost:27017"),
34 |         Database -> "test",
35 |         Collection -> "target",
36 |         SamplingRatio -> 1.0,
37 |         WriteConcern -> "normal",
38 |         SplitKey -> "_id",
39 |         SplitSize -> 8))
40 | 
41 |     val writeConfig =  targetOutputBuilder.build()
42 | 
43 |     // Writing data into the mongoDb table
44 |     //targetData.saveToMongodb(writeConfig)
45 |     //write avro data to mongodb dable
46 |     avroInput.saveToMongodb(writeConfig)
47 |   }
48 | 
49 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/sql/res/twitter.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soniclavier/bigdata-notebook/a708de834fd282ba576f15f87a46f3953695a9ad/spark/src/main/scala/com/vishnu/spark/sql/res/twitter.avro


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/sql/res/twitter.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type" : "record",
 3 |   "name" : "twitter_schema",
 4 |   "namespace" : "com.miguno.avro",
 5 |   "fields" : [ {
 6 |     "name" : "username",
 7 |     "type" : "string",
 8 |     "doc" : "Name of the user account on Twitter.com"
 9 |   }, {
10 |     "name" : "tweet",
11 |     "type" : "string",
12 |     "doc" : "The content of the user's Twitter message"
13 |   }, {
14 |     "name" : "timestamp",
15 |     "type" : "long",
16 |     "doc" : "Unix epoch time in seconds"
17 |   } ],
18 |   "doc:" : "A basic schema for storing Twitter messages"
19 | }
20 | 


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/FlumeStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.streaming._
 5 | import org.apache.spark.streaming.StreamingContext._
 6 | import org.apache.spark.streaming.flume._
 7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 8 | 
 9 | 
10 | object FlumeStreaming {
11 |   def main(args: Array[String]) {
12 |     
13 |     val host = "localhost"
14 |     val port = 4444
15 |     val conf = new SparkConf().setAppName("FlumeStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
16 |     val ssc = new StreamingContext(conf, Seconds(1))
17 |     
18 |     val stream = FlumeUtils.createStream(ssc, host, port)
19 |     val words = stream.flatMap(_.event.toString().split(" "))
20 |     val pairs = words.map(word => (word,1))
21 |     val wordCounts = pairs.reduceByKey(_+_)
22 |     
23 |     wordCounts.print()
24 |     
25 |     ssc.start()
26 |     ssc.awaitTermination()
27 |   }
28 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/KafkaDirectStream.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | 
 4 | import org.apache.spark._
 5 | import org.apache.spark.streaming.StreamingContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.kafka.KafkaUtils
 8 | import kafka.serializer.StringDecoder
 9 | 
10 | 
11 | 
12 | /**
13 |  * DirectStream approach periodically queries the kafka topic for new offset and takes in data
14 |  * from previous offset to new offset as an RDD
15 |  * 
16 |  * 1. creates as many RDD partitions as there are kafka partitions
17 |  * 2. no need of write ahead log to ensure no data loss
18 |  * 3. no zookeeper hence hence exactly-once guarantee can be maintained. In the case of zookeeper
19 |  * 			there might some miss communication b/w spark and zookeeper during failures and chances are
20 |  * 			there that some data may be read twice.
21 |  * 
22 |  */
23 | object KafkaDirectStream {
24 |   
25 |   def main(args :Array[String]) {
26 |     val conf = new SparkConf().setAppName("KafkaStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
27 |     val ssc = new StreamingContext(conf,Seconds(1))
28 |     val topics = "spark_streaming"
29 |     val topicsSet = topics.split(",").toSet
30 |     val brokers = "localhost:9092"
31 |     val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
32 |     val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)
33 |     val lines = messages.map(_._2)
34 |     
35 |     val words = lines.flatMap(_.split(" "))
36 |     val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
37 |     wordCounts.print()
38 | 
39 |     // Start the computation
40 |     ssc.start()
41 |     ssc.awaitTermination()
42 |     
43 |   }
44 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/KafkaStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark.streaming.kafka._
 4 | import org.apache.spark._
 5 | import org.apache.spark.streaming._
 6 | import org.apache.spark.streaming.StreamingContext._
 7 | 
 8 | /**
 9 |  * @author : vishnu viswanath
10 |  * Receiver based approach, i.e., used kafka consumer api to implement receiver
11 |  * Drawback : possible loss of data incase of failures
12 |  * 
13 |  * Solution : use write-ahead logs and Reliable receivers.
14 |  * Spark provides a built in ReliableKafkaReceiver class which is not used by default.
15 |  * To use this receiver, set spark.streaming.receiver.writeAheadLog.enable to true 
16 |  *  
17 |  *  
18 |  */
19 | object KafkaStreaming {
20 |   
21 |   def main(args: Array[String]) {
22 |     val conf = new SparkConf().setAppName("KafkaStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
23 |     val ssc = new StreamingContext(conf, Seconds(1))
24 |     
25 |     //default zookeeper quorum is localhost in single node setup
26 |     val zqQuorum = "localhost"
27 |     val groupId = "spark"
28 |     val topics = "spark_streaming"
29 |     val topicMap = topics.split(",").map((_, 1)).toMap
30 |     val lines = KafkaUtils.createStream(ssc,zqQuorum,groupId,topicMap)
31 |     val words = lines.map(_._2).flatMap(_.split(" "))
32 |     val pairs = words.map(word => (word,1))
33 |     val wordCounts = pairs.reduceByKey(_+_)
34 |     wordCounts.print()
35 |     
36 |     ssc.start()
37 |     ssc.awaitTermination()
38 |   }
39 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/SeqFileStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.streaming._
 5 | import org.apache.spark.streaming.StreamingContext._
 6 | import org.apache.hadoop.io.Text
 7 | import org.apache.hadoop.io.IntWritable
 8 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat
 9 | 
10 | 
11 | /**
12 |  * An example of how to stream from sequence file
13 |  */
14 | object SeqFileStreaming {
15 |   def main(args: Array[String]) {
16 |     
17 |     val conf = new SparkConf().setAppName("SeqFileStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
18 |     val ssc = new StreamingContext(conf, Seconds(10))
19 |     val inputDir = "/usr/vishnu/spark_temp/seqfile_sample/"
20 |     val keyValue = ssc.fileStream[Text,IntWritable, SequenceFileInputFormat[Text,IntWritable]](inputDir).map {
21 |       
22 |       //x.toString is needed because Text by itself is not serializ  able and it will throw an error
23 |       case (x,y) => (x.toString,y.get())
24 |     }
25 |     keyValue.print()
26 |     
27 |     ssc.start()
28 |     ssc.awaitTermination()
29 |   }
30 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/SocketStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.streaming._
 5 | import org.apache.spark.streaming.StreamingContext._
 6 | 
 7 | 
 8 | /**
 9 |  * Example from spark programming guide
10 |  * https://spark.apache.org/docs/1.4.1/streaming-programming-guide.html
11 |  */
12 | object SocketStreaming {
13 |   def main(args: Array[String]) {
14 |     
15 |     val conf = new SparkConf().setAppName("BasicStreaming").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
16 |     val ssc = new StreamingContext(conf, Seconds(1))
17 |     
18 |     val lines = ssc.socketTextStream("localhost", 9999)
19 |     val words = lines.flatMap(_.split(" "))
20 |     val pairs = words.map(word => (word,1))
21 |     val wordCounts = pairs.reduceByKey(_+_)
22 |     
23 |     wordCounts.print()
24 |     
25 |     ssc.start()
26 |     ssc.awaitTermination()
27 |   }
28 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/StreamingFromCheckpoint.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.StreamingContext
 5 | import org.apache.spark.streaming.Seconds
 6 | 
 7 | 
 8 | object StreamingFromCheckpoint {
 9 |   
10 |   val checkpoint_dir = "/user/vishnu/spark_checkpoint"
11 |   var dataDir = ""
12 |   def main(args: Array[String]): Unit = {
13 |     dataDir = args(0)
14 |     val ssc = StreamingContext.getOrCreate(checkpoint_dir,createStreamingContext _)
15 |     
16 |     ssc.start()
17 |     ssc.awaitTermination() 
18 |   }
19 |   
20 |   def createStreamingContext() = {
21 |     println("creating new stream")
22 |     val conf = new SparkConf().setAppName("StreamingRecoverFromCheckpoint")
23 |     val ssc = new StreamingContext(conf,Seconds(10))
24 |     ssc.checkpoint(checkpoint_dir)
25 |     val dataDirDStream = ssc.textFileStream(dataDir)
26 |     dataDirDStream.print()
27 |     ssc
28 |   }
29 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/StreamingJoins.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.streaming._
 5 | import org.apache.spark.streaming.StreamingContext._
 6 | 
 7 | 
 8 | /**
 9 |  * An example with multiple receivers and stream joins
10 |  * This is also an example of Multiple DStream, => this created multiple receivers 
11 |  */
12 | object StreamingJoins {
13 |   def main(args: Array[String]) {
14 |     
15 |     val conf = new SparkConf().setAppName("StreamingJoins").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
16 |     val ssc = new StreamingContext(conf, Seconds(10))
17 |     
18 |     val stream1 = ssc.socketTextStream("localhost", 9999)
19 |     val stream2 = ssc.socketTextStream("localhost", 8888)
20 |     
21 |     
22 |     val words1 = stream1.map(processLine)
23 |     val words2 = stream2.map(processLine)
24 |     val joined = words1.join(words2)
25 |     joined.print()
26 |     
27 |     ssc.start()
28 |     ssc.awaitTermination()
29 |   }
30 |   
31 |   def processLine(line:String) = {
32 |     val words = line.split(" ")
33 |       (words(0),line)
34 |   }
35 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/StreamingWindow.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.streaming.StreamingContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming._
 8 | import org.apache.spark.SparkContext._
 9 | import org.apache.spark.streaming.api._
10 | import org.apache.spark.streaming.dstream.DStream
11 | import org.apache.spark.streaming.StreamingContext._
12 | import org.apache.spark.SparkContext._
13 | import org.apache.spark.sql.SQLContext
14 | 
15 | object StreamingWindow {
16 | 
17 |   def main(args: Array[String]): Unit = {
18 |     //confs
19 |     val conf = new SparkConf().setAppName("StreamingWindow")
20 |     val sc = new SparkContext(conf)
21 |     val ssc = new StreamingContext(sc, Seconds(1))
22 |     
23 |     //stream from text file
24 |     val linesDStream = ssc.textFileStream("/user/vishnu/mapr/dev362");
25 |     val sensorDStream = linesDStream.map(parseSensor)
26 |     
27 |     //count of events by resid
28 |     val counts = sensorDStream.map(sensor=>(sensor.resid,1)).reduceByKeyAndWindow((a:Int,b:Int)=>(a+b), Seconds(6), Seconds(2))
29 |     counts.print()
30 |     
31 |     //6 seconds data, 2 seconds window
32 |     sensorDStream.window(Seconds(6),Seconds(2)).foreachRDD {
33 |       rdd =>
34 |         if (!rdd.partitions.isEmpty) {
35 |           val sqlContext = SQLContext.getOrCreate(rdd.sparkContext)
36 |           import sqlContext.implicits._
37 |           import org.apache.spark.sql.functions._
38 |           
39 |           val sensorDF = rdd.toDF()
40 |           sensorDF.registerTempTable("sensor")
41 |           
42 |           val res = sqlContext.sql("SELECT resid, date, count(resid) as total FROM sensor GROUP BY resid, date")
43 |           println("sensor count ")
44 |           res.show
45 |           val res2 = sqlContext.sql("SELECT resid, date, MAX(psi) as maxpsi, min(psi) as minpsi, avg(psi) as avgpsi FROM sensor GROUP BY resid,date")
46 |           println("sensor max, min, averages ")
47 |           res2.show
48 |         }
49 |     }
50 |     
51 |     
52 |     println("Starting streaming")
53 |     ssc.start()
54 |     ssc.awaitTermination()
55 |    
56 |     
57 |   }
58 |   
59 |   case class Sensor(resid: String, date: String, time: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double) extends Serializable
60 |   
61 |   def parseSensor(str: String): Sensor = {
62 |     val p = str.split(",")
63 |     Sensor(p(0), p(1), p(2), p(3).toDouble, p(4).toDouble, p(5).toDouble, p(6).toDouble, p(7).toDouble, p(8).toDouble)
64 |   }
65 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/StreamingWithCheckpoint.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.streaming._
 5 | import org.apache.spark.streaming.StreamingContext._
 6 | 
 7 | 
 8 | /**
 9 |  * An example of Streaming with checkpointing
10 |  */
11 | object StreamingWithCheckpointing {
12 |   def main(args: Array[String]) {
13 |     
14 |     val conf = new SparkConf().setAppName("StreamingWithCheckpointing").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
15 |     val ssc = new StreamingContext(conf, Seconds(1))
16 |     ssc.checkpoint("hdfs:///user/vishnu/spark_checkpoint")
17 |     
18 |     val linesDStream = ssc.socketTextStream("localhost", 9999)
19 |     
20 |     val lines = linesDStream.window(Seconds(5),Seconds(10))
21 |     val words = lines.flatMap(_.split(" "))
22 |     val pairs = words.map(word => (word,1))
23 |     
24 |     pairs.checkpoint(Seconds(10));
25 |     val wordCounts = pairs.reduceByKey(_+_)
26 |     
27 |     wordCounts.print()
28 |     
29 |     ssc.start()
30 |     ssc.awaitTermination()
31 |   }
32 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/UpdateStateByKey.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.streaming._
 5 | import org.apache.spark.streaming.StreamingContext._
 6 | 
 7 | 
 8 | /**
 9 |  * An example of update state by key
10 |  * Each log entry contains <userid> <action>
11 |  * based on the action, state of the userid is udpated
12 |  */
13 | object UpdateStateByKey {
14 |   def main(args: Array[String]) {
15 |     
16 |     
17 |     val conf = new SparkConf().setAppName("UpdateStateByKey").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
18 |     val ssc = new StreamingContext(conf, Seconds(1))
19 |     ssc.checkpoint("hdfs:///user/vishnu/spark_checkpoint")
20 |     
21 |     
22 |     val linesDStream = ssc.socketTextStream("localhost", 9999)
23 |     
24 |     //input is expected to be of the format <userid> <action>
25 |     val userActionPair = linesDStream.map(line => {
26 |       val parts = line.split(" ")
27 |       (parts(0),parts(1))
28 |     })
29 |     
30 |     val userStates = userActionPair.updateStateByKey(updateUserState)
31 |     userStates.print()
32 |     
33 |     ssc.start()
34 |     ssc.awaitTermination()
35 |   }
36 |   
37 |   def updateUserState(values: Seq[String], state:Option[String]) = {
38 |     val currState = state.getOrElse("Unknown")
39 |     var newState = Option(currState)
40 |     if (!currState.equals(values.lastOption)) {
41 |       if (values.lastOption != None) {
42 |         newState = values.lastOption
43 |       }
44 |     }
45 |     newState
46 |   }
47 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/WindowedStream.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.streaming._
 5 | import org.apache.spark.streaming.StreamingContext._
 6 | 
 7 | 
 8 | /**
 9 |  * Streaming with sliding window
10 |  */
11 | object WindowedStream {
12 |   def main(args: Array[String]) {
13 |     
14 |     val conf = new SparkConf().setAppName("StreamingWithCheckpointing").setMaster("spark://Vishnus-MacBook-Pro.local:7077")
15 |     val ssc = new StreamingContext(conf, Seconds(1))
16 |     ssc.checkpoint("hdfs:///user/vishnu/spark_checkpoint")
17 |     
18 |     
19 |     val linesDStream = ssc.socketTextStream("localhost", 9999)
20 |     val lines = linesDStream.window(Seconds(10),Seconds(5))
21 |     val words = lines.flatMap(_.split(" "))
22 |     val pairs = words.map(word => (word,1))
23 |     pairs.checkpoint(Seconds(10));
24 |     val wordCounts = pairs.reduceByKey(_+_)
25 |     //wordCounts.print()
26 |     
27 |     
28 |     //reduce by key and window, will do reduce by key and use the first function to do the aggregation
29 |     //and second function to do the inverse aggregation
30 |     val windowedWordCount = pairs.reduceByKeyAndWindow({(x,y)=>x+y},{(x,y)=>x-y}, Seconds(10),Seconds(5))
31 |     //windowedWordCount.print()
32 |     
33 |     
34 |     
35 |     //expected input <ip> <body>
36 |     //e.g, 10.90.123.42 some long log contenxt
37 |     val logsDStream = ssc.socketTextStream("localhost", 8888)
38 |     val ipAddress = logsDStream.map(line => line.split(" ")(0))
39 |     val count1 = ipAddress.countByValueAndWindow(Seconds(10),Seconds(5));
40 |     val count2 = ipAddress.countByWindow(Seconds(10),Seconds(5));
41 |     
42 |     count1.print()
43 |     count2.print()
44 |     
45 |     
46 |     
47 |     
48 |     
49 |     
50 |     ssc.start()
51 |     ssc.awaitTermination()
52 |   }
53 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/akka/SendToActor.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming.akka
 2 | 
 3 | import org.apache.spark._
 4 | import akka.actor.ActorSystem
 5 | 
 6 | /**
 7 |  * This object is used to send message to HelloSpark akka Actor
 8 |  */
 9 | object SendToActor {
10 |   
11 |   def main(args: Array[String]) : Unit = {
12 |     val actorSystem = ActorSystem("sparkMaster")
13 |     
14 |     val url = s"akka.tcp://sparkDriver@$SparkAkkaSource.driverHost:$SparkAkkaSource.driverPort/user/Supervisor0/$SparkAkkaSource.actorName"
15 |     val helloer = actorSystem.actorSelection(url)
16 |     
17 |     var ok = true
18 |     while (ok) {
19 |       val ln = readLine()
20 |       ok = ln != null
21 |       if (ok) {
22 |         helloer ! ln
23 |       }
24 |     }
25 |   }
26 |   
27 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/akka/SparkAkkaSource.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming.akka
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.streaming._
 5 | import akka.actor.Props
 6 | import org.apache.spark.streaming.receiver.ActorHelper
 7 | import akka.actor.Actor
 8 | 
 9 | /**
10 |  * Example form http://www.lightbend.com/activator/template/spark-streaming-scala-akka#code/src/main/scala/StreamingApp.scala
11 |  */
12 | 
13 | //INCOMPLETE
14 | 
15 | class HelloSpark extends Actor with ActorHelper {
16 | 
17 |   override def preStart() = {
18 |     println("")
19 |     println("Starting HelloSpark Actor")
20 |     println("")
21 |   }
22 | 
23 |   def receive = {
24 |     case s => store(s)
25 |   }
26 | }
27 | 
28 | object SparkAkkaSource {
29 | 
30 |   //fix a driver port, this is by default random (but now we need to know what driver port is)
31 |   val driverPort = 7777
32 |   val driverHost = "localhost"
33 |   val actorName = "HelloSparkActor"
34 | 
35 |   def main(args: Array[String]): Unit = {
36 | 
37 |     val conf = new SparkConf(false)
38 |       .setMaster("local[*]")
39 |       .setAppName("Spark Streaming from Akka")
40 |       .set("spark.logConf", "true")
41 |       .set("spark.driver.port", driverPort.toString)
42 |       .set("spark.driver.host", driverHost)
43 |       .set("spark.akka,logLifeCycleEvents", "true")
44 | 
45 |     val ssc = new StreamingContext(conf, Seconds(1))
46 | 
47 |     val actorStream = ssc.actorStream[String](Props[HelloSpark], actorName)
48 |     actorStream.print()
49 | 
50 |     ssc.start()
51 |     java.util.concurrent.TimeUnit.SECONDS.sleep(3)
52 | 
53 |     val actorSystem = SparkEnv.get.actorSystem
54 | 
55 |     val url = s"akka.tcp://sparkDriver@$driverHost:$driverPort/user/vishnu/$actorName"
56 |     val helloer = actorSystem.actorSelection(url)
57 |     helloer ! "Hello"
58 |     helloer ! "from"
59 |     helloer ! "Spark Streaming"
60 |     helloer ! "with"
61 |     helloer ! "Scala"
62 |     helloer ! "and"
63 |     helloer ! "Akka"
64 | 
65 |     val ln = readLine()
66 |     ssc.stop(stopSparkContext = true, stopGracefully = true)
67 | 
68 |   }
69 | 
70 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/customsource/ActivityReceiver.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming.customsource
 2 | 
 3 | import org.apache.spark.streaming.receiver.Receiver
 4 | import org.apache.spark.storage.StorageLevel
 5 | import scala.util.control.Breaks._
 6 | import java.net.Socket
 7 | import java.io.BufferedReader
 8 | import java.io.InputStreamReader
 9 | 
10 | 
11 | case class Activity(user: String,action:String)
12 | 
13 | /**
14 |  * Based on https://www.mapr.com/blog/how-integrate-custom-data-sources-apache-spark?platform=hootsuite
15 |  */
16 | class ActivityReceiver(port:Int) extends Receiver[Activity] (StorageLevel.MEMORY_ONLY){
17 |   
18 |   override def onStart(): Unit = {
19 |     println("Activity Receiver starting")
20 |     val thread = new Thread("ActivityReceiverThread") {
21 |       override def run() {
22 |         val socket = new Socket("localhost",port)
23 |         val reader = new BufferedReader(new InputStreamReader (socket.getInputStream(), "UTF-8"))
24 |         var line = ""
25 |         while(!isStopped()) {
26 |           var line = reader.readLine()
27 |           if (line == null) break
28 |           else {
29 |             val parts = line.split(" ")
30 |             val activity = Activity(parts(0),parts(1))
31 |             store(activity)
32 |           }
33 |         }
34 |       }
35 |     }
36 |     thread.start()
37 |   }
38 |   
39 |   override def onStop(): Unit = {
40 |     stop("Activity receiver stopping")
41 |   }
42 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/com/vishnu/spark/streaming/customsource/StreamingWithCustomSource.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnu.spark.streaming.customsource
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.StreamingContext
 5 | import org.apache.spark.streaming.Seconds
 6 | 
 7 | object StreamingWithCustomSource {
 8 |   
 9 |   def main(args: Array[String]): Unit = {
10 |     val conf = new SparkConf().setAppName("StreamingWithCustomSource")
11 |     val ssc = new StreamingContext(conf,Seconds(5))
12 |     
13 |     val activityDStream = ssc.receiverStream(new ActivityReceiver(9999))
14 |     activityDStream.print()
15 |     
16 |     ssc.start()
17 |     ssc.awaitTermination()
18 |     
19 |   }
20 | }


--------------------------------------------------------------------------------
/spark/uberjar.md:
--------------------------------------------------------------------------------
 1 | ## How to create Uber/Fat jar using sbt
 2 | 
 3 | 1. create assembly.sbt file inside your project folder. 
 4 |   e.g., if your eclipse project root folder is spark_learn, then you should have spark_learn/project/assembly.sbt
 5 | 2. add below lines to assembly.sbt
 6 |   ```
 7 |   addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.2")
 8 |   ```
 9 | 3. To create the jar, from your sbt console type `assembly`. Note that you won't get a fat jar if you use the command `package`
10 | 4. All dependecies would now be packaged in your jar file. to exclude a jar file and its dependencies, mention it as provided.
11 |   e.g., `val spark_streaming = "org.apache.spark" % "spark-streaming_2.10" % "1.6.0" % "provided"`
12 | 5. You might need to add some merge starategies since multiple depnedencies can depend on same dependency(e.g., A can depend on C and B can depend on another verision of C)
13 |   .
14 |   This is how my merge strategy is defined. Works so far for Spark
15 | 
16 | ```
17 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
18 |   {
19 |     case PathList("javax", "servlet", xs @ _*)         => MergeStrategy.first
20 |     case PathList(ps @ _*) if ps.last endsWith ".html" => MergeStrategy.first
21 |     case x if x.contains("unused") => MergeStrategy.last
22 |     case "application.conf" => MergeStrategy.concat
23 |     case "unwanted.txt"     => MergeStrategy.discard
24 |     case x => old(x)
25 |   }
26 | }
27 | ```
28 | 


--------------------------------------------------------------------------------
/spark_23/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "spark_23"
 2 | version := "1.0"
 3 | scalaVersion := "2.11.9"
 4 | 
 5 | val sparkVersion = "2.4.0-SNAPSHOT"
 6 | val kafkaVersion = "0.10.2.1"
 7 | 
 8 | resolvers += Resolver.mavenLocal
 9 | 
10 | libraryDependencies += "org.apache.spark" %% "spark-sql-kafka-0-10" % sparkVersion
11 | libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion
12 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion
13 | libraryDependencies += "org.apache.kafka" % "kafka-clients" % kafkaVersion
14 | 
15 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.2" % Test
16 | 
17 |     


--------------------------------------------------------------------------------
/spark_23/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.16


--------------------------------------------------------------------------------
/spark_23/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/ContinuousKafkaStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming
 2 | 
 3 | import java.sql.Timestamp
 4 | 
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.streaming.Trigger
 7 | 
 8 | 
 9 | //convert aggregates into typed data
10 | case class CarEvent(carId: String, speed: Option[Int], acceleration: Option[Double], timestamp: Timestamp)
11 | object CarEvent {
12 |   def apply(rawStr: String): CarEvent = {
13 |     val parts = rawStr.split(",")
14 |     CarEvent(parts(0), Some(Integer.parseInt(parts(1))), Some(java.lang.Double.parseDouble(parts(2))), new Timestamp(parts(3).toLong))
15 |   }
16 | }
17 | 
18 | /**
19 |   * Created by vviswanath on 2/18/18.
20 |   */
21 | object ContinuousKafkaStreaming {
22 | 
23 |   def main(args: Array[String]): Unit = {
24 |     val spark = SparkSession.builder()
25 |       .appName("ContinuousStreaming Kafka example")
26 |       .master("local[*]")
27 |       .getOrCreate()
28 | 
29 |     import spark.implicits._
30 | 
31 |     val raw = spark
32 |       .readStream
33 |       .format("kafka")
34 |       .option("kafka.bootstrap.servers", "localhost:9092")
35 |       .option("subscribe", "cars")
36 |       .load()
37 | 
38 |     //supported operations in Continuous Processing includes - Map, Filter, Project
39 |     val fastCars = raw
40 |       .selectExpr("CAST(value as STRING)") //project
41 |       .map(r ⇒ CarEvent(r.getString(0))) //map
42 |       .filter("speed > 70") //filter
43 |       //.filter(c ⇒ c.speed.getOrElse(0) > 70) //TypedFilter not supported in continuous processing,
44 | 
45 | 
46 |    val consoleQuery = fastCars
47 |       .writeStream
48 |       .format("console")
49 |       .outputMode("append")
50 |       //.outputMode("update")
51 |       //.outputMode("complete") not supported since it requires an agg, and Continuous processing does not support aggregations.
52 |       .trigger(Trigger.Continuous("1 second"))
53 |       .start()
54 | 
55 | 
56 |     val kafkaSinkQuery = fastCars
57 |       .selectExpr("CAST(carId as STRING) as value") //kafka needs a value field
58 |       .writeStream
59 |       .format("kafka")
60 |       .outputMode("update")
61 |       .option("kafka.bootstrap.servers", "localhost:9092")
62 |       .option("topic", "fastcars")
63 |       .option("checkpointLocation", "/tmp/spark/continuousCheckpoint")
64 |       .outputMode("update")
65 |       .trigger(Trigger.Continuous("10 seconds")) //how often to checkpoint the offsets,
66 |       .start()
67 | 
68 |     spark.streams.awaitAnyTermination()
69 | 
70 |   }
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/CustomV2SourceExample.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming
 2 | import com.vishnuviswanath.spark.streaming.sources.netcat.NetcatSourceProvider
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.Trigger
 5 | 
 6 | /**
 7 |   * Created by vviswanath on 2/20/18.
 8 |   *
 9 |   * An example that uses CustomV2 source {@link NetcatSourceProvider}
10 |   */
11 | object CustomV2SourceExample {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 |     val spark = SparkSession.builder()
15 |       .appName("CustomV2 source")
16 |       .master("local[*]")
17 |       .getOrCreate()
18 | 
19 |     spark.sparkContext.setLogLevel("ERROR")
20 | 
21 |     import spark.implicits._
22 |     val raw = spark
23 |       .readStream
24 |       .format(classOf[NetcatSourceProvider].getName)
25 |       .option("port", 9999)
26 |       .option("host", "localhost")
27 |       .option("buffSize", 100)
28 |       .load()
29 | 
30 |     val consoleQuery = raw
31 |       .selectExpr("cast(value as STRING)")
32 |       .writeStream
33 |       .queryName("console-query")
34 |       .format("console")
35 |       .outputMode("update")
36 |       //.outputMode("update")
37 |       //.outputMode("complete") not supported since it requires an agg, and Continuous processing does not support aggregations.
38 |       .trigger(Trigger.Continuous("3 second")) //how often to checkpoint
39 |       .start()
40 | 
41 |     consoleQuery.awaitTermination()
42 | 
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/HelloStructredStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming
 2 | 
 3 | import org.apache.spark.sql.{Dataset, SparkSession}
 4 | 
 5 | /**
 6 |   * Created by vviswanath on 1/9/18.
 7 |   *
 8 |   * Word count program to get started with Spark Structured Streaming
 9 |   */
10 | object HelloStructredStreaming {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     //create a spark session, and run it on local mode
15 |     val spark = SparkSession.builder()
16 |       .appName("HelloStructuredStreaming")
17 |       .master("local[*]")
18 |       .getOrCreate()
19 | 
20 |     import spark.implicits._
21 | 
22 |     //read from a directory as text stream
23 |     val readme: Dataset[String] = spark
24 |       .readStream
25 |       .textFile("/Users/vviswanath/Downloads/streaming_input_dir/cars/")
26 | 
27 |     //do word count
28 |     val words = readme.flatMap(_.split(" "))
29 |     val wordCounts = words.groupBy("value").count()
30 | 
31 |     //run the wordCount query and write to console
32 |     val query = wordCounts
33 |         .writeStream
34 |         .queryName("WordCount")
35 |         .outputMode("complete")
36 |         .format("console")
37 |         .start()
38 | 
39 |     //wait till query.stop() is called
40 |     query.awaitTermination()
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/SocketSourceStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming
 2 | 
 3 | import org.apache.spark.sql.streaming.Trigger
 4 | import org.apache.spark.sql.{Dataset, SparkSession}
 5 | 
 6 | /**
 7 |   * Created by vviswanath on 1/9/18.
 8 |   *
 9 |   * Wordcount from socket streams.
10 |   *
11 |   * nc -lk 9999
12 |   */
13 | object SocketSourceStreaming {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 | 
17 |     //create a spark session, and run it on local mode
18 |     val spark = SparkSession.builder()
19 |       .appName("NetcatSourceStreaming")
20 |       .master("local[*]")
21 |       .getOrCreate()
22 | 
23 |     spark.sparkContext.setLogLevel("ERROR")
24 | 
25 |     import spark.implicits._
26 | 
27 |     //read from a directory as text stream
28 |     val socketData = spark
29 |       .readStream
30 |       .format("socket")
31 |       .option("host", "localhost")
32 |       .option("port", 9999)
33 |       .load()
34 | 
35 |     //do word count
36 |     val words = socketData.as[String].flatMap(_.split(" "))
37 |     val wordCounts = words.groupBy("value").count()
38 | 
39 |     //run the wordCount query and write to console
40 |     val query = wordCounts
41 |         .writeStream
42 |         .queryName("WordCount")
43 |         .outputMode("update") //output only the counts that changed
44 |         //.outputMode("complete") //output all the counts seen till now
45 |         .format("console")
46 |         //.trigger(Trigger.ProcessingTime(5000))  //triggers the query every "interval" if any new element was received.
47 |         .start()
48 | 
49 | 
50 | 
51 |     //wait till query.stop() is called
52 |     query.awaitTermination()
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/StreamingAggregations.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.types._
 5 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 6 | 
 7 | /**
 8 |   * Created by vviswanath on 1/10/18.
 9 |   *
10 |   * Explore Spark streaming aggregations.
11 |   */
12 | object StreamingAggregations {
13 | 
14 |   //convert aggregates into typed data
15 |   case class CarEvent(car: String, speed: Option[Int], acceleration: Option[Double])
16 | 
17 |   def main(args: Array[String]): Unit = {
18 | 
19 |     //create a spark session, and run it on local mode
20 |     val spark = SparkSession.builder()
21 |       .appName("StreaminAggregations")
22 |       .master("local[*]")
23 |       .getOrCreate()
24 | 
25 |     //spark.sparkContext.setLogLevel("WARN")
26 | 
27 |     import spark.implicits._
28 | 
29 |     //define the schema
30 |     val schema = StructType(
31 |       StructField("car", StringType) ::
32 |       StructField("speed", IntegerType) ::
33 |       StructField("acceleration", DoubleType) :: Nil)
34 | 
35 |     //read the source
36 |     val cars: DataFrame = spark
37 |       .readStream
38 |       .schema(schema)
39 |       .csv("/Users/vviswanath/Downloads/streaming_input_dir/cars/")
40 | 
41 |     //do aggregates
42 |     val aggregates = cars
43 |       .groupBy("car")
44 |       .agg(
45 |         "speed" → "max",
46 |         "acceleration" → "avg")
47 |       .withColumnRenamed("max(speed)", "speed")
48 |       .withColumnRenamed("avg(acceleration)", "acceleration")
49 | 
50 |     aggregates.printSchema()
51 |     aggregates.explain()
52 | 
53 |     val typedAggregates = aggregates.as[CarEvent]
54 |     val filtered  = typedAggregates
55 |       .filter(_.speed.exists(_ > 70))
56 |       .where("acceleration > 10")
57 |       .repartition(10)
58 | 
59 |     val query = filtered
60 |       .writeStream
61 |       .queryName("fastVehicles")
62 |       .partitionBy("car")
63 |       .outputMode("complete")
64 |       .format("console")
65 |       .start()
66 | 
67 |     query.awaitTermination()
68 | 
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/sources/netcat/NetcatContinuousReader.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming.sources.netcat
 2 | 
 3 | import java.util
 4 | import java.util.Optional
 5 | 
 6 | import org.apache.spark.sql.Row
 7 | import org.apache.spark.sql.sources.v2.reader.{DataReader, DataReaderFactory}
 8 | import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, Offset, PartitionOffset}
 9 | import org.apache.spark.sql.types.StructType
10 | 
11 | /**
12 |   * Created by vviswanath on 2/21/18.
13 |   */
14 | class NetcatContinuousReader(schema: StructType,
15 |                              sourceOptions: Map[String, String]) extends ContinuousReader {
16 | 
17 | 
18 |   val numPartitions = 1
19 | 
20 |   private var offset: Offset = _
21 | 
22 |   override def getStartOffset: Offset = offset
23 | 
24 |   override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = new NetcatOffset
25 | 
26 |   override def setStartOffset(start: Optional[Offset]): Unit = {}
27 | 
28 |   override def deserializeOffset(json: String): Offset = new NetcatOffset
29 | 
30 |   override def commit(end: Offset): Unit = {}
31 | 
32 |   /**
33 |     * Create a reader factory with just 1 reader.
34 |     * @return
35 |     */
36 |   override def createDataReaderFactories(): util.List[DataReaderFactory[Row]] = {
37 |     java.util.Arrays.asList(new DataReaderFactory[Row] {
38 |       val port = sourceOptions.getOrElse("port", "9999").toInt
39 |       val host = sourceOptions.getOrElse("host", "localhost")
40 |       val buffSize = sourceOptions.getOrElse("buffSize", "100").toInt
41 |       override def createDataReader(): DataReader[Row] = new NetcatReader(port, host, buffSize)
42 |     })
43 |   }
44 | 
45 |   override def readSchema(): StructType = schema
46 | 
47 |   override def stop(): Unit = {}
48 | }
49 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/sources/netcat/NetcatOffset.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming.sources.netcat
 2 | 
 3 | import org.apache.spark.sql.sources.v2
 4 | import org.json4s.jackson.Serialization
 5 | 
 6 | /**
 7 |   * Created by vviswanath on 2/21/18.
 8 |   */
 9 | class NetcatOffset extends v2.reader.streaming.Offset {
10 | 
11 |   override def json(): String = "{}"
12 | }
13 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/sources/netcat/NetcatReader.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming.sources.netcat
 2 | 
 3 | import java.io.{BufferedReader, InputStreamReader}
 4 | import java.net.Socket
 5 | 
 6 | import org.apache.spark.sql.Row
 7 | import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousDataReader, PartitionOffset}
 8 | 
 9 | /**
10 |   * Created by vviswanath on 2/21/18.
11 |   */
12 | class NetcatReader(port: Int, host: String, buffSize: Int) extends ContinuousDataReader[Row] {
13 | 
14 |   val conn = new Socket(host, port)
15 |   val inReader = new BufferedReader(new InputStreamReader(conn.getInputStream))
16 | 
17 |   var line: String = _
18 | 
19 | 
20 |   override def next(): Boolean = {
21 |     line = inReader.readLine()
22 |     line != null
23 |   }
24 | 
25 |   override def get(): Row = {
26 |     //print(s"read value $line")
27 |     Row(line)
28 |   }
29 | 
30 |   override def close(): Unit = {
31 |     conn.close()
32 |   }
33 | 
34 |   override def getOffset: PartitionOffset = NetcatPartitionOffset(0)
35 | }
36 | 
37 | case class NetcatPartitionOffset(offset: Long) extends PartitionOffset
38 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/sources/netcat/NetcatSourceProvider.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming.sources.netcat
 2 | 
 3 | import java.util.Optional
 4 | 
 5 | import org.apache.spark.sql.sources.DataSourceRegister
 6 | import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceOptions}
 7 | import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader
 8 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 9 | 
10 | import scala.collection.JavaConverters._
11 | 
12 | /**
13 |   * Created by vviswanath on 2/21/18.
14 |   */
15 | class NetcatSourceProvider extends ContinuousReadSupport
16 |   with DataSourceRegister {
17 | 
18 |   override def shortName(): String = "netcat"
19 | 
20 |   val netcatSchema: StructType = StructType(Seq(StructField("value", StringType)))
21 | 
22 |   override def createContinuousReader(schema: Optional[StructType], checkpointLocation: String, options: DataSourceOptions): ContinuousReader = {
23 |     new NetcatContinuousReader(netcatSchema, options.asMap().asScala.toMap)
24 |   }
25 | }


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/util/NetcatProducer.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.util
 2 | 
 3 | import java.io.{BufferedWriter, OutputStreamWriter, PrintWriter}
 4 | import java.net.ServerSocket
 5 | 
 6 | /**
 7 |   * Created by vviswanath on 2/21/18.
 8 |   *
 9 |   * A util for writing to a socket}
10 |   */
11 | object NetcatProducer {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 |     val port = 9999
15 |     val server = new ServerSocket(port)
16 |     val sleepInterval = 100
17 | 
18 | 
19 |     val socket = server.accept()
20 |     val outputStream = socket.getOutputStream
21 | 
22 |     val writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(outputStream)))
23 | 
24 |     val contentStream = WordsStream.stream
25 | 
26 |     for {
27 |       word ← contentStream
28 |     } {
29 |       print(s"writing word $word\n")
30 |       writer.println(word)
31 |       writer.flush()
32 |       Thread.sleep(sleepInterval)
33 |     }
34 | 
35 |     print("close the producer?")
36 |     System.in.read()
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/util/RandomCarsKafkaProducer.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.util
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 6 | 
 7 | import scala.annotation.tailrec
 8 | import scala.util.{Random ⇒ r}
 9 | /**
10 |   * Created by vviswanath on 1/15/18.
11 |   */
12 | object RandomCarsKafkaProducer {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 |     val props = new Properties()
16 |     props.put("bootstrap.servers", "localhost:9092")
17 |     props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
18 |     props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
19 | 
20 |     val producer = new KafkaProducer[String, String](props)
21 |     val interval = 1000
22 |     val topic = "cars"
23 |     val numRecsToProduce: Option[Int] = None //None = infinite
24 | 
25 | 
26 |     @tailrec
27 |     def produceRecord(numRecToProduce: Option[Int]): Unit = {
28 |       def generateCarRecord(topic: String): ProducerRecord[String, String] = {
29 |         val carName = s"car${r.nextInt(10)}"
30 |         val speed = r.nextInt(150)
31 |         val acc = r.nextFloat * 100
32 | 
33 |         val value = s"$carName,$speed,$acc,${System.currentTimeMillis()}"
34 |         print(s"Writing $value\n")
35 |         val d = r.nextFloat() * 100
36 |         if (d < 2) {
37 |           //induce random delay
38 |           println("Argh! some network dealy")
39 |           Thread.sleep((d*100).toLong)
40 |         }
41 |         new ProducerRecord[String, String](topic,"key", value)
42 |       }
43 | 
44 |       numRecToProduce match {
45 |         case Some(x) if x > 0 ⇒
46 |           producer.send(generateCarRecord(topic))
47 |           Thread.sleep(interval)
48 |           produceRecord(Some(x - 1))
49 | 
50 |         case None ⇒
51 |           producer.send(generateCarRecord(topic))
52 |           Thread.sleep(interval)
53 |           produceRecord(None)
54 | 
55 |         case _ ⇒
56 |       }
57 |     }
58 | 
59 |     produceRecord(numRecsToProduce)
60 | 
61 | 
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/util/SimulateLateDateProducer.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.util
 2 | 
 3 | import java.text.SimpleDateFormat
 4 | import java.util.{Calendar, Properties}
 5 | 
 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 7 | 
 8 | import scala.annotation.tailrec
 9 | import scala.util.{Random => r}
10 | 
11 | /**
12 |   * Created by vviswanath on 1/15/18.
13 |   */
14 | object SimulateLateDateProducer {
15 | 
16 |   def main(args: Array[String]): Unit = {
17 |     val props = new Properties()
18 |     props.put("bootstrap.servers", "localhost:9092")
19 |     props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
20 |     props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
21 | 
22 |     val producer = new KafkaProducer[String, String](props)
23 |     val topic = "cars"
24 |     var mCount = 1
25 | 
26 |     def generateCarRecord(carName: String, speed: Int = r.nextInt(150), topic: String = topic, lateby: Long = 0): ProducerRecord[String, String] = {
27 |       val acc = r.nextFloat * 100
28 |       val nowTs = System.currentTimeMillis()
29 |       val ts = nowTs - lateby
30 |       val value = s"$carName,$speed,$acc,$ts"
31 |       val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
32 | 
33 |       val cal = Calendar.getInstance()
34 |       cal.setTimeInMillis(ts)
35 | 
36 |       val now = Calendar.getInstance()
37 |       now.setTimeInMillis(nowTs)
38 |       print(s"[$mCount] Writing $value at ${format.format(now.getTime)} with Event time = ${format.format(cal.getTime)}\n")
39 |       mCount += 1
40 |       new ProducerRecord[String, String](topic,"key", value)
41 |     }
42 | 
43 |     producer.send(generateCarRecord("car1", speed = 75))
44 |     Thread.sleep(1000)
45 |     producer.send(generateCarRecord("car2", speed = 20))
46 |     Thread.sleep(1000)
47 |     producer.send(generateCarRecord("car2", speed = 20))
48 |     Thread.sleep(8000)
49 |     producer.send(generateCarRecord("car2", speed = 20))  //this message has a hidden importance, it increments the event time
50 |     Thread.sleep(3000)
51 |     producer.send(generateCarRecord("car1", speed = 50, lateby = 12000))
52 | 
53 |      /*
54 |      this will not throw away the state for the last message even though its past the watermark, since the eventtime never got updated in between
55 |     producer.send(generateCarRecord("car1", speed = 75))
56 |     Thread.sleep(1000)
57 |     producer.send(generateCarRecord("car2", speed = 20))
58 |     Thread.sleep(1000)
59 |     producer.send(generateCarRecord("car2", speed = 20))
60 |     Thread.sleep(1000)
61 |     producer.send(generateCarRecord("car2", speed = 20))
62 |     Thread.sleep(8000)
63 |     producer.send(generateCarRecord("car1", speed = 50, lateby = 12000))*/
64 |   }
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/util/ToFileProducer.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.util
 2 | 
 3 | import java.io._
 4 | 
 5 | /**
 6 |   * Created by vviswanath on 2/22/18.
 7 |   *
 8 |   * Writes to a file, rolls over every rollOverCount, stops when maxFiles #files are created.
 9 |   * Sources from WordsStream.scala
10 |   */
11 | object ToFileProducer {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     val wordsStream = WordsStream.stream
16 |     val rollOverCount = 1000
17 |     var fileIndex = 0
18 |     val maxFiles: Option[Int] = Some(50)
19 | 
20 |     val path = if (args.length > 0) args(0) else "/tmp/spark_file_stream"
21 | 
22 | 
23 |     val filePrefix = "words_set"
24 | 
25 |     val stream = WordsStream.stream
26 | 
27 |     def rollingWriter(path: String, filePrefix: String)(index: Int, previousWriter: Option[PrintWriter]): PrintWriter = {
28 |       previousWriter.foreach(w ⇒ {
29 |         w.flush()
30 |         w.close()
31 |       })
32 |       val file = new File(s"$path/${filePrefix}_$index")
33 |       print(s"new file created ${file.getAbsolutePath}\n")
34 |       file.getParentFile.mkdirs()
35 |       file.createNewFile()
36 |       new PrintWriter(file)
37 |     }
38 | 
39 |     val writerGen: (Int, Option[PrintWriter]) => PrintWriter = rollingWriter(path, filePrefix)
40 | 
41 |     var writer = writerGen(0, None)
42 | 
43 |     var wordsWritten = 0
44 | 
45 |     for {
46 |       word ← stream
47 |     } {
48 |       if (wordsWritten == rollOverCount) {
49 |         wordsWritten = 0
50 |         if (maxFiles.isDefined && fileIndex + 1 > maxFiles.get) {
51 |           System.exit(0)
52 |         }
53 |         fileIndex += 1
54 |         writer = writerGen(fileIndex, Some(writer))
55 |       }
56 |       writer.write(word+" ")
57 |       wordsWritten += 1
58 |     }
59 | 
60 |     writer.close()
61 |   }
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/spark_23/src/main/scala/com/vishnuviswanath/spark/util/WordsStream.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.util
 2 | 
 3 | /**
 4 |   * Created by vviswanath on 2/22/18.
 5 |   */
 6 | object WordsStream {
 7 | 
 8 |   val content = "Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the Dataset/DataFrame API in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write Ahead Logs. In short, Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming.".split(" ")
 9 | 
10 |   def infiniteWordsStream(content: Array[String], index: Int): Stream[String] = {
11 |     val nextIndex = if (index == content.length - 1) 0 else index + 1
12 |     content(index) #:: infiniteWordsStream(content, nextIndex)
13 |   }
14 | 
15 |   val stream = infiniteWordsStream(content, 0)
16 | }
17 | 


--------------------------------------------------------------------------------
/spark_23/src/test/scala/com/vishnuviswanath/spark/streaming/HelloStructuredStreamingSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.vishnuviswanath.spark.streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.scalatest.{BeforeAndAfterEach, FunSuite}
 5 | 
 6 | /**
 7 |   * Created by vviswanath on 1/10/18.
 8 |   */
 9 | class HelloStructuredStreamingSpec extends FunSuite with BeforeAndAfterEach {
10 | 
11 |   var spark: SparkSession = _
12 | 
13 |   override def beforeEach(): Unit = {
14 |     spark = SparkSession.builder()
15 |       .appName("unitTest")
16 |       .master("local")
17 |       .getOrCreate()
18 |   }
19 | 
20 |   override def afterEach(): Unit = {
21 |     spark.stop()
22 |   }
23 | 
24 |   test("Hello structured streaming") {
25 | 
26 |   }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/stormkafka/.gitignore:
--------------------------------------------------------------------------------
 1 | .classpath
 2 | .project
 3 | .settings
 4 | target
 5 | 
 6 | *.class
 7 | 
 8 | # Mobile Tools for Java (J2ME)
 9 | .mtj.tmp/
10 | 
11 | # Package Files #
12 | *.jar
13 | *.war
14 | *.ear
15 | 
16 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
17 | hs_err_pid*


--------------------------------------------------------------------------------
/stormkafka/README.md:
--------------------------------------------------------------------------------
 1 | This is a POC on how to build a near Realtime Processing system using **Apache Storm** and **Kafka** in **Java**.<br/>
 2 | 
 3 | Messages come into a Kafka topic, Storm picks up these messages using Kafka Spout and gives it to a Bolt, 
 4 | which parses and identifies the message type based on the header. 
 5 | 
 6 | Once the message type is identified, the content of the message is extracted and is sent to different bolts for 
 7 | persistence - SOLR bolt, MongoDB bolt or HDFS bolt.
 8 | 
 9 | [view the blog](http://vishnuviswanath.com/realtime-storm-kafka1.html)
10 | 


--------------------------------------------------------------------------------
/stormkafka/src/main/java/com/vishnu/storm/Keys.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.storm;
 2 | 
 3 | /**
 4 |  * @author vishnu viswanath
 5 |  * This is an utility class. It contains the keys that should be present in the input config-file
 6 |  */
 7 | public class Keys {
 8 | 	
 9 | 	
10 | 	public static final String TOPOLOGY_NAME                  = "topology";
11 | 	
12 | 	//kafka spout
13 | 	public static final String KAFKA_SPOUT_ID = "kafka-spout";
14 | 	public static final String KAFKA_ZOOKEEPER               = "kafka.zookeeper";
15 | 	public static final String KAFKA_TOPIC              = "kafa.topic";
16 | 	public static final String KAFKA_ZKROOT                    = "kafka.zkRoot";
17 | 	public static final String KAFKA_CONSUMERGROUP     = "kafka.consumer.group";
18 | 	public static final String KAFKA_SPOUT_COUNT          = "kafkaspout.count";
19 | 		
20 | 	//sink bolt
21 | 	public static final String SINK_TYPE_BOLT_ID = "sink-type-bolt";
22 | 	public static final String SINK_BOLT_COUNT = "sinkbolt.count";
23 | 	
24 | 	//solr bolt
25 | 	public static final String SOLR_BOLT_ID = "solr-bolt";
26 | 	public static final String SOLR_BOLT_COUNT = "solrbolt.count";
27 | 	public static final String SOLR_COLLECTION = "solr.collection";
28 | 	public static final String SOLR_SERVER = "solr.url";
29 | 	public static final String SOLR_ZOOKEEPER_HOSTS = "solr.zookeeper.hosts";
30 | 	
31 | 	//hdfs bolt 
32 | 	public static final String HDFS_BOLT_ID = "hdfs-bolt";
33 | 	public static final String HDFS_BOLT_COUNT = "hdfsbolt.count";
34 | 	public static final String HDFS_FOLDER = "hdfs.folder";
35 | 	public static final String HDFS_PORT = "hdfs.port";
36 | 	public static final String HDFS_HOST = "hdfs.host";
37 | 	
38 | 	//mongodb bolt
39 | 	public static final String MONGO_BOLT_ID = "mongodb.bolt.id";
40 | 	public static final String MONGO_HOST = "mongodb.host";
41 | 	public static final String MONGO_PORT = "mongodb.port";
42 | 	public static final String MONGO_DATABASE = "mongodb.database";
43 | 	public static final String MONGO_COLLECTION = "mongodb.collection";
44 | 	public static final String MONGO_BOLT_COUNT = "mongodbbolt.count";
45 | 	
46 | 	
47 | 	
48 | 	
49 | }
50 | 


--------------------------------------------------------------------------------
/stormkafka/src/main/java/com/vishnu/storm/bolt/BoltBuilder.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.storm.bolt;
 2 | 
 3 | import java.util.Properties;
 4 | 
 5 | import org.apache.storm.hdfs.bolt.HdfsBolt;
 6 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
 7 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
 8 | import org.apache.storm.hdfs.bolt.format.FileNameFormat;
 9 | import org.apache.storm.hdfs.bolt.format.RecordFormat;
10 | import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;
11 | import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy;
12 | import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy.Units;
13 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
14 | import org.apache.storm.hdfs.bolt.sync.SyncPolicy;
15 | 
16 | import com.vishnu.storm.Keys;
17 | 
18 | /**
19 |  * @author vishnu viswanath
20 |  * This class is used for building bolts
21 |  */
22 | public class BoltBuilder {
23 | 	
24 | 	public Properties configs = null;
25 | 	
26 | 	public BoltBuilder(Properties configs) {
27 | 		this.configs = configs;
28 | 	}
29 | 	
30 | 	public SinkTypeBolt buildSinkTypeBolt() {
31 | 		return new SinkTypeBolt();
32 | 	}
33 | 	
34 | 	public MongodbBolt buildMongodbBolt() {
35 | 		String host = configs.getProperty(Keys.MONGO_HOST);
36 | 		int port = Integer.parseInt(configs.getProperty(Keys.MONGO_PORT));
37 | 		String db = configs.getProperty(Keys.MONGO_DATABASE);
38 | 		String collection = configs.getProperty(Keys.MONGO_COLLECTION);
39 | 		return new MongodbBolt(host, port, db, collection);
40 | 	}
41 | 	
42 | 	public  SolrBolt buildSolrBolt() {
43 | 		String solrServerUlr = configs.getProperty(Keys.SOLR_SERVER);
44 | 		String collection = configs.getProperty(Keys.SOLR_COLLECTION);
45 | 		SolrBolt solrBolt = new SolrBolt(solrServerUlr+collection);
46 | 		return solrBolt;
47 | 	}
48 | 	
49 | 	public HdfsBolt buildHdfsBolt() {
50 | 		RecordFormat format = new DelimitedRecordFormat().withFieldDelimiter("|");
51 | 		SyncPolicy syncPolicy = new CountSyncPolicy(1);
52 | 		FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, Units.MB);
53 | 		FileNameFormat fileNameFormat = new DefaultFileNameFormat().withPath(configs.getProperty(Keys.HDFS_FOLDER));
54 | 		String port = configs.getProperty((Keys.HDFS_PORT));
55 | 		String host = configs.getProperty((Keys.HDFS_HOST));
56 | 		HdfsBolt bolt = new HdfsBolt()
57 |         .withFsUrl("hdfs://"+host+":"+port)
58 |         .withFileNameFormat(fileNameFormat)
59 |         .withRecordFormat(format)
60 |         .withRotationPolicy(rotationPolicy)
61 |         .withSyncPolicy(syncPolicy);
62 | 		return bolt;
63 | 	}
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/stormkafka/src/main/java/com/vishnu/storm/bolt/MongodbBolt.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.storm.bolt;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.bson.Document;
 6 | 
 7 | import com.mongodb.MongoClient;
 8 | import com.mongodb.client.MongoDatabase;
 9 | 
10 | import backtype.storm.task.OutputCollector;
11 | import backtype.storm.task.TopologyContext;
12 | import backtype.storm.topology.OutputFieldsDeclarer;
13 | import backtype.storm.topology.base.BaseRichBolt;
14 | import backtype.storm.tuple.Tuple;
15 | 
16 | 
17 | public class MongodbBolt extends BaseRichBolt {
18 | 	/**
19 | 	 * 
20 | 	 */
21 | 	private static final long serialVersionUID = 1L;
22 | 	private OutputCollector collector;
23 | 	private MongoDatabase mongoDB;
24 | 	private MongoClient mongoClient;
25 | 	private String collection;
26 | 	
27 | 	public String host;
28 | 	public int port ;
29 | 	public String db;
30 | 
31 | 	protected MongodbBolt(String host, int port, String db,String collection) {
32 | 		this.host = host;
33 | 		this.port = port;
34 | 		this.db = db;
35 | 		this.collection = collection;
36 | 	}
37 | 	
38 | 	
39 | 	public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
40 | 		this.collector = collector;
41 | 		this.mongoClient = new MongoClient(host,port);
42 | 		this.mongoDB = mongoClient.getDatabase(db);
43 | 	}
44 | 
45 | 	
46 | 	public void execute(Tuple input) {
47 | 		
48 | 		Document mongoDoc = getMongoDocForInput(input);
49 | 		try{
50 | 			mongoDB.getCollection(collection).insertOne(mongoDoc);
51 | 			collector.ack(input);
52 | 		}catch(Exception e) {
53 | 			e.printStackTrace();
54 | 			collector.fail(input);
55 | 		}
56 | 	}
57 | 
58 | 	
59 | 	@Override
60 | 	public void cleanup() {
61 | 		this.mongoClient.close();
62 | 	}
63 | 
64 | 	public void declareOutputFields(OutputFieldsDeclarer declarer) {
65 | 		// TODO Auto-generated method stub
66 | 	}
67 | 	
68 | 	public Document  getMongoDocForInput(Tuple input) {
69 | 		Document doc = new Document();
70 | 		String content = (String) input.getValueByField("content");
71 | 		String[] parts = content.trim().split(" ");
72 | 		System.out.println("Received in MongoDB bolt "+content);
73 | 		try {
74 | 			for(String part : parts) {
75 | 				String[] subParts = part.split(":");
76 | 				String fieldName = subParts[0];
77 | 				String value = subParts[1];
78 | 				doc.append(fieldName, value);
79 | 			}
80 | 		} catch(Exception e) {
81 | 			
82 | 		}
83 | 		return doc;
84 | 	}
85 | 	
86 | 
87 | 
88 | }


--------------------------------------------------------------------------------
/stormkafka/src/main/java/com/vishnu/storm/bolt/SinkTypeBolt.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.storm.bolt;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import com.vishnu.storm.Topology;
 6 | 
 7 | import backtype.storm.task.OutputCollector;
 8 | import backtype.storm.task.TopologyContext;
 9 | import backtype.storm.topology.OutputFieldsDeclarer;
10 | import backtype.storm.topology.base.BaseRichBolt;
11 | import backtype.storm.tuple.Fields;
12 | import backtype.storm.tuple.Tuple;
13 | import backtype.storm.tuple.Values;
14 | 
15 | /**
16 |  * @author vishnu viswanath
17 |  * This class parses the incoming messages and decided which bolt the message has to be passed on to
18 |  * There are two cases in this example, first if of solr type and second is of hdfs type.
19 |  */
20 | public class SinkTypeBolt extends BaseRichBolt {
21 | 
22 | 
23 | 	private static final long serialVersionUID = 1L;
24 | 	private OutputCollector collector;
25 | 	
26 | 
27 | 	public void execute(Tuple tuple) {
28 | 		String value = tuple.getString(0);
29 | 		System.out.println("Received in SinkType bolt : "+value);
30 | 		int index = value.indexOf(" ");
31 | 		if (index == -1)
32 | 			return;
33 | 		String type = value.substring(0,index);
34 | 		System.out.println("Type : "+type);
35 | 		value = value.substring(index);
36 | 		if(type.equals("solr")) {
37 | 			collector.emit(Topology.SOLR_STREAM,new Values(type,value));
38 | 			System.out.println("Emitted : "+value);
39 | 		} else if (type.equals("hdfs")) {
40 | 			collector.emit(Topology.HDFS_STREAM,new Values(type,value));
41 | 			System.out.println("Emitted : "+value);
42 | 		} else if (type.equals("mongo")) {
43 | 			collector.emit(Topology.MONGODB_STREAM,new Values(type,value));
44 | 			System.out.println("Emitted : "+value);
45 | 		}
46 | 		collector.ack(tuple);	
47 | 	}
48 | 
49 | 
50 | 	public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
51 | 		this.collector = collector;
52 | 		
53 | 	}
54 | 
55 | 	public void declareOutputFields(OutputFieldsDeclarer declarer) {
56 | 		declarer.declareStream(Topology.SOLR_STREAM, new Fields( "sinkType","content" ));
57 | 		declarer.declareStream(Topology.HDFS_STREAM, new Fields( "sinkType","content" ));
58 | 		declarer.declareStream(Topology.MONGODB_STREAM, new Fields( "sinkType","content" ));
59 | 	}
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/stormkafka/src/main/java/com/vishnu/storm/bolt/SolrBolt.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.storm.bolt;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | import org.apache.solr.client.solrj.SolrClient;
 6 | import org.apache.solr.client.solrj.impl.HttpSolrClient;
 7 | import org.apache.solr.common.SolrInputDocument;
 8 | 
 9 | import backtype.storm.task.OutputCollector;
10 | import backtype.storm.task.TopologyContext;
11 | import backtype.storm.topology.OutputFieldsDeclarer;
12 | import backtype.storm.topology.base.BaseRichBolt;
13 | import backtype.storm.tuple.Tuple;
14 | 
15 | /**
16 |  * @author vishnu viswanath
17 |  * This class is used for ingesting data into SOLR
18 |  */
19 | public class SolrBolt extends BaseRichBolt {
20 | 	
21 | 	private static final long serialVersionUID = 1L;
22 | 	private OutputCollector collector;
23 | 	SolrClient solrClient;
24 | 	String solrAddress;
25 | 
26 | 	/**
27 | 	 * 
28 | 	 * @param solrAddress url that is used to connect to solr
29 | 	 * e.g., http://localhost:8983/solr/collection1"
30 | 	 */
31 | 	public SolrBolt(String solrAddress) {
32 | 		this.solrAddress = solrAddress;
33 | 		
34 | 	}
35 | 	
36 | 	public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
37 | 		this.collector = collector;
38 | 		this.solrClient = new HttpSolrClient(solrAddress);
39 | 	}
40 | 
41 | 
42 | 	public void execute(Tuple input) {
43 | 		
44 | 		SolrInputDocument document = getSolrInputDocumentForInput(input);
45 | 		try{
46 | 		solrClient.add(document);
47 | 		solrClient.commit();
48 | 		collector.ack(input);
49 | 		}catch(Exception e) {
50 | 			
51 | 		}
52 | 
53 | 	}
54 | 
55 | 	/**
56 | 	 * Converts the tuple into SOLR document.
57 | 	 * Input will have the content in the field named "content" ( this is set by the SinkTypeBolt )
58 | 	 * It is assumed that the content will be of the format fieldName1:Value1 fieldName2:Value2 ..
59 | 	 * @param input 
60 | 	 * @return
61 | 	 */
62 | 	public SolrInputDocument getSolrInputDocumentForInput(Tuple input) {
63 | 		String content = (String) input.getValueByField("content");
64 | 		String[] parts = content.trim().split(" ");
65 | 		System.out.println("Received in SOLR bolt "+content);
66 | 		SolrInputDocument document = new SolrInputDocument();
67 | 		try {
68 | 			for(String part : parts) {
69 | 				String[] subParts = part.split(":");
70 | 				String fieldName = subParts[0];
71 | 				String value = subParts[1];
72 | 				document.addField(fieldName, value);
73 | 			}
74 | 		} catch(Exception e) {
75 | 			
76 | 		}
77 | 		return document;
78 | 	}
79 | 	
80 | 	@Override
81 | 	public void cleanup() {
82 | 	}
83 | 
84 | 	public void declareOutputFields(OutputFieldsDeclarer declarer) {
85 | 		// TODO Auto-generated method stub
86 | 		
87 | 	}
88 | 
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/stormkafka/src/main/java/com/vishnu/storm/spout/SpoutBuilder.java:
--------------------------------------------------------------------------------
 1 | package com.vishnu.storm.spout;
 2 | 
 3 | import storm.kafka.BrokerHosts;
 4 | import storm.kafka.KafkaSpout;
 5 | import storm.kafka.SpoutConfig;
 6 | import storm.kafka.StringScheme;
 7 | import storm.kafka.ZkHosts;
 8 | 
 9 | import java.util.Properties;
10 | 
11 | import com.vishnu.storm.Keys;
12 | 
13 | import backtype.storm.spout.RawScheme;
14 | import backtype.storm.spout.SchemeAsMultiScheme;
15 | 
16 | /**
17 |  * @author vishnu viswanath
18 |  *
19 |  */
20 | public class SpoutBuilder {
21 | 	
22 | 	public Properties configs = null;
23 | 	
24 | 	public SpoutBuilder(Properties configs) {
25 | 		this.configs = configs;
26 | 	}
27 | 	public KafkaSpout buildKafkaSpout() {
28 | 		BrokerHosts hosts = new ZkHosts(configs.getProperty(Keys.KAFKA_ZOOKEEPER));
29 | 		String topic = configs.getProperty(Keys.KAFKA_TOPIC);
30 | 		String zkRoot = configs.getProperty(Keys.KAFKA_ZKROOT);
31 | 		String groupId = configs.getProperty(Keys.KAFKA_CONSUMERGROUP);
32 | 		SpoutConfig spoutConfig = new SpoutConfig(hosts, topic, zkRoot, groupId);
33 | 		spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
34 | 		KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
35 | 		return kafkaSpout;
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/stormkafka/src/main/java/default_config.properties:
--------------------------------------------------------------------------------
 1 | topology=storm-kafka-topology
 2 | 
 3 | kafka-spout=kafka-spout
 4 | kafka.zookeeper=localhost:2181
 5 | kafa.topic=incoming
 6 | kafka.zkRoot=/kafka
 7 | kafka.consumer.group=sample_group
 8 | kafkaspout.count=1
 9 | 
10 | sink-type-bolt=sink-type
11 | sinkbolt.count=1
12 | 
13 | solr-bolt=solr-bolt
14 | solrbolt.count=1
15 | solr.collection=collection1
16 | solr.url=http://localhost:8983/solr/
17 | solr.zookeeper.hosts=localhost:2181
18 | 
19 | 
20 | hdfs-bolt=hdfs-bolt
21 | hdfsbolt.count=1
22 | hdfs.folder=/from_storm
23 | hdfs.port=9000
24 | hdfs.host=localhost
25 | 
26 | 
27 | mongodb.host=localhost
28 | mongodb.port=27017
29 | mongodb.database=storm
30 | mongodb.collection=collection1
31 | mongodb.bolt.id=mongodb-bolt
32 | mongodbbolt.count=1


--------------------------------------------------------------------------------