├── .gitignore
├── README.md
├── first-example
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── org
│ │ └── sparkexample
│ │ └── WordCountTask.java
│ └── test
│ ├── java
│ └── WordCountTaskTest.java
│ └── resources
│ └── loremipsum.txt
└── streaming-twitter-custom-receiver
├── pom.xml
└── src
├── main
├── java
│ └── org
│ │ └── sparkexample
│ │ ├── TwitterReceiver.java
│ │ └── TwitterStreamTask.java
└── resources
│ └── twitter4j.properties
└── test
└── java
└── TwitterStreamTaskTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | **/.idea/
2 |
3 | **/build/
4 | **/target/
5 |
6 | **/*.iml
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | spark-examples
2 | ==============
3 |
4 | ## Content
5 |
6 | To run the Java example go to the project folder and execute maven test command. Java 8 or above is required.
7 |
8 | - **first-example**: Performs a simple word count task. Spark, Maven, Java.
9 | - **streaming-twitter-custom-receiver**: A custom receiver to collect Twitter messages streams. Update resources/twitter4j.properties with your credentials to use it. Spark streaming, Maven, Java, Kryo serializer, Twitter custom receiver.
10 |
11 |
12 | ## More information
13 | For more tutorials and examples about Apache Spark visit [http://www.robertomarchetto.com](http://www.robertomarchetto.com/category/apache_spark)
14 |
--------------------------------------------------------------------------------
/first-example/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | org.sparkexamples
8 | first-example
9 | 1.0-SNAPSHOT
10 |
11 |
12 | 1.8
13 | 1.8
14 |
15 |
16 |
17 |
18 |
19 | org.apache.spark
20 | spark-core_2.11
21 | 2.1.0
22 |
23 |
24 |
25 |
26 | org.slf4j
27 | slf4j-log4j12
28 | 1.7.22
29 |
30 |
31 |
32 |
33 | junit
34 | junit
35 | 4.12
36 | test
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/first-example/src/main/java/org/sparkexample/WordCountTask.java:
--------------------------------------------------------------------------------
1 | package org.sparkexample;
2 |
3 | import org.apache.spark.SparkConf;
4 | import org.apache.spark.api.java.JavaSparkContext;
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 | import scala.Tuple2;
8 |
9 | import java.util.Arrays;
10 |
11 | import static com.google.common.base.Preconditions.checkArgument;
12 |
13 | /**
14 | * WordCountTask class, we will call this class with the test WordCountTest.
15 | */
16 | public class WordCountTask {
17 | /**
18 | * We use a logger to print the output. Sl4j is a common library which works with log4j, the
19 | * logging system used by Apache Spark.
20 | */
21 | private static final Logger LOGGER = LoggerFactory.getLogger(WordCountTask.class);
22 |
23 | /**
24 | * This is the entry point when the task is called from command line with spark-submit.sh.
25 | * See {@see http://spark.apache.org/docs/latest/submitting-applications.html}
26 | */
27 | public static void main(String[] args) {
28 | checkArgument(args.length > 0, "Please provide the path of input file as first parameter.");
29 | new WordCountTask().run(args[0]);
30 | }
31 |
32 | /**
33 | * The task body
34 | */
35 | public void run(String inputFilePath) {
36 | /*
37 | * This is the address of the Spark cluster. We will call the task from WordCountTest and we
38 | * use a local standalone cluster. [*] means use all the cores available.
39 | * See {@see http://spark.apache.org/docs/latest/submitting-applications.html#master-urls}.
40 | */
41 | String master = "local[*]";
42 |
43 | /*
44 | * Initialises a Spark context.
45 | */
46 | SparkConf conf = new SparkConf()
47 | .setAppName(WordCountTask.class.getName())
48 | .setMaster(master);
49 | JavaSparkContext context = new JavaSparkContext(conf);
50 |
51 | /*
52 | * Performs a work count sequence of tasks and prints the output with a logger.
53 | */
54 | context.textFile(inputFilePath)
55 | .flatMap(text -> Arrays.asList(text.split(" ")).iterator())
56 | .mapToPair(word -> new Tuple2<>(word, 1))
57 | .reduceByKey((a, b) -> a + b)
58 | .foreach(result -> LOGGER.info(
59 | String.format("Word [%s] count [%d].", result._1(), result._2)));
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/first-example/src/test/java/WordCountTaskTest.java:
--------------------------------------------------------------------------------
1 | import org.junit.Test;
2 | import org.sparkexample.WordCountTask;
3 |
4 | import java.net.URISyntaxException;
5 |
6 | public class WordCountTaskTest {
7 | @Test
8 | public void test() throws URISyntaxException {
9 | String inputFile = getClass().getResource("loremipsum.txt").toURI().toString();
10 | new WordCountTask().run(inputFile);
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/first-example/src/test/resources/loremipsum.txt:
--------------------------------------------------------------------------------
1 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus nec egestas tellus. Nunc efficitur nunc nunc. Fusce quis tortor sapien. Cras finibus nisl eu eros tincidunt, eget laoreet velit porta. Morbi pellentesque volutpat mollis. Quisque maximus tellus ut magna vulputate, at pharetra turpis ultricies. Donec eu quam justo. Suspendisse sit amet sollicitudin orci. Vivamus pulvinar sem in risus pulvinar dignissim. Nulla sit amet laoreet eros. Nullam sit amet erat dignissim, vulputate sapien at, tincidunt enim. Etiam nunc neque, condimentum eu dui at, vestibulum ornare odio.
2 | Fusce sed dolor pulvinar, euismod mauris eu, elementum purus. In gravida sollicitudin quam nec ultricies. Aenean vel nisl eget metus lobortis luctus a at erat. Suspendisse ut ipsum quam. Mauris id justo non ligula aliquam tristique. Phasellus volutpat quam at neque fringilla, sed condimentum diam maximus. Proin ut quam aliquet, convallis elit at, dignissim sem. Nam eu arcu purus.
3 | Cras et ligula ac mauris fringilla semper. Mauris interdum magna rhoncus pretium varius. Nulla fermentum est erat, eu interdum erat sodales nec. Quisque ornare suscipit eros, at tempus diam dapibus tristique. Morbi malesuada nibh ac justo faucibus volutpat. Curabitur nec lacus non neque euismod pharetra. Suspendisse odio ipsum, sodales vitae sapien ut, porta feugiat enim. Aliquam erat volutpat. Fusce elementum posuere dolor id auctor. Donec in ante pulvinar, malesuada purus non, tincidunt dui. Maecenas mollis in augue vitae vulputate. Donec condimentum fringilla auctor.
4 | Aenean efficitur metus justo, posuere placerat urna efficitur eget. Nullam et est eu nibh dapibus fringilla. Praesent lobortis tincidunt odio, nec dapibus odio faucibus sit amet. In faucibus, magna eu tincidunt consequat, velit risus bibendum ligula, nec aliquam nisl dui sodales ligula. Integer at dapibus metus, id pellentesque mauris. Vivamus eleifend nisi id mollis dapibus. Donec ut ex sed mauris consectetur feugiat. Quisque viverra quam purus, eu ornare massa iaculis vitae. Praesent fringilla dui nec arcu feugiat, ac posuere dui ullamcorper. Suspendisse nec velit a ipsum euismod malesuada eu non nibh. Mauris aliquam quis quam sit amet condimentum.
5 | Donec a sem dapibus, pretium elit at, fermentum dui. Etiam arcu ex, imperdiet tempor ex a,
6 | convallis condimentum erat. Aliquam ullamcorper ultricies eros, vitae cursus ligula viverra in. Quisque et viverra sem, eget vehicula metus. Nam rutrum leo quam, a vestibulum diam auctor at. Integer diam leo, consectetur eget rhoncus ac, facilisis sit amet tellus. Duis mattis placerat vulputate. Nunc eu aliquet tellus, in varius erat. Pellentesque elementum cursus dolor, condimentum consectetur enim sagittis ac. Donec vehicula ut mauris non porttitor. Vivamus rutrum nunc et egestas vulputate. Proin nec tempor velit. Aliquam eget augue mollis, cursus arcu sed, tincidunt nulla. Aenean feugiat arcu eu mauris cursus gravida.#
7 |
--------------------------------------------------------------------------------
/streaming-twitter-custom-receiver/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | org.sparkexamples
8 | streaming-twitter-custom-receiver
9 | 1.0-SNAPSHOT
10 |
11 |
12 | 1.8
13 | 1.8
14 |
15 |
16 |
17 |
18 |
19 | org.apache.spark
20 | spark-streaming_2.11
21 | 2.1.0
22 |
23 |
24 |
25 |
26 | org.twitter4j
27 | twitter4j-stream
28 | 4.0.6
29 |
30 |
31 |
32 |
33 | org.slf4j
34 | slf4j-log4j12
35 | 1.7.22
36 |
37 |
38 |
39 |
40 | junit
41 | junit
42 | 4.12
43 | test
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/streaming-twitter-custom-receiver/src/main/java/org/sparkexample/TwitterReceiver.java:
--------------------------------------------------------------------------------
1 | package org.sparkexample;
2 |
3 | import org.apache.spark.storage.StorageLevel;
4 | import org.apache.spark.streaming.receiver.Receiver;
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 | import twitter4j.FilterQuery;
8 | import twitter4j.StallWarning;
9 | import twitter4j.Status;
10 | import twitter4j.StatusDeletionNotice;
11 | import twitter4j.StatusListener;
12 | import twitter4j.TwitterStream;
13 | import twitter4j.TwitterStreamFactory;
14 |
15 | import static com.google.common.base.Preconditions.checkArgument;
16 |
17 | /**
18 | * Twitter stream receiver. Make sure resources/twitter4j.properties contains your Twitter
19 | * authentication values. {@see https://apps.twitter.com}.
20 | *
21 | * This receiver tracks the status messages containing the keyword twitter.
22 | */
23 | public final class TwitterReceiver extends Receiver {
24 | private static final Logger LOGGER = LoggerFactory.getLogger(TwitterReceiver.class);
25 |
26 | /**
27 | * The keywords to be tracked.
28 | */
29 | private static final String KEYWORDS = "twitter";
30 |
31 | private final TwitterStream twitterStream;
32 |
33 | private StatusListener listener;
34 |
35 | public TwitterReceiver(StorageLevel storageLevel) {
36 | super(storageLevel);
37 | checkArgument(StorageLevel.MEMORY_ONLY().equals(storageLevel),
38 | String.format("Only [%s] supported.", StorageLevel.MEMORY_ONLY().toString()));
39 | twitterStream = new TwitterStreamFactory().getInstance();
40 | }
41 |
42 | @Override
43 | public void onStart() {
44 | if (listener == null) {
45 | listener = new StreamListener();
46 | }
47 | twitterStream.addListener(listener);
48 | twitterStream.filter(createFilter());
49 | }
50 |
51 | private FilterQuery createFilter() {
52 | FilterQuery filterQuery = new FilterQuery();
53 | try {
54 | filterQuery.track(KEYWORDS);
55 | } catch (Exception e) {
56 | LOGGER.error(e.getMessage(), e);
57 | throw new IllegalArgumentException(e);
58 | }
59 | return filterQuery;
60 | }
61 |
62 | @Override
63 | public void onStop() {
64 | twitterStream.clearListeners();
65 | twitterStream.cleanUp();
66 | listener = null;
67 | }
68 |
69 | private class StreamListener implements StatusListener {
70 | @Override
71 | public void onStatus(Status status) {
72 | store(status);
73 | }
74 |
75 | @Override
76 | public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) {
77 | // Intentionally empty.
78 | }
79 |
80 | @Override
81 | public void onTrackLimitationNotice(int numberOfLimitedStatuses) {
82 | // Intentionally empty.
83 | }
84 |
85 | @Override
86 | public void onScrubGeo(long userId, long upToStatusId) {
87 | // Intentionally empty.
88 | }
89 |
90 | @Override
91 | public void onStallWarning(StallWarning warning) {
92 | // Intentionally empty.
93 | }
94 |
95 | @Override
96 | public void onException(Exception ex) {
97 | LOGGER.warn(ex.getMessage(), ex);
98 | }
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/streaming-twitter-custom-receiver/src/main/java/org/sparkexample/TwitterStreamTask.java:
--------------------------------------------------------------------------------
1 | package org.sparkexample;
2 |
3 | import com.google.common.collect.ImmutableList;
4 | import org.apache.spark.SparkConf;
5 | import org.apache.spark.storage.StorageLevel;
6 | import org.apache.spark.streaming.Durations;
7 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
8 | import org.slf4j.Logger;
9 | import org.slf4j.LoggerFactory;
10 | import twitter4j.GeoLocation;
11 | import twitter4j.Status;
12 | import twitter4j.User;
13 |
14 | /**
15 | * Twitter stream example.
16 |
17 | * Make sure resources/twitter4j.properties contains your Twitter authentication values.
18 | * {@see https://apps.twitter.com}
19 | */
20 | public class TwitterStreamTask {
21 | /**
22 | * Kryo serializer offers much better performance than the default serializer.
23 | * {@see https://spark.apache.org/docs/latest/tuning.html#data-serialization}
24 | */
25 | private static final Class[] KRYO_CLASSES = ImmutableList.builder()
26 | .add(GeoLocation.class)
27 | .add(Status.class)
28 | .add(User.class)
29 | .build()
30 | .toArray(new Class[] {});
31 |
32 | /**
33 | * We use a logger to print the output. Sl4j is a common library which works with log4j, the
34 | * logging system used by Apache Spark.
35 | */
36 | private static final Logger LOGGER = LoggerFactory.getLogger(TwitterStreamTask.class);
37 |
38 | /**
39 | * This is the entry point function when the task is called with spark-submit.sh from command
40 | * line. In our example we will call the task from a WordCountTest instead.
41 | * See {@see http://spark.apache.org/docs/latest/submitting-applications.html}
42 | */
43 | public static void main(String args[]) throws InterruptedException {
44 | new TwitterStreamTask().run();
45 | }
46 |
47 | public void run() throws InterruptedException {
48 | /*
49 | * Creates a Spark local cluster with Kryo serialization enabled.
50 | */
51 | SparkConf conf = new SparkConf().setMaster("local[*]")
52 | .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
53 | .registerKryoClasses(KRYO_CLASSES)
54 | .setAppName("sparkTask");
55 |
56 | /*
57 | * Starts a streaming context with a windowing (micro batch) of 10 seconds.
58 | */
59 | JavaStreamingContext streamingContext = new JavaStreamingContext(conf, Durations.seconds(10));
60 |
61 | /*
62 | * Uses the custom Twitter receiver. For every micro batch prints the collected messages.
63 | *
64 | * coalesce(10) filters out empty micro batches by reducing the partitions.
65 | *
66 | * Make sure resources/twitter4j.properties contains your Twitter authentication values.
67 | * {@see https://apps.twitter.com}.
68 | */
69 | streamingContext.receiverStream(new TwitterReceiver(StorageLevel.MEMORY_ONLY()))
70 | .foreachRDD(
71 | rdd -> rdd.coalesce(10)
72 | .foreach(message -> LOGGER.info(message.getText())));
73 |
74 | /*
75 | * Starts the streaming task.
76 | */
77 | streamingContext.start();
78 | streamingContext.awaitTermination();
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/streaming-twitter-custom-receiver/src/main/resources/twitter4j.properties:
--------------------------------------------------------------------------------
1 | debug=false
2 | oauth.consumerKey=yourConsumerKeyHere
3 | oauth.consumerSecret=yourConsumerSecretHere
4 | oauth.accessToken=yourAccessTokenHere
5 | oauth.accessTokenSecret=yourAccessTokenSecretHere
6 |
--------------------------------------------------------------------------------
/streaming-twitter-custom-receiver/src/test/java/TwitterStreamTaskTest.java:
--------------------------------------------------------------------------------
1 | import org.junit.Test;
2 | import org.sparkexample.TwitterStreamTask;
3 |
4 | public class TwitterStreamTaskTest {
5 | @Test
6 | public void test() throws InterruptedException {
7 | new TwitterStreamTask().run();
8 | }
9 | }
10 |
--------------------------------------------------------------------------------