├── .gitignore
├── LICENSE
├── README.md
├── pyspark.submit.sh
├── quiet_logs.py
├── setup.sh
├── spark-streaming-foreachRDD-and-foreach.py
├── spark-streaming-foreachRDD-and-foreachPartition.py
├── spark-streaming-reading-files-from-a-folder.py
└── spark-streaming-reading-from-a-tcp-port.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | jars/*
59 | checkpoint/*
60 | streamingData/*
61 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Daniel Santana
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spark-Streaming-Examples
 2 | Spark Streaming examples using python
 3 | 
 4 | ## Getting the code
 5 | 
 6 |     mkdir SOME_DIRECORY
 7 |     cd SOME_DIRECORY
 8 |     git clone https://github.com/danielsan/Spark-Streaming-Examples.git
 9 | 
10 | ## Preparing everything
11 | The `setup.sh` script assumes that you already have curl, python, mongodb and Java installed in your system.
12 | 
13 | Before running the [setup.sh](setup.sh) script I recommend you to see the source code and understand what is it going to do when you run it on your computer. 
14 | 
15 | **This will take several minutes**
16 | 
17 |     cd Spark-Streaming-Examples && $SHELL ./setup.sh
18 | 
19 | ## Running the examples with Spark
20 | 
21 | Assuming your working directory (your current directory in your shell) is the `Spark-Streaming-Examples` one, you can run the command as follow:
22 | 
23 |     ./pyspark.submit.sh PYTHON_FILE.py
24 | 
25 | To run the `spark-streaming-reading-files-from-a-folder.py` for exemple you can just do this:
26 | 
27 |     ./pyspark.submit.sh spark-streaming-reading-files-from-a-folder.py
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/pyspark.submit.sh:
--------------------------------------------------------------------------------
 1 | SPARK_SUBMIT=../spark.git/bin/spark-submit
 2 | JARS_DIR=jars
 3 | 
 4 | MONGO_DRIVER_JAR=$(ls $JARS_DIR/mongo-java-driver*jar)
 5 | HADOOP_JAR=$(ls $JARS_DIR/mongo-hadoop-core*.jar)
 6 | SPARK_JAR=$(ls $JARS_DIR/mongo-hadoop-spark*jar)
 7 | 
 8 | JARS=$MONGO_DRIVER_JAR,$HADOOP_JAR,$SPARK_JAR
 9 | 
10 | PYFILE=$(ls "$1" 2>/dev/null || echo $0 | sed 's/submit.sh/py/')
11 | 
12 | $SPARK_SUBMIT \
13 |   --driver-class-path $SPARK_JAR \
14 |   --jars $JARS \
15 |   $PYFILE
16 | 


--------------------------------------------------------------------------------
/quiet_logs.py:
--------------------------------------------------------------------------------
1 | def quiet_logs( sc ):
2 |   logger = sc._jvm.org.apache.log4j
3 |   logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
4 |   logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
5 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | export PROJECT_GIT_DIR=$(pwd)
  3 | echo  "PROJECT_GIT_DIR=$PROJECT_GIT_DIR"
  4 | export BASE_DIR=$(dirname $PROJECT_GIT_DIR)
  5 | echo  "BASE_DIR=$BASE_DIR"
  6 | export JARS_DIR="${PROJECT_GIT_DIR}/jars"
  7 | echo  "JARS_DIR=$JARS_DIR"
  8 | 
  9 | is_a_git_repo(){
 10 |     return "$(git status &> /dev/null && echo 1 || echo 0)"
 11 | }
 12 | 
 13 | git_remote_url(){
 14 |     echo $(git remote -v | head -1 | grep -Po 'git://.+.git')
 15 | }
 16 | 
 17 | clone_git_repo(){
 18 |     local GITHUB_URL=$1    
 19 |     local GIT_DIR=$2
 20 |     time git clone $GITHUB_URL $GIT_DIR
 21 | }
 22 | 
 23 | backup_dir(){
 24 |     local DIR=$1
 25 |     mv $DIR "${DIR}_$(date +'%Y%m%d_%H%M%S')"
 26 | }
 27 | 
 28 | cd $BASE_DIR
 29 | 
 30 | # getting spark from git repo
 31 | echo "##### SPARK phase"
 32 | export SPARK_GIT_DIR="$BASE_DIR/spark.git"
 33 | export SPARK_GITHUB_URL='git://github.com/apache/spark.git'
 34 | 
 35 | build_spark_instance(){
 36 |     echo "##### Building SPARK instance"
 37 |     cd $SPARK_GIT_DIR
 38 |     time build/mvn -DskipTests clean package
 39 |     echo
 40 | }
 41 | 
 42 | clone_spark(){
 43 |     echo "##### Cloning SPARK"
 44 |     clone_git_repo $SPARK_GITHUB_URL $SPARK_GIT_DIR
 45 |     build_spark_instance
 46 | }
 47 | 
 48 | move_curr_dir_and_clone_spark(){
 49 |     backup_dir $SPARK_GIT_DIR
 50 |     clone_spark
 51 | }
 52 | 
 53 | if [ ! -d $SPARK_GIT_DIR ] ; then
 54 |     # time git clone $SPARK_GITHUB_URL $SPARK_GIT_DIR
 55 |     clone_spark
 56 | else
 57 |     cd $SPARK_GIT_DIR
 58 |     if [ ! git status &> /dev/null ]; then
 59 |         # mv $SPARK_GIT_DIR "${SPARK_GIT_DIR}_$(date +'%Y%m%d_%H%M%S')"
 60 |         # time git clone $SPARK_GITHUB_URL $SPARK_GIT_DIR
 61 |         build_spark_instance
 62 |     else
 63 |         if [ $SPARK_GITHUB_URL != $(git_remote_url) ]; then
 64 |             move_curr_dir_and_clone_spark
 65 |         else
 66 |             git checkout master
 67 |             git pull origin master
 68 |         fi
 69 |     fi
 70 | fi
 71 | 
 72 | 
 73 | #getting mongodb-hadoop from
 74 | echo "######## MongoDB Hadoop Connector phase"
 75 | export HADOOPCONN_GIT_DIR=$BASE_DIR/mongodb-hadoop.git
 76 | export HADOOPCONN_GITHUB_URL=git://github.com/mongodb/mongo-hadoop.git
 77 | 
 78 | build_hadoop_conn_instance(){
 79 |     echo "##### Building MongoDB Hadoop Connector JARS"
 80 |     cd $HADOOPCONN_GIT_DIR
 81 |     time ./gradlew clean jar
 82 |     echo
 83 | }
 84 | 
 85 | patch_hadoop_conn(){
 86 |     echo "##### Patching MongoDB Hadoop Connector Project due to a bug"
 87 |     patch $HADOOPCONN_GIT_DIR/core/src/main/java/com/mongodb/hadoop/util/MongoConfigUtil.java \
 88 |           $PROJECT_GIT_DIR/com_mongodb_hadoop_util_MongoConfigUtil.java.patch
 89 |     echo
 90 | }
 91 | 
 92 | clone_hadoop_conn(){
 93 |     echo "##### Cloning MongoDB Hadoop Connector Project"
 94 |     clone_git_repo $HADOOPCONN_GITHUB_URL $HADOOPCONN_GIT_DIR
 95 |     build_hadoop_conn_instance
 96 | }
 97 | 
 98 | move_curr_dir_and_clone_hadoop_conn(){
 99 |     backup_dir $HADOOPCONN_GIT_DIR
100 |     clone_hadoop_conn
101 | }
102 | 
103 | if [ ! -d $HADOOPCONN_GIT_DIR ] ; then
104 |     # time git clone $HADOOPCONN_GITHUB_URL $HADOOPCONN_GIT_DIR
105 |     clone_hadoop_conn
106 | else
107 |     cd $HADOOPCONN_GIT_DIR
108 |     if [ ! git status &> /dev/null ]; then
109 |         # mv $HADOOPCONN_GIT_DIR "${HADOOPCONN_GIT_DIR}_$(date +'%Y%m%d_%H%M%S')"
110 |         # time git clone $HADOOPCONN_GITHUB_URL $HADOOPCONN_GIT_DIR
111 |         build_hadoop_conn_instance
112 |     else
113 |         if [ $HADOOPCONN_GITHUB_URL != $(git_remote_url) ]; then
114 |             move_curr_dir_and_clone_hadoop_conn
115 |         else
116 |             git checkout master
117 |             git pull origin master
118 |         fi
119 |     fi
120 | fi
121 | 
122 | # getting spark from git repo
123 | echo "##### JAVA_DRIVER phase"
124 | export JAVA_DRIVER_GIT_DIR="$BASE_DIR/mongo-java-driver.git"
125 | export JAVA_DRIVER_GITHUB_URL='git://github.com/mongodb/mongo-java-driver.git'
126 | 
127 | build_java_driver_instance(){
128 |     echo "##### Building JAVA_DRIVER jars"
129 |     cd $JAVA_DRIVER_GIT_DIR
130 |     time ./gradlew clean jar
131 |     echo
132 | }
133 | 
134 | clone_java_driver(){
135 |     echo "##### Cloning JAVA_DRIVER"
136 |     clone_git_repo $JAVA_DRIVER_GITHUB_URL $JAVA_DRIVER_GIT_DIR
137 |     build_java_driver_instance
138 | }
139 | 
140 | move_curr_dir_and_clone_java_driver(){
141 |     backup_dir $JAVA_DRIVER_GIT_DIR
142 |     clone_java_driver
143 | }
144 | 
145 | if [ ! -d $JAVA_DRIVER_GIT_DIR ] ; then
146 |     # time git clone $JAVA_DRIVER_GITHUB_URL $JAVA_DRIVER_GIT_DIR
147 |     clone_java_driver
148 | else
149 |     cd $JAVA_DRIVER_GIT_DIR
150 |     if [ ! git status &> /dev/null ]; then
151 |         # mv $JAVA_DRIVER_GIT_DIR "${JAVA_DRIVER_GIT_DIR}_$(date +'%Y%m%d_%H%M%S')"
152 |         # time git clone $JAVA_DRIVER_GITHUB_URL $JAVA_DRIVER_GIT_DIR
153 |         build_java_driver_instance
154 |     else
155 |         if [ $JAVA_DRIVER_GITHUB_URL != $(git_remote_url) ]; then
156 |             move_curr_dir_and_clone_java_driver
157 |         else
158 |             git checkout master
159 |             git pull origin master
160 |         fi
161 |     fi
162 | fi
163 | 
164 | 
165 | # Dependency jars
166 | echo "##### Copying the requred JAR files to the jars directory"
167 | test -d "$JARS_DIR" || mkdir $JARS_DIR
168 | 
169 | cp -a --no-clobber $(find $HADOOPCONN_GIT_DIR -name mongo-hadoop-core\*.jar ) $JARS_DIR
170 | cp -a --no-clobber $(find $HADOOPCONN_GIT_DIR -name mongo-hadoop-spark\*.jar) $JARS_DIR
171 | cp -a --no-clobber $(find $JAVA_DRIVER_GIT_DIR -name mongo-java-driver\*.jar) $JARS_DIR
172 | # time curl https://oss.sonatype.org/content/repositories/releases/org/mongodb/mongodb-driver/3.1.1/mongodb-driver-3.1.1.jar > jars/mongodb-driver-3.1.1.jar
173 | echo
174 | 
175 | 
176 | echo "##### Installing some python Dependency"
177 | time sudo easy_install pytz
178 | echo
179 | 
180 | echo "##### Done!"
181 | 


--------------------------------------------------------------------------------
/spark-streaming-foreachRDD-and-foreach.py:
--------------------------------------------------------------------------------
 1 | """
 2 | To run
 3 |   ./pyspark.submit.sh spark-streaming-foreachRDD-and-foreach.py
 4 | """
 5 | 
 6 | from pyspark import SparkContext, SparkConf
 7 | from pyspark.streaming import StreamingContext
 8 | 
 9 | from pymongo import MongoClient
10 | 
11 | from quiet_logs import quiet_logs
12 | 
13 | # Documentation
14 | # http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd
15 | def sendRecord(tup):
16 |     word   = tup[0]
17 |     amount = tup[1]
18 | 
19 |     connection     = MongoClient()
20 |     test_db        = connection.get_database('test')
21 |     wordcount_coll = test_db.get_collection('wordcount_coll')
22 |     wordcount_coll.update({"_id": word}, {"$inc": {"count": amount} }, upsert=True)
23 |     connection.close()
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     conf = SparkConf().setAppName("using foreachRDD and foreach on RDD")
28 |     sc   = SparkContext(conf=conf)
29 |     ssc  = StreamingContext(sc, 2)
30 |     ssc.checkpoint("checkpoint")
31 | 
32 |     quiet_logs(sc)
33 | 
34 |     # Create a DStream that will connect to hostname:port, like localhost:9999
35 |     lines = ssc.socketTextStream("localhost", 9998)
36 |     # lines = ssc.textFileStream('./streamingData')
37 | 
38 |     # Split each line into words
39 |     words = lines.flatMap(lambda line: line.split(" "))
40 | 
41 |     # Count each word in each batch
42 |     pairs = words.map(lambda word: (word, 1))
43 | 
44 |     wordCounts = pairs.reduceByKey(lambda x, y: x + y)
45 | 
46 |     # http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd
47 |     wordCounts.foreachRDD(lambda rdd: rdd.foreach(sendRecord))
48 | 
49 |     # Print the first ten elements of each RDD generated in this DStream to the console
50 |     wordCounts.pprint()
51 | 
52 |     ssc.start()             # Start the computation
53 |     ssc.awaitTermination()  # Wait for the computation to terminate
54 | 


--------------------------------------------------------------------------------
/spark-streaming-foreachRDD-and-foreachPartition.py:
--------------------------------------------------------------------------------
 1 | """
 2 | To run
 3 |   ./pyspark.submit.sh spark-streaming-foreachRDD-and-foreachPartition.py
 4 | """
 5 | 
 6 | from pyspark import SparkContext, SparkConf
 7 | from pyspark.streaming import StreamingContext
 8 | 
 9 | from pymongo import MongoClient
10 | 
11 | from quiet_logs import quiet_logs
12 | 
13 | # Documentation
14 | # http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd
15 | def sendPartition(partition):
16 |     connection     = MongoClient()
17 |     test_db        = connection.get_database('test')
18 |     wordcount_coll = test_db.get_collection('wordcount_coll')
19 | 
20 |     for tup in partition:
21 |         word   = tup[0]
22 |         amount = tup[1]
23 |         wordcount_coll.update({"_id": word}, {"$inc": {"count": amount} }, upsert=True)
24 | 
25 |     connection.close()
26 | 
27 | if __name__ == "__main__":
28 |     conf = SparkConf().setAppName("using foreachRDD and foreachPartition on RDD")
29 |     sc   = SparkContext(conf=conf)
30 |     ssc  = StreamingContext(sc, 2)
31 |     ssc.checkpoint("checkpoint")
32 | 
33 |     quiet_logs(sc)
34 | 
35 |     # Create a DStream that will connect to hostname:port, like localhost:9999
36 |     lines = ssc.socketTextStream('localhost', 9998)
37 |     # lines = ssc.textFileStream('./streamingData')
38 | 
39 |     # Split each line into words
40 |     words = lines.flatMap(lambda line: line.split(" "))
41 | 
42 |     # Count each word in each batch
43 |     pairs = words.map(lambda word: (word, 1))
44 | 
45 |     wordCounts = pairs.reduceByKey(lambda x, y: x + y)
46 | 
47 |     # http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd
48 |     wordCounts.foreachRDD(lambda rdd: rdd.foreachPartition(sendPartition))
49 | 
50 |     # Print the first ten elements of each RDD generated in this DStream to the console
51 |     wordCounts.pprint()
52 | 
53 |     ssc.start()             # Start the computation
54 |     ssc.awaitTermination()  # Wait for the computation to terminate
55 | 


--------------------------------------------------------------------------------
/spark-streaming-reading-files-from-a-folder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | To run
 3 |   ./pyspark.submit.sh spark-streaming-foreachRDD-and-foreachPartition.py
 4 | """
 5 | 
 6 | from pyspark import SparkContext, SparkConf
 7 | from pyspark.streaming import StreamingContext
 8 | 
 9 | from quiet_logs import quiet_logs
10 | 
11 | if __name__ == "__main__":
12 |     conf = SparkConf().setAppName("Reading files from a directory")
13 |     sc   = SparkContext(conf=conf)
14 |     ssc  = StreamingContext(sc, 2)
15 | 
16 |     quiet_logs(sc)
17 | 
18 |     lines = ssc.textFileStream('./streamingData')
19 | 
20 |     # Split each line into words
21 |     words = lines.flatMap(lambda line: line.split(" "))
22 | 
23 |     # Count each word in each batch
24 |     pairs = words.map(lambda word: (word, 1))
25 | 
26 |     wordCounts = pairs.reduceByKey(lambda x, y: x + y)
27 | 
28 |     # Print the first ten elements of each RDD generated in this DStream to the console
29 |     wordCounts.pprint()
30 | 
31 |     ssc.start()             # Start the computation
32 |     ssc.awaitTermination()  # Wait for the computation to terminate


--------------------------------------------------------------------------------
/spark-streaming-reading-from-a-tcp-port.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
 3 | 
 4 |  To run this on your local machine, you need to first run a Netcat server
 5 |     `$ nc -lk 9999`
 6 | 
 7 |  and then run the example
 8 |     `$ ./pyspark.submit.sh spark-streaming-listening-to-a-tcp-port.py
 9 | """
10 | 
11 | from pyspark import SparkContext, SparkConf
12 | from pyspark.streaming import StreamingContext
13 | 
14 | from quiet_logs import quiet_logs
15 | 
16 | if __name__ == "__main__":
17 |     conf = SparkConf().setAppName("Listening to a tcp port")
18 |     sc   = SparkContext(conf=conf)
19 |     ssc  = StreamingContext(sc, 1)
20 | 
21 |     quiet_logs(sc)
22 | 
23 |     lines = ssc.socketTextStream("localhost", 9998)
24 | 
25 |     # Split each line into words
26 |     words = lines.flatMap(lambda line: line.split(" "))
27 | 
28 |     # Count each word in each batch
29 |     pairs = words.map(lambda word: (word, 1))
30 | 
31 | 
32 |     wordCounts = pairs.reduceByKey(lambda x, y: x + y)
33 | 
34 |     # Print the first ten elements of each RDD generated in this DStream to the console
35 |     wordCounts.pprint()
36 | 
37 | 
38 |     ssc.start()             # Start the computation
39 |     ssc.awaitTermination()  # Wait for the computation to terminate


--------------------------------------------------------------------------------