├── .gitignore ├── LICENSE ├── README.md ├── pyspark.submit.sh ├── quiet_logs.py ├── setup.sh ├── spark-streaming-foreachRDD-and-foreach.py ├── spark-streaming-foreachRDD-and-foreachPartition.py ├── spark-streaming-reading-files-from-a-folder.py └── spark-streaming-reading-from-a-tcp-port.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | jars/* 59 | checkpoint/* 60 | streamingData/* 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Daniel Santana 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark-Streaming-Examples 2 | Spark Streaming examples using python 3 | 4 | ## Getting the code 5 | 6 | mkdir SOME_DIRECORY 7 | cd SOME_DIRECORY 8 | git clone https://github.com/danielsan/Spark-Streaming-Examples.git 9 | 10 | ## Preparing everything 11 | The `setup.sh` script assumes that you already have curl, python, mongodb and Java installed in your system. 12 | 13 | Before running the [setup.sh](setup.sh) script I recommend you to see the source code and understand what is it going to do when you run it on your computer. 14 | 15 | **This will take several minutes** 16 | 17 | cd Spark-Streaming-Examples && $SHELL ./setup.sh 18 | 19 | ## Running the examples with Spark 20 | 21 | Assuming your working directory (your current directory in your shell) is the `Spark-Streaming-Examples` one, you can run the command as follow: 22 | 23 | ./pyspark.submit.sh PYTHON_FILE.py 24 | 25 | To run the `spark-streaming-reading-files-from-a-folder.py` for exemple you can just do this: 26 | 27 | ./pyspark.submit.sh spark-streaming-reading-files-from-a-folder.py 28 | 29 | 30 | -------------------------------------------------------------------------------- /pyspark.submit.sh: -------------------------------------------------------------------------------- 1 | SPARK_SUBMIT=../spark.git/bin/spark-submit 2 | JARS_DIR=jars 3 | 4 | MONGO_DRIVER_JAR=$(ls $JARS_DIR/mongo-java-driver*jar) 5 | HADOOP_JAR=$(ls $JARS_DIR/mongo-hadoop-core*.jar) 6 | SPARK_JAR=$(ls $JARS_DIR/mongo-hadoop-spark*jar) 7 | 8 | JARS=$MONGO_DRIVER_JAR,$HADOOP_JAR,$SPARK_JAR 9 | 10 | PYFILE=$(ls "$1" 2>/dev/null || echo $0 | sed 's/submit.sh/py/') 11 | 12 | $SPARK_SUBMIT \ 13 | --driver-class-path $SPARK_JAR \ 14 | --jars $JARS \ 15 | $PYFILE 16 | -------------------------------------------------------------------------------- /quiet_logs.py: -------------------------------------------------------------------------------- 1 | def quiet_logs( sc ): 2 | logger = sc._jvm.org.apache.log4j 3 | logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) 4 | logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) 5 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PROJECT_GIT_DIR=$(pwd) 3 | echo "PROJECT_GIT_DIR=$PROJECT_GIT_DIR" 4 | export BASE_DIR=$(dirname $PROJECT_GIT_DIR) 5 | echo "BASE_DIR=$BASE_DIR" 6 | export JARS_DIR="${PROJECT_GIT_DIR}/jars" 7 | echo "JARS_DIR=$JARS_DIR" 8 | 9 | is_a_git_repo(){ 10 | return "$(git status &> /dev/null && echo 1 || echo 0)" 11 | } 12 | 13 | git_remote_url(){ 14 | echo $(git remote -v | head -1 | grep -Po 'git://.+.git') 15 | } 16 | 17 | clone_git_repo(){ 18 | local GITHUB_URL=$1 19 | local GIT_DIR=$2 20 | time git clone $GITHUB_URL $GIT_DIR 21 | } 22 | 23 | backup_dir(){ 24 | local DIR=$1 25 | mv $DIR "${DIR}_$(date +'%Y%m%d_%H%M%S')" 26 | } 27 | 28 | cd $BASE_DIR 29 | 30 | # getting spark from git repo 31 | echo "##### SPARK phase" 32 | export SPARK_GIT_DIR="$BASE_DIR/spark.git" 33 | export SPARK_GITHUB_URL='git://github.com/apache/spark.git' 34 | 35 | build_spark_instance(){ 36 | echo "##### Building SPARK instance" 37 | cd $SPARK_GIT_DIR 38 | time build/mvn -DskipTests clean package 39 | echo 40 | } 41 | 42 | clone_spark(){ 43 | echo "##### Cloning SPARK" 44 | clone_git_repo $SPARK_GITHUB_URL $SPARK_GIT_DIR 45 | build_spark_instance 46 | } 47 | 48 | move_curr_dir_and_clone_spark(){ 49 | backup_dir $SPARK_GIT_DIR 50 | clone_spark 51 | } 52 | 53 | if [ ! -d $SPARK_GIT_DIR ] ; then 54 | # time git clone $SPARK_GITHUB_URL $SPARK_GIT_DIR 55 | clone_spark 56 | else 57 | cd $SPARK_GIT_DIR 58 | if [ ! git status &> /dev/null ]; then 59 | # mv $SPARK_GIT_DIR "${SPARK_GIT_DIR}_$(date +'%Y%m%d_%H%M%S')" 60 | # time git clone $SPARK_GITHUB_URL $SPARK_GIT_DIR 61 | build_spark_instance 62 | else 63 | if [ $SPARK_GITHUB_URL != $(git_remote_url) ]; then 64 | move_curr_dir_and_clone_spark 65 | else 66 | git checkout master 67 | git pull origin master 68 | fi 69 | fi 70 | fi 71 | 72 | 73 | #getting mongodb-hadoop from 74 | echo "######## MongoDB Hadoop Connector phase" 75 | export HADOOPCONN_GIT_DIR=$BASE_DIR/mongodb-hadoop.git 76 | export HADOOPCONN_GITHUB_URL=git://github.com/mongodb/mongo-hadoop.git 77 | 78 | build_hadoop_conn_instance(){ 79 | echo "##### Building MongoDB Hadoop Connector JARS" 80 | cd $HADOOPCONN_GIT_DIR 81 | time ./gradlew clean jar 82 | echo 83 | } 84 | 85 | patch_hadoop_conn(){ 86 | echo "##### Patching MongoDB Hadoop Connector Project due to a bug" 87 | patch $HADOOPCONN_GIT_DIR/core/src/main/java/com/mongodb/hadoop/util/MongoConfigUtil.java \ 88 | $PROJECT_GIT_DIR/com_mongodb_hadoop_util_MongoConfigUtil.java.patch 89 | echo 90 | } 91 | 92 | clone_hadoop_conn(){ 93 | echo "##### Cloning MongoDB Hadoop Connector Project" 94 | clone_git_repo $HADOOPCONN_GITHUB_URL $HADOOPCONN_GIT_DIR 95 | build_hadoop_conn_instance 96 | } 97 | 98 | move_curr_dir_and_clone_hadoop_conn(){ 99 | backup_dir $HADOOPCONN_GIT_DIR 100 | clone_hadoop_conn 101 | } 102 | 103 | if [ ! -d $HADOOPCONN_GIT_DIR ] ; then 104 | # time git clone $HADOOPCONN_GITHUB_URL $HADOOPCONN_GIT_DIR 105 | clone_hadoop_conn 106 | else 107 | cd $HADOOPCONN_GIT_DIR 108 | if [ ! git status &> /dev/null ]; then 109 | # mv $HADOOPCONN_GIT_DIR "${HADOOPCONN_GIT_DIR}_$(date +'%Y%m%d_%H%M%S')" 110 | # time git clone $HADOOPCONN_GITHUB_URL $HADOOPCONN_GIT_DIR 111 | build_hadoop_conn_instance 112 | else 113 | if [ $HADOOPCONN_GITHUB_URL != $(git_remote_url) ]; then 114 | move_curr_dir_and_clone_hadoop_conn 115 | else 116 | git checkout master 117 | git pull origin master 118 | fi 119 | fi 120 | fi 121 | 122 | # getting spark from git repo 123 | echo "##### JAVA_DRIVER phase" 124 | export JAVA_DRIVER_GIT_DIR="$BASE_DIR/mongo-java-driver.git" 125 | export JAVA_DRIVER_GITHUB_URL='git://github.com/mongodb/mongo-java-driver.git' 126 | 127 | build_java_driver_instance(){ 128 | echo "##### Building JAVA_DRIVER jars" 129 | cd $JAVA_DRIVER_GIT_DIR 130 | time ./gradlew clean jar 131 | echo 132 | } 133 | 134 | clone_java_driver(){ 135 | echo "##### Cloning JAVA_DRIVER" 136 | clone_git_repo $JAVA_DRIVER_GITHUB_URL $JAVA_DRIVER_GIT_DIR 137 | build_java_driver_instance 138 | } 139 | 140 | move_curr_dir_and_clone_java_driver(){ 141 | backup_dir $JAVA_DRIVER_GIT_DIR 142 | clone_java_driver 143 | } 144 | 145 | if [ ! -d $JAVA_DRIVER_GIT_DIR ] ; then 146 | # time git clone $JAVA_DRIVER_GITHUB_URL $JAVA_DRIVER_GIT_DIR 147 | clone_java_driver 148 | else 149 | cd $JAVA_DRIVER_GIT_DIR 150 | if [ ! git status &> /dev/null ]; then 151 | # mv $JAVA_DRIVER_GIT_DIR "${JAVA_DRIVER_GIT_DIR}_$(date +'%Y%m%d_%H%M%S')" 152 | # time git clone $JAVA_DRIVER_GITHUB_URL $JAVA_DRIVER_GIT_DIR 153 | build_java_driver_instance 154 | else 155 | if [ $JAVA_DRIVER_GITHUB_URL != $(git_remote_url) ]; then 156 | move_curr_dir_and_clone_java_driver 157 | else 158 | git checkout master 159 | git pull origin master 160 | fi 161 | fi 162 | fi 163 | 164 | 165 | # Dependency jars 166 | echo "##### Copying the requred JAR files to the jars directory" 167 | test -d "$JARS_DIR" || mkdir $JARS_DIR 168 | 169 | cp -a --no-clobber $(find $HADOOPCONN_GIT_DIR -name mongo-hadoop-core\*.jar ) $JARS_DIR 170 | cp -a --no-clobber $(find $HADOOPCONN_GIT_DIR -name mongo-hadoop-spark\*.jar) $JARS_DIR 171 | cp -a --no-clobber $(find $JAVA_DRIVER_GIT_DIR -name mongo-java-driver\*.jar) $JARS_DIR 172 | # time curl https://oss.sonatype.org/content/repositories/releases/org/mongodb/mongodb-driver/3.1.1/mongodb-driver-3.1.1.jar > jars/mongodb-driver-3.1.1.jar 173 | echo 174 | 175 | 176 | echo "##### Installing some python Dependency" 177 | time sudo easy_install pytz 178 | echo 179 | 180 | echo "##### Done!" 181 | -------------------------------------------------------------------------------- /spark-streaming-foreachRDD-and-foreach.py: -------------------------------------------------------------------------------- 1 | """ 2 | To run 3 | ./pyspark.submit.sh spark-streaming-foreachRDD-and-foreach.py 4 | """ 5 | 6 | from pyspark import SparkContext, SparkConf 7 | from pyspark.streaming import StreamingContext 8 | 9 | from pymongo import MongoClient 10 | 11 | from quiet_logs import quiet_logs 12 | 13 | # Documentation 14 | # http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd 15 | def sendRecord(tup): 16 | word = tup[0] 17 | amount = tup[1] 18 | 19 | connection = MongoClient() 20 | test_db = connection.get_database('test') 21 | wordcount_coll = test_db.get_collection('wordcount_coll') 22 | wordcount_coll.update({"_id": word}, {"$inc": {"count": amount} }, upsert=True) 23 | connection.close() 24 | 25 | 26 | if __name__ == "__main__": 27 | conf = SparkConf().setAppName("using foreachRDD and foreach on RDD") 28 | sc = SparkContext(conf=conf) 29 | ssc = StreamingContext(sc, 2) 30 | ssc.checkpoint("checkpoint") 31 | 32 | quiet_logs(sc) 33 | 34 | # Create a DStream that will connect to hostname:port, like localhost:9999 35 | lines = ssc.socketTextStream("localhost", 9998) 36 | # lines = ssc.textFileStream('./streamingData') 37 | 38 | # Split each line into words 39 | words = lines.flatMap(lambda line: line.split(" ")) 40 | 41 | # Count each word in each batch 42 | pairs = words.map(lambda word: (word, 1)) 43 | 44 | wordCounts = pairs.reduceByKey(lambda x, y: x + y) 45 | 46 | # http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd 47 | wordCounts.foreachRDD(lambda rdd: rdd.foreach(sendRecord)) 48 | 49 | # Print the first ten elements of each RDD generated in this DStream to the console 50 | wordCounts.pprint() 51 | 52 | ssc.start() # Start the computation 53 | ssc.awaitTermination() # Wait for the computation to terminate 54 | -------------------------------------------------------------------------------- /spark-streaming-foreachRDD-and-foreachPartition.py: -------------------------------------------------------------------------------- 1 | """ 2 | To run 3 | ./pyspark.submit.sh spark-streaming-foreachRDD-and-foreachPartition.py 4 | """ 5 | 6 | from pyspark import SparkContext, SparkConf 7 | from pyspark.streaming import StreamingContext 8 | 9 | from pymongo import MongoClient 10 | 11 | from quiet_logs import quiet_logs 12 | 13 | # Documentation 14 | # http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd 15 | def sendPartition(partition): 16 | connection = MongoClient() 17 | test_db = connection.get_database('test') 18 | wordcount_coll = test_db.get_collection('wordcount_coll') 19 | 20 | for tup in partition: 21 | word = tup[0] 22 | amount = tup[1] 23 | wordcount_coll.update({"_id": word}, {"$inc": {"count": amount} }, upsert=True) 24 | 25 | connection.close() 26 | 27 | if __name__ == "__main__": 28 | conf = SparkConf().setAppName("using foreachRDD and foreachPartition on RDD") 29 | sc = SparkContext(conf=conf) 30 | ssc = StreamingContext(sc, 2) 31 | ssc.checkpoint("checkpoint") 32 | 33 | quiet_logs(sc) 34 | 35 | # Create a DStream that will connect to hostname:port, like localhost:9999 36 | lines = ssc.socketTextStream('localhost', 9998) 37 | # lines = ssc.textFileStream('./streamingData') 38 | 39 | # Split each line into words 40 | words = lines.flatMap(lambda line: line.split(" ")) 41 | 42 | # Count each word in each batch 43 | pairs = words.map(lambda word: (word, 1)) 44 | 45 | wordCounts = pairs.reduceByKey(lambda x, y: x + y) 46 | 47 | # http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd 48 | wordCounts.foreachRDD(lambda rdd: rdd.foreachPartition(sendPartition)) 49 | 50 | # Print the first ten elements of each RDD generated in this DStream to the console 51 | wordCounts.pprint() 52 | 53 | ssc.start() # Start the computation 54 | ssc.awaitTermination() # Wait for the computation to terminate 55 | -------------------------------------------------------------------------------- /spark-streaming-reading-files-from-a-folder.py: -------------------------------------------------------------------------------- 1 | """ 2 | To run 3 | ./pyspark.submit.sh spark-streaming-foreachRDD-and-foreachPartition.py 4 | """ 5 | 6 | from pyspark import SparkContext, SparkConf 7 | from pyspark.streaming import StreamingContext 8 | 9 | from quiet_logs import quiet_logs 10 | 11 | if __name__ == "__main__": 12 | conf = SparkConf().setAppName("Reading files from a directory") 13 | sc = SparkContext(conf=conf) 14 | ssc = StreamingContext(sc, 2) 15 | 16 | quiet_logs(sc) 17 | 18 | lines = ssc.textFileStream('./streamingData') 19 | 20 | # Split each line into words 21 | words = lines.flatMap(lambda line: line.split(" ")) 22 | 23 | # Count each word in each batch 24 | pairs = words.map(lambda word: (word, 1)) 25 | 26 | wordCounts = pairs.reduceByKey(lambda x, y: x + y) 27 | 28 | # Print the first ten elements of each RDD generated in this DStream to the console 29 | wordCounts.pprint() 30 | 31 | ssc.start() # Start the computation 32 | ssc.awaitTermination() # Wait for the computation to terminate -------------------------------------------------------------------------------- /spark-streaming-reading-from-a-tcp-port.py: -------------------------------------------------------------------------------- 1 | """ 2 | Counts words in UTF8 encoded, '\n' delimited text received from the network every second. 3 | 4 | To run this on your local machine, you need to first run a Netcat server 5 | `$ nc -lk 9999` 6 | 7 | and then run the example 8 | `$ ./pyspark.submit.sh spark-streaming-listening-to-a-tcp-port.py 9 | """ 10 | 11 | from pyspark import SparkContext, SparkConf 12 | from pyspark.streaming import StreamingContext 13 | 14 | from quiet_logs import quiet_logs 15 | 16 | if __name__ == "__main__": 17 | conf = SparkConf().setAppName("Listening to a tcp port") 18 | sc = SparkContext(conf=conf) 19 | ssc = StreamingContext(sc, 1) 20 | 21 | quiet_logs(sc) 22 | 23 | lines = ssc.socketTextStream("localhost", 9998) 24 | 25 | # Split each line into words 26 | words = lines.flatMap(lambda line: line.split(" ")) 27 | 28 | # Count each word in each batch 29 | pairs = words.map(lambda word: (word, 1)) 30 | 31 | 32 | wordCounts = pairs.reduceByKey(lambda x, y: x + y) 33 | 34 | # Print the first ten elements of each RDD generated in this DStream to the console 35 | wordCounts.pprint() 36 | 37 | 38 | ssc.start() # Start the computation 39 | ssc.awaitTermination() # Wait for the computation to terminate --------------------------------------------------------------------------------