├── NYC_neighborhoods ├── ZillowNeighborhoods-NY.dbf ├── ZillowNeighborhoods-NY.shp ├── ZillowNeighborhoods-NY.shx ├── ZillowNeighborhoods-NY.prj ├── prep-nbhds.py └── nbhd.jsonl ├── launch-streams.sh ├── bootstrap.sh ├── spark-streaming-hdfs.py ├── Vagrantfile ├── spark-streaming-console.py ├── spark-streaming-memory.py ├── README.md └── spark-streaming-hdfs-memory.py /NYC_neighborhoods/ZillowNeighborhoods-NY.dbf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaltas/spark-streaming-pyspark/HEAD/NYC_neighborhoods/ZillowNeighborhoods-NY.dbf -------------------------------------------------------------------------------- /NYC_neighborhoods/ZillowNeighborhoods-NY.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaltas/spark-streaming-pyspark/HEAD/NYC_neighborhoods/ZillowNeighborhoods-NY.shp -------------------------------------------------------------------------------- /NYC_neighborhoods/ZillowNeighborhoods-NY.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaltas/spark-streaming-pyspark/HEAD/NYC_neighborhoods/ZillowNeighborhoods-NY.shx -------------------------------------------------------------------------------- /NYC_neighborhoods/ZillowNeighborhoods-NY.prj: -------------------------------------------------------------------------------- 1 | GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137,298.257222101]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]] -------------------------------------------------------------------------------- /NYC_neighborhoods/prep-nbhds.py: -------------------------------------------------------------------------------- 1 | import shapefile 2 | import json 3 | with open('nbhd.jsonl', 'w') as outfile: 4 | sf = shapefile.Reader("ZillowNeighborhoods-NY") 5 | shapeRecs = sf.shapeRecords() 6 | for n in shapeRecs: 7 | State, County, City, Name, RegionID = n.record[:] 8 | if City != 'New York' : continue 9 | if County != 'New York' : continue # New York County corresponds to Manhattan borough 10 | json.dump({"name":Name, "coord":n.shape.points}, outfile) 11 | outfile.write('\n') 12 | -------------------------------------------------------------------------------- /launch-streams.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # execute as root or kafka user from master02.cluster host 4 | 5 | su - kafka -c '( curl -s https://training.ververica.com/trainingData/nycTaxiRides.gz | \ 6 | zcat | \ 7 | split -l 10000 --filter="/usr/hdp/current/kafka-broker/bin/kafka-console-producer.sh \ 8 | --broker-list master02.cluster:6667 --topic taxirides; sleep 0.2" \ 9 | > /dev/null ) &' 10 | 11 | su - kafka -c '( curl -s https://training.ververica.com/trainingData/nycTaxiFares.gz | \ 12 | zcat | \ 13 | split -l 10000 --filter="/usr/hdp/current/kafka-broker/bin/kafka-console-producer.sh \ 14 | --broker-list master02.cluster:6667 --topic taxifares; sleep 0.2" \ 15 | > /dev/null ) &' 16 | -------------------------------------------------------------------------------- /bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Boostrap each machine with Ambari prerequisites / prepare environment 4 | echo "*** BOOSTRAPING NODE $NODE ***" 5 | 6 | yum upgrade -q -y 7 | yum install -q -y epel-release 8 | yum install -q -y htop 9 | yum install -q -y vim 10 | yum install -q -y wget 11 | 12 | cat > /etc/hosts <<- EOF 13 | 10.10.10.11 master01.cluster master01 14 | 10.10.10.12 master02.cluster master02 15 | 10.10.10.16 worker01.cluster worker01 16 | 10.10.10.17 worker02.cluster worker02 17 | EOF 18 | cat >> /etc/sysconfig/network <<- EOF 19 | NETWORKING=yes 20 | HOSTNAME=$HNAME 21 | EOF 22 | sysctl -w fs.file-max=100000 23 | yum install -y -q ntp 24 | systemctl enable ntpd 25 | systemctl start ntpd 26 | systemctl disable firewalld 27 | service firewalld stop 28 | echo umask 0022 >> /etc/profile 29 | setenforce 0 30 | sed -i -e 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/selinux/config 31 | 32 | # For Ambari 2.7.3 Minimum Java is OracleJDK JDK 1.8.0_77 33 | # Java installation skipped - ambari-server can download it itself and share with 34 | # all ambari-agents 35 | 36 | #install Python 3 from epel repository (concurrently to existing Python 2.7.5) 37 | yum install -y -q python36 38 | yum install -y -q python36-setuptools 39 | easy_install-3.6 -q pip 40 | -------------------------------------------------------------------------------- /spark-streaming-hdfs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Submit from master02.cluster as spark user: 3 | 4 | spark-submit \ 5 | --master yarn --deploy-mode client \ 6 | --num-executors 2 --executor-cores 1 \ 7 | --executor-memory 5g --driver-memory 4g \ 8 | --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0 \ 9 | --conf spark.sql.hive.thriftServer.singleSession=true \ 10 | /vagrant/spark-streaming-hdfs.py 11 | 12 | * The application reads data from Kafka topic, parses Kafka messages, and dumps unaltered raw data to HDFS 13 | * Two streaming queries 14 | * `PersistRawTaxiRides` query persists raw taxi rides data on hdfs path /user/spark/datalake/RidesRaw 15 | * `PersistRawTaxiFares` query persists raw taxi fares data on hdfs path /user/spark/datalake/FaresRaw 16 | 17 | ''' 18 | from pyspark.sql import SparkSession 19 | from pyspark.sql.types import * 20 | from pyspark.sql.functions import expr 21 | from pyspark.sql.functions import avg 22 | from pyspark.sql.functions import window 23 | 24 | def parse_data_from_kafka_message(sdf, schema): 25 | from pyspark.sql.functions import split 26 | assert sdf.isStreaming == True, "DataFrame doesn't receive treaming data" 27 | col = split(sdf['value'], ',') #split attributes to nested array in one Column 28 | #now expand col to multiple top-level columns 29 | for idx, field in enumerate(schema): 30 | sdf = sdf.withColumn(field.name, col.getItem(idx).cast(field.dataType)) 31 | return sdf.select([field.name for field in schema]) 32 | 33 | spark = SparkSession.builder \ 34 | .appName("Spark Structured Streaming from Kafka") \ 35 | .getOrCreate() 36 | 37 | sdfRides = spark \ 38 | .readStream \ 39 | .format("kafka") \ 40 | .option("kafka.bootstrap.servers", "master02.cluster:6667") \ 41 | .option("subscribe", "taxirides") \ 42 | .option("startingOffsets", "latest") \ 43 | .load() \ 44 | .selectExpr("CAST(value AS STRING)") 45 | 46 | sdfFares = spark \ 47 | .readStream \ 48 | .format("kafka") \ 49 | .option("kafka.bootstrap.servers", "master02.cluster:6667") \ 50 | .option("subscribe", "taxifares") \ 51 | .option("startingOffsets", "latest") \ 52 | .load() \ 53 | .selectExpr("CAST(value AS STRING)") 54 | 55 | taxiFaresSchema = StructType([ \ 56 | StructField("rideId", LongType()), StructField("taxiId", LongType()), \ 57 | StructField("driverId", LongType()), StructField("startTime", TimestampType()), \ 58 | StructField("paymentType", StringType()), StructField("tip", FloatType()), \ 59 | StructField("tolls", FloatType()), StructField("totalFare", FloatType())]) 60 | 61 | taxiRidesSchema = StructType([ \ 62 | StructField("rideId", LongType()), StructField("isStart", StringType()), \ 63 | StructField("endTime", TimestampType()), StructField("startTime", TimestampType()), \ 64 | StructField("startLon", FloatType()), StructField("startLat", FloatType()), \ 65 | StructField("endLon", FloatType()), StructField("endLat", FloatType()), \ 66 | StructField("passengerCnt", ShortType()), StructField("taxiId", LongType()), \ 67 | StructField("driverId", LongType())]) 68 | 69 | sdfRides = parse_data_from_kafka_message(sdfRides, taxiRidesSchema) 70 | sdfFares = parse_data_from_kafka_message(sdfFares, taxiFaresSchema) 71 | 72 | from pyspark.sql.functions import year, month, dayofmonth 73 | 74 | 75 | sdfRides.withColumn("year", year("startTime")) \ 76 | .withColumn("month", month("startTime")) \ 77 | .withColumn("day", dayofmonth("startTime")) \ 78 | .writeStream \ 79 | .queryName("PersistRawTaxiRides") \ 80 | .outputMode("append") \ 81 | .format("parquet") \ 82 | .option("path", "datalake/RidesRaw") \ 83 | .option("checkpointLocation", "checkpoints/RidesRaw") \ 84 | .partitionBy("startTime") \ 85 | .option("truncate", False) \ 86 | .start() 87 | 88 | sdfFares.withColumn("year", year("startTime")) \ 89 | .withColumn("month", month("startTime")) \ 90 | .withColumn("day", dayofmonth("startTime")) \ 91 | .writeStream \ 92 | .queryName("PersistRawTaxiFares") \ 93 | .outputMode("append") \ 94 | .format("parquet") \ 95 | .option("path", "datalake/FaresRaw") \ 96 | .option("checkpointLocation", "checkpoints/FaresRaw") \ 97 | .partitionBy("year", "month", "day") \ 98 | .option("truncate", False) \ 99 | .start() 100 | 101 | #Notice that the path `checkpoints/FaresRaw` amounts to `/user/spark/datalake/FaresRaw` on HDFS 102 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | box = "centos/7" 5 | 6 | $ambari_server=<