├── .gitignore
├── Makefile
├── README.md
├── schema.sql
├── scripts
    └── start_with_local_kafka.sh
└── transform
    ├── Dockerfile
    ├── transform.py
    └── vader_sentiment_lexicon.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | transform.tar.gz
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | MEMSQL_CONTAINER = twitter-demo-memsql
 2 | KAFKA_CONTAINER = twitter-demo-kafka
 3 | TRANSFORM_BUILDER = pipelines-twitter-demo-transform
 4 | 
 5 | 
 6 | .PHONY: run-memsql
 7 | run-memsql: schema.sql
 8 | 	docker run \
 9 | 		-d -p 3306:3306 -p 9000:9000 \
10 | 		--name ${MEMSQL_CONTAINER} \
11 | 		-v ${PWD}/schema.sql:/schema.sql \
12 | 		memsql/quickstart
13 | 
14 | 
15 | .PHONY: run-kafka
16 | run-kafka:
17 | 	docker run --name ${KAFKA_CONTAINER} \
18 | 		-d -p 9092:9092 -p 2181:2181 \
19 | 		-e PRODUCE_TWITTER=1 \
20 | 		-e TWITTER_CONSUMER_KEY \
21 | 		-e TWITTER_CONSUMER_SECRET \
22 | 		-e TWITTER_ACCESS_TOKEN \
23 | 		-e TWITTER_ACCESS_SECRET \
24 | 		memsql/kafka
25 | 
26 | 
27 | .PHONY: stop-kafka
28 | stop-kafka:
29 | 	docker rm -f ${KAFKA_CONTAINER}
30 | 
31 | 
32 | .PHONY: run-memsql-local
33 | run-memsql-local: schema.sql
34 | 	docker run \
35 | 		-d -p 3306:3306 -p 9000:9000 \
36 | 		--name ${MEMSQL_CONTAINER} \
37 | 		--link ${KAFKA_CONTAINER}:kafka \
38 | 		-v ${PWD}/scripts/start_with_local_kafka.sh:/start.sh \
39 | 		-v ${PWD}/schema.sql:/schema.sql.tpl \
40 | 		memsql/quickstart /start.sh
41 | 
42 | 
43 | .PHONY: stop-memsql
44 | stop-memsql:
45 | 	docker rm -f ${MEMSQL_CONTAINER}
46 | 
47 | 
48 | .PHONY: sql-console
49 | sql-console:
50 | 	docker run \
51 | 		-it --link ${MEMSQL_CONTAINER}:memsql \
52 | 		memsql/quickstart \
53 | 		memsql-shell example
54 | 
55 | 
56 | # You can use this target to build your own transform and upload it somewhere
57 | # (or mount it in the container)
58 | transform.tar.gz: transform
59 | 	docker build -t ${TRANSFORM_BUILDER} transform
60 | 	@-docker rm ${TRANSFORM_BUILDER} >/dev/null 2>&1
61 | 	docker create --name ${TRANSFORM_BUILDER} ${TRANSFORM_BUILDER}
62 | 	docker cp ${TRANSFORM_BUILDER}:transform.tar.gz transform.tar.gz
63 | 	docker rm ${TRANSFORM_BUILDER}
64 | 
65 | 
66 | ################################################################
67 | # Deployment targets
68 | # Must have access to the MemSQL organization on Docker Hub and
69 | # access to uploading files to the MemSQL S3 download bucket.
70 | 
71 | .PHONY: upload-transform
72 | upload-transform: transform.tar.gz
73 | 	aws s3api put-object \
74 | 		--bucket download.memsql.com \
75 | 		--key pipelines-twitter-demo/transform.tar.gz \
76 | 		--acl public-read \
77 | 		--body transform.tar.gz
78 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | MemSQL Pipelines Twitter Demo
 2 | =============================
 3 | 
 4 | Here is an example project that demonstrates an interesting
 5 | analytics use case using MemSQL Pipelines. Specifically, this demo captures
 6 | tweets and retweet counts for posts pertaining to the 30 NBA teams.
 7 | 
 8 | To run the demo you will need to have Docker installed and the docker
 9 | client available.
10 | 
11 | 
12 | Example Usage
13 | -------------
14 | 
15 | ```bash
16 | # Start the MemSQL container on your machine
17 | make run-memsql
18 | 
19 | # Open up the Pipelines page on MemSQL Ops
20 | # Note that your hostname may be different if you are using Docker Toolbox or similar software
21 | open http://localhost:9000/data/pipelines
22 | 
23 | # Open a MemSQL client for the local MemSQL container
24 | make sql-console
25 | 
26 | # Stop the MemSQL container when you are done
27 | make stop-memsql
28 | ```
29 | 
30 | 
31 | Running Apache Kafka Locally
32 | ----------------------------
33 | 
34 | Running the demo using the base instructions above will use a public-kafka stream provided by MemSQL which is heavily
35 | limited and low in data stream volume. You can circumvent this by choosing to run the Kafka stream and
36 | associated producer with Docker in your local environment, like so:
37 | 
38 | ```bash
39 | 
40 | # First we need to setup the Kafka stream...
41 | 
42 | # Go to https://apps.twitter.com/ to get a set of tokens for the Twitter API.
43 | # Then export those tokens into your environment:
44 | # (replace TODO with the relevant token)
45 | 
46 | export TWITTER_CONSUMER_KEY=TODO
47 | export TWITTER_CONSUMER_SECRET=TODO
48 | export TWITTER_ACCESS_TOKEN=TODO
49 | export TWITTER_ACCESS_SECRET=TODO
50 | 
51 | # Start the Kafka container on your machine
52 | make run-kafka
53 | 
54 | # Then we can run the local MemSQL cluster
55 | make run-memsql-local
56 | 
57 | # Open up the Pipelines page on MemSQL Ops
58 | # Note that your hostname may be different if you are using Docker Toolbox or similar software
59 | open http://localhost:9000/data/pipelines
60 | 
61 | # Open a MemSQL client for the local MemSQL container
62 | make sql-console
63 | 
64 | # Stop the containers when you are done
65 | make stop-memsql
66 | make stop-kafka
67 | ```
68 | 
69 | 
70 | Running Analytical Queries on the Data Set
71 | ------------------------------------------
72 | 
73 | Use `make sql-console` to open a SQL prompt into the MemSQL Database
74 | that is running the twitter pipelines. You should notice two pipelines running -
75 | `twitter_pipeline` and `twitter_sentiment_pipeline`. You should also notice
76 | two tables where these pipelines store data -
77 | `tweets` and `tweet_sentiment`, which are steadily increasing in size.
78 | See `schema.sql`to see exactly how the pipelines and tables are defined.
79 | 
80 | Within the MemSQL Database, you run simple analytical queries on
81 | the `tweets` and `tweet_sentiment` tables. For example,
82 | the following query will retrieve 10 random tweets about any team
83 | and show the tweet's sentiment - positive, negative or neutral.
84 | 
85 | ```sql
86 | SELECT
87 |     REPLACE(text, "\n", "") AS text,
88 |     positive, neutral, negative
89 | FROM
90 |     example.tweets t
91 |     JOIN example.tweet_sentiment ts ON t.id = ts.id
92 | ORDER BY RAND()
93 | LIMIT 10\G
94 | ```
95 | 


--------------------------------------------------------------------------------
/schema.sql:
--------------------------------------------------------------------------------
  1 | DROP DATABASE IF EXISTS example;
  2 | CREATE DATABASE example;
  3 | USE example;
  4 | 
  5 | 
  6 | -- A table set up to receive raw data from Kafka.
  7 | CREATE TABLE tweets (
  8 | 
  9 |     -- These fields are present in each record we get from Kafka. `tweet` is an
 10 |     -- arbitrary JSON blob.
 11 |     id BIGINT,
 12 |     ts TIMESTAMP,
 13 |     tweet JSON,
 14 | 
 15 |     -- These are examples of computed columns. We use MemSQL JSON subselect
 16 |     -- syntax to create extra columns derived from `tweet`.
 17 |     text AS tweet::$text PERSISTED TEXT,
 18 |     retweet_count AS tweet::%retweet_count PERSISTED INT,
 19 |     team_location AS CASE
 20 |         WHEN (text LIKE '%hawks%') THEN 'Atlanta'
 21 |         WHEN (text LIKE '%celtics%') THEN 'Boston'
 22 |         WHEN (text LIKE '%nets%') THEN 'Brooklyn'
 23 |         WHEN (text LIKE '%hornets%') THEN 'Charlotte'
 24 |         WHEN (text LIKE '%bulls%') THEN 'Chicago'
 25 |         WHEN (text LIKE '%cavaliers%') THEN 'Cleveland'
 26 |         WHEN (text LIKE '%mavericks%') THEN 'Dallas'
 27 |         WHEN (text LIKE '%nuggets%') THEN 'Denver'
 28 |         WHEN (text LIKE '%pistons%') THEN 'Detroit'
 29 |         WHEN (text LIKE '%warriors%') THEN 'Golden State'
 30 |         WHEN (text LIKE '%rockets%') THEN 'Houston'
 31 |         WHEN (text LIKE '%pacers%') THEN 'Indiana'
 32 |         WHEN (text LIKE '%clippers%') THEN 'Los Angeles'
 33 |         WHEN (text LIKE '%lakers%') THEN 'Los Angeles'
 34 |         WHEN (text LIKE '%grizzlies%') THEN 'Memphis'
 35 |         WHEN (text LIKE '%heat%') THEN 'Miami'
 36 |         WHEN (text LIKE '%bucks%') THEN 'Milwaukee'
 37 |         WHEN (text LIKE '%timberwolves%') THEN 'Minnesota'
 38 |         WHEN (text LIKE '%pelicans%') THEN 'New Orleans'
 39 |         WHEN (text LIKE '%knicks%') THEN 'New York'
 40 |         WHEN (text LIKE '%thunder%') THEN 'Oklahoma City'
 41 |         WHEN (text LIKE '%magic%') THEN 'Orlando'
 42 |         WHEN (text LIKE '%sixers%') THEN 'Philadelphia'
 43 |         WHEN (text LIKE '%suns%') THEN 'Phoenix'
 44 |         WHEN (text LIKE '%blazers%') THEN 'Portland'
 45 |         WHEN (text LIKE '%kings%') THEN 'Sacramento'
 46 |         WHEN (text LIKE '%spurs%') THEN 'San Antonio'
 47 |         WHEN (text LIKE '%raptors%') THEN 'Toronto'
 48 |         WHEN (text LIKE '%jazz%') THEN 'Utah'
 49 |         WHEN (text LIKE '%wizards%') THEN 'Washington' 
 50 |         ELSE 'Unknown' END PERSISTED TEXT,
 51 |     team AS CASE
 52 |         WHEN (text LIKE '%hawks%') THEN 'Hawks'
 53 |         WHEN (text LIKE '%celtics%') THEN 'Celtics'
 54 |         WHEN (text LIKE '%nets%') THEN 'Nets'
 55 |         WHEN (text LIKE '%hornets%') THEN 'Hornets'
 56 |         WHEN (text LIKE '%bulls%') THEN 'Bulls'
 57 |         WHEN (text LIKE '%cavaliers%') THEN 'Cavaliers'
 58 |         WHEN (text LIKE '%mavericks%') THEN 'Mavericks'
 59 |         WHEN (text LIKE '%nuggets%') THEN 'Nuggets'
 60 |         WHEN (text LIKE '%pistons%') THEN 'Pistons'
 61 |         WHEN (text LIKE '%warriors%') THEN 'Warriors'
 62 |         WHEN (text LIKE '%rockets%') THEN 'Rockets'
 63 |         WHEN (text LIKE '%pacers%') THEN 'Pacers'
 64 |         WHEN (text LIKE '%clippers%') THEN 'Clippers'
 65 |         WHEN (text LIKE '%lakers%') THEN 'Lakers'
 66 |         WHEN (text LIKE '%grizzlies%') THEN 'Grizzlies'
 67 |         WHEN (text LIKE '%heat%') THEN 'Heat'
 68 |         WHEN (text LIKE '%bucks%') THEN 'Bucks'
 69 |         WHEN (text LIKE '%timberwolves%') THEN 'Timberwolves'
 70 |         WHEN (text LIKE '%pelicans%') THEN 'Pelicans'
 71 |         WHEN (text LIKE '%knicks%') THEN 'Knicks'
 72 |         WHEN (text LIKE '%thunder%') THEN 'Thunder'
 73 |         WHEN (text LIKE '%magic%') THEN 'Magic'
 74 |         WHEN (text LIKE '%sixers%') THEN 'Sixers'
 75 |         WHEN (text LIKE '%suns%') THEN 'Suns'
 76 |         WHEN (text LIKE '%blazers%') THEN 'Blazers'
 77 |         WHEN (text LIKE '%kings%') THEN 'Kings'
 78 |         WHEN (text LIKE '%spurs%') THEN 'Spurs'
 79 |         WHEN (text LIKE '%raptors%') THEN 'Raptors'
 80 |         WHEN (text LIKE '%jazz%') THEN 'Jazz'
 81 |         WHEN (text LIKE '%wizards%') THEN 'Wizards' 
 82 |         ELSE 'Unknown' END PERSISTED TEXT,
 83 |     created AS FROM_UNIXTIME(`tweet`::$created_at) PERSISTED DATETIME,
 84 | 
 85 |     KEY(id) USING CLUSTERED COLUMNSTORE,
 86 |     SHARD KEY(id)
 87 | );
 88 | 
 89 | 
 90 | -- A MemSQL Pipeline. Everything inside "CREATE PIPELINE AS" is a normal "LOAD
 91 | -- DATA" statement.
 92 | CREATE PIPELINE twitter_pipeline AS
 93 | 
 94 |     -- The "source" of this pipeline is a Kafka broker and topic.
 95 |     LOAD DATA KAFKA "public-kafka.memcompute.com:9092/tweets-json"
 96 | 
 97 |     -- The "sink" of this pipeline is a MemSQL Table. In this case, our
 98 |     -- destination table has a unique key, so we REPLACE rows if we get a new
 99 |     -- record with a key that already exists in the table.
100 |     INTO TABLE tweets
101 | 
102 |     -- Our example Kafka topic contains tab-separated data: a tweet ID, and a
103 |     -- JSON blob representing a tweet.
104 |     FIELDS TERMINATED BY "\t"
105 | 
106 |     -- Our tab-separated data from Kafka will be written to these two columns in
107 |     -- the destination table.
108 |     (id, tweet);
109 | 
110 | 
111 | -- A table of sentiment score by tweet ID.
112 | CREATE TABLE tweet_sentiment (
113 |     id BIGINT,
114 | 
115 |     -- These fields are outputted by the sentiment analyzer. "compound" ranges
116 |     -- from -1 to 1 and generally represents sentiment on a scale of "bad" to
117 |     -- "good". The other three numbers each range from 0 to 1 and together give
118 |     -- a more precise picture of the detected sentiment.
119 |     compound FLOAT,
120 |     positive FLOAT,
121 |     negative FLOAT,
122 |     neutral FLOAT,
123 | 
124 |     KEY(id) USING CLUSTERED COLUMNSTORE,
125 |     SHARD KEY(id)
126 | );
127 | 
128 | -- A MemSQL Pipeline with a transform
129 | CREATE PIPELINE twitter_sentiment_pipeline AS
130 |     LOAD DATA KAFKA "public-kafka.memcompute.com:9092/tweets-json"
131 | 
132 |     -- Here, we specify an executable that will transform each record that
133 |     -- passes through the Pipeline. In this case, our transform function takes
134 |     -- JSON blobs from Twitter and performs a sentiment analysis on the tweet
135 |     -- text, returning a tweet ID and a score.
136 |     -- passes through the Pipeline. In this case, our transform function takes
137 |     -- JSON blobs from Twitter and performs a sentiment analysis on the tweet
138 |     -- text, returning a tweet ID and a score.
139 |     -- WITH TRANSFORM (
140 |     --     "file://localhost/transform.tar.gz",
141 |     --     "transform.py", "")
142 | 
143 |     -- The transform can also be downloaded from a url
144 |     WITH TRANSFORM (
145 |         "http://download.memsql.com/pipelines-twitter-demo/transform.tar.gz",
146 |         "transform.py", "")
147 | 
148 |     INTO TABLE tweet_sentiment
149 |     FIELDS TERMINATED BY "\t"
150 |     (id, compound, positive, negative, neutral);
151 | 
152 | 
153 | CREATE VIEW tweet_scores AS
154 | SELECT
155 |     t.id, t.tweet, t.text, t.retweet_count,
156 |     t.ts, t.team, ts.compound,
157 |     ts.positive, ts.negative, ts.neutral
158 | FROM
159 |     tweets AS t
160 |     INNER JOIN tweet_sentiment AS ts
161 |     ON t.id = ts.id;
162 | 
163 | 
164 | CREATE VIEW tweets_per_team as
165 | SELECT
166 |     FORMAT(COUNT(*), 0) tweet_scores, team
167 | FROM tweet_scores t
168 | GROUP BY team;
169 | 
170 | 
171 | CREATE VIEW tweets_per_sentiment_per_team_timeseries AS
172 | SELECT
173 |     FROM_UNIXTIME(TRUNCATE(UNIX_TIMESTAMP(ts) / 60, 0) * 60) AS ts_bucket,
174 |     TRUNCATE(compound, 1) AS sentiment_bucket,
175 |     FORMAT(COUNT(*), 0) AS tweet_volume,
176 |     team
177 | FROM tweet_scores t
178 | GROUP BY ts_bucket, sentiment_bucket, team;
179 | 
180 | 
181 | CREATE VIEW sentiment_histogram AS
182 | SELECT
183 |     sentiment_bucket,
184 |     SUM(IF(team = 'Hawks', tweet_volume, 0)) as hawks_tweets,
185 |     SUM(IF(team = 'Celtics', tweet_volume, 0)) as celtics_tweets,
186 |     SUM(IF(team = 'Nets', tweet_volume, 0)) as nets_tweets,
187 |     SUM(IF(team = 'Hornets', tweet_volume, 0)) as hornets_tweets,
188 |     SUM(IF(team = 'Bulls', tweet_volume, 0)) as bulls_tweets,
189 |     SUM(IF(team = 'Cavaliers', tweet_volume, 0)) as cavaliers_tweets,
190 |     SUM(IF(team = 'Mavericks', tweet_volume, 0)) as mavericks_tweets,
191 |     SUM(IF(team = 'Nuggets', tweet_volume, 0)) as nuggets_tweets,
192 |     SUM(IF(team = 'Pistons', tweet_volume, 0)) as pistons_tweets,
193 |     SUM(IF(team = 'Warriors', tweet_volume, 0)) as warriors_tweets,
194 |     SUM(IF(team = 'Rockets', tweet_volume, 0)) as rockets_tweets,
195 |     SUM(IF(team = 'Pacers', tweet_volume, 0)) as pacers_tweets,
196 |     SUM(IF(team = 'Clippers', tweet_volume, 0)) as clippers_tweets,
197 |     SUM(IF(team = 'Lakers', tweet_volume, 0)) as lakers_tweets,
198 |     SUM(IF(team = 'Grizzlies', tweet_volume, 0)) as grizzlies_tweets,
199 |     SUM(IF(team = 'Heat', tweet_volume, 0)) as heat_tweets,
200 |     SUM(IF(team = 'Bucks', tweet_volume, 0)) as bucks_tweets,
201 |     SUM(IF(team = 'Timberwolves', tweet_volume, 0)) as timberwolves_tweets,
202 |     SUM(IF(team = 'Pelicans', tweet_volume, 0)) as pelicans_tweets,
203 |     SUM(IF(team = 'Knicks', tweet_volume, 0)) as knicks_tweets,
204 |     SUM(IF(team = 'Thunder', tweet_volume, 0)) as thunder_tweets,
205 |     SUM(IF(team = 'Magic', tweet_volume, 0)) as magic_tweets,
206 |     SUM(IF(team = 'Sixers', tweet_volume, 0)) as sixers_tweets,
207 |     SUM(IF(team = 'Suns', tweet_volume, 0)) as suns_tweets,
208 |     SUM(IF(team = 'Blazers', tweet_volume, 0)) as blazers_tweets,
209 |     SUM(IF(team = 'Kings', tweet_volume, 0)) as kings_tweets,
210 |     SUM(IF(team = 'Spurs', tweet_volume, 0)) as spurs_tweets,
211 |     SUM(IF(team = 'Raptors', tweet_volume, 0)) as raptors_tweets,
212 |     SUM(IF(team = 'Jazz', tweet_volume, 0)) as jazz_tweets,
213 |     SUM(IF(team = 'Wizards', tweet_volume, 0)) as wizards_tweets 
214 | FROM tweets_per_sentiment_per_team_timeseries t
215 | GROUP BY sentiment_bucket
216 | ORDER BY sentiment_bucket;
217 | 
218 | 
219 | CREATE VIEW timeseries_histogram AS
220 | SELECT
221 |     ts_bucket,
222 |     SUM(IF(team = 'Hawks', tweet_volume, 0)) as hawks_tweets,
223 |     SUM(IF(team = 'Celtics', tweet_volume, 0)) as celtics_tweets,
224 |     SUM(IF(team = 'Nets', tweet_volume, 0)) as nets_tweets,
225 |     SUM(IF(team = 'Hornets', tweet_volume, 0)) as hornets_tweets,
226 |     SUM(IF(team = 'Bulls', tweet_volume, 0)) as bulls_tweets,
227 |     SUM(IF(team = 'Cavaliers', tweet_volume, 0)) as cavaliers_tweets,
228 |     SUM(IF(team = 'Mavericks', tweet_volume, 0)) as mavericks_tweets,
229 |     SUM(IF(team = 'Nuggets', tweet_volume, 0)) as nuggets_tweets,
230 |     SUM(IF(team = 'Pistons', tweet_volume, 0)) as pistons_tweets,
231 |     SUM(IF(team = 'Warriors', tweet_volume, 0)) as warriors_tweets,
232 |     SUM(IF(team = 'Rockets', tweet_volume, 0)) as rockets_tweets,
233 |     SUM(IF(team = 'Pacers', tweet_volume, 0)) as pacers_tweets,
234 |     SUM(IF(team = 'Clippers', tweet_volume, 0)) as clippers_tweets,
235 |     SUM(IF(team = 'Lakers', tweet_volume, 0)) as lakers_tweets,
236 |     SUM(IF(team = 'Grizzlies', tweet_volume, 0)) as grizzlies_tweets,
237 |     SUM(IF(team = 'Heat', tweet_volume, 0)) as heat_tweets,
238 |     SUM(IF(team = 'Bucks', tweet_volume, 0)) as bucks_tweets,
239 |     SUM(IF(team = 'Timberwolves', tweet_volume, 0)) as timberwolves_tweets,
240 |     SUM(IF(team = 'Pelicans', tweet_volume, 0)) as pelicans_tweets,
241 |     SUM(IF(team = 'Knicks', tweet_volume, 0)) as knicks_tweets,
242 |     SUM(IF(team = 'Thunder', tweet_volume, 0)) as thunder_tweets,
243 |     SUM(IF(team = 'Magic', tweet_volume, 0)) as magic_tweets,
244 |     SUM(IF(team = 'Sixers', tweet_volume, 0)) as sixers_tweets,
245 |     SUM(IF(team = 'Suns', tweet_volume, 0)) as suns_tweets,
246 |     SUM(IF(team = 'Blazers', tweet_volume, 0)) as blazers_tweets,
247 |     SUM(IF(team = 'Kings', tweet_volume, 0)) as kings_tweets,
248 |     SUM(IF(team = 'Spurs', tweet_volume, 0)) as spurs_tweets,
249 |     SUM(IF(team = 'Raptors', tweet_volume, 0)) as raptors_tweets,
250 |     SUM(IF(team = 'Jazz', tweet_volume, 0)) as jazz_tweets,
251 |     SUM(IF(team = 'Wizards', tweet_volume, 0)) as wizards_tweets 
252 | FROM tweets_per_sentiment_per_team_timeseries t
253 | GROUP BY ts_bucket
254 | ORDER BY ts_bucket;
255 | 
256 | 
257 | -- We make sure that our pipelines do not start reading the Kafka topic from the
258 | -- beginning; instead, they starts reading new data as it comes in.
259 | ALTER PIPELINE twitter_pipeline SET OFFSETS LATEST;
260 | ALTER PIPELINE twitter_sentiment_pipeline SET OFFSETS LATEST;
261 | 
262 | START PIPELINE twitter_pipeline;
263 | START PIPELINE twitter_sentiment_pipeline;
264 | 


--------------------------------------------------------------------------------
/scripts/start_with_local_kafka.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | KAFKA_HOST="$KAFKA_PORT_9092_TCP_ADDR"
4 | 
5 | sed "s_public-kafka.memcompute.com:9092/tweets-json_$KAFKA_HOST/tweets-json_" /schema.sql.tpl > /schema.sql
6 | 
7 | exec /memsql-entrypoint.sh memsqld
8 | 


--------------------------------------------------------------------------------
/transform/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:8.6
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y python-dev python-pip && \
 5 |     apt-get clean && \
 6 |     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 7 | 
 8 | # upgrade pip
 9 | RUN pip install -U pip
10 | # compile the dependencies
11 | RUN pip install --target /python_deps nltk numpy
12 | 
13 | # construct the transform
14 | ADD . /transform
15 | 
16 | # add dependencies
17 | RUN mv /python_deps /transform
18 | 
19 | # package the transform
20 | RUN cd /transform && tar czf /transform.tar.gz .
21 | 


--------------------------------------------------------------------------------
/transform/transform.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import json
 4 | import os
 5 | import struct
 6 | import sys
 7 | 
 8 | # dependencies are stored in a folder called python_deps
 9 | # relative to this script. This is setup by the Dockerfile.
10 | 
11 | SCRIPT_DIR = os.path.join(os.path.dirname(__file__))
12 | sys.path.append(os.path.join(SCRIPT_DIR, "python_deps"))
13 | 
14 | # Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious
15 | # Rule-based Model for Sentiment Analysis of Social Media Text.
16 | # Eighth International Conference on Weblogs and Social Media
17 | # (ICWSM-14). Ann Arbor, MI, June 2014.
18 | from nltk.sentiment.vader import SentimentIntensityAnalyzer
19 | 
20 | # This class uses a file that we included in the transform tarball that we put
21 | # in the 'WITH TRANSFORM' statement.
22 | LEXICON_LOCAL_PATH = "vader_sentiment_lexicon.txt"
23 | LEXICON_PATH = os.path.join(SCRIPT_DIR, LEXICON_LOCAL_PATH)
24 | 
25 | # A text sentiment analyzer that doesn't require a training step.
26 | MODEL = SentimentIntensityAnalyzer(lexicon_file=LEXICON_PATH)
27 | 
28 | 
29 | # This is a bit of boilerplate that handles the way that incoming records are
30 | # encoded. Data is streamed to stdin, and each record is prefixed with 8 bytes
31 | # indicating how long the record is. This Python generator reads individual
32 | # records and yields them one by one.
33 | def transform_records():
34 |     while True:
35 |         byte_len = sys.stdin.read(8)
36 |         if len(byte_len) == 8:
37 |             byte_len = struct.unpack("L", byte_len)[0]
38 |             result = sys.stdin.read(byte_len)
39 |             yield result
40 |         else:
41 |             assert len(byte_len) == 0, byte_len
42 |             return
43 | 
44 | 
45 | # Iterate over the records that we receive from Kafka.
46 | for bytes in transform_records():
47 | 
48 |     # Parse the tab-separated record.
49 |     (id, tweet_str) = bytes.split("\t")
50 | 
51 |     # Convert the tweet JSON blob from double-escaped to single-escaped, so we
52 |     # can parse it.
53 |     tweet = json.loads(tweet_str.replace('\\\\', '\\'))
54 | 
55 |     # Extract the text from the tweet, and run it through the sentiment model.
56 |     text = tweet["text"]
57 |     scores = MODEL.polarity_scores(text)
58 | 
59 |     # Output, in tab-separated format with a newline at the end, the tweet ID
60 |     # plus all of the fields returned by the sentiment analyzer.
61 |     out_record = (
62 |         id, scores["compound"], scores["pos"], scores["neg"], scores["neu"])
63 | 
64 |     out_str = "\t".join([str(field) for field in out_record])
65 |     out = b"%s\n" % out_str
66 | 
67 |     sys.stdout.write(out)
68 | 


--------------------------------------------------------------------------------
/transform/vader_sentiment_lexicon.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/memsql/pipelines-twitter-demo/0acfd5095d308f94f965aff08f85894e88867b96/transform/vader_sentiment_lexicon.txt


--------------------------------------------------------------------------------