├── .gitignore
├── LICENSE
├── README.md
├── aws_config
    ├── .gitignore
    ├── __init__.py
    ├── configure
    │   ├── __init__.py
    │   ├── config_utils.py
    │   ├── configure_elasticsearch.py
    │   ├── configure_flask.py
    │   ├── configure_kafka.py
    │   ├── configure_spark.py
    │   ├── configure_storm.py
    │   └── templates
    │   │   ├── elasticsearch.yml
    │   │   ├── kafka-server-start.sh
    │   │   ├── kafka.server.properties
    │   │   ├── spark-env.sh
    │   │   ├── storm.yaml
    │   │   └── zoo.cfg
    ├── create_clusters.py
    ├── discover.py
    ├── host_install_scripts
    │   ├── elasticsearch_install.sh
    │   ├── flask_install.sh
    │   ├── kafka_install.sh
    │   ├── spark_install.sh
    │   └── storm_install.sh
    └── straw_service_config.sh
├── config
    └── config.properties
├── data
    ├── .gitignore
    ├── queries.bigrams.gz
    ├── queries.small
    ├── tweets.big.sample.gz
    └── tweets.small
├── local_demo
    ├── launch_demo_ui.sh
    ├── launch_local_cluster.sh
    ├── mock_firehose.sh
    └── prerequisites.sh
├── src
    ├── frontend
    │   ├── app
    │   │   ├── __init__.py
    │   │   ├── query_subscriber.py
    │   │   ├── static
    │   │   │   ├── assets
    │   │   │   │   ├── favicon.ico
    │   │   │   │   └── straw.pdf
    │   │   │   ├── css
    │   │   │   │   ├── bootstrap-theme.css
    │   │   │   │   ├── bootstrap-theme.css.map
    │   │   │   │   ├── bootstrap-theme.min.css
    │   │   │   │   ├── bootstrap.css
    │   │   │   │   ├── bootstrap.css.map
    │   │   │   │   ├── bootstrap.min.css
    │   │   │   │   └── theme.css
    │   │   │   ├── fonts
    │   │   │   │   ├── glyphicons-halflings-regular.eot
    │   │   │   │   ├── glyphicons-halflings-regular.svg
    │   │   │   │   ├── glyphicons-halflings-regular.ttf
    │   │   │   │   ├── glyphicons-halflings-regular.woff
    │   │   │   │   └── glyphicons-halflings-regular.woff2
    │   │   │   └── js
    │   │   │   │   ├── bootstrap.js
    │   │   │   │   ├── bootstrap.min.js
    │   │   │   │   ├── customize.min.js
    │   │   │   │   ├── docs.min.js
    │   │   │   │   ├── ie10-viewport-bug-workaround.js
    │   │   │   │   ├── jquery.js
    │   │   │   │   └── npm.js
    │   │   ├── straw_app.py
    │   │   ├── templates
    │   │   │   ├── about.html
    │   │   │   └── index.html
    │   │   └── views.py
    │   ├── launch.sh
    │   └── run.py
    ├── kafka_stream_eater
    │   ├── kafka_stream_consumer.py
    │   ├── kafka_stream_producer.py
    │   └── third_party
    │   │   └── kafka-docker-master
    │   │       ├── Dockerfile
    │   │       ├── LICENSE
    │   │       ├── README.md
    │   │       ├── broker-list.sh
    │   │       ├── docker-compose-single-broker.yml
    │   │       ├── docker-compose.yml
    │   │       ├── download-kafka.sh
    │   │       ├── start-kafka-shell.sh
    │   │       └── start-kafka.sh
    ├── luwak_search
    │   ├── .gitignore
    │   ├── pom.xml
    │   ├── run_luwak_topology.sh
    │   ├── src
    │   │   └── straw
    │   │   │   └── storm
    │   │   │       ├── LuwakSearchTopology.java
    │   │   │       ├── bolt
    │   │   │           └── LuwakSearchBolt.java
    │   │   │       └── util
    │   │   │           ├── ConfigurationManager.java
    │   │   │           ├── Counter.java
    │   │   │           ├── LuwakHelper.java
    │   │   │           ├── RequestsHelper.java
    │   │   │           └── ScheduledMessageCounter.java
    │   └── submit_topology.sh
    └── storming_search
    │   ├── .gitignore
    │   ├── pom.xml
    │   ├── run_search_topology.sh
    │   ├── src
    │       └── straw
    │       │   └── storm
    │       │       ├── StreamingSearchTopology.java
    │       │       ├── bolt
    │       │           └── SearchBolt.java
    │       │       └── util
    │       │           ├── ConfigurationManager.java
    │       │           ├── Counter.java
    │       │           ├── PercolatorHelper.java
    │       │           ├── RequestsHelper.java
    │       │           └── ScheduledMessageCounter.java
    │   └── submit_search_topology.sh
├── test
    ├── launch_luwak_test_cluster.sh
    └── launch_percolator_test_cluster.sh
└── util
    ├── docker_elasticsearch.sh
    ├── elasticsearch_index_demo.py
    ├── elasticsearch_percolator_demo.py
    ├── kafka_add_documents.sh
    ├── kafka_add_queries.sh
    ├── query_maker.py
    ├── redis_pub_sub_demo.py
    ├── stage_demo_mode.sh
    └── tweet_sampler.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Ryan Walker
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | straw
  2 | =================
  3 | A platform for real-time streaming search
  4 | 
  5 | #### Table of Contents
  6 |   * [Overview](#overview)
  7 |   * [What's included:](#whats-included)
  8 |   * [Architecture](#architecture)
  9 |   * [Getting started](#getting-started)
 10 |     * [Running locally](#running-locally)
 11 |     * [Deploy to AWS](#deploy-to-aws)
 12 |       * [Prerequisites:](#prerequisites)
 13 |       * [Steps:](#steps)
 14 |       * [Submitting topologies](#submitting-topologies)
 15 |     * [Configuring Redis](#configuring-redis)
 16 |   * [Benchmarking and simulation](#benchmarking-and-simulation)
 17 |     * [Measuring throughput](#measuring-throughput)
 18 |     * [Generating/simulating data](#generatingsimulating-data)
 19 | 
 20 | 
 21 | 
 22 | 
 23 | ## Overview
 24 | The goal of this project is to provide a clean, scalable architecture for real-time search on streaming data.  Additionally, the project contains utilities to provide some very simple throughput benchmarking of Elasticsearch Percolators vs Lucence-Luwak.  A full writeup of the project can be found at:
 25 | 
 26 | http://blog.ryanwalker.us/2015/11/building-streaming-search-platform.html
 27 | 
 28 | This project was inspired by the following excellent blog posts on streaming search: 
 29 | - http://www.confluent.io/blog/real-time-full-text-search-with-luwak-and-samza/
 30 | - http://www.flax.co.uk/blog/2015/07/27/a-performance-comparison-of-streamed-search-implementations/
 31 | 
 32 | I completed this project as a Fellow in the 2015C Insight Data Engineering Silicon Valley program.
 33 | 
 34 | The typical use case for a streaming search system involves many users who are interested in running Lucene style queries against a streaming data source in real-time.  For example, investors might want to register queries for positive or negative mentions about companies in the twitter firehose and then receive real-time alerts about matches for their queries.  This project provides a base architecture for such a system.  In particular, it aims to support:
 35 | 
 36 | - Many diverse users registering queries
 37 | - Full Lucene query capabilities against streaming text sources
 38 | - Scaling in both the volume of data and in the number of queries
 39 | 
 40 | ## What's included:
 41 | - Automated AWS cluster deployment utilities using boto3
 42 | - Java based Storm implementation:
 43 |   - KafkaSpout for query and document spouts
 44 |   - Two flavors of streaming search bolts:
 45 |     - [Elasticsearch-Percolators](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-percolate.html)
 46 |     - Pure Lucene with [Luwak](https://github.com/flaxsearch/luwak)
 47 |   - Storm topology for streaming search and configuration management
 48 | - Scripts to populate document streams, including twitter API sampling utilities
 49 | - Simple Python flask web UI
 50 | - Testing and other utilities, including Docker components so that the entire topology can run on a local machine
 51 | 
 52 | ## Architecture
 53 | The core of the platform is an Apache Storm cluster which parallelizes the work of real-time streaming search.  Internally, the Storm cluster consumes messages from a Kafka cluster and these messages are distributed to bolts which each contain a Lucene-Luwak index.  The project contains a demo flask UI which handles subscriptions with a Redis PUBSUB system.
 54 | 
 55 | The key layers of the system are:
 56 | 
 57 | - Real-time ingestion via Kafka from a streaming source (e.g. Twitter firehose)
 58 | - Storm cluster to distribute tweets from Kafka to workers.  Each worker contains a Lucene instance with Luwak.
 59 | - Publish-Subscribe system (Redis) which receives matches and delivers them back to the application server
 60 | - Application server (Python Flask) who registers queries from the users and serves matches
 61 | 
 62 | More about the architecture can be found at:
 63 | http://straw.ryanwalker.us/about
 64 | 
 65 | ## Getting started
 66 | 
 67 | There are two options for running straw.  For development, you can run a mini version of the entire platform on a single local machine. In local mode, dependent services run in Dockers.  For production, you can deploy the system to the cloud.  The project supports a fully automated deployment to AWS with fully customizable cluster configurations.  
 68 | 
 69 | ### Running locally
 70 | 
 71 | Minimum supported requirements: Ubuntu 14.04 with Docker 1.8.0 or better
 72 | 
 73 | UPDATE: I've added utility scripts to make launching the demo mode a bit simpler.  Now, you can just do the following steps:
 74 | 
 75 | 1. `cd local_demo`
 76 | 2. Install the prerequisites: `./prerequisites.sh`
 77 | 3. run `./launch_local_cluster.sh`
 78 | 4. In a separate shell, run `./launch_demo_ui.sh`
 79 | 5. In a separate shell, run `./mock_firehose.sh`
 80 | 6. Open a web browser and point to [http://localhost:5000](http://localhost:5000)
 81 | 7. Type "Justin Bieber" or some other common twitter query (only 100k unique documents can be found in the mock stream).
 82 | 
 83 | For reference, here are the old step=by-step launch instructions:
 84 | 
 85 | 1. install [docker-compose](http://docs.docker.com/compose/install/) and redis-server
 86 | 2. run util/stage_demo_mode.sh  This will create dockers for Kafka with Zookeeper and Elasticsearch and will populate these services with some example data.  [BUG: You may have to run this script twice!]
 87 | 3. cd src/storming_search OR src/luwak_search depending on which flavor of search you want to build
 88 | 4. run `mvn package`
 89 | 5. run `./run_luwak_topology.sh`.  This will start the local storm cluster with the Luwak topology.
 90 | 6. In a separate terminal, start the webserver frontend by calling ./run.py from src/frontend
 91 | 7. Open a browser and point to the frontend UI.  By default: [http://localhost:5000](http://localhost:5000)
 92 | 8. Enter a query that will likely generate lots of hits e.g. "Justin Bieber".  Note: there are only 100k sampled tweets included with the repo but there are utility scripts for collecting more.
 93 | 9. To start a simulated tweet stream, `cd util` and `./kafka_add_documents.sh`.
 94 | 
 95 | ### Deploy to AWS
 96 | #### Prerequisites:
 97 | 
 98 | 1. Install the aws cli: `sudo apt-get install awscli`
 99 | 2. Install Python boto3: `sudo pip3 install boto3`
100 | 3. Set your default configurations by calling `aws configure`
101 | 4. Modify the settings in `aws_config/straw_service_config.sh` to your own AWS account information and then
102 | ```
103 | source straw_service_config.sh
104 | ```
105 | 
106 | ####Steps:
107 | 
108 | 1. `cd aws_config`
109 | 2. `./create_clusters.py --help` to get instructions about this AWS creation script and follow instructions.
110 | 3. Once all resources are created, `cd configure`. This directory contains scripts to configure each of the individual services; you'll need to run each of these to configure the resource, e.g. `./configure_elasticsearch`.
111 | 4. Once resources are created, run
112 | ```
113 | ./discover.py
114 | ```
115 | to see the list of services and their IPs.
116 | 
117 | ####Submitting topologies
118 | To submit or run topologies, you need to install storm on your machine (or, even better, on a dedicated machine within the subnet of the Storm cluster).  Install storm as follows:
119 | ```
120 | sudo apt-get update
121 | sudo apt-get install openjdk-7-jdk
122 | wget http://mirrors.gigenet.com/apache/storm/apache-storm-0.9.5/apache-storm-0.9.5.tar.gz -P ~/Downloads
123 | sudo tar zxvf ~/Downloads/apache-storm*.gz -C /usr/local
124 | sudo mv /usr/local/apache-storm* /usr/local/storm
125 | ```
126 | Then edit `/usr/local/storm/config/storm.yaml` by adding the line
127 | ```nimbus.host: 10.X.X.X```
128 | using either your private or public IP for the nimbus node. If you use a public IP, you need to update the security group.  If you use a private IP, you need to be running from within the subnet.
129 | 
130 | Next, you need to tell storm where all of your cluster resources reside.  To do this,
131 | ```
132 | vi config/config.properties
133 | ```
134 | Enter the private IPs of your system resources, following this template.  We are assuming that all of the resources live on the same subnet in the cluster.
135 | 
136 | You should now switch into the source directory of either the Luwak or Elasticsearch topology and build the topology, e.g.
137 | ```
138 | cd /home/ubuntu/straw/src/luwak_search
139 | mvn clean
140 | mvn package
141 | ```
142 | Finally, you can submit the topology to the cluster (whose nimbus node was specified in step 5) by executing
143 | ```
144 | ./submit_topology.sh
145 | ```
146 | 
147 | 
148 | ### Configuring Redis
149 | The included webserver and the query result pipeline both rely on Redis as a publish-subscribe system.  Redis can also be used to collect the benchmarking statistics for profiling Luwak and Elasticsearch.
150 | 
151 | Install redis on the same server as the webserver and modify the bind interface:
152 | ```
153 | # set bind 0.0.0.0 in redis.conf:
154 | sudo apt-get install redis-server
155 | sudo vi /etc/redis/redis.conf
156 | ```
157 | If you want to use a separate redis instance for the benchmarking, you should repeat the above step on a different AWS machine and update the global configuration `config/config.properties`.
158 | 
159 | ## Benchmarking and simulation
160 | A goal of the straw project was to allow for benchmarking of the Lucene-Luwak package in a distributed context.  
161 | 
162 | ### Measuring throughput
163 | I measure throughput through the search bolts of the Storm cluster in simple way.  Start a stopwatch in a background thread.  Each bolt has a counter which get incremented each time a document gets checked against the search engine.  When the stopwatch hits 10 seconds, collect the data from each counter, publish the result to a redis DB, and reset the counter.
164 | 
165 | ### Generating/simulating data
166 | For benchmarking and simulations, you'll need a way to generate tweets and queries. For this purpose, I've added many tools to the `straw/utils` directory.  In particular, the scripts
167 | ```
168 | ./kafka_add_documents
169 | ./kafka_add_queries
170 | ```
171 | can be used to add documents and queries from sample files.  Some small example data files are found in ```straw/data```.  For long running simulation, you can run ```./kafka_add_documents.sh``` in a cronjob, to periodically put documents into the Kafka cluster.  NOTE: Kafka has been configured to purge documents after 1 hour.
172 | 
173 | You can easily harvest your own tweet data from the Twitter api. Try the following helper script which uses Twython to read from the Twitter streaming sample API:
174 | ```
175 | ./tweet_sampler.py --help
176 | ```
177 | You'll need to export your twitter credentials as environment variables to run this and other scripts, e.g.
178 | ```
179 | source my_twitter_credentials
180 | ```
181 | where `my_twitter_credentials` looks like
182 | ```
183 | export TWITTER_ACCESS_TOKEN=...
184 | export TWITTER_SECRET_TOKEN=...
185 | export TWITTER_CONSUMER_TOKEN=...
186 | export TWITTER_CONSUMER_SECRET=...
187 | 
188 | ```
189 | To generate many reasonably complex queries for the benchmarking studies, the included query maker utility might be helpful
190 | ```
191 | ./query_maker.py --help
192 | ```
193 | This script takes a sample of tweets and uses NLTK to compute bigram frequencies.  The most frequent bigram are then converted into queries that Straw can parse.  For ease of use, I've included `data/queries.bigrams` in  the repo.  This is a collection of 100,000 generated bigram queries collected from a sample of 20 million tweets.
194 | 
195 | 


--------------------------------------------------------------------------------
/aws_config/.gitignore:
--------------------------------------------------------------------------------
1 | *.tmp
2 | 


--------------------------------------------------------------------------------
/aws_config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/aws_config/__init__.py


--------------------------------------------------------------------------------
/aws_config/configure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/aws_config/configure/__init__.py


--------------------------------------------------------------------------------
/aws_config/configure/config_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | 
3 | def quiet_wrap(cmd):
4 |     return(" ".join(["nohup",cmd, "< /dev/null > std.out 2> std.err &"]))
5 | 


--------------------------------------------------------------------------------
/aws_config/configure/configure_elasticsearch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | #
 3 | #   Configure Kafka on ec2 instances
 4 | #
 5 | import boto3, os, sys
 6 | sys.path.append("..")
 7 | from botocore.exceptions import ClientError as BotoClientError
 8 | from time import sleep
 9 | from create_clusters import get_tag, keyfile
10 | 
11 | # configuration
12 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('elasticsearch-node')]}]
13 | 
14 | if __name__=="__main__":
15 |     
16 |     # find all the host nodes
17 |     ec2 = boto3.resource('ec2')
18 |     hosts = []
19 |     private_ips = []
20 |     reservations = ec2.instances.filter( Filters = my_instances_filters )
21 |     for instance in reservations:
22 |         print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address))
23 |         hosts.append(instance.public_ip_address)
24 |         private_ips.append(instance.private_ip_address)
25 | 
26 |     if len(hosts) != len(private_ips):
27 |         raise(RuntimeError("Host and private ips not consistent!"))
28 | 
29 |     if len(hosts) == 0:
30 |         raise(RuntimeError("No hosts found."))
31 | 
32 |     #######################################################################
33 |     #   Elasticsearch
34 |     #######################################################################
35 |     print("Starting elasticsearch configuration...")
36 | 
37 |     # create a temporary config file
38 |     with open("templates/elasticsearch.yml.tmp", "w") as tmpfile:
39 |         with open("templates/elasticsearch.yml","r") as f:
40 |             # copy over the template
41 |             for l in f:
42 |                 tmpfile.write(l)
43 | 
44 |             # add cloud credentials
45 |             # hack: boto3 doesn't yet offer a way to access the store configuration values
46 |             S = boto3._get_default_session()
47 |             profile = S._session.full_config['profiles']['default']
48 | 
49 |             # add profile information to elasticsearch config to enable cloud discovery
50 |             tmpfile.write("cloud.aws.access_key: {0}\n".format(profile['aws_access_key_id']))
51 |             tmpfile.write("cloud.aws.secret_key: {0}\n".format(profile['aws_secret_access_key']))
52 |             tmpfile.write("cloud.aws.region: {0}\n".format(profile['region']))
53 |             tmpfile.write("discovery.type: ec2\n")
54 |             tmpfile.write("discovery.ec2.groups: {0}\n".format(get_tag('elasticsearch-security-group')))
55 |             #tmpfile.write("discovery.ec2.host_type: public_ip\n")
56 |             tmpfile.write("cluster.name: {0}\n".format(get_tag('elasticsearch-cluster')))
57 | 
58 |     # build the command queue
59 |     cmd_str = []               
60 |     for h in hosts:
61 |         # add commands to queue
62 |         cmd_str.append("scp -i {0} {1} ubuntu@{2}:elasticsearch.yml".format(keyfile, tmpfile.name, h))
63 |         cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv elasticsearch.yml /etc/elasticsearch/elasticsearch.yml".format(keyfile, h))
64 | 
65 |     # start each node
66 |     cmd_str.extend(["ssh -i {0} ubuntu@{1} \"sudo service elasticsearch start\"".format(keyfile, h) for h in hosts])
67 | 
68 |     # execute the remote commands
69 |     for cmd in cmd_str:
70 |         print(cmd)
71 |         res=os.system(cmd)
72 |         if res!=0:
73 |             raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
74 | 


--------------------------------------------------------------------------------
/aws_config/configure/configure_flask.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | #
 3 | #   Configure Kafka on ec2 instances
 4 | #
 5 | import boto3, os, sys
 6 | sys.path.append("..")
 7 | from botocore.exceptions import ClientError as BotoClientError
 8 | from time import sleep
 9 | from create_clusters import get_tag, keyfile
10 | from config_utils import quiet_wrap
11 | 
12 | # configuration
13 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('flask-node')]}]
14 | 
15 | if __name__=="__main__":
16 |     
17 |     # find all the host nodes
18 |     ec2 = boto3.resource('ec2')
19 |     hosts = []
20 |     private_ips = []
21 |     public_dns = []
22 |     reservations = ec2.instances.filter( Filters = my_instances_filters )
23 |     for instance in reservations:
24 |         print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address))
25 |         hosts.append(instance.public_ip_address)
26 |         private_ips.append(instance.private_ip_address)
27 |         public_dns.append(instance.public_dns_name)
28 | 
29 |     if len(hosts) != len(private_ips):
30 |         raise(RuntimeError("Host and private ips not consistent!"))
31 | 
32 |     if len(hosts) == 0:
33 |         raise(RuntimeError("No hosts found."))
34 | 
35 |     #######################################################################
36 |     #   flask
37 |     #######################################################################
38 |     cmd_str = []
39 |     for h in hosts:
40 |         print("Starting flask configuration...")
41 |         cmd_str.append("(cd ../../src/ && tar -zcvf frontend.tmp.tar.gz frontend)")
42 |         cmd_str.append("(cd ../../src/ && scp -i {0} frontend.tmp.tar.gz ubuntu@{1}:)".format(keyfile, h))
43 |         cmd_str.append("(cd ../../src/ && rm frontend.tmp.tar.gz)")
44 |         cmd_str.append("ssh -i {0} ubuntu@{1} tar xvf frontend.tmp.tar.gz".format(keyfile, h))
45 |         
46 |         # launch webapp
47 |         cmd_str.append("ssh -i {0} ubuntu@{1} \"{2}\"".format(keyfile, h, quiet_wrap("sudo ./frontend/run.py")))
48 | 
49 |     # execute the remote commands
50 |     for cmd in cmd_str:
51 |         print(cmd)
52 |         res=os.system(cmd)
53 |         if res!=0:
54 |             raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
55 | 
56 |     for a in public_dns:
57 |         print("Straw Frontend:\thttp://{0}:5000".format(a))
58 | 
59 | 


--------------------------------------------------------------------------------
/aws_config/configure/configure_kafka.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #
  3 | #   Configure Kafka on ec2 instances
  4 | #
  5 | 
  6 | import boto3, os, sys
  7 | from botocore.exceptions import ClientError as BotoClientError
  8 | from time import sleep
  9 | sys.path.append("..")
 10 | from create_clusters import get_tag, keyfile
 11 | 
 12 | # configuration
 13 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('kafka-node')]}]
 14 | 
 15 | if __name__=="__main__":
 16 |     
 17 |     # find all the host nodes
 18 |     ec2 = boto3.resource('ec2')
 19 |     hosts = []
 20 |     private_ips = []
 21 |     reservations = ec2.instances.filter( Filters = my_instances_filters )
 22 |     for instance in reservations:
 23 |         print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address))
 24 |         hosts.append(instance.public_ip_address)
 25 |         private_ips.append(instance.private_ip_address)
 26 | 
 27 |     if len(hosts) != len(private_ips):
 28 |         raise(RuntimeError("Host and private ips not consistent!"))
 29 | 
 30 |     if len(hosts) == 0:
 31 |         raise(RuntimeError("No hosts found."))
 32 | 
 33 |     #######################################################################
 34 |     #   ZOOKEEPER
 35 |     ####################################################################### 
 36 |     # just a little hacking to inject some settings into the templates
 37 |     # TODO: parallelize this to save some boot time
 38 |     print("Starting zookeeper configuration...")
 39 |     zooid = 1
 40 |     for h in hosts:
 41 |         cmd_str = []
 42 |         with open("templates/zoo.cfg.tmp", "w") as tmpfile:
 43 |             with open("templates/zoo.cfg","r") as f:
 44 |                 # copy over the template
 45 |                 for l in f:
 46 |                     tmpfile.write(l)
 47 | 
 48 |                 # append the server settings
 49 |                 host_strings= ["server.{0}={1}:2888:3888".format(i+1,private_ips[i]) for i in range(len(hosts))]
 50 |                 for s in host_strings:
 51 |                     tmpfile.write(s + "\n")
 52 |                 cmd_str.append("scp -i {0} {1} ubuntu@{2}:zoo.cfg".format(keyfile, tmpfile.name, h))
 53 |                 cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv zoo.cfg /etc/zookeeper/conf/zoo.cfg".format(keyfile, h))
 54 | 
 55 |                 # Assign the zookeeper ids
 56 |                 cmd_str.append("ssh -i {0} ubuntu@{1} \" echo 'echo {2} > /var/lib/zookeeper/myid' | sudo -s\" ".format(keyfile, h, zooid))
 57 |                 zooid+=1
 58 | 
 59 |         # execute the remote commands
 60 |         for cmd in cmd_str:
 61 |             print(cmd)
 62 |             res=os.system(cmd)
 63 |             if res!=0:
 64 |                 raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
 65 | 
 66 |     # start each zookeeper
 67 |     cmd_str = ["ssh -i {0} ubuntu@{1} sudo service zookeeper restart".format(keyfile, h) for h in hosts]
 68 |     for cmd in cmd_str:
 69 |         print(cmd)
 70 |         res=os.system(cmd)
 71 |         if res!=0:
 72 |             raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))        
 73 | 
 74 |     #######################################################################
 75 |     #   Kafka
 76 |     #######################################################################
 77 |     print("Starting kafka configuration...")
 78 |     broker_id = 0
 79 |     kafka_start_script = "templates/kafka-server-start.sh"
 80 |     for i,h in enumerate(hosts):
 81 |         cmd_str = []
 82 |         with open("templates/kafka.server.properties.tmp", "w") as tmpfile:
 83 |             with open("templates/kafka.server.properties","r") as f:
 84 |                 # copy over the template
 85 |                 for l in f:
 86 |                     tmpfile.write(l)
 87 | 
 88 |                 # advertise host's private IP
 89 |                 # tmpfile.write("advertised.host.name: {0}\n".format(h))
 90 |                 
 91 |                 # add zookeeper info
 92 |                 host_strings= ["{0}:2181".format(private_ips[i]) for i in range(len(hosts))]
 93 |                 tmpfile.write("zookeeper.connect={0}\n".format(",".join(host_strings)))
 94 |                 
 95 |                 # set broker id
 96 |                 tmpfile.write("broker.id={0}\n".format(broker_id))
 97 |                 broker_id+=1
 98 |                 
 99 |                 # add commands to queue
100 |                 cmd_str.append("scp -i {0} {1} ubuntu@{2}:server.properties".format(keyfile, tmpfile.name, h))
101 |                 cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv server.properties /usr/local/kafka/config/server.properties".format(keyfile, h))
102 |                 cmd_str.append("scp -i {0} {1} ubuntu@{2}:kafka-server-start.sh".format(keyfile, kafka_start_script, h))
103 |                 cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv kafka-server-start.sh /usr/local/kafka/bin/kafka-server-start.sh ".format(keyfile, h))
104 | 
105 |         # execute the remote commands
106 |         for cmd in cmd_str:
107 |             print(cmd)
108 |             res=os.system(cmd)
109 |             if res!=0:
110 |                 raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
111 | 
112 |     # start each kafka
113 |     cmd_str = ["ssh -i {0} ubuntu@{1} \"nohup sudo /usr/local/kafka/bin/kafka-server-start.sh  /usr/local/kafka/config/server.properties < /dev/null > std.out 2> std.err &\"".format(keyfile, h) for h in hosts]
114 | 
115 |     for cmd in cmd_str:
116 |         print(cmd)
117 |         res=os.system(cmd)
118 |         if res!=0:
119 |             raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
120 | 
121 | 
122 |     # create the documents and queries topics on one of the Kafka nodes
123 |     
124 |     h = hosts[0]
125 |     cmd_str = ["ssh -i {0} ubuntu@{1} /usr/local/kafka/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor {2} --partitions {3} --topic documents".format(keyfile, h, 2, 5), "ssh -i {0} ubuntu@{1} /usr/local/kafka/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor {2} --partitions {3} --topic queries".format(keyfile, h, 3, 1)]
126 |     
127 |     for cmd in cmd_str:
128 |         print(cmd)
129 |         res=os.system(cmd)
130 |         if res!=0:
131 |             raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
132 | 


--------------------------------------------------------------------------------
/aws_config/configure/configure_spark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #
  3 | #   Configure Kafka on ec2 instances
  4 | #
  5 | 
  6 | import boto3, os, sys
  7 | from botocore.exceptions import ClientError as BotoClientError
  8 | from time import sleep
  9 | sys.path.append("..")
 10 | from create_clusters import get_tag, keyfile
 11 | 
 12 | # configuration
 13 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('spark-node')]}]
 14 | 
 15 | if __name__=="__main__":
 16 |     
 17 |     # find all the host nodes
 18 |     ec2 = boto3.resource('ec2')
 19 |     hosts = []
 20 |     private_ips = []
 21 |     reservations = ec2.instances.filter( Filters = my_instances_filters )
 22 |     for instance in reservations:
 23 |         print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address))
 24 |         hosts.append(instance.public_ip_address)
 25 |         private_ips.append(instance.private_ip_address)
 26 | 
 27 |     if len(hosts) != len(private_ips):
 28 |         raise(RuntimeError("Host and private ips not consistent!"))
 29 | 
 30 |     if len(hosts) == 0:
 31 |         raise(RuntimeError("No hosts found."))
 32 |     
 33 |     # Identify master node
 34 |     master = hosts[0]
 35 |     #######################################################################
 36 |     # Spark requires passwordless SSH
 37 |     #######################################################################    
 38 |     cmd_str = []
 39 |     
 40 |     # generate a key on the master
 41 |     cmd_str.append("ssh -i {0} ubuntu@{1} \"sudo apt-get -y install ssh rsync && ssh-keygen -f ~/.ssh/id_rsa -t rsa -P \'\' \"".format(keyfile, hosts[0]))
 42 | 
 43 |     # download public key temporarily
 44 |     cmd_str.append("scp -i {0} ubuntu@{1}:.ssh/id_rsa.pub {2}".format(keyfile, master, "templates/key.tmp"))
 45 | 
 46 |     # auth public key for all hosts
 47 |     for h in hosts:
 48 |         cmd_str.append("scp -i {0} {1} ubuntu@{2}:".format(keyfile, "templates/key.tmp", h))
 49 |         cmd_str.append("ssh -i {0} ubuntu@{1} \"cat key.tmp >> ~/.ssh/authorized_keys\"".format(keyfile, h))
 50 | 
 51 |     for cmd in cmd_str:
 52 |         print(cmd)
 53 |         res=os.system(cmd)
 54 |         if res!=0:
 55 |             raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
 56 | 
 57 |     #######################################################################
 58 |     #   Spark
 59 |     #######################################################################
 60 |     print("Starting Spark configuration...")
 61 |     for i,h in enumerate(hosts):
 62 |         cmd_str = []
 63 |         with open("templates/spark-env.sh.tmp", "w") as tmpfile:
 64 |             with open("templates/spark-env.sh","r") as f:
 65 |                 # copy over the template
 66 |                 for l in f:
 67 |                     tmpfile.write(l)
 68 | 
 69 |                 # advertise host's private IP
 70 |                 tmpfile.write("export SPARK_PUBLIC_DNS={0}\n".format(private_ips[i]))
 71 |                                 
 72 |                 # add commands to queue
 73 |                 cmd_str.append("scp -i {0} {1} ubuntu@{2}:".format(keyfile, tmpfile.name, h))
 74 |                 cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv spark-env.sh.tmp /usr/local/spark/conf/spark-env.sh".format(keyfile, h))
 75 |                
 76 |         # execute the remote commands
 77 |         for cmd in cmd_str:
 78 |             print(cmd)
 79 |             res=os.system(cmd)
 80 |             if res!=0:
 81 |                 raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
 82 | 
 83 |     # send the slaves file to the master
 84 |     with open("templates/slaves.tmp", "w") as tmpfile:
 85 |         for i,h in enumerate(hosts[1:]):
 86 |             tmpfile.write("{0}\n".format(private_ips[i]))
 87 | 
 88 |     # add commands to queue
 89 |     cmd_str.append("scp -i {0} {1} ubuntu@{2}:".format(keyfile, tmpfile.name, master))
 90 |     cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv slaves.tmp /usr/local/spark/conf/slaves".format(keyfile, master))
 91 | 
 92 |     # start spark on the master
 93 |     cmd_str.append("ssh -i {0} ubuntu@{1} /usr/local/spark/sbin/start-all.sh".format(keyfile, master))
 94 | 
 95 |     for cmd in cmd_str:
 96 |         print(cmd)
 97 |         res=os.system(cmd)
 98 |         if res!=0:
 99 |             raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
100 | 
101 |     
102 |     
103 | 


--------------------------------------------------------------------------------
/aws_config/configure/configure_storm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #
  3 | #   Configure Kafka on ec2 instances
  4 | #
  5 | import boto3, os, argparse, sys
  6 | sys.path.append("..")
  7 | from botocore.exceptions import ClientError as BotoClientError
  8 | from time import sleep
  9 | from create_clusters import get_tag, keyfile
 10 | from config_utils import quiet_wrap
 11 | 
 12 | # configuration
 13 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('storm-node')]}]
 14 | 
 15 | if __name__=="__main__":
 16 |     
 17 |     # argument help
 18 |     parser = argparse.ArgumentParser(description='Configure the storm cluster.')
 19 |     parser.add_argument('--elasticsearch', help='Collocate elasticsearch with Storm cluster.', action='store_true')
 20 |     args = parser.parse_args()
 21 | 
 22 |     # find all the host nodes
 23 |     ec2 = boto3.resource('ec2')
 24 |     hosts = []
 25 |     private_ips = []
 26 |     reservations = ec2.instances.filter( Filters = my_instances_filters )
 27 |     for instance in reservations:
 28 |         print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address))
 29 |         hosts.append(instance.public_ip_address)
 30 |         private_ips.append(instance.private_ip_address)
 31 | 
 32 |     if len(hosts) != len(private_ips):
 33 |         raise(RuntimeError("Host and private ips not consistent!"))
 34 | 
 35 |     if len(hosts) == 0:
 36 |         raise(RuntimeError("No hosts found."))
 37 | 
 38 |     #######################################################################
 39 |     #   ZOOKEEPER
 40 |     ####################################################################### 
 41 |     # just a little hacking to inject some settings into the templates
 42 |     # TODO: parallelize this to save some boot time
 43 |     print("Starting zookeeper configuration...")
 44 |     zooid = 1
 45 |     for h in hosts:
 46 |         cmd_str = []
 47 |         with open("templates/zoo.cfg.tmp", "w") as tmpfile:
 48 |             with open("templates/zoo.cfg","r") as f:
 49 |                 # copy over the template
 50 |                 for l in f:
 51 |                     tmpfile.write(l)
 52 | 
 53 |                 # append the server settings
 54 |                 host_strings= ["server.{0}={1}:2888:3888".format(i+1,private_ips[i]) for i in range(len(hosts))]
 55 |                 for s in host_strings:
 56 |                     tmpfile.write(s + "\n")
 57 |                 cmd_str.append("scp -i {0} {1} ubuntu@{2}:zoo.cfg".format(keyfile, tmpfile.name, h))
 58 |                 cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv zoo.cfg /etc/zookeeper/conf/zoo.cfg".format(keyfile, h))
 59 | 
 60 |                 # Assign the zookeeper ids
 61 |                 cmd_str.append("ssh -i {0} ubuntu@{1} \" echo 'echo {2} > /var/lib/zookeeper/myid' | sudo -s\" ".format(keyfile, h, zooid))
 62 |                 zooid+=1
 63 | 
 64 |         # execute the remote commands
 65 |         for cmd in cmd_str:
 66 |             print(cmd)
 67 |             res=os.system(cmd)
 68 |             if res!=0:
 69 |                 raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
 70 | 
 71 |     # start each zookeeper
 72 |     cmd_str = ["ssh -i {0} ubuntu@{1} sudo service zookeeper restart".format(keyfile, h) for h in hosts]
 73 |     for cmd in cmd_str:
 74 |         print(cmd)
 75 |         res=os.system(cmd)
 76 |         if res!=0:
 77 |             raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))        
 78 | 
 79 |     #######################################################################
 80 |     #   Storm
 81 |     #######################################################################
 82 |     print("Starting Storm configuration...")
 83 |     for h in hosts:
 84 |         cmd_str = []
 85 |         with open("templates/storm.yaml.tmp", "w") as tmpfile:
 86 |             with open("templates/storm.yaml.tmp","r") as f:
 87 |                 # copy over the template
 88 |                 for l in f:
 89 |                     tmpfile.write(l)
 90 | 
 91 |                 # add zookeeper info
 92 |                 tmpfile.write("storm.zookeeper.servers:\n")               
 93 |                 host_strings= ["    - \"{0}\"\n".format(private_ips[i]) for i in range(len(hosts))]
 94 |                 for v in host_strings:
 95 |                     tmpfile.write(v)
 96 | 
 97 |                 # declare the master             
 98 |                 tmpfile.write("nimbus.host: \"{0}\"\n".format(private_ips[0]))
 99 |                 
100 |                 # path to stateful info
101 |                 tmpfile.write("storm.local.dir: \"/usr/local/storm/local_state\"\n")
102 | 
103 |                 # supervisor info
104 |                 # supervisor.slots.ports:
105 |                 #    - 6700
106 |                 #    - 6701
107 |                 #   etc..
108 |                 tmpfile.write("supervisor.slots.ports:\n")
109 |                 tmpfile.write("".join(["    -{0}\n".format([6700 + i for i in range(len(hosts))])]))
110 | 
111 |         # add commands to queue
112 |         cmd_str.append("scp -i {0} {1} ubuntu@{2}:storm.yaml".format(keyfile, tmpfile.name, h))
113 |         cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv storm.yaml /usr/local/storm/conf/storm.yaml".format(keyfile, h))
114 |             
115 |         if h==hosts[0]:
116 |             # start nimbus
117 |             cmd_str.append("ssh -i {0} ubuntu@{1} \"{2}\"".format(keyfile, h, quiet_wrap("sudo /usr/local/storm/bin/storm nimbus")))
118 |             # web ui
119 |             cmd_str.append("ssh -i {0} ubuntu@{1} \"{2}\"".format(keyfile, h, quiet_wrap("sudo /usr/local/storm/bin/storm ui")))
120 |         else:
121 |             cmd_str.append("ssh -i {0} ubuntu@{1} \"{2}\"".format(keyfile, h, quiet_wrap("sudo /usr/local/storm/bin/storm supervisor")))
122 |         
123 |         # execute the remote commands
124 |         for cmd in cmd_str:
125 |             print(cmd)
126 |             res=os.system(cmd)
127 |             if res!=0:
128 |                 raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
129 | 
130 |     # print some info
131 |     # TODO: retag master and open its 8080 port. 
132 |     print("Master: {0}".format(hosts[0]))
133 |     print("\n".join(["Worker: "+ h for h in hosts[1:]]))    
134 | 
135 |     if args.elasticsearch == True:
136 |     #######################################################################
137 |     #   Collocated Elasticsearch
138 |     #######################################################################
139 | 
140 |         cmd_str = []
141 |         for h in hosts:
142 |             cmd_str.append("scp -i {0} {1} ubuntu@{2}:".format(keyfile, "../host_install_scripts/elasticsearch_install.sh", h))
143 |             cmd_str.append("ssh -i {0} ubuntu@{1} sudo ./elasticsearch_install.sh".format(keyfile, h))
144 | 
145 |         # execute the remote commands
146 |         for cmd in cmd_str:
147 |             print(cmd)
148 |             res=os.system(cmd)
149 |             if res!=0:
150 |                 raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))            
151 |         
152 |         print("Starting elasticsearch configuration...")
153 |         # create a temporary config file
154 |         with open("templates/elasticsearch.yml.tmp", "w") as tmpfile:
155 |             with open("templates/elasticsearch.yml","r") as f:
156 |                 # copy over the template
157 |                 for l in f:
158 |                     tmpfile.write(l)
159 | 
160 |                 # add cloud credentials
161 |                 # hack: boto3 doesn't yet offer a way to access the store configuration values
162 |                 S = boto3._get_default_session()
163 |                 profile = S._session.full_config['profiles']['default']
164 | 
165 |                 # add profile information to elasticsearch config to enable cloud discovery
166 |                 tmpfile.write("cloud.aws.access_key: {0}\n".format(profile['aws_access_key_id']))
167 |                 tmpfile.write("cloud.aws.secret_key: {0}\n".format(profile['aws_secret_access_key']))
168 |                 tmpfile.write("cloud.aws.region: {0}\n".format(profile['region']))
169 |                 tmpfile.write("discovery.type: ec2\n")
170 |                 tmpfile.write("discovery.ec2.groups: {0}\n".format(get_tag('elasticsearch-security-group')))
171 |                 #tmpfile.write("discovery.ec2.host_type: public_ip\n")
172 |                 tmpfile.write("cluster.name: {0}\n".format(get_tag('elasticsearch-cluster')))
173 | 
174 |         # build the command queue
175 |         cmd_str = []               
176 |         for h in hosts:
177 |             # add commands to queue
178 |             cmd_str.append("scp -i {0} {1} ubuntu@{2}:elasticsearch.yml".format(keyfile, tmpfile.name, h))
179 |             cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv elasticsearch.yml /etc/elasticsearch/elasticsearch.yml".format(keyfile, h))
180 | 
181 |         # start each node
182 |         cmd_str.extend(["ssh -i {0} ubuntu@{1} \"sudo service elasticsearch start\"".format(keyfile, h) for h in hosts])
183 | 
184 |         # execute the remote commands
185 |         for cmd in cmd_str:
186 |             print(cmd)
187 |             res=os.system(cmd)
188 |             if res!=0:
189 |                 raise(RuntimeError("Something went wrong executing {0}  Got exit: {1}".format(cmd, res)))
190 | 
191 | 
192 | 


--------------------------------------------------------------------------------
/aws_config/configure/templates/elasticsearch.yml:
--------------------------------------------------------------------------------
  1 | ##################### Elasticsearch Configuration Example #####################
  2 | 
  3 | # This file contains an overview of various configuration settings,
  4 | # targeted at operations staff. Application developers should
  5 | # consult the guide at <http://elasticsearch.org/guide>.
  6 | #
  7 | # The installation procedure is covered at
  8 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup.html>.
  9 | #
 10 | # Elasticsearch comes with reasonable defaults for most settings,
 11 | # so you can try it out without bothering with configuration.
 12 | #
 13 | # Most of the time, these defaults are just fine for running a production
 14 | # cluster. If you're fine-tuning your cluster, or wondering about the
 15 | # effect of certain configuration option, please _do ask_ on the
 16 | # mailing list or IRC channel [http://elasticsearch.org/community].
 17 | 
 18 | # Any element in the configuration can be replaced with environment variables
 19 | # by placing them in ${...} notation. For example:
 20 | #
 21 | #node.rack: ${RACK_ENV_VAR}
 22 | 
 23 | # For information on supported formats and syntax for the config file, see
 24 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup-configuration.html>
 25 | 
 26 | 
 27 | ################################### Cluster ###################################
 28 | 
 29 | # Cluster name identifies your cluster for auto-discovery. If you're running
 30 | # multiple clusters on the same network, make sure you're using unique names.
 31 | #
 32 | #cluster.name: elasticsearch
 33 | 
 34 | 
 35 | #################################### Node #####################################
 36 | 
 37 | # Node names are generated dynamically on startup, so you're relieved
 38 | # from configuring them manually. You can tie this node to a specific name:
 39 | #
 40 | #node.name: "Franz Kafka"
 41 | 
 42 | # Every node can be configured to allow or deny being eligible as the master,
 43 | # and to allow or deny to store the data.
 44 | #
 45 | # Allow this node to be eligible as a master node (enabled by default):
 46 | #
 47 | #node.master: true
 48 | #
 49 | # Allow this node to store data (enabled by default):
 50 | #
 51 | #node.data: true
 52 | 
 53 | # You can exploit these settings to design advanced cluster topologies.
 54 | #
 55 | # 1. You want this node to never become a master node, only to hold data.
 56 | #    This will be the "workhorse" of your cluster.
 57 | #
 58 | #node.master: false
 59 | #node.data: true
 60 | #
 61 | # 2. You want this node to only serve as a master: to not store any data and
 62 | #    to have free resources. This will be the "coordinator" of your cluster.
 63 | #
 64 | #node.master: true
 65 | #node.data: false
 66 | #
 67 | # 3. You want this node to be neither master nor data node, but
 68 | #    to act as a "search load balancer" (fetching data from nodes,
 69 | #    aggregating results, etc.)
 70 | #
 71 | #node.master: false
 72 | #node.data: false
 73 | 
 74 | # Use the Cluster Health API [http://localhost:9200/_cluster/health], the
 75 | # Node Info API [http://localhost:9200/_nodes] or GUI tools
 76 | # such as <http://www.elasticsearch.org/overview/marvel/>,
 77 | # <http://github.com/karmi/elasticsearch-paramedic>,
 78 | # <http://github.com/lukas-vlcek/bigdesk> and
 79 | # <http://mobz.github.com/elasticsearch-head> to inspect the cluster state.
 80 | 
 81 | # A node can have generic attributes associated with it, which can later be used
 82 | # for customized shard allocation filtering, or allocation awareness. An attribute
 83 | # is a simple key value pair, similar to node.key: value, here is an example:
 84 | #
 85 | #node.rack: rack314
 86 | 
 87 | # By default, multiple nodes are allowed to start from the same installation location
 88 | # to disable it, set the following:
 89 | #node.max_local_storage_nodes: 1
 90 | 
 91 | 
 92 | #################################### Index ####################################
 93 | 
 94 | # You can set a number of options (such as shard/replica options, mapping
 95 | # or analyzer definitions, translog settings, ...) for indices globally,
 96 | # in this file.
 97 | #
 98 | # Note, that it makes more sense to configure index settings specifically for
 99 | # a certain index, either when creating it or by using the index templates API.
100 | #
101 | # See <http://elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules.html> and
102 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/indices-create-index.html>
103 | # for more information.
104 | 
105 | # Set the number of shards (splits) of an index (5 by default):
106 | #
107 | #index.number_of_shards: 5
108 | 
109 | # Set the number of replicas (additional copies) of an index (1 by default):
110 | #
111 | #index.number_of_replicas: 1
112 | 
113 | # Note, that for development on a local machine, with small indices, it usually
114 | # makes sense to "disable" the distributed features:
115 | #
116 | #index.number_of_shards: 1
117 | #index.number_of_replicas: 0
118 | 
119 | # These settings directly affect the performance of index and search operations
120 | # in your cluster. Assuming you have enough machines to hold shards and
121 | # replicas, the rule of thumb is:
122 | #
123 | # 1. Having more *shards* enhances the _indexing_ performance and allows to
124 | #    _distribute_ a big index across machines.
125 | # 2. Having more *replicas* enhances the _search_ performance and improves the
126 | #    cluster _availability_.
127 | #
128 | # The "number_of_shards" is a one-time setting for an index.
129 | #
130 | # The "number_of_replicas" can be increased or decreased anytime,
131 | # by using the Index Update Settings API.
132 | #
133 | # Elasticsearch takes care about load balancing, relocating, gathering the
134 | # results from nodes, etc. Experiment with different settings to fine-tune
135 | # your setup.
136 | 
137 | # Use the Index Status API (<http://localhost:9200/A/_status>) to inspect
138 | # the index status.
139 | 
140 | 
141 | #################################### Paths ####################################
142 | 
143 | # Path to directory containing configuration (this file and logging.yml):
144 | #
145 | #path.conf: /path/to/conf
146 | 
147 | # Path to directory where to store index data allocated for this node.
148 | #
149 | #path.data: /path/to/data
150 | #
151 | # Can optionally include more than one location, causing data to be striped across
152 | # the locations (a la RAID 0) on a file level, favouring locations with most free
153 | # space on creation. For example:
154 | #
155 | #path.data: /path/to/data1,/path/to/data2
156 | 
157 | # Path to temporary files:
158 | #
159 | #path.work: /path/to/work
160 | 
161 | # Path to log files:
162 | #
163 | #path.logs: /path/to/logs
164 | 
165 | # Path to where plugins are installed:
166 | #
167 | #path.plugins: /path/to/plugins
168 | 
169 | 
170 | #################################### Plugin ###################################
171 | 
172 | # If a plugin listed here is not installed for current node, the node will not start.
173 | #
174 | #plugin.mandatory: mapper-attachments,lang-groovy
175 | 
176 | 
177 | ################################### Memory ####################################
178 | 
179 | # Elasticsearch performs poorly when JVM starts swapping: you should ensure that
180 | # it _never_ swaps.
181 | #
182 | # Set this property to true to lock the memory:
183 | #
184 | #bootstrap.mlockall: true
185 | 
186 | # Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set
187 | # to the same value, and that the machine has enough memory to allocate
188 | # for Elasticsearch, leaving enough memory for the operating system itself.
189 | #
190 | # You should also make sure that the Elasticsearch process is allowed to lock
191 | # the memory, eg. by using `ulimit -l unlimited`.
192 | 
193 | 
194 | ############################## Network And HTTP ###############################
195 | 
196 | # Elasticsearch, by default, binds itself to the 0.0.0.0 address, and listens
197 | # on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node
198 | # communication. (the range means that if the port is busy, it will automatically
199 | # try the next port).
200 | 
201 | # Set the bind address specifically (IPv4 or IPv6):
202 | #
203 | #network.bind_host: 192.168.0.1
204 | 
205 | # Set the address other nodes will use to communicate with this node. If not
206 | # set, it is automatically derived. It must point to an actual IP address.
207 | #
208 | #network.publish_host: 192.168.0.1
209 | 
210 | # Set both 'bind_host' and 'publish_host':
211 | #
212 | #network.host: 192.168.0.1
213 | 
214 | # Set a custom port for the node to node communication (9300 by default):
215 | #
216 | #transport.tcp.port: 9300
217 | 
218 | # Enable compression for all communication between nodes (disabled by default):
219 | #
220 | #transport.tcp.compress: true
221 | 
222 | # Set a custom port to listen for HTTP traffic:
223 | #
224 | #http.port: 9200
225 | 
226 | # Set a custom allowed content length:
227 | #
228 | #http.max_content_length: 100mb
229 | 
230 | # Disable HTTP completely:
231 | #
232 | #http.enabled: false
233 | 
234 | 
235 | ################################### Gateway ###################################
236 | 
237 | # The gateway allows for persisting the cluster state between full cluster
238 | # restarts. Every change to the state (such as adding an index) will be stored
239 | # in the gateway, and when the cluster starts up for the first time,
240 | # it will read its state from the gateway.
241 | 
242 | # There are several types of gateway implementations. For more information, see
243 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-gateway.html>.
244 | 
245 | # The default gateway type is the "local" gateway (recommended):
246 | #
247 | #gateway.type: local
248 | 
249 | # Settings below control how and when to start the initial recovery process on
250 | # a full cluster restart (to reuse as much local data as possible when using shared
251 | # gateway).
252 | 
253 | # Allow recovery process after N nodes in a cluster are up:
254 | #
255 | #gateway.recover_after_nodes: 1
256 | 
257 | # Set the timeout to initiate the recovery process, once the N nodes
258 | # from previous setting are up (accepts time value):
259 | #
260 | #gateway.recover_after_time: 5m
261 | 
262 | # Set how many nodes are expected in this cluster. Once these N nodes
263 | # are up (and recover_after_nodes is met), begin recovery process immediately
264 | # (without waiting for recover_after_time to expire):
265 | #
266 | #gateway.expected_nodes: 2
267 | 
268 | 
269 | ############################# Recovery Throttling #############################
270 | 
271 | # These settings allow to control the process of shards allocation between
272 | # nodes during initial recovery, replica allocation, rebalancing,
273 | # or when adding and removing nodes.
274 | 
275 | # Set the number of concurrent recoveries happening on a node:
276 | #
277 | # 1. During the initial recovery
278 | #
279 | #cluster.routing.allocation.node_initial_primaries_recoveries: 4
280 | #
281 | # 2. During adding/removing nodes, rebalancing, etc
282 | #
283 | #cluster.routing.allocation.node_concurrent_recoveries: 2
284 | 
285 | # Set to throttle throughput when recovering (eg. 100mb, by default 20mb):
286 | #
287 | #indices.recovery.max_bytes_per_sec: 20mb
288 | 
289 | # Set to limit the number of open concurrent streams when
290 | # recovering a shard from a peer:
291 | #
292 | #indices.recovery.concurrent_streams: 5
293 | 
294 | 
295 | ################################## Discovery ##################################
296 | 
297 | # Discovery infrastructure ensures nodes can be found within a cluster
298 | # and master node is elected. Multicast discovery is the default.
299 | 
300 | # Set to ensure a node sees N other master eligible nodes to be considered
301 | # operational within the cluster. This should be set to a quorum/majority of 
302 | # the master-eligible nodes in the cluster.
303 | #
304 | #discovery.zen.minimum_master_nodes: 1
305 | 
306 | # Set the time to wait for ping responses from other nodes when discovering.
307 | # Set this option to a higher value on a slow or congested network
308 | # to minimize discovery failures:
309 | #
310 | #discovery.zen.ping.timeout: 3s
311 | 
312 | # For more information, see
313 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-discovery-zen.html>
314 | 
315 | # Unicast discovery allows to explicitly control which nodes will be used
316 | # to discover the cluster. It can be used when multicast is not present,
317 | # or to restrict the cluster communication-wise.
318 | #
319 | # 1. Disable multicast discovery (enabled by default):
320 | #
321 | #discovery.zen.ping.multicast.enabled: false
322 | #
323 | # 2. Configure an initial list of master nodes in the cluster
324 | #    to perform discovery when new nodes (master or data) are started:
325 | #
326 | #discovery.zen.ping.unicast.hosts: ["host1", "host2:port"]
327 | 
328 | # EC2 discovery allows to use AWS EC2 API in order to perform discovery.
329 | #
330 | # You have to install the cloud-aws plugin for enabling the EC2 discovery.
331 | #
332 | # For more information, see
333 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-discovery-ec2.html>
334 | #
335 | # See <http://elasticsearch.org/tutorials/elasticsearch-on-ec2/>
336 | # for a step-by-step tutorial.
337 | 
338 | # GCE discovery allows to use Google Compute Engine API in order to perform discovery.
339 | #
340 | # You have to install the cloud-gce plugin for enabling the GCE discovery.
341 | #
342 | # For more information, see <https://github.com/elasticsearch/elasticsearch-cloud-gce>.
343 | 
344 | # Azure discovery allows to use Azure API in order to perform discovery.
345 | #
346 | # You have to install the cloud-azure plugin for enabling the Azure discovery.
347 | #
348 | # For more information, see <https://github.com/elasticsearch/elasticsearch-cloud-azure>.
349 | 
350 | ################################## Slow Log ##################################
351 | 
352 | # Shard level query and fetch threshold logging.
353 | 
354 | #index.search.slowlog.threshold.query.warn: 10s
355 | #index.search.slowlog.threshold.query.info: 5s
356 | #index.search.slowlog.threshold.query.debug: 2s
357 | #index.search.slowlog.threshold.query.trace: 500ms
358 | 
359 | #index.search.slowlog.threshold.fetch.warn: 1s
360 | #index.search.slowlog.threshold.fetch.info: 800ms
361 | #index.search.slowlog.threshold.fetch.debug: 500ms
362 | #index.search.slowlog.threshold.fetch.trace: 200ms
363 | 
364 | #index.indexing.slowlog.threshold.index.warn: 10s
365 | #index.indexing.slowlog.threshold.index.info: 5s
366 | #index.indexing.slowlog.threshold.index.debug: 2s
367 | #index.indexing.slowlog.threshold.index.trace: 500ms
368 | 
369 | ################################## GC Logging ################################
370 | 
371 | #monitor.jvm.gc.young.warn: 1000ms
372 | #monitor.jvm.gc.young.info: 700ms
373 | #monitor.jvm.gc.young.debug: 400ms
374 | 
375 | #monitor.jvm.gc.old.warn: 10s
376 | #monitor.jvm.gc.old.info: 5s
377 | #monitor.jvm.gc.old.debug: 2s
378 | 
379 | ################################## Security ################################
380 | 
381 | # Uncomment if you want to enable JSONP as a valid return transport on the
382 | # http server. With this enabled, it may pose a security risk, so disabling
383 | # it unless you need it is recommended (it is disabled by default).
384 | #
385 | #http.jsonp.enable: true
386 | 
387 | 
388 | 
389 | 
390 | 


--------------------------------------------------------------------------------
/aws_config/configure/templates/kafka-server-start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | # 
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | # 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | export JMX_PORT=${JMX_PORT:-9999}
19 | 
20 | if [ $# -lt 1 ];
21 | then
22 | 	echo "USAGE: $0 [-daemon] server.properties"
23 | 	exit 1
24 | fi
25 | base_dir=$(dirname $0)
26 | 
27 | if [ "x$KAFKA_LOG4J_OPTS" = "x" ]; then
28 |     export KAFKA_LOG4J_OPTS="-Dlog4j.configuration=file:$base_dir/../config/log4j.properties"
29 | fi
30 | 
31 | if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then
32 |     export KAFKA_HEAP_OPTS="-Xmx1G -Xms1G"
33 | fi
34 | 
35 | EXTRA_ARGS="-name kafkaServer -loggc"
36 | 
37 | COMMAND=$1
38 | case $COMMAND in
39 |   -daemon)
40 |     EXTRA_ARGS="-daemon "$EXTRA_ARGS
41 |     shift
42 |     ;;
43 |   *)
44 |     ;;
45 | esac
46 | 
47 | exec $base_dir/kafka-run-class.sh $EXTRA_ARGS kafka.Kafka $@
48 | 


--------------------------------------------------------------------------------
/aws_config/configure/templates/kafka.server.properties:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | # 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # see kafka.server.KafkaConfig for additional details and defaults
 16 | 
 17 | ############################# Server Basics #############################
 18 | # SEE END OF FILE
 19 | # The id of the broker. This must be set to a unique integer for each broker.
 20 | # broker.id=0
 21 | 
 22 | ############################# Socket Server Settings #############################
 23 | 
 24 | # The port the socket server listens on
 25 | port=9092
 26 | 
 27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces
 28 | #host.name=localhost
 29 | 
 30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the
 31 | # value for "host.name" if configured.  Otherwise, it will use the value returned from
 32 | # java.net.InetAddress.getCanonicalHostName().
 33 | #advertised.host.name=<hostname routable by clients>
 34 | 
 35 | # The port to publish to ZooKeeper for clients to use. If this is not set,
 36 | # it will publish the same port that the broker binds to.
 37 | #advertised.port=<port accessible by clients>
 38 | 
 39 | # The number of threads handling network requests
 40 | num.network.threads=3
 41 |  
 42 | # The number of threads doing disk I/O
 43 | num.io.threads=8
 44 | 
 45 | # The send buffer (SO_SNDBUF) used by the socket server
 46 | socket.send.buffer.bytes=102400
 47 | 
 48 | # The receive buffer (SO_RCVBUF) used by the socket server
 49 | socket.receive.buffer.bytes=102400
 50 | 
 51 | # The maximum size of a request that the socket server will accept (protection against OOM)
 52 | socket.request.max.bytes=104857600
 53 | 
 54 | 
 55 | ############################# Log Basics #############################
 56 | 
 57 | # A comma seperated list of directories under which to store log files
 58 | log.dirs=/tmp/kafka-logs
 59 | 
 60 | # The default number of log partitions per topic. More partitions allow greater
 61 | # parallelism for consumption, but this will also result in more files across
 62 | # the brokers.
 63 | num.partitions=1
 64 | 
 65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
 66 | # This value is recommended to be increased for installations with data dirs located in RAID array.
 67 | num.recovery.threads.per.data.dir=1
 68 | 
 69 | ############################# Log Flush Policy #############################
 70 | 
 71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync
 72 | # the OS cache lazily. The following configurations control the flush of data to disk. 
 73 | # There are a few important trade-offs here:
 74 | #    1. Durability: Unflushed data may be lost if you are not using replication.
 75 | #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
 76 | #    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
 77 | # The settings below allow one to configure the flush policy to flush data after a period of time or
 78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 79 | 
 80 | # The number of messages to accept before forcing a flush of data to disk
 81 | #log.flush.interval.messages=10000
 82 | 
 83 | # The maximum amount of time a message can sit in a log before we force a flush
 84 | log.flush.interval.ms=500
 85 | 
 86 | ############################# Log Retention Policy #############################
 87 | 
 88 | # The following configurations control the disposal of log segments. The policy can
 89 | # be set to delete segments after a period of time, or after a given size has accumulated.
 90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 91 | # from the end of the log.
 92 | 
 93 | # The minimum age of a log file to be eligible for deletion
 94 | log.retention.hours=1
 95 | 
 96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
 97 | # segments don't drop below log.retention.bytes.
 98 | #log.retention.bytes=1073741824
 99 | 
100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created.
101 | log.segment.bytes=1073741824
102 | 
103 | # The interval at which log segments are checked to see if they can be deleted according 
104 | # to the retention policies
105 | log.retention.check.interval.ms=300000
106 | 
107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires.
108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction.
109 | log.cleaner.enable=false
110 | 
111 | ############################# Zookeeper #############################
112 | 
113 | # Zookeeper connection string (see zookeeper docs for details).
114 | # This is a comma separated host:port pairs, each corresponding to a zk
115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
116 | # You can also append an optional chroot string to the urls to specify the
117 | # root directory for all kafka znodes.
118 | 
119 | # Timeout in ms for connecting to zookeeper
120 | zookeeper.connection.timeout.ms=6000
121 | 
122 | #
123 | #   INJECTED CONFIGURATION [SEE ABOVE FOR PARAMETER DETAILS]
124 | #
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/aws_config/configure/templates/spark-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This file is sourced when running various Spark programs.
 4 | # Copy it as spark-env.sh and edit that to configure Spark for your site.
 5 | 
 6 | # Options read when launching programs locally with
 7 | # ./bin/run-example or ./bin/spark-submit
 8 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
 9 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
10 | # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
11 | # - SPARK_CLASSPATH, default classpath entries to append
12 | 
13 | # Options read by executors and drivers running inside the cluster
14 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
15 | # - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
16 | # - SPARK_CLASSPATH, default classpath entries to append
17 | # - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
18 | # - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
19 | 
20 | # Options read in YARN client mode
21 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
22 | # - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
23 | # - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
24 | # - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
25 | # - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
26 | # - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
27 | # - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
28 | # - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
29 | # - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.
30 | 
31 | # Options for the daemons used in the standalone deploy mode
32 | # - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
33 | # - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
34 | # - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
35 | # - SPARK_WORKER_CORES, to set the number of cores to use on this machine
36 | # - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
37 | # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
38 | # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
39 | # - SPARK_WORKER_DIR, to set the working directory of worker processes
40 | # - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
41 | # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
42 | # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
43 | # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
44 | # - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
45 | 
46 | # Generic options for the daemons used in the standalone deploy mode
47 | # - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
48 | # - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
49 | # - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
50 | # - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
51 | # - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
52 | 
53 | export JAVA_HOME=/usr
54 | export SPARK_WORKER_CORES=$(echo $(nproc)*3 | bc)
55 | 


--------------------------------------------------------------------------------
/aws_config/configure/templates/storm.yaml:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | ########### These MUST be filled in for a storm configuration
18 | # storm.zookeeper.servers:
19 | #     - "server1"
20 | #     - "server2"
21 | # 
22 | # nimbus.host: "nimbus"
23 | # 
24 | # 
25 | # ##### These may optionally be filled in:
26 | #    
27 | ## List of custom serializations
28 | # topology.kryo.register:
29 | #     - org.mycompany.MyType
30 | #     - org.mycompany.MyType2: org.mycompany.MyType2Serializer
31 | #
32 | ## List of custom kryo decorators
33 | # topology.kryo.decorators:
34 | #     - org.mycompany.MyDecorator
35 | #
36 | ## Locations of the drpc servers
37 | # drpc.servers:
38 | #     - "server1"
39 | #     - "server2"
40 | 
41 | ## Metrics Consumers
42 | # topology.metrics.consumer.register:
43 | #   - class: "backtype.storm.metric.LoggingMetricsConsumer"
44 | #     parallelism.hint: 1
45 | #   - class: "org.mycompany.MyMetricsConsumer"
46 | #     parallelism.hint: 1
47 | #     argument:
48 | #       - endpoint: "metrics-collector.mycompany.org"
49 | 


--------------------------------------------------------------------------------
/aws_config/configure/templates/zoo.cfg:
--------------------------------------------------------------------------------
 1 | # http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html
 2 | 
 3 | # The number of milliseconds of each tick
 4 | tickTime=2000
 5 | # The number of ticks that the initial 
 6 | # synchronization phase can take
 7 | initLimit=10
 8 | # The number of ticks that can pass between 
 9 | # sending a request and getting an acknowledgement
10 | syncLimit=5
11 | # the directory where the snapshot is stored.
12 | dataDir=/var/lib/zookeeper
13 | # Place the dataLogDir to a separate physical disc for better performance
14 | # dataLogDir=/disk2/zookeeper
15 | 
16 | # the port at which the clients will connect
17 | clientPort=2181
18 | 
19 | # To avoid seeks ZooKeeper allocates space in the transaction log file in
20 | # blocks of preAllocSize kilobytes. The default block size is 64M. One reason
21 | # for changing the size of the blocks is to reduce the block size if snapshots
22 | # are taken more often. (Also, see snapCount).
23 | #preAllocSize=65536
24 | 
25 | # Clients can submit requests faster than ZooKeeper can process them,
26 | # especially if there are a lot of clients. To prevent ZooKeeper from running
27 | # out of memory due to queued requests, ZooKeeper will throttle clients so that
28 | # there is no more than globalOutstandingLimit outstanding requests in the
29 | # system. The default limit is 1,000.ZooKeeper logs transactions to a
30 | # transaction log. After snapCount transactions are written to a log file a
31 | # snapshot is started and a new transaction log file is started. The default
32 | # snapCount is 10,000.
33 | #snapCount=1000
34 | 
35 | # If this option is defined, requests will be will logged to a trace file named
36 | # traceFile.year.month.day. 
37 | #traceFile=
38 | 
39 | # Leader accepts client connections. Default value is "yes". The leader machine
40 | # coordinates updates. For higher update throughput at thes slight expense of
41 | # read throughput the leader can be configured to not accept clients and focus
42 | # on coordination.
43 | #leaderServes=yes
44 | 
45 | # specify all zookeeper servers
46 | # The fist port is used by followers to connect to the leader
47 | # The second one is used for leader election
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/aws_config/create_clusters.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #
  3 | #   Create the resources for Straw cluster on AWS
  4 | #
  5 | #   RUN aws configure prior to executing this script.
  6 | #
  7 | #
  8 | import boto3, os, argparse
  9 | from time import sleep
 10 | 
 11 | #############################
 12 | # CONFIG
 13 | #############################
 14 | try:
 15 |     keyfile = os.environ["AWS_PEM_FILE"]
 16 |     pemkey=os.environ["PEM_KEY"]
 17 |     tag_prefix = os.environ["TAG_PREFIX"]
 18 | except KeyError as e:
 19 |     print("Can't find PEM and/or tag ENV variable. You must export values for AWS_PEM FILE and TAG_PREFIX.")
 20 |     raise e
 21 | 
 22 | # network settings -- only single subnet right now
 23 | vpc_cidr = "10.0.0.0/27"
 24 | subnet_cidr = "10.0.0.0/27"
 25 | 
 26 | # node settings
 27 | kafka_instances=5                 
 28 | elasticsearch_instances=3
 29 | storm_instances=7
 30 | 
 31 | # initializtion files
 32 | path = "host_install_scripts"
 33 | kafka_initfile = os.path.join(path, "kafka_install.sh")
 34 | elasticsearch_initfile = os.path.join(path, "elasticsearch_install.sh")
 35 | storm_initfile = os.path.join(path, "storm_install.sh")
 36 | flask_initfile = os.path.join(path, "flask_install.sh")
 37 | spark_initfile = os.path.join(path, "spark_install.sh")
 38 | 
 39 | # base AWS settings
 40 | base_aws_image = 'ami-5189a661'
 41 | 
 42 | # services
 43 | services = ['kafka', 'elasticsearch', 'storm', 'flask', 'spark']
 44 | 
 45 | ###############################
 46 | # helper methods
 47 | def get_tag(name):
 48 |     # all service tags will be prefixed with the "tag_prefix" value
 49 |     return (tag_prefix + "-" + name)
 50 | 
 51 | 
 52 | ###############################
 53 | if __name__=="__main__":
 54 | 
 55 |     # argument help
 56 |     parser = argparse.ArgumentParser(description='Launch AWS EC2 instances for the straw cluster.')
 57 |     parser.add_argument('service', help='Name of service to start one of {0}. Specify \'all\' to launch all services.'.format(services))
 58 |     args = parser.parse_args()
 59 | 
 60 |     # boto3 api
 61 |     ec2 = boto3.resource('ec2')
 62 | 
 63 | 
 64 |     ############################################################
 65 |     #
 66 |     #   NETWORKING -- common to all services
 67 |     #
 68 |     ############################################################
 69 |     # check if vpc already exists
 70 |     vpcid = None
 71 |     for v in ec2.vpcs.filter(Filters=[{'Name':'tag-value','Values':[get_tag('vpc')]}]):
 72 |         vpcid = v.id
 73 |             
 74 |     # create the vpc
 75 |     if vpcid is None:
 76 |         my_vpc = ec2.create_vpc(CidrBlock=vpc_cidr)
 77 |         vpc = ec2.Vpc(my_vpc.id)
 78 |         vpc.modify_attribute(VpcId=my_vpc.id, EnableDnsSupport={'Value':True})
 79 |         vpc.modify_attribute(VpcId=my_vpc.id, EnableDnsHostnames={'Value':True})
 80 |         vpc.create_tags(Tags=[{'Key':'Name', 'Value':get_tag('vpc')}])
 81 |     else:
 82 |         vpc = ec2.Vpc(vpcid)
 83 | 
 84 |     #
 85 |     # Create a single subnet in vpc
 86 |     #
 87 |     # subnets
 88 |     subnetid = None
 89 |     for v in vpc.subnets.filter(Filters=[{'Name':'tag-value','Values':[get_tag('subnet')]}]):
 90 |         subnetid = v.id
 91 |     if subnetid is None:
 92 |         subnet = vpc.create_subnet(CidrBlock=subnet_cidr)
 93 |         subnet.create_tags(Tags=[{'Key':'Name', 'Value':get_tag('subnet')}])
 94 |     else:
 95 |         subnet = ec2.Subnet(subnetid)
 96 | 
 97 |     # Find the gateway id; gateway is automatically created with the subnet?
 98 |     gatewayid = None
 99 |     for v in vpc.internet_gateways.filter(Filters=[{'Name':'tag-value','Values':[get_tag('gateway')]}]):
100 |         gatewayid = v.id
101 |     if gatewayid is None: 
102 |         gateway = ec2.create_internet_gateway()
103 |         gateway.create_tags(Tags=[{'Key':'Name', 'Value':get_tag('gateway')}])
104 |         gateway.attach_to_vpc(VpcId=vpc.id)
105 | 
106 |     #
107 |     #   Create a route table
108 |     #
109 |     rtid = None        
110 |     for v in vpc.route_tables.filter(Filters=[{'Name':'tag-value','Values':[get_tag('route_table')]}]):
111 |         rtid = v.id
112 |         break
113 |     if rtid is None:
114 |         rt = ec2.create_route_table(VpcId=vpc.id)
115 |         rt.associate_with_subnet(SubnetId=subnet.id)
116 |         rt.create_route(GatewayId=gateway.id, DestinationCidrBlock='0.0.0.0/0')
117 |         rt.create_tags(Tags=[{'Key':'Name', 'Value':get_tag('route_table')}])
118 | 
119 |     #   
120 |     #   Create a security group -- just one for the vpc right now.
121 |     #
122 |     tag = get_tag('security-group')
123 |     description = 'A security group for kafka clusters.'
124 |     sgid = None
125 |     for v in vpc.security_groups.filter(Filters=[{'Name':'group-name','Values':[tag]}]):
126 |         sgid = v.id
127 |     if sgid is None:
128 |         security_group = ec2.create_security_group(GroupName=tag, Description=description, VpcId=vpc.id)
129 |         # permissions
130 |         IpPermissions=[
131 |             {
132 |                 'IpProtocol': 'tcp',
133 |                 'FromPort': 0,
134 |                 'ToPort': 65535,
135 |                 'IpRanges': [
136 |                     {
137 |                         'CidrIp': '10.0.0.0/16'
138 |                     },
139 |                 ],
140 |             },
141 |             {
142 |                 'IpProtocol': 'tcp',
143 |                 'FromPort': 22,
144 |                 'ToPort': 22,
145 |                 'IpRanges': [
146 |                     {
147 |                         'CidrIp': '0.0.0.0/0'
148 |                     },
149 |                 ],
150 |             }
151 |         ]
152 |         security_group.authorize_egress(IpPermissions=IpPermissions)
153 |         security_group.authorize_ingress(IpPermissions=IpPermissions)
154 |     else:
155 |         security_group = ec2.SecurityGroup(sgid)
156 | 
157 |     ################################################################
158 |     #
159 |     #   Services
160 |     #
161 |     ################################################################
162 | 
163 |     if args.service.lower() in ['all','kafka']:
164 |         #########################################
165 |         #   KAFKA CLUSTER
166 |         #########################################
167 |         print("Creating a Kafka cluster...")
168 |         #
169 |         #   EC2 Instances
170 |         #
171 |         shellcodefile=os.path.abspath(kafka_initfile)
172 |         shellfile = open(shellcodefile,'r').read()
173 |         pemfile =os.path.abspath(keyfile)
174 |         instances = ec2.create_instances(
175 |             MinCount=kafka_instances,
176 |             MaxCount=kafka_instances,
177 |             UserData=shellfile,
178 |             KeyName=pemkey,
179 |             ImageId=base_aws_image,
180 |             InstanceType='m4.large',
181 |             NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}],
182 |             BlockDeviceMappings=[
183 |                 {
184 |                     'VirtualName': 'ephemeral0',
185 |                     'DeviceName': '/dev/sda1',
186 |                     'Ebs': {
187 |                         'VolumeSize': 128,
188 |                         'VolumeType': 'gp2'        # standard for magnetic, gp2 for SSD
189 |                     }
190 |                 }
191 |             ]
192 |         )
193 | 
194 |         # tag instances and assign a public ip
195 |         tag='kafka-node'
196 |         print("Sleep 60 seconds to give instances time to configure...")
197 |         sleep(60)
198 |         for v in instances:
199 |             v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}])
200 |             # elastic ip assignment 
201 |             #address = client.allocate_address()
202 |             #client.associate_address(InstanceId=v.instance_id, PublicIp=address['PublicIp'])
203 |             print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name))
204 | 
205 |     if args.service.lower() in ['all', 'elasticsearch']:
206 |         #########################################
207 |         #   ELASTICSEARCH CLUSTER
208 |         #########################################
209 |         print("Creating an Elasticsearch cluster...")
210 |         #   
211 |         #   Create a security group for elasticsearch
212 |         #   world access to 9200,9300 should modify for production
213 |         #
214 |         sgid = None
215 |         tag = get_tag('elasticsearch-security-group')
216 |         description = 'A security group for elasticsearch clusters.'
217 |         for v in ec2.security_groups.filter(Filters=[{'Name':'group-name','Values':[tag]}]):
218 |             sgid = v.id
219 |         if sgid is None:
220 |             security_group = ec2.create_security_group(GroupName=tag, Description=description, VpcId=vpc.id)
221 | 
222 |             # permissions
223 |             IpPermissions=[
224 |                 {
225 |                     'IpProtocol': 'tcp',
226 |                     'FromPort': 0,
227 |                     'ToPort': 65535,
228 |                     'IpRanges': [
229 |                         {
230 |                             'CidrIp': '10.0.0.0/16'
231 |                         },
232 |                     ],
233 |                 },
234 |                 {
235 |                     'IpProtocol': 'tcp',
236 |                     'FromPort': 22,
237 |                     'ToPort': 22,
238 |                     'IpRanges': [
239 |                         {
240 |                             'CidrIp': '0.0.0.0/0'
241 |                         }
242 |                     ]
243 |                 },
244 |                 {
245 |                     'IpProtocol': 'tcp',
246 |                     'FromPort': 9200,
247 |                     'ToPort': 9200,
248 |                     'IpRanges': [
249 |                         {
250 |                             'CidrIp': '0.0.0.0/0'
251 |                         }
252 |                     ]
253 |                 },
254 |                 {
255 |                     'IpProtocol': 'tcp',
256 |                     'FromPort': 9300,
257 |                     'ToPort': 9300,
258 |                     'IpRanges': [
259 |                         {
260 |                             'CidrIp': '0.0.0.0/0'
261 |                         }
262 |                     ]
263 |                 }
264 |             ]
265 |             security_group.authorize_egress(IpPermissions=IpPermissions)
266 |             security_group.authorize_ingress(IpPermissions=IpPermissions)
267 |         else:
268 |             security_group = ec2.SecurityGroup(sgid)
269 | 
270 |         #
271 |         #   EC2 Instances
272 |         #
273 |         shellcodefile=os.path.abspath(elasticsearch_initfile)
274 |         shellfile = open(shellcodefile,'r').read()
275 |         pemfile =os.path.abspath(keyfile)
276 |         instances = ec2.create_instances(
277 |             MinCount=elasticsearch_instances,
278 |             MaxCount=elasticsearch_instances,
279 |             UserData=shellfile,
280 |             KeyName=pemkey,
281 |             ImageId=base_aws_image,
282 |             InstanceType='m4.large',
283 |             NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}]
284 |         )
285 | 
286 |         # tag instances and assign a public ip
287 |         tag='elasticsearch-node'
288 |         print("Sleep 60 seconds to give instances time to configure...")
289 |         sleep(60)
290 |         for v in instances:
291 |             v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}])
292 |             print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name))
293 | 
294 |     if args.service.lower() in ['all', 'storm']:
295 |         #########################################
296 |         #   STORM CLUSTER
297 |         #########################################
298 |         print("Creating a Storm cluster...")
299 |         #
300 |         #   EC2 Instances
301 |         #
302 |         shellcodefile=os.path.abspath(storm_initfile)
303 |         shellfile = open(shellcodefile,'r').read()
304 |         pemfile =os.path.abspath(keyfile)
305 |         instances = ec2.create_instances(
306 |             MinCount=storm_instances,
307 |             MaxCount=storm_instances,
308 |             UserData=shellfile,
309 |             KeyName=pemkey,
310 |             ImageId=base_aws_image,
311 |             InstanceType='m4.xlarge',
312 |             NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}],
313 |             BlockDeviceMappings=[
314 |                 {
315 |                     'VirtualName': 'ephemeral0',
316 |                     'DeviceName': '/dev/sda1',
317 |                     'Ebs': {
318 |                         'VolumeSize': 64,
319 |                         'VolumeType': 'gp2'        # standard for magnetic, gp2 for SSD
320 |                     }
321 |                 }
322 |             ]
323 |         )
324 | 
325 |         # tag instances and assign a public ip
326 |         tag='storm-node'
327 |         print("Sleep 60 seconds to give instances time to configure...")
328 |         sleep(60)
329 |         for v in instances:
330 |             v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}])
331 |             print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name))
332 | 
333 |     if args.service.lower() in ['all', 'spark']:
334 |         #########################################
335 |         #   Spark
336 |         #########################################
337 |         print("Creating Spark Cluster...")
338 |         #
339 |         #   EC2 Instances
340 |         #
341 |         shellcodefile=os.path.abspath(spark_initfile)
342 |         shellfile = open(shellcodefile,'r').read()
343 |         pemfile =os.path.abspath(keyfile)
344 |         instances = ec2.create_instances(
345 |             MinCount=4,
346 |             MaxCount=4,
347 |             UserData=shellfile,
348 |             KeyName=pemkey,
349 |             ImageId=base_aws_image,
350 |             InstanceType='m4.large',
351 |             NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}],
352 |             BlockDeviceMappings=[
353 |                 {
354 |                     'VirtualName': 'ephemeral0',
355 |                     'DeviceName': '/dev/sda1',
356 |                     'Ebs': {
357 |                         'VolumeSize': 32,
358 |                         'VolumeType': 'gp2'        # standard for magnetic, gp2 for SSD
359 |                     }
360 |                 }
361 |             ]
362 |         )
363 | 
364 |         # tag instances and assign a public ip
365 |         tag='spark-node'
366 |         print("Sleep 60 seconds to give instances time to configure...")
367 |         sleep(60)
368 |         for v in instances:
369 |             v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}])
370 |             print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name))
371 | 
372 |     if args.service.lower() in ['all', 'storm']:
373 |         #########################################
374 |         #   STORM CLUSTER
375 |         #########################################
376 |         print("Creating a Storm cluster...")
377 |         #
378 |         #   EC2 Instances
379 |         #
380 |         shellcodefile=os.path.abspath(storm_initfile)
381 |         shellfile = open(shellcodefile,'r').read()
382 |         pemfile =os.path.abspath(keyfile)
383 |         instances = ec2.create_instances(
384 |             MinCount=storm_instances,
385 |             MaxCount=storm_instances,
386 |             UserData=shellfile,
387 |             KeyName=pemkey,
388 |             ImageId=base_aws_image,
389 |             InstanceType='m4.xlarge',
390 |             NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}],
391 |             BlockDeviceMappings=[
392 |                 {
393 |                     'VirtualName': 'ephemeral0',
394 |                     'DeviceName': '/dev/sda1',
395 |                     'Ebs': {
396 |                         'VolumeSize': 64,
397 |                         'VolumeType': 'gp2'        # standard for magnetic, gp2 for SSD
398 |                     }
399 |                 }
400 |             ]
401 |         )
402 | 
403 |         # tag instances and assign a public ip
404 |         tag='storm-node'
405 |         print("Sleep 60 seconds to give instances time to configure...")
406 |         sleep(60)
407 |         for v in instances:
408 |             v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}])
409 |             print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name))
410 | 
411 | 
412 | 


--------------------------------------------------------------------------------
/aws_config/discover.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | #
 3 | #   Discover straw cluster resources running on AWS
 4 | #
 5 | import boto3, argparse
 6 | from create_clusters import services, get_tag, keyfile
 7 | 
 8 | class ServicesList:
 9 |     '''Container class for AWS services info'''
10 |     def __init__(self):
11 |         ec2 = boto3.resource('ec2')
12 |         client= boto3.client('ec2')
13 |         filt=[{'Name': 'instance-state-name', 'Values': ['running']},{'Name':'tag-value','Values':[get_tag(s+'-node') for s in services]}]
14 |         self.services = []
15 |         for v in ec2.instances.filter(Filters=filt):
16 |             self.services.append(v)
17 | 
18 |     def print(self):
19 |         for v in self.services:
20 |             print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15} PRIVATE IP: {3:<15}".format(v.tags[0]['Value'], 
21 |                 v.instance_id, v.public_ip_address, v.private_ip_address))
22 | 
23 |     def make_config_file(self, filename):
24 |         '''create a straw config file for AWS'''
25 | 
26 |         def find_first_service(s):
27 |             '''find the FIRST listed service with tag post-fix s
28 |                NOTE: We implicitly assume that the list of AWS services is fixed.
29 |                We should fix that by identifying the leader nodes among each service
30 |                type.
31 |             '''
32 |             for v in self.services:
33 |                 if v.tags[0]['Value']==get_tag(s):
34 |                     return(v.private_ip_address)
35 | 
36 |         with open(filename,"w") as f:
37 |             header = """#
38 | # config for straw.storm application
39 | #
40 | 
41 | """
42 |             f.write(header)
43 |             elasticsearch = """
44 | # elasticsearch settings
45 | elasticsearch_host={0}
46 | elasticsearch_port=9300
47 | elasticsearch_cluster_name={1}
48 | index_name=documents
49 | document_type=document
50 | """.format(find_first_service("elasticsearch-node"), get_tag("elasticsearch-cluster"))
51 |             f.write(elasticsearch)
52 | 
53 |             kafka = """
54 | # kafka settings
55 | zookeeper_host={0}
56 | zookeeper_port=2181
57 | kafka_query_topic=queries
58 | kafka_document_topic=documents
59 | """.format(find_first_service("kafka-node"))
60 |             f.write(kafka)
61 | 
62 |             redis = """
63 | # redis
64 | redis_host={0}
65 | redis_port=6379
66 | """.format(find_first_service("flask-node"))
67 |             f.write(redis)
68 |         print("Wrote config file {0}.".format(f.name))
69 | 
70 | if __name__=="__main__":
71 |     
72 |     # argument help
73 |     parser = argparse.ArgumentParser(description='Discover AWS ec2 instances for the straw cluster.')
74 |     args = parser.add_argument("--configure", help="Write a configuration file", action="store_true")
75 |     args = parser.parse_args()
76 | 
77 |     # boto3
78 |     S = ServicesList()
79 |     S.print()
80 |     
81 |     if args.configure:
82 |         S.make_config_file("config.properties.tmp")
83 | 


--------------------------------------------------------------------------------
/aws_config/host_install_scripts/elasticsearch_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | #   Config/install elasticsearch 1.7 on Ubuntu 14.04   
 5 | #
 6 | #   Forked from https://gist.github.com/ricardo-rossi/8265589463915837429d
 7 | #   and modified by rwalker.
 8 | #
 9 | #
10 | 
11 | ### Agree to stupid oracle license nonsense
12 | ### See http://stackoverflow.com/questions/19275856/auto-yes-to-the-license-agreement-on-sudo-apt-get-y-install-oracle-java7-instal
13 | echo debconf shared/accepted-oracle-license-v1-1 select true | sudo debconf-set-selections
14 | echo debconf shared/accepted-oracle-license-v1-1 seen true | sudo debconf-set-selections 
15 | 
16 | ### Install Java 8
17 | apt-get install -y python-software-properties
18 | add-apt-repository -y ppa:webupd8team/java
19 | apt-get update
20 | apt-get install -y oracle-java8-installer
21 | 
22 | ### Download and install the Public Signing Key
23 | wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch | apt-key add -
24 | 
25 | ### Setup Repository
26 | echo "deb http://packages.elastic.co/elasticsearch/1.7/debian stable main" | tee -a /etc/apt/sources.list.d/elk.list
27 | 
28 | ### Install Elasticsearch
29 | #apt-get purge elasticsearch -y
30 | apt-get update && sudo apt-get install elasticsearch -y
31 | 
32 | ### node discovery plugin for AWS
33 | /usr/share/elasticsearch/bin/plugin install elasticsearch/elasticsearch-cloud-aws/2.5.0
34 | 
35 | ### start elasticsearch:
36 | # service elasticsearch start
37 | ### To test:
38 | # curl <IP>:9200
39 | 
40 | 


--------------------------------------------------------------------------------
/aws_config/host_install_scripts/flask_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ##########################################################
 3 | #   Flask Webserver setup
 4 | ##########################################################
 5 | 
 6 | # python3 discouraged: http://flask.pocoo.org/docs/0.10/python3/
 7 | sudo apt-get -y update
 8 | sudo apt-get install -y python-pip python-dev build-essential
 9 | sudo pip install flask
10 | sudo pip install flask-session
11 | 
12 | # install redis
13 | sudo apt-get install -y redis-server
14 | sudo apt-get install -y supervisor
15 | 


--------------------------------------------------------------------------------
/aws_config/host_install_scripts/kafka_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # install java and zookeeper
 4 | apt-get update
 5 | apt-get install -y default-jre
 6 | apt-get install -y zookeeperd
 7 | 
 8 | # install kafka
 9 | mkdir -p ~/Downloads
10 | wget "http://mirror.cc.columbia.edu/pub/software/apache/kafka/0.8.2.1/kafka_2.11-0.8.2.1.tgz" -P ~/Downloads
11 | tar zxvf ~/Downloads/kafka_2.11-0.8.2.1.tgz -C /usr/local
12 | mv /usr/local/kafka_2.11-0.8.2.1 /usr/local/kafka
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/aws_config/host_install_scripts/spark_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ### Agree to stupid oracle license nonsense
 4 | ### See http://stackoverflow.com/questions/19275856/auto-yes-to-the-license-agreement-on-sudo-apt-get-y-install-oracle-java7-instal
 5 | echo debconf shared/accepted-oracle-license-v1-1 select true | sudo debconf-set-selections
 6 | echo debconf shared/accepted-oracle-license-v1-1 seen true | sudo debconf-set-selections 
 7 | 
 8 | ### Install Java 8
 9 | apt-get update
10 | apt-get install -y python-software-properties
11 | add-apt-repository -y ppa:webupd8team/java
12 | apt-get update
13 | apt-get install -y oracle-java8-installer
14 | 
15 | ### 
16 | apt-get install -y scala
17 | 
18 | # Install sbt
19 | wget https://dl.bintray.com/sbt/debian/sbt-0.13.7.deb -P ~/Downloads
20 | dpkg -i ~/Downloads/sbt-0.13.7.deb
21 | apt-get install sbt
22 | 
23 | # Install Spark
24 | wget http://apache.mirrors.tds.net/spark/spark-1.4.1/spark-1.4.1-bin-hadoop2.4.tgz -P ~/Downloads
25 | tar zxvf ~/Downloads/spark-1.4.1-bin-hadoop2.4.tgz -C /usr/local
26 | sudo mv /usr/local/spark-1.4.1-bin-hadoop2.4 /usr/local/spark
27 | sudo chown -R ubuntu /usr/local/spark
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/aws_config/host_install_scripts/storm_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # install java and zookeeper
 4 | apt-get update
 5 | apt-get install -y default-jre
 6 | apt-get install -y zookeeperd
 7 | apt-get install -y supervisor
 8 | 
 9 | # install storm
10 | wget "http://mirrors.gigenet.com/apache/storm/apache-storm-0.9.5/apache-storm-0.9.5.tar.gz" -P ~/Downloads
11 | tar zxvf ~/Downloads/apache-storm*.gz -C /usr/local
12 | mv /usr/local/apache-storm* /usr/local/storm
13 | 
14 | # inject some new config
15 | echo "export STORM_HOME=/usr/local/storm" | tee -a  /home/ubuntu/.profile /home/ubuntu/.bashrc
16 | echo "export PATH=$PATH:$STORM_HOME/bin" | tee -a /home/ubuntu/.profile /home/ubuntu/.bashrc
17 | 
18 | 
19 | # create space for local state
20 | mkdir /usr/local/storm/local_state
21 | chown ubuntu /usr/local/storm/local_state
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/aws_config/straw_service_config.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/bash
2 | #Source this file to set enviornment vars for config
3 | export AWS_PEM_FILE=/home/ryan/projects/insight/accounts/rwalker.pem
4 | export PEM_KEY=rwalker
5 | export TAG_PREFIX=rwalker
6 | 
7 | 


--------------------------------------------------------------------------------
/config/config.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # config for straw.storm application
 3 | #
 4 | 
 5 | # elasticsearch settings
 6 | elasticsearch_host=localhost
 7 | elasticsearch_port=9300
 8 | elasticsearch_cluster_name=elasticsearch
 9 | index_name=documents
10 | document_type=document
11 | 
12 | # kafka settings
13 | zookeeper_host=127.0.0.1
14 | zookeeper_port=2181
15 | kafka_query_topic=queries
16 | kafka_document_topic=documents
17 | 
18 | # redis
19 | redis_host=127.0.0.1
20 | redis_port=6379
21 | 
22 | # redis_analytics db
23 | redis_analytics_host=127.0.0.1
24 | redis_analytics_port=6379
25 | 
26 | # storm settings
27 | search.bolt.number.tasks=1
28 | search.bolts=5
29 | document.spouts=5
30 | query.spouts=3
31 | workers=6
32 | 
33 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | tweets.big.sample
2 | queries.bigrams
3 | 


--------------------------------------------------------------------------------
/data/queries.bigrams.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/data/queries.bigrams.gz


--------------------------------------------------------------------------------
/data/queries.small:
--------------------------------------------------------------------------------
1 | { "type": "terms-query", "terms": [ "coffee", "toast"], "minimum-match": 1 }
2 | { "type": "terms-query", "terms": ["Keith Richards"], "minimum-match": 1 }
3 | { "type": "terms-query", "terms": [ "justin", "beiber"], "minimum-match": 2}
4 | { "type": "terms-query", "terms": ["bonsai","tree"], "minimum-match": 1}
5 | 


--------------------------------------------------------------------------------
/data/tweets.big.sample.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/data/tweets.big.sample.gz


--------------------------------------------------------------------------------
/local_demo/launch_demo_ui.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | #   Launch the local UI for running in DEMO mode
 4 | #   Visit http://localhost:5000 in a browser to see the results
 5 | #
 6 | #
 7 | (
 8 |     cd ../src/frontend &&
 9 |     ./run.py -p 5000 --debug
10 | )
11 | 


--------------------------------------------------------------------------------
/local_demo/launch_local_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | #   This script helps run the demo mode with the Luwak search topology on a local cluster
 4 | #   The below commands stage system resources in docker containers and then builds   
 5 | #   the main Luwak topology storm cluster.  
 6 | #
 7 | #   You'll want to launch the WEB UI in a seperate terminal/process. 
 8 | #   For this:
 9 | #       cd src/frontend
10 | #       ./run.py.  
11 | #   See run.py -h for help.
12 | #
13 | #   To simulate the twitter firehose, run ./mock_firehose.sh
14 | #
15 | 
16 | # startup local resources in docker containers
17 | ( cd ../util &&
18 | ./stage_demo_mode.sh
19 | )
20 | 
21 | # build libraries
22 | (cd ../src/luwak_search &&
23 | mvn clean &&
24 | mvn package
25 | )
26 | 
27 | # launch local storm cluster with luwak topology
28 | (cd ../src/luwak_search &&
29 | ./run_luwak_topology.sh
30 | )
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/local_demo/mock_firehose.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script simulates the twitter firehose by repeatedly
 3 | # adding the collection of 100k tweets included with this repo into Kafka.
 4 | #
 5 | # data can be found in data/tweets.big.sample
 6 | #
 7 | (
 8 | cd ../util &&
 9 | while true ; do
10 |     ./kafka_add_documents.sh
11 |     echo "Sleeping 10 so I don't swamp your puny local cluster!"
12 |     sleep 10
13 | done
14 | )
15 | 


--------------------------------------------------------------------------------
/local_demo/prerequisites.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | #   This script attempts to install necassary dependencies for straw running on Ubuntu 14.04
 5 | #
 6 | #   NOTE: You will need to install docker manually, using at least version 1.8.0.
 7 | #
 8 | #   DOCKER INSTALL INSTRUCTIONS from http://docs.docker.com/engine/installation/ubuntulinux/
 9 | #	
10 | #   sudo apt-key adv --keyserver hkp://pgp.mit.edu:80 --recv-keys 58118E89F3A912897C070ADBF76221572C52609D
11 | #   sudo vi /etc/apt/sources.list.d/docker.list
12 | #   Add the line "deb https://apt.dockerproject.org/repo ubuntu-trusty main" and save
13 | #   sudo apt-get install docker-engine
14 | #   sudo service docker start
15 | #   sudo usermod -aG docker $USER
16 | #   Log out and log in again
17 | 
18 | 
19 | # redis
20 | sudo apt-get update
21 | sudo apt-get install redis-server python3-pip python-pip maven openjdk-7-jdk
22 | 
23 | # docker compose
24 | ( 
25 | curl -L https://github.com/docker/compose/releases/download/1.5.0/docker-compose-`uname -s`-`uname -m` > tmp &&
26 | sudo mv tmp /usr/local/bin/docker-compose &&
27 | sudo chmod +x /usr/local/bin/docker-compose
28 | )
29 | 
30 | # python2 packages -- flask only recommended for python2
31 | sudo pip install redis flask flask-session kafka-python
32 | 
33 | # python3 packages
34 | sudo pip3 install kafka-python
35 | 


--------------------------------------------------------------------------------
/src/frontend/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/__init__.py


--------------------------------------------------------------------------------
/src/frontend/app/query_subscriber.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | #   Consume matches from subscribed queries
 4 | #
 5 | import redis
 6 | from time import sleep
 7 | 
 8 | def message_handler(data, message):
 9 |     data.append((message['channel'], message['data']))
10 |  
11 | class QuerySubscriber:
12 | 
13 |     def __init__(self, host, port, msg_handler):
14 |         ''' Query subscriber takes an arbitrary msg_handler which is a function
15 |             of a single variable message.'''
16 |         pool = redis.ConnectionPool(host='localhost', port=6379)
17 |         r = redis.Redis(connection_pool=pool)
18 |         self.connection = r.pubsub(ignore_subscribe_messages=True)
19 |         self.queries = []
20 |         self._thread = None
21 |         self.handler = msg_handler   
22 | 
23 |     def add_query(self, query):
24 |         # add query to list
25 |         self.queries.append(query)
26 | 
27 |         # stop the existing thread and start new one with the full set of queries
28 |         self._update()
29 | 
30 |     def start(self):
31 |         queries = dict((k,v) for (k,v) in [(k,self.handler) for k in self.queries])
32 |         if len(queries)>0: 
33 |             self.connection.subscribe(**queries)
34 |             self._thread = self.connection.run_in_thread(sleep_time=0.001)
35 |         else:
36 |             self._thread = None
37 | 	
38 |     def _update(self):
39 |         # WARNING: We might drop some messages here
40 |         if self._thread is not None:
41 |             self._thread.stop()
42 |             self._thread=None
43 |         self.start()
44 | 
45 |     def close(self):
46 |         try:
47 |             self._thread.stop()
48 |         except:
49 |             pass
50 | 
51 | if __name__=="__main__":
52 |     mydata = []
53 |     subscriber = QuerySubscriber("localhost", 6379, lambda x: message_handler(mydata, x)  )
54 |     query = None
55 |     while True:
56 |         if query is None:
57 |             query = raw_input("Please enter the topic you'd like to follow: ")
58 |         else:
59 |             subscriber.add_query(query)
60 |             query = None
61 |             print mydata
62 | 


--------------------------------------------------------------------------------
/src/frontend/app/static/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/assets/favicon.ico


--------------------------------------------------------------------------------
/src/frontend/app/static/assets/straw.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/assets/straw.pdf


--------------------------------------------------------------------------------
/src/frontend/app/static/css/theme.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   padding-top: 70px;
 3 |   padding-bottom: 30px;
 4 | }
 5 | 
 6 | .theme-dropdown .dropdown-menu {
 7 |   position: static;
 8 |   display: block;
 9 |   margin-bottom: 20px;
10 | }
11 | 
12 | .theme-showcase > p > .btn {
13 |   margin: 5px 0;
14 | }
15 | 
16 | .theme-showcase .navbar .container {
17 |   width: auto;
18 | }
19 | 
20 | .highlight { background-color: yellow }
21 | 


--------------------------------------------------------------------------------
/src/frontend/app/static/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/src/frontend/app/static/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/src/frontend/app/static/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/src/frontend/app/static/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/src/frontend/app/static/js/ie10-viewport-bug-workaround.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * IE10 viewport hack for Surface/desktop Windows 8 bug
 3 |  * Copyright 2014-2015 Twitter, Inc.
 4 |  * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
 5 |  */
 6 | 
 7 | // See the Getting Started docs for more information:
 8 | // http://getbootstrap.com/getting-started/#support-ie10-width
 9 | 
10 | (function () {
11 |   'use strict';
12 | 
13 |   if (navigator.userAgent.match(/IEMobile\/10\.0/)) {
14 |     var msViewportStyle = document.createElement('style')
15 |     msViewportStyle.appendChild(
16 |       document.createTextNode(
17 |         '@-ms-viewport{width:auto!important}'
18 |       )
19 |     )
20 |     document.querySelector('head').appendChild(msViewportStyle)
21 |   }
22 | 
23 | })();
24 | 


--------------------------------------------------------------------------------
/src/frontend/app/static/js/npm.js:
--------------------------------------------------------------------------------
 1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment.
 2 | require('../../js/transition.js')
 3 | require('../../js/alert.js')
 4 | require('../../js/button.js')
 5 | require('../../js/carousel.js')
 6 | require('../../js/collapse.js')
 7 | require('../../js/dropdown.js')
 8 | require('../../js/modal.js')
 9 | require('../../js/tooltip.js')
10 | require('../../js/popover.js')
11 | require('../../js/scrollspy.js')
12 | require('../../js/tab.js')
13 | require('../../js/affix.js')


--------------------------------------------------------------------------------
/src/frontend/app/straw_app.py:
--------------------------------------------------------------------------------
 1 | import thread, redis
 2 | from kafka import SimpleProducer, KafkaClient
 3 | from flask import Flask, session
 4 | from flask.ext.session import Session
 5 | from query_subscriber import QuerySubscriber
 6 | from views import attach_views
 7 | from datetime import datetime
 8 | 
 9 | def highlight(word):
10 |     return("<span style=\"background-color: #FFFF00\">{0}</span>".format(word))
11 | 
12 | class StrawAppBase:
13 | 
14 |     def __init__(self, config):
15 |     
16 |         app = Flask(__name__)
17 |         app.secret_key = 'i love to search full text in real time'
18 | 
19 |         # attach a redis connection pool
20 |         app.pool = redis.ConnectionPool(host="localhost", port=6379)
21 | 
22 |         # user -> channels mapping
23 |         app.user_channels = {}
24 | 
25 |         # how to handle messages that enter the stream from redis pub sub
26 |         def redis_message_handler(msg):
27 |             redis_connection = redis.Redis(connection_pool=app.pool)
28 |             # get channel and content of incoming message
29 |             channel = msg['channel']
30 |             data = msg['data']
31 | 
32 |             # word highlighting -- TODO: this would be better to do in the search engine!
33 |             query = redis_connection.get(channel)
34 |             words = list(set(query.split(" ")))
35 |             for w in words:
36 |                 data=data.lower().replace(w.lower(), highlight(w.lower()))
37 | 
38 |             # find users subscribed to this channel
39 |             if app.user_channels.get(channel) is not None:
40 |                 for user in app.user_channels.get(channel):
41 |                     redis_connection.lpush(user, data)
42 |             else:
43 |                 # no more users for this channel, unsubscribe from it
44 |                 redis_connection.unsubscribe(channel)            
45 |             
46 |         # Add Redis query subscriber to app
47 |         app.disp = []
48 |         app.subscriber = QuerySubscriber("localhost", 6379, redis_message_handler)
49 | 
50 |         # setup kafka producer in the app
51 |         kafka = KafkaClient("{0}:{1}".format(config["zookeeper_host"], 9092))
52 |         app.producer = SimpleProducer(kafka)
53 | 
54 |         # add the app
55 |         self.app = app
56 | 
57 |     def clear_user(self, uid):
58 |         redis_connection = redis.Redis(connection_pool=self.app.pool)
59 |         # print("Trying to clean for user {0}".format(uid))
60 |         # find all the queries to which the user is subscribed
61 |         # and remove them from the subscribers list for each query.
62 |         for qid in redis_connection.lrange(uid+"-queries", 0, -1):
63 |             try:
64 |                 self.app.user_channels[qid].remove(uid)
65 |             except KeyError:
66 |                 pass
67 |                 
68 |         # remove the user-queries
69 |         redis_connection.delete(uid+"-queries")
70 | 
71 |         # remove the stored results
72 |         redis_connection.delete(uid)
73 | 
74 | def get_straw_app(config):
75 |     base = StrawAppBase(config)
76 |     app = base.app
77 |     app.clear_user = base.clear_user
78 |     attach_views(app)
79 |     return app
80 | 


--------------------------------------------------------------------------------
/src/frontend/app/templates/about.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |     <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
 8 |     <meta name="description" content="straw is a platform for real time search on streaming text data">
 9 |     <meta name="author" content="Ryan Walker">
10 |     <link rel="icon" href="../static/assets/favicon.ico">
11 | 
12 |     <title>Straw -- a platform for streaming search</title>
13 | 
14 |     <!-- Bootstrap core CSS -->
15 |     <link href="../static/css/bootstrap.min.css" rel="stylesheet">
16 |     <!-- Bootstrap theme -->
17 |     <link href="../static/css/bootstrap-theme.min.css" rel="stylesheet">
18 | 
19 |     <!-- Custom styles for this template -->
20 |     <link href="../static/css/theme.css" rel="stylesheet">
21 | 
22 |   </head>
23 | 
24 |   <body role="document">
25 | 
26 |     <!-- Fixed navbar -->
27 |     <nav class="navbar navbar-inverse navbar-fixed-top">
28 |       <div class="container">
29 |         <div class="navbar-header">
30 |           <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
31 |             <span class="sr-only">Toggle navigation</span>
32 |             <span class="icon-bar"></span>
33 |             <span class="icon-bar"></span>
34 |             <span class="icon-bar"></span>
35 |           </button>
36 |           <a class="navbar-brand" href="/">Try it!</a>
37 |         </div>
38 |         <div class="navbar-header">
39 |           <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
40 |             <span class="sr-only">Toggle navigation</span>
41 |             <span class="icon-bar"></span>
42 |             <span class="icon-bar"></span>
43 |             <span class="icon-bar"></span>
44 |           </button>
45 |           <a class="navbar-brand" href="about">About</a>
46 |         </div>
47 |       </div>
48 |     </nav>
49 | 
50 |     <div class="container theme-showcase" role="main">
51 | 
52 |       <!-- Main jumbotron for a primary marketing message or call to action -->
53 |       <div class="jumbotron">
54 |         <h1>Straw</h1>
55 |         <p>Straw is a platform for real-time, full text search on streaming data.</p>
56 |       </div>
57 | 
58 |     <div class="page-header">
59 |         <h2>An Overview</h2>
60 |     </div>
61 | <iframe src="https://docs.google.com/presentation/d/1nlfP69QIpEG2ZywbIGuq0mlBdHd3UT38LdMt1lpLyik/embed?start=false&loop=false&delayms=3000" frameborder="0" width="960" height="569" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true"></iframe>
62 |     <div class="page-header">
63 |         <h2>Watch</h2>
64 |     </div>
65 | 
66 | <iframe frameborder="0" width="960" height="569" src="https://www.youtube.com/embed/62uOmnYVzH8" frameborder="0" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true"> </iframe>
67 |     
68 |     <!-- Bootstrap core JavaScript
69 |     ================================================== -->
70 |     <!-- Placed at the end of the document so the pages load faster -->
71 |     <script type=text/javascript src="../static/js/jquery.js"></script>
72 |     <script type=text/javascript src="../static/js/bootstrap.min.js"></script>
73 |   </body>
74 | </html>
75 | 


--------------------------------------------------------------------------------
/src/frontend/app/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  7 |     <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
  8 |     <meta name="description" content="straw is a platform for real time search on streaming text data">
  9 |     <meta name="author" content="Ryan Walker">
 10 |     <link rel="icon" href="../static/assets/favicon.ico">
 11 | 
 12 |     <title>Straw -- a platform for streaming search</title>
 13 | 
 14 |     <!-- Bootstrap core CSS -->
 15 |     <link href="../static/css/bootstrap.min.css" rel="stylesheet">
 16 |     <!-- Bootstrap theme -->
 17 |     <link href="../static/css/bootstrap-theme.min.css" rel="stylesheet">
 18 | 
 19 |     <!-- Custom styles for this template -->
 20 |     <link href="../static/css/theme.css" rel="stylesheet">
 21 | 
 22 |   </head>
 23 | 
 24 |   <body role="document">
 25 | 
 26 |     <!-- Fixed navbar -->
 27 |     <nav class="navbar navbar-inverse navbar-fixed-top">
 28 |       <div class="container">
 29 |         <div class="navbar-header">
 30 |           <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
 31 |             <span class="sr-only">Toggle navigation</span>
 32 |             <span class="icon-bar"></span>
 33 |             <span class="icon-bar"></span>
 34 |             <span class="icon-bar"></span>
 35 |           </button>
 36 |           <a class="navbar-brand" href="/">Try it!</a>
 37 |         </div>
 38 |         <div class="navbar-header">
 39 |           <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
 40 |             <span class="sr-only">Toggle navigation</span>
 41 |             <span class="icon-bar"></span>
 42 |             <span class="icon-bar"></span>
 43 |             <span class="icon-bar"></span>
 44 |           </button>
 45 |           <a class="navbar-brand" href="about">About</a>
 46 |         </div>
 47 |       </div>
 48 |     </nav>
 49 | 
 50 |     <div class="container theme-showcase" role="main">
 51 | 
 52 |       <!-- Main jumbotron for a primary marketing message or call to action -->
 53 |       <div class="jumbotron">
 54 |         <h1>Straw</h1>
 55 |         <p>Straw is a platform for real-time, full text search on streaming data.</p>
 56 |       </div>
 57 | 
 58 |     <div class="page-header">
 59 |         <h2>Register a query</h2>
 60 |     </div>
 61 |     <h4> Subscribe to real-time alerts from the (simulated) Twitter firehose. </h4>
 62 |     <form action="." method="POST">
 63 |         <input type="text" name="text" style="width: 75%">
 64 |         <input type="submit" class="btn btn-lg btn-success" value="Subscribe">
 65 |         <input type="submit" class="btn btn-lg btn-danger" name="clear" value="Clear All">
 66 |     </form>
 67 |     
 68 |     {% if query_list|length>0 %}
 69 |     <div class="page-header">
 70 |         <h2>Your Queries</h2>
 71 |     </div>
 72 |     <div class="page-header" id="query_list" align=left>
 73 |         {% for query in query_list %}
 74 |         <div> {{query}} </div>
 75 |         {% endfor %}
 76 |     </div>
 77 |     <div class="page-header">
 78 |         <h2>Your Alerts</h2>
 79 |     </div>
 80 | <script type="text/javascript"
 81 |   src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js">
 82 | </script>
 83 | <script type="text/javascript">
 84 |   var $SCRIPT_ROOT = {{ request.script_root|tojson|safe }};
 85 | </script>
 86 | <script type=text/javascript>
 87 |     $(document).ready(
 88 |             function() {
 89 |                 setInterval(function() {
 90 |                     var randomnumber = "hello";
 91 |                     $.getJSON($SCRIPT_ROOT + '/_fetch_messages', {
 92 |                           }, function(data) {
 93 |                             var result="";
 94 |                             for(var i=0;i<data.result.length;i++)
 95 |                             {
 96 |                                 result += '<div>'+data.result[i]+'</div>';
 97 |                             }
 98 |                             document.getElementById("result").innerHTML = result
 99 |                             //$("#result").text(result);
100 |                           });
101 |                 }, 500);
102 |             });
103 | </script>
104 | <div id="result" align="left"></div>
105 |     {% endif %}
106 | 
107 |     </div> <!-- /container -->
108 | 
109 | 
110 |     <!-- Bootstrap core JavaScript
111 |     ================================================== -->
112 |     <!-- Placed at the end of the document so the pages load faster -->
113 |     <script type=text/javascript src="../static/js/jquery.js"></script>
114 |     <script type=text/javascript src="../static/js/bootstrap.min.js"></script>
115 |   </body>
116 | </html>
117 | 


--------------------------------------------------------------------------------
/src/frontend/app/views.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | '''
  3 | Define the views for the straw web app
  4 | '''
  5 | from flask import render_template, session, request, render_template, jsonify, Flask, make_response
  6 | from time import sleep
  7 | from kafka.common import FailedPayloadsError, NotLeaderForPartitionError, KafkaUnavailableError
  8 | import md5, redis
  9 | import json, uuid
 10 | 
 11 | MAX_RESULTS = 100
 12 | EXPIRATION = 1
 13 | def attach_views(app):
 14 | 
 15 |     @app.route('/_fetch_messages')
 16 |     def fetch_messages():
 17 |         # get a redis connection
 18 |         redis_connection = redis.Redis(connection_pool=app.pool)
 19 | 
 20 |         # update the query list in the view
 21 |         if session.get('sid') is not None:
 22 |             matches = redis_connection.lrange(session.get('sid'), 0, MAX_RESULTS)
 23 |         return jsonify(result=matches)
 24 | 
 25 |     @app.route('/', methods=['GET'])
 26 |     def index():
 27 |         if session.get('sid') is None:
 28 |             session['sid'] = uuid.uuid4().hex
 29 |         try:
 30 |             query_list = session['queries']
 31 |         except KeyError:
 32 |             query_list = []
 33 |         return render_template('index.html', query_list=query_list)
 34 | 
 35 |     @app.route('/', methods=['POST'])
 36 |     def search_box_control():
 37 |         '''add to or clear the list of queries.'''
 38 | 
 39 |         # we need a session
 40 |         if session.get('sid') is None:
 41 |             raise RuntimeError("No session.")
 42 |         sid = session.get('sid')       
 43 | 
 44 |         # get a redis connection
 45 |         redis_connection = redis.Redis(connection_pool=app.pool)
 46 |          
 47 |         # if clear button pressed:
 48 |         if 'clear' in request.form:
 49 |             app.clear_user(session.get('sid'))
 50 |             if session.has_key('queries'):
 51 |                 del session['queries']
 52 |             return render_template("index.html", query_list=[], session=session)
 53 | 
 54 |         # create a new query
 55 |         text = request.form['text'].lower().split(" ")
 56 | 
 57 |         # generate a unique query id
 58 |         msg = {"type":"terms-query","terms":text,"minimum-match":len(text)}
 59 |         data = json.dumps(msg)
 60 |         qid = md5.new(data).hexdigest()
 61 |         query_string = " ".join(text)
 62 | 
 63 |         # add the qid and value to the query lookup store
 64 |         try:
 65 |             session['queries'].append(query_string)
 66 |         except KeyError:
 67 |             # sanity: clear any queries stored for this user but not in the session.
 68 |             redis_connection.delete(sid+"-queries")
 69 |             session['queries'] = [query_string]
 70 | 
 71 |         # try three times to do the post to kafka.
 72 |         post_success = False
 73 |         for i in range(3):
 74 |             try:
 75 |                 app.producer.send_messages("queries", data)
 76 |             except (FailedPayloadsError, NotLeaderForPartitionError, KafkaUnavailableError) as e:
 77 |                 # wait a bit and try again
 78 |                 print("Failed to post query {0} to kafka. Try #{1}".format(data, i))
 79 |                 sleep(0.25)
 80 |                 continue
 81 |             post_success=True
 82 |             break
 83 | 
 84 |         if post_success==True:
 85 |             # subscribe the user to the query            
 86 |             try:
 87 |                 app.user_channels[qid].add(sid)
 88 |             except KeyError:
 89 |                 app.user_channels[qid] = set([sid])
 90 |                 app.subscriber.add_query(qid)
 91 | 
 92 |             # link the id to the query text
 93 |             redis_connection.set(qid, " ".join(text))
 94 | 
 95 |             # add query to the list of things the user has subscribed to
 96 |             redis_connection.lpush(sid +"-queries", qid)
 97 | 
 98 |         # update the query list in the view
 99 |         query_list = session["queries"]
100 |         return render_template("index.html", query_list=query_list)
101 | 
102 |     @app.route('/about')
103 |     def about():
104 |         return render_template('%s.html' % 'about')
105 | 
106 | 
107 |     @app.route('/straw.pdf')
108 |     def pdf():
109 |         return app.send_static_file('assets/straw.pdf')
110 | 


--------------------------------------------------------------------------------
/src/frontend/launch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | (cd /home/ubuntu/straw/src/frontend && \
3 |     ./run.py -p 80 )
4 | sleep 5
5 | 
6 | 


--------------------------------------------------------------------------------
/src/frontend/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from app.straw_app import get_straw_app
 3 | import redis
 4 | import argparse
 5 | 
 6 | if __name__=="__main__":
 7 | 
 8 |     # arg parsing
 9 |     parser = argparse.ArgumentParser(description="Launch straw webserver frontend")
10 |     parser.add_argument("-p", "--port", default=5000, help="port, default 5000")
11 |     parser.add_argument("--debug", help="Use flask debug mode, default False.", action="store_true")
12 |     args = parser.parse_args()
13 | 
14 |     with open("../../config/config.properties", "r") as f:
15 |         lines = f.readlines()
16 | 
17 |     config={}
18 |     for line in lines:
19 |         if line.find("=")!=-1:
20 |             ls = line.split("=")
21 |             config[ls[0]]=ls[1]
22 |     config["debug"]=args.debug
23 |     
24 |     # get the app and clear the redis db
25 |     app = get_straw_app(config)
26 |     redis_connection = redis.Redis(connection_pool=app.pool)
27 |     redis_connection.flushall()
28 |     app.run(host='0.0.0.0', port=int(args.port), debug = args.debug)
29 | 
30 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/kafka_stream_consumer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | #
 3 | #   Put documents from the stream into Kafka
 4 | #
 5 | import argparse
 6 | from kafka import KafkaConsumer
 7 | 
 8 | if __name__=="__main__":
 9 | 
10 |     # arg parsing
11 |     parser = argparse.ArgumentParser(description="Consume messages from kafka.")
12 |     parser.add_argument("host", help="A Kafka host")
13 |     parser.add_argument("topic", help="A topic to consume")
14 |     parser.add_argument("-p","--port", default="9092", help="A Kafka port, default 9092.")   
15 |     args = parser.parse_args()
16 | 
17 |     # get a client
18 |     #consumer = KafkaConsumer('documents', group_id='straw', bootstrap_servers=["{0}:{1}".format(args.host, args.port)])
19 |     print("Trying to get messages from topic {0} on {1}:{2}".format(args.topic, args.host, args.port))
20 |     consumer = KafkaConsumer(args.topic, bootstrap_servers=["{0}:{1}".format(args.host, args.port)], auto_offset_reset='smallest')
21 | 
22 |     # read through the file and send messages to Kafka in chunks
23 |     for message in consumer:
24 |         # message value is raw byte string -- decode if necessary!
25 |         # e.g., for unicode: `message.value.decode('utf-8')`
26 |         print("{0}:{1}:{2}: key={3} value={4}".format(message.topic, message.partition,
27 |                                              message.offset, message.key,
28 |                                              message.value.decode('utf-8')))
29 | 
30 | 
31 |             
32 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/kafka_stream_producer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | #
 3 | #   Put documents from the stream into Kafka
 4 | #
 5 | 
 6 | import argparse
 7 | from kafka import SimpleProducer, KafkaClient
 8 | from time import sleep
 9 | 
10 | def chunk_iterable(A,n):
11 |     '''An iterable that contains the iterates of A divided into lists of size n.
12 |        A    iterable
13 |        n    int, size of chunk
14 |     '''
15 |     cnt = 0
16 |     chunk = []
17 |     for v in A:
18 |         if cnt<n:
19 |             cnt+=1
20 |             chunk.append(v)
21 |         else:
22 |             yield(chunk)
23 |             cnt = 0
24 |             chunk = []
25 |     if len(chunk)>0:
26 |         yield(chunk)
27 | 
28 | if __name__=="__main__":
29 | 
30 |     # arg parsing
31 |     parser = argparse.ArgumentParser(description="Feed Kafka a stream from a file")
32 |     parser.add_argument("file", help="A file of data, one datum per line")
33 |     parser.add_argument("host", help="Public IP address of a Kafka node")
34 |     parser.add_argument("topic", help="Kafka topic to feed")
35 |     parser.add_argument("-p", "--port", default=9092, help="port for zookeeper, default 9092")
36 |     parser.add_argument("-c", "--chunksize", default=100, help="Number of messages to send at one time,  default 100")    
37 |     parser.add_argument("-d", "--delay", default=0, help="Delay in ms between shipment of chunks to Kafka, default 0")
38 |     args = parser.parse_args()
39 | 
40 |     # get a client
41 |     print("Connecting to Kafka node {0}:{1}".format(args.host, args.port))
42 |     kafka = KafkaClient("{0}:{1}".format(args.host, args.port))
43 |     producer = SimpleProducer(kafka)
44 |     
45 |     # read through the file and send messages to Kafka in chunks
46 |     with open(args.file, "rb") as f:
47 |         for chunk in chunk_iterable(f, args.chunksize):
48 |             print("Sending {0} messages to topic {1} on {2}".format(len(chunk), args.topic, args.host))
49 |             producer.send_messages(args.topic, *chunk)
50 |             sleep(1.0*int(args.delay)/1000.0)
51 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/third_party/kafka-docker-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:trusty
 2 | 
 3 | MAINTAINER Wurstmeister 
 4 | 
 5 | ENV KAFKA_VERSION="0.8.2.1" SCALA_VERSION="2.9.2"
 6 | 
 7 | RUN apt-get update && apt-get install -y unzip openjdk-6-jdk wget curl git docker.io jq
 8 | 
 9 | ADD download-kafka.sh /tmp/download-kafka.sh
10 | RUN /tmp/download-kafka.sh
11 | RUN tar xf /tmp/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz -C /opt
12 | 
13 | VOLUME ["/kafka"]
14 | 
15 | ENV KAFKA_HOME /opt/kafka_${SCALA_VERSION}-${KAFKA_VERSION}
16 | ADD start-kafka.sh /usr/bin/start-kafka.sh
17 | ADD broker-list.sh /usr/bin/broker-list.sh
18 | CMD start-kafka.sh
19 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/third_party/kafka-docker-master/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/third_party/kafka-docker-master/README.md:
--------------------------------------------------------------------------------
 1 | kafka-docker
 2 | ============
 3 | 
 4 | Dockerfile for [Apache Kafka](http://kafka.apache.org/)
 5 | 
 6 | The image is available directly from https://registry.hub.docker.com/
 7 | 
 8 | ##Pre-Requisites
 9 | 
10 | - install docker-compose [https://docs.docker.com/compose/install/](https://docs.docker.com/compose/install/)
11 | - modify the ```KAFKA_ADVERTISED_HOST_NAME``` in ```docker-compose.yml``` to match your docker host IP (Note: Do not use localhost or 127.0.0.1 as the host ip if you want to run multiple brokers.)
12 | - if you want to customise any Kafka parameters, simply add them as environment variables in ```docker-compose.yml```, e.g. in order to increase the ```message.max.bytes``` parameter set the environment to ```KAFKA_MESSAGE_MAX_BYTES: 2000000```. To turn off automatic topic creation set ```KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'false'```
13 | 
14 | ##Usage
15 | 
16 | Start a cluster:
17 | 
18 | - ```docker-compose up -d ```
19 | 
20 | Add more brokers:
21 | 
22 | - ```docker-compose scale kafka=3```
23 | 
24 | Destroy a cluster:
25 | 
26 | - ```docker-compose stop```
27 | 
28 | ##Note
29 | 
30 | The default ```docker-compose.yml``` should be seen as a starting point. By default each broker will get a new port number and broker id on restart. Depending on your use case this might not be desirable. If you need to use specific ports and broker ids, modify the docker-compose configuration accordingly, e.g. [docker-compose-single-broker.yml](https://github.com/wurstmeister/kafka-docker/blob/master/docker-compose-single-broker.yml):
31 | 
32 | - ```docker-compose -f docker-compose-single-broker.yml up```
33 | 
34 | ##Broker IDs
35 | 
36 | If you don't specify a broker id in your docker-compose file, it will automatically be generated based on the name that docker-compose gives the container. This allows scaling up and down. In this case it is recommended to use the ```--no-recreate``` option of docker-compose to ensure that containers are not re-created and thus keep their names and ids.
37 | 
38 | 
39 | ##Automatically create topics
40 | 
41 | If you want to have kafka-docker automatically create topics in Kafka during
42 | creation, a ```KAFKA_CREATE_TOPICS``` environment variable can be
43 | added in ```docker-compose.yml```.
44 | 
45 | Here is an example snippet from ```docker-compose.yml```:
46 | 
47 |         environment:
48 |           KAFKA_CREATE_TOPICS: "Topic1:1:3,Topic2:1:1"
49 | 
50 | ```Topic 1``` will have 1 partition and 3 replicas, ```Topic 2``` will have 1 partition and 1 replica.
51 | 
52 | ##Tutorial
53 | 
54 | [http://wurstmeister.github.io/kafka-docker/](http://wurstmeister.github.io/kafka-docker/)
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/third_party/kafka-docker-master/broker-list.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CONTAINERS=$(docker ps | grep 9092 | awk '{print $1}')
4 | BROKERS=$(for CONTAINER in $CONTAINERS; do docker port $CONTAINER 9092 | sed -e "s/0.0.0.0:/$HOST_IP:/g"; done)
5 | echo $BROKERS | sed -e 's/ /,/g'
6 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/third_party/kafka-docker-master/docker-compose-single-broker.yml:
--------------------------------------------------------------------------------
 1 | zookeeper:
 2 |   image: wurstmeister/zookeeper
 3 |   ports:
 4 |     - "2181:2181"
 5 | kafka:
 6 |   image: wurstmeister/kafka:0.8.2.0
 7 |   ports:
 8 |     - "9092:9092"
 9 |   links:
10 |     - zookeeper:zk
11 |   environment:
12 |     KAFKA_ADVERTISED_HOST_NAME: 192.168.59.103
13 |   volumes:
14 |     - /var/run/docker.sock:/var/run/docker.sock
15 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/third_party/kafka-docker-master/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | zookeeper:
 2 |   image: wurstmeister/zookeeper
 3 |   ports: 
 4 |     - "localhost:2181:2181"
 5 | kafka:
 6 |   build: .
 7 |   ports:
 8 |     - "localhost:9092:9092"
 9 |   links: 
10 |     - zookeeper:zk
11 |   environment:
12 |     KAFKA_CREATE_TOPICS: "queries:1:1,documents:1:1"
13 |     KAFKA_ADVERTISED_HOST_NAME: "0.0.0.0"
14 |   volumes:
15 |     - /var/run/docker.sock:/var/run/docker.sock
16 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/third_party/kafka-docker-master/download-kafka.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mirror=$(curl --stderr /dev/null https://www.apache.org/dyn/closer.cgi\?as_json\=1 | jq -r '.preferred')
4 | url="${mirror}kafka/${KAFKA_VERSION}/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz"
5 | wget -q "${url}" -O "/tmp/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz"
6 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/third_party/kafka-docker-master/start-kafka-shell.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | docker run --rm -v /var/run/docker.sock:/var/run/docker.sock -e HOST_IP=$1 -e ZK=$2 -i -t wurstmeister/kafka:0.8.2.0 /bin/bash
3 | 


--------------------------------------------------------------------------------
/src/kafka_stream_eater/third_party/kafka-docker-master/start-kafka.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z "$KAFKA_ADVERTISED_PORT" ]]; then
 4 |     export KAFKA_ADVERTISED_PORT=$(docker port `hostname` 9092 | sed -r "s/.*:(.*)/\1/g")
 5 | fi
 6 | if [[ -z "$KAFKA_BROKER_ID" ]]; then
 7 |     export KAFKA_BROKER_ID=$(docker inspect `hostname` | jq --raw-output '.[0] | .Name' | awk -F_ '{print $3}')
 8 | fi
 9 | if [[ -z "$KAFKA_LOG_DIRS" ]]; then
10 |     export KAFKA_LOG_DIRS="/kafka/kafka-logs-$KAFKA_BROKER_ID"
11 | fi
12 | if [[ -z "$KAFKA_ZOOKEEPER_CONNECT" ]]; then
13 |     export KAFKA_ZOOKEEPER_CONNECT=$(env | grep ZK.*PORT_2181_TCP= | sed -e 's|.*tcp://||' | paste -sd ,)
14 | fi
15 | 
16 | if [[ -n "$KAFKA_HEAP_OPTS" ]]; then
17 |     sed -r -i "s/(export KAFKA_HEAP_OPTS)=\"(.*)\"/\1=\"$KAFKA_HEAP_OPTS\"/g" $KAFKA_HOME/bin/kafka-server-start.sh
18 |     unset KAFKA_HEAP_OPTS
19 | fi
20 | 
21 | for VAR in `env`
22 | do
23 |   if [[ $VAR =~ ^KAFKA_ && ! $VAR =~ ^KAFKA_HOME ]]; then
24 |     kafka_name=`echo "$VAR" | sed -r "s/KAFKA_(.*)=.*/\1/g" | tr '[:upper:]' '[:lower:]' | tr _ .`
25 |     env_var=`echo "$VAR" | sed -r "s/(.*)=.*/\1/g"`
26 |     if egrep -q "(^|^#)$kafka_name=" $KAFKA_HOME/config/server.properties; then
27 |         sed -r -i "s@(^|^#)($kafka_name)=(.*)@\2=${!env_var}@g" $KAFKA_HOME/config/server.properties #note that no config values may contain an '@' char
28 |     else
29 |         echo "$kafka_name=${!env_var}" >> $KAFKA_HOME/config/server.properties
30 |     fi
31 |   fi
32 | done
33 | 
34 | 
35 | $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties &
36 | KAFKA_SERVER_PID=$!
37 | 
38 | while netstat -lnt | awk '$4 ~ /:9092$/ {exit 1}'; do sleep 1; done
39 | 
40 | if [[ -n $KAFKA_CREATE_TOPICS ]]; then
41 |     IFS=','; for topicToCreate in $KAFKA_CREATE_TOPICS; do
42 |         IFS=':' read -a topicConfig <<< "$topicToCreate"
43 |         $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper $KAFKA_ZOOKEEPER_CONNECT --replication-factor ${topicConfig[2]} --partition ${topicConfig[1]} --topic "${topicConfig[0]}"
44 |     done
45 | fi
46 | 
47 | wait $KAFKA_SERVER_PID
48 | 


--------------------------------------------------------------------------------
/src/luwak_search/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .settings/
3 | .classpath
4 | .project
5 | 
6 | /target
7 | 


--------------------------------------------------------------------------------
/src/luwak_search/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
  3 | 	license agreements. See the NOTICE file distributed with this work for additional 
  4 | 	information regarding copyright ownership. The ASF licenses this file to 
  5 | 	You under the Apache License, Version 2.0 (the "License"); you may not use 
  6 | 	this file except in compliance with the License. You may obtain a copy of 
  7 | 	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
  8 | 	by applicable law or agreed to in writing, software distributed under the 
  9 | 	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
 10 | 	OF ANY KIND, either express or implied. See the License for the specific 
 11 | 	language governing permissions and limitations under the License. -->
 12 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 13 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 14 | 	<modelVersion>4.0.0</modelVersion>
 15 | 	<groupId>github.com.rwalk333</groupId>
 16 | 	<artifactId>storming-luwak-search</artifactId>
 17 | 	<packaging>jar</packaging>
 18 | 	<version>0.0.1</version>
 19 | 	<name>storming-search</name>
 20 | 
 21 | 	<properties>
 22 | 		<lucene.group>org.apache.lucene</lucene.group>
 23 | 		<lucene.version>5.3.0</lucene.version>
 24 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 25 | 		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 26 | 		<maven.compiler.target>1.7</maven.compiler.target>
 27 | 	</properties>
 28 | 
 29 | 	<dependencies>
 30 | 
 31 | 		<!-- Lucene -->
 32 | 		<dependency>
 33 | 			<groupId>${lucene.group}</groupId>
 34 | 			<artifactId>lucene-core</artifactId>
 35 | 			<version>${lucene.version}</version>
 36 | 		</dependency>
 37 | 		<dependency>
 38 | 			<groupId>${lucene.group}</groupId>
 39 | 			<artifactId>lucene-memory</artifactId>
 40 | 			<version>${lucene.version}</version>
 41 | 		</dependency>
 42 | 		<dependency>
 43 | 			<groupId>${lucene.group}</groupId>
 44 | 			<artifactId>lucene-analyzers-common</artifactId>
 45 | 			<version>${lucene.version}</version>
 46 | 		</dependency>
 47 | 		<dependency>
 48 | 			<groupId>${lucene.group}</groupId>
 49 | 			<artifactId>lucene-queries</artifactId>
 50 | 			<version>${lucene.version}</version>
 51 | 		</dependency>
 52 | 		<dependency>
 53 | 			<groupId>${lucene.group}</groupId>
 54 | 			<artifactId>lucene-queryparser</artifactId>
 55 | 			<version>${lucene.version}</version>
 56 | 		</dependency>
 57 | 
 58 | 
 59 | 
 60 | 		<!-- Kafka connector -->
 61 | 		<dependency>
 62 | 			<groupId>org.apache.storm</groupId>
 63 | 			<artifactId>storm-kafka</artifactId>
 64 | 			<version>0.9.2-incubating</version>
 65 | 		</dependency>
 66 | 		<dependency>
 67 | 			<groupId>org.apache.kafka</groupId>
 68 | 			<artifactId>kafka_2.10</artifactId>
 69 | 			<version>0.8.2.1</version>
 70 | 			<!-- use provided scope, if users need to pull in whichever scala version 
 71 | 				they choose -->
 72 | 			<!-- scope provided scope -->
 73 | 			<exclusions>
 74 | 				<exclusion>
 75 | 					<groupId>org.apache.zookeeper</groupId>
 76 | 					<artifactId>zookeeper</artifactId>
 77 | 				</exclusion>
 78 | 				<exclusion>
 79 | 					<groupId>log4j</groupId>
 80 | 					<artifactId>log4j</artifactId>
 81 | 				</exclusion>
 82 | 			</exclusions>
 83 | 		</dependency>
 84 | 		<dependency>
 85 | 			<groupId>org.apache.kafka</groupId>
 86 | 			<artifactId>kafka-clients</artifactId>
 87 | 			<version>0.8.2.1</version>
 88 | 			<!-- scope provided scope -->
 89 | 		</dependency>
 90 | 		<dependency>
 91 | 			<groupId>org.apache.storm</groupId>
 92 | 			<artifactId>storm-core</artifactId>
 93 | 			<version>0.9.3</version>
 94 | 			<!-- keep storm outdd of the jar-with-dependencies -->
 95 | 			<scope>provided</scope>
 96 | 		</dependency>
 97 | 		<dependency>
 98 | 			<groupId>redis.clients</groupId>
 99 | 			<artifactId>jedis</artifactId>
100 | 			<version>2.7.2</version>
101 | 			<type>jar</type>
102 | 			<scope>compile</scope>
103 | 		</dependency>
104 | 		<dependency>
105 | 			<groupId>com.github.flaxsearch</groupId>
106 | 			<artifactId>luwak</artifactId>
107 | 			<version>1.2.0</version>
108 | 		</dependency>
109 | 		<dependency>
110 | 			<groupId>org.json</groupId>
111 | 			<artifactId>json</artifactId>
112 | 			<version>20131018</version>
113 | 		</dependency>
114 | 	</dependencies>
115 | 	<build>
116 | 		<sourceDirectory>src</sourceDirectory>
117 | 		<testSourceDirectory>test/jvm</testSourceDirectory>
118 | 		<plugins>
119 | 			<plugin>
120 | 				<groupId>org.apache.maven.plugins</groupId>
121 | 				<artifactId>maven-compiler-plugin</artifactId>
122 | 				<version>2.5.1</version>
123 | 				<configuration>
124 |                 	<source>1.7</source>
125 |                 	<target>1.7</target>
126 |             	</configuration>
127 | 			</plugin>
128 | 			<plugin>
129 | 				<groupId>org.apache.maven.plugins</groupId>
130 | 				<artifactId>maven-shade-plugin</artifactId>
131 | 				<version>1.4</version>
132 | 				<configuration>
133 | 					<createDependencyReducedPom>true</createDependencyReducedPom>
134 | 				</configuration>
135 | 				<executions>
136 | 					<execution>
137 | 						<phase>package</phase>
138 | 						<goals>
139 | 							<goal>shade</goal>
140 | 						</goals>
141 | 						<configuration>
142 | 							<transformers>
143 | 								<transformer
144 | 									implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
145 | 								<transformer
146 | 									implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer" />
147 | 							</transformers>
148 | 						</configuration>
149 | 					</execution>
150 | 				</executions>
151 | 			</plugin>
152 | 			<plugin>
153 | 				<groupId>org.codehaus.mojo</groupId>
154 | 				<artifactId>exec-maven-plugin</artifactId>
155 | 				<version>1.2.1</version>
156 | 				<executions>
157 | 					<execution>
158 | 						<goals>
159 | 							<goal>exec</goal>
160 | 						</goals>
161 | 					</execution>
162 | 				</executions>
163 | 				<configuration>
164 | 					<executable>java</executable>
165 | 					<includeProjectDependencies>true</includeProjectDependencies>
166 | 					<includePluginDependencies>false</includePluginDependencies>
167 | 					<classpathScope>compile</classpathScope>
168 | 					<mainClass>${storm.topology}</mainClass>
169 | 				</configuration>
170 | 			</plugin>
171 | 		</plugins>
172 | 	</build>
173 | 	<repositories>
174 | 		<repository>
175 | 			<id>central</id>
176 | 			<name>Maven Repository Switchboard</name>
177 | 			<layout>default</layout>
178 | 			<url>http://repo1.maven.org/maven2</url>
179 | 			<snapshots>
180 | 				<enabled>false</enabled>
181 | 			</snapshots>
182 | 		</repository>
183 | 		<repository>
184 | 			<id>clojars.org</id>
185 | 			<url>http://clojars.org/repo</url>
186 | 		</repository>
187 | 		<repository>
188 | 			<id>mvnrepo</id>
189 | 			<name>mvnrepo</name>
190 | 			<url>http://mvnrepository.com</url>
191 | 		</repository>
192 | 		<repository>
193 | 			<id>mvnrepo2</id>
194 | 			<name>mvnrepo2</name>
195 | 			<url>http://repo1.maven.org/maven2/</url>
196 | 		</repository>
197 | 	</repositories>
198 | </project>
199 | 


--------------------------------------------------------------------------------
/src/luwak_search/run_luwak_topology.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export STRAW_CONFIG=`pwd`/../../config/config.properties
3 | echo "USING CONFIG FILE: $STRAW_CONFIG"
4 | mvn compile exec:java -Dstorm.topology=straw.storm.LuwakSearchTopology
5 | 


--------------------------------------------------------------------------------
/src/luwak_search/src/straw/storm/LuwakSearchTopology.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  * http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | package straw.storm;
 19 | 
 20 | import backtype.storm.Config;
 21 | import backtype.storm.LocalCluster;
 22 | import backtype.storm.StormSubmitter;
 23 | import backtype.storm.topology.TopologyBuilder;
 24 | import backtype.storm.utils.Utils;
 25 | 
 26 | import storm.kafka.*;
 27 | import straw.storm.bolt.LuwakSearchBolt;
 28 | import straw.storm.util.ConfigurationManager;
 29 | 
 30 | /**
 31 |  * This is a basic example of a Storm topology, following the example
 32 |  * https://github.com/buildlackey/cep/tree/master/storm%2Bkafka
 33 |  * 
 34 |  */
 35 | public class LuwakSearchTopology {
 36 | 
 37 |   public static void main(String[] args) throws Exception {
 38 | 	/*
 39 | 	 *   Define and packaged a topology to submit to a storm cluster  
 40 | 	 */
 41 | 	 
 42 | 	  
 43 | 	/*
 44 | 	 * CONFIGURATION
 45 | 	 * TODO: Better config management; should throw meaningful errors
 46 | 	 * when a config value is called but not defined.
 47 | 	 * 
 48 | 	 */
 49 |     ConfigurationManager config_manager = new ConfigurationManager();
 50 |     config_manager.put("document_type", "document_type");
 51 |     config_manager.put("kafka_query_topic", "kafka_query_topic");
 52 |     config_manager.put("kafka_document_topic", "kafka_document_topic");
 53 |     config_manager.put("zookeeper_host", "zookeeper_host");
 54 |     config_manager.put("zookeeper_port", "zookeeper_port");
 55 |     config_manager.put("redis_host", "redis_host");
 56 |     config_manager.put("redis_port", "redis_port");
 57 |     config_manager.put("redis_analytics_host", "redis_analytics_host");
 58 |     config_manager.put("redis_analytics_port", "redis_analytics_port");
 59 |     config_manager.put("search.bolts", "search.bolts");
 60 |     config_manager.put("document.spouts", "document.spouts");
 61 |     config_manager.put("query.spouts", "query.spouts");
 62 |     config_manager.put("workers", "workers");
 63 |     config_manager.put("search.bolt.number.tasks", "search.bolt.number.tasks");
 64 |     Config config = config_manager.get();
 65 | 
 66 |     /*
 67 |      * KafkaSpout configuration
 68 |      */
 69 |     // offset management
 70 |     String zkroot = "/brokers"; // the root path in Zookeeper for the spout to store the consumer offsets
 71 |     String zkid = "ids"; // an id for this consumer for storing the consumer offsets in Zookeeper
 72 |     
 73 |     // set zookeeper host
 74 |     BrokerHosts brokerHosts = new ZkHosts( String.format("%s:%s", 
 75 |     		config.get("zookeeper_host").toString(), 
 76 |     		config.get("zookeeper_port")).toString(), zkroot);
 77 |     
 78 |     // kafka topics
 79 |     String query_topic = config.get("kafka_query_topic").toString();
 80 |     String document_topic = config.get("kafka_document_topic").toString();
 81 |     
 82 |     // define spouts
 83 |     SpoutConfig query_spout_config = new SpoutConfig(brokerHosts, query_topic, zkroot, zkid);
 84 |     query_spout_config.forceFromStart=true;
 85 |     SpoutConfig document_spout_config = new SpoutConfig(brokerHosts, document_topic, zkroot, zkid);
 86 |     document_spout_config.forceFromStart=true;
 87 |     
 88 |     // add a string scheme to the spouts
 89 |     document_spout_config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme());
 90 |     query_spout_config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme());
 91 |     
 92 |     // topology definition
 93 |     // distribute documents randomly to bolts; queries are localized in memory at the bolt so we need to broadcast them
 94 |     TopologyBuilder builder = new TopologyBuilder();
 95 |     builder.setSpout("query-spout", new KafkaSpout(query_spout_config), Integer.parseInt(config.get("query.spouts").toString()));
 96 |     builder.setSpout("document-spout", new KafkaSpout(document_spout_config), Integer.parseInt(config.get("document.spouts").toString()));
 97 |     builder.setBolt("search-bolt", new LuwakSearchBolt(), Integer.parseInt(config.get("search.bolts").toString()))
 98 |     	.allGrouping("query-spout")
 99 |     	.shuffleGrouping("document-spout");
100 |     	
101 |     // topology submission
102 |     if (args != null && args.length > 0) {
103 |       config.setNumWorkers(Integer.parseInt(config.get("workers").toString()));
104 |       StormSubmitter.submitTopologyWithProgressBar(args[0], config, builder.createTopology());
105 |     }
106 |     else {
107 |       LocalCluster cluster = new LocalCluster();
108 |       cluster.submitTopology("streaming-search-topology", config, builder.createTopology());
109 |       
110 |       // run for a while then die
111 |       Utils.sleep(50000000);
112 |       cluster.killTopology("streaming-search-topology");
113 |       cluster.shutdown();
114 |       
115 |     }
116 |   }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/luwak_search/src/straw/storm/bolt/LuwakSearchBolt.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  * http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | package straw.storm.bolt;
 19 | 
 20 | 
 21 | 
 22 | import java.io.IOException;
 23 | import java.util.Map;
 24 | import java.util.Timer;
 25 | 
 26 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 27 | 
 28 | import redis.clients.jedis.Jedis;
 29 | import redis.clients.jedis.JedisPool;
 30 | import redis.clients.jedis.JedisPoolConfig;
 31 | import straw.storm.util.Counter;
 32 | import straw.storm.util.LuwakHelper;
 33 | import straw.storm.util.RequestsHelper;
 34 | import straw.storm.util.ScheduledMessageCounter;
 35 | import uk.co.flax.luwak.InputDocument;
 36 | import uk.co.flax.luwak.Matches;
 37 | import uk.co.flax.luwak.Monitor;
 38 | import uk.co.flax.luwak.MonitorQuery;
 39 | import uk.co.flax.luwak.QueryMatch;
 40 | import uk.co.flax.luwak.matchers.SimpleMatcher;
 41 | import uk.co.flax.luwak.presearcher.TermFilteredPresearcher;
 42 | import uk.co.flax.luwak.queryparsers.LuceneQueryParser;
 43 | import backtype.storm.task.OutputCollector;
 44 | import backtype.storm.task.TopologyContext;
 45 | import backtype.storm.topology.OutputFieldsDeclarer;
 46 | import backtype.storm.topology.base.BaseRichBolt;
 47 | import backtype.storm.tuple.Fields;
 48 | import backtype.storm.tuple.Tuple;
 49 | import backtype.storm.tuple.Values;
 50 | 
 51 | 
 52 | /**
 53 |  * This bolt aggregates counts from multiple upstream bolts.
 54 |  */
 55 | public class LuwakSearchBolt extends BaseRichBolt {
 56 | 
 57 | 	private OutputCollector collector;
 58 | 	private Map conf;
 59 | 	private static JedisPool pool;
 60 | 	private Monitor monitor;
 61 | 	private Counter counter;
 62 | 	
 63 | 	@SuppressWarnings("rawtypes")
 64 | 	@Override
 65 | 	public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
 66 | 		this.conf = conf;
 67 | 		this.collector = collector;
 68 | 		
 69 | 		// prepare the redis client
 70 | 		pool = new JedisPool(new JedisPoolConfig(), conf.get("redis_host").toString());
 71 | 		
 72 | 		// count message throughput
 73 | 		counter = new Counter();
 74 | 		ScheduledMessageCounter message_counter = new ScheduledMessageCounter(counter, conf);
 75 | 		Timer time = new Timer(); // Instantiate Timer Object
 76 | 		time.schedule(message_counter, 0, 10000); // Create Repetitively task for every 30 secs
 77 | 		
 78 | 		// luwak
 79 | 		try {
 80 | 			this.monitor = new Monitor(new LuceneQueryParser("text", new StandardAnalyzer()), new TermFilteredPresearcher());
 81 | 		} catch (IOException e) {
 82 | 			// TODO Auto-generated catch block
 83 | 			e.printStackTrace();
 84 | 		}
 85 | 
 86 | 	}
 87 | 
 88 | 	@Override
 89 | 	public void execute(Tuple tuple) {
 90 | 
 91 | 		// process the tuple
 92 | 		String sourcename = tuple.getSourceComponent();
 93 | 		String data = tuple.getValue(0).toString();
 94 | 
 95 | 		// either we get a query and we need to add it to the index
 96 | 		// or we get a document and we need to do a search
 97 | 		// Values("query", request_id, user_id, query_id, query)
 98 | 		// Values("document", source, document)
 99 | 		if(sourcename.toLowerCase().contains("query")){
100 | 			// add queries
101 | 			MonitorQuery query = LuwakHelper.make_query(data);
102 | 
103 | 			//register the query
104 | 			try {
105 | 				// System.out.println(query.toString());
106 | 				monitor.update(query);
107 | 			} catch (IOException e) {
108 | 				// TODO Auto-generated catch block
109 | 				e.printStackTrace();
110 | 			}
111 | 		}
112 | 		else if (sourcename.toLowerCase().contains("document")){
113 | 			// try to parse as document
114 | 			String text = LuwakHelper.extract_text(data);
115 | 
116 | 			//Build a document to check against the percolator
117 | 			InputDocument doc = null;
118 | 			if (text != null){
119 | 				doc = InputDocument.builder(RequestsHelper.generate_unique_identifier(data))
120 | 	                    .addField("text", text, new StandardAnalyzer())
121 | 	                    .build();
122 | 			}
123 | 			
124 | 			// pass the document through Luwak
125 | 			if (doc != null) {
126 | 				try {
127 | 					Matches<QueryMatch> matches = monitor.match(doc, SimpleMatcher.FACTORY);
128 | 					
129 | 					//Handle the result which is the set of queries in the percolator
130 | 					for(QueryMatch match : matches) {
131 | 						// System.out.println("Query: " + match.toString() + " matched document " + text);
132 | 						// emit results
133 | 						// collector.emit(new Values(data));						
134 | 						//System.out.println(match.toString());
135 | 						
136 | 						// publish the result to jedis
137 | 				        try (Jedis jedis_client = pool.getResource()) {
138 | 				        	jedis_client.publish(match.getQueryId(), text);
139 | 				        }						
140 | 					}
141 | 				} catch (IOException e) {
142 | 					e.printStackTrace();
143 | 				}
144 | 			}
145 | 		}
146 | 
147 | 		// acknowledge 
148 | 		collector.ack(tuple);
149 | 		
150 | 		// we completed a search, so we need to update the counter
151 | 		counter.count+=1;
152 | 	}
153 | 
154 | 
155 | 	@Override
156 | 	public void declareOutputFields(OutputFieldsDeclarer declarer) {
157 | 		declarer.declare(new Fields("document"));
158 | 	}
159 | 
160 | 	@Override
161 | 	public void cleanup() {
162 | 		pool.destroy();
163 | 	}
164 | 
165 | 
166 | }
167 | 


--------------------------------------------------------------------------------
/src/luwak_search/src/straw/storm/util/ConfigurationManager.java:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |  * 
 4 |  * This class is a simple wrapper around the storm configuration object.
 5 |  * For straw, we store all configuration in a config file whose location is 
 6 |  * given by the value of the enviornment variable STRAW_CONFIG.
 7 |  * 
 8 |  * The purpose of this class is to allow for a simple interface where
 9 |  * we can ask storm to set a configuration value "storm_property" based on 
10 |  * the value of "system_name" in the STRAW_CONFIG file.  An error is thrown
11 |  * when the "system_name" is not found in the config file.
12 |  * 
13 |  */
14 | package straw.storm.util;
15 | 
16 | import java.io.FileInputStream;
17 | import java.io.IOException;
18 | import java.io.InputStream;
19 | import java.util.Map;
20 | import java.util.Properties;
21 | import backtype.storm.Config;
22 | 
23 | public class ConfigurationManager {
24 | 
25 | 	private Map<String, String> env = System.getenv();
26 | 	private String config_filename;
27 | 	private Properties prop = new Properties();
28 | 	private Config config = new Config();
29 | 	
30 | 	public ConfigurationManager() {		
31 | 		// read config file location from sys
32 | 		config_filename = env.get("STRAW_CONFIG");
33 | 	    if(config_filename==null)
34 | 	    {
35 | 	    	throw new RuntimeException("Couldn't access config file, did you set STRAW_CONFIG in enviornment?");
36 | 	    }
37 | 
38 | 	    // load the properties
39 | 	    InputStream input = null;
40 | 	    try {
41 | 			input = new FileInputStream(config_filename);
42 | 			prop.load(input);
43 | 		} catch (IOException ex) {
44 | 			ex.printStackTrace();
45 | 		} finally {
46 | 			if (input != null) {
47 | 				try {
48 | 					input.close();
49 | 				} catch (IOException e) {
50 | 					e.printStackTrace();
51 | 				}
52 | 			}
53 | 		}
54 | 	    
55 | 	}
56 | 	
57 | 	// add a setting to the config
58 | 	public void put(String storm_property, String system_name){
59 | 		if (prop.getProperty(system_name)==null){
60 | 			throw new RuntimeException("Property "+system_name+" not found in config file " + config_filename +".");
61 | 		}
62 | 		config.put(storm_property, prop.getProperty(system_name));
63 | 	}
64 | 	
65 | 	// return the storm config object
66 | 	public Config get(){
67 | 		return config;
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/src/luwak_search/src/straw/storm/util/Counter.java:
--------------------------------------------------------------------------------
1 | package straw.storm.util;
2 | 
3 | // wrapper class to hold bolt's throughput count
4 | public class Counter {
5 | 	public int count=0;				
6 | }


--------------------------------------------------------------------------------
/src/luwak_search/src/straw/storm/util/LuwakHelper.java:
--------------------------------------------------------------------------------
 1 | package straw.storm.util;
 2 | 
 3 | 
 4 | 
 5 | import org.apache.commons.lang.StringUtils;
 6 | 
 7 | import org.json.JSONArray;
 8 | import org.json.JSONObject;
 9 | 
10 | import uk.co.flax.luwak.MonitorQuery;
11 | 
12 | public class LuwakHelper {
13 | 
14 | 	public static String extract_text(String data) {
15 | 		// parse input JSON
16 | 		JSONObject obj;
17 | 		String text = null;
18 | 		try {
19 | 			obj = new JSONObject(data);
20 | 			text = obj.getString("text");
21 | 		} 
22 | 		catch (org.json.JSONException e) {
23 | 			// TODO: Bad json passed
24 | 			// System.out.println("JSON PARSER FAILED TO HANDLE: " + data);
25 | 			//e.printStackTrace();
26 | 		}
27 | 		
28 | 		return text;
29 | 	}
30 | 	
31 | 	public static MonitorQuery make_query(String data){
32 | 		// build a query out of the data JSON string
33 | 		MonitorQuery qb = null;
34 | 		JSONObject obj = null;
35 | 		try {
36 | 			obj = new JSONObject(data);
37 | 		} 
38 | 		catch (org.json.JSONException e) {
39 | 			System.out.println("JSON PARSER FAILED TO HANDLE: " + data);
40 | 			e.printStackTrace();
41 | 		}
42 | 			
43 | 		String type = obj.getString("type");
44 | 		
45 | 		// terms query parser
46 | 		if(type.equalsIgnoreCase("terms-query")) {
47 | 			Integer minimum_match = obj.getInt("minimum-match");
48 | 			JSONArray arr = obj.getJSONArray("terms");
49 | 			
50 | 			if (arr!=null){
51 | 				String[] string_arry = new String[arr.length()];
52 | 				// use length of array if minimum match not provided
53 | 				if (minimum_match==null){
54 | 					minimum_match=arr.length();
55 | 				}
56 | 				for(int i=0; i<arr.length(); i++){
57 | 					string_arry[i] = arr.getString(i);
58 | 				}
59 | 				// query is lucene style, e.g. "text:term1 AND text:term2 ... "
60 | 				qb = new MonitorQuery(RequestsHelper.generate_unique_identifier(data), "text:" + StringUtils.join(string_arry, " AND text:"));
61 | 			}
62 | 		}
63 | 		return qb; 
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/src/luwak_search/src/straw/storm/util/RequestsHelper.java:
--------------------------------------------------------------------------------
 1 | package straw.storm.util;
 2 | 
 3 | import java.io.UnsupportedEncodingException;
 4 | import java.security.MessageDigest;
 5 | import java.security.NoSuchAlgorithmException;
 6 | 
 7 | public class RequestsHelper {
 8 | 
 9 | 	public static String generate_unique_identifier(String object_string){
10 | 		
11 | 		// generate a unique id for string representation of an object
12 | 	    MessageDigest md;
13 | 		byte[] hash = null;
14 | 		try {
15 | 			md = MessageDigest.getInstance("MD5");
16 | 			hash = md.digest(object_string.getBytes("UTF-8"));
17 | 		} catch (NoSuchAlgorithmException e) {
18 | 			// TODO Auto-generated catch block
19 | 			e.printStackTrace();
20 | 		} catch (UnsupportedEncodingException e) {
21 | 			// TODO Auto-generated catch block
22 | 			e.printStackTrace();
23 | 		}
24 | 	    
25 | 	    //converting byte array to Hexadecimal String
26 | 	   StringBuilder sb = new StringBuilder(2*hash.length);
27 | 	   for(byte b : hash){
28 | 	       sb.append(String.format("%02x", b&0xff));
29 | 	   }
30 | 	   return sb.toString();
31 | 	}
32 | 	
33 | }
34 | 


--------------------------------------------------------------------------------
/src/luwak_search/src/straw/storm/util/ScheduledMessageCounter.java:
--------------------------------------------------------------------------------
 1 | package straw.storm.util;
 2 | import java.text.SimpleDateFormat;
 3 | import java.util.Calendar;
 4 | import java.util.Map;
 5 | import java.util.TimerTask;
 6 | 
 7 | import redis.clients.jedis.Jedis;
 8 | import redis.clients.jedis.JedisPool;
 9 | import redis.clients.jedis.JedisPoolConfig;
10 | 
11 | 
12 | /*
13 |  * 
14 |  * This task can be used to log throughput count at a point in time.
15 |  * It consumes a counter object, logs the counters value, then resets that value.
16 |  * It should be passed to a scheduler.
17 |  * 
18 |  */
19 | public class ScheduledMessageCounter extends TimerTask {
20 | 
21 | 	private Counter counter;
22 | 	private SimpleDateFormat tfmt = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss:SSS");
23 | 	private JedisPool pool;
24 | 	
25 | 	public ScheduledMessageCounter(Counter counter, Map conf){
26 | 		super();
27 | 		this.counter = counter;
28 | 		pool = new JedisPool(new JedisPoolConfig(), conf.get("redis_analytics_host").toString());
29 | 	}
30 | 	
31 | 	@Override
32 | 	public void run() {
33 | 		String time_stamp = tfmt.format(Calendar.getInstance().getTime());
34 | 		String msg = String.format("(%s, %s, %d)", time_stamp, counter.hashCode(), counter.count);
35 | 		
36 | 		// publish the result to jedis
37 |         try (Jedis jedis_client = pool.getResource()) {
38 |         	jedis_client.rpush("luwak", msg);
39 |         }
40 | 		
41 | 		// reset counter
42 | 		counter.count=0;
43 | 	}
44 | 	
45 | }
46 | 


--------------------------------------------------------------------------------
/src/luwak_search/submit_topology.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export STRAW_CONFIG=`pwd`/../../config/config.properties
3 | /usr/local/storm/bin/storm kill "prod-topology"
4 | echo "USING CONFIG FILE: $STRAW_CONFIG"
5 | sleep 30
6 | /usr/local/storm/bin/storm jar target/storming-luwak-search-0.0.1.jar straw.storm.LuwakSearchTopology "prod-topology"
7 | 
8 | 


--------------------------------------------------------------------------------
/src/storming_search/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .settings/
3 | .classpath
4 | .project
5 | 
6 | /target
7 | 


--------------------------------------------------------------------------------
/src/storming_search/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
  3 | 	license agreements. See the NOTICE file distributed with this work for additional 
  4 | 	information regarding copyright ownership. The ASF licenses this file to 
  5 | 	You under the Apache License, Version 2.0 (the "License"); you may not use 
  6 | 	this file except in compliance with the License. You may obtain a copy of 
  7 | 	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
  8 | 	by applicable law or agreed to in writing, software distributed under the 
  9 | 	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
 10 | 	OF ANY KIND, either express or implied. See the License for the specific 
 11 | 	language governing permissions and limitations under the License. -->
 12 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 13 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 14 | 	<modelVersion>4.0.0</modelVersion>
 15 | 	<groupId>github.com.rwalk333</groupId>
 16 | 	<artifactId>storming-search</artifactId>
 17 | 	<packaging>jar</packaging>
 18 | 	<version>0.0.1</version>
 19 | 	<name>storming-search</name>
 20 | 	<dependencies>
 21 | 		<!-- Elasticsearch -->
 22 | 		<dependency>
 23 | 			<groupId>org.elasticsearch</groupId>
 24 | 			<artifactId>elasticsearch</artifactId>
 25 | 			<version>1.7.0</version>
 26 | 		</dependency>
 27 | 		<!-- Kafka connector -->
 28 | 		<dependency>
 29 | 			<groupId>org.apache.storm</groupId>
 30 | 			<artifactId>storm-kafka</artifactId>
 31 | 			<version>0.9.2-incubating</version>
 32 | 		</dependency>
 33 | 		<dependency>
 34 | 			<groupId>org.apache.kafka</groupId>
 35 | 			<artifactId>kafka_2.10</artifactId>
 36 | 			<version>0.8.2.1</version>
 37 | 			<!-- use provided scope, if users need to pull in whichever scala version 
 38 | 				they choose -->
 39 | 			<!-- scope provided scope -->
 40 | 			<exclusions>
 41 | 				<exclusion>
 42 | 					<groupId>org.apache.zookeeper</groupId>
 43 | 					<artifactId>zookeeper</artifactId>
 44 | 				</exclusion>
 45 | 				<exclusion>
 46 | 					<groupId>log4j</groupId>
 47 | 					<artifactId>log4j</artifactId>
 48 | 				</exclusion>
 49 | 			</exclusions>
 50 | 		</dependency>
 51 | 		<dependency>
 52 | 			<groupId>org.apache.kafka</groupId>
 53 | 			<artifactId>kafka-clients</artifactId>
 54 | 			<version>0.8.2.1</version>
 55 | 			<!-- scope provided scope -->
 56 | 		</dependency>
 57 | 		<dependency>
 58 | 			<groupId>org.apache.storm</groupId>
 59 | 			<artifactId>storm-core</artifactId>
 60 | 			<version>0.9.3</version>
 61 | 			<!-- keep storm outdd of the jar-with-dependencies -->
 62 | 			<scope>provided</scope>
 63 | 		</dependency>
 64 | 		<dependency>
 65 | 			<groupId>redis.clients</groupId>
 66 | 			<artifactId>jedis</artifactId>
 67 | 			<version>2.7.2</version>
 68 | 			<type>jar</type>
 69 | 			<scope>compile</scope>
 70 | 		</dependency>
 71 | 		<dependency>
 72 | 			<groupId>commons-collections</groupId>
 73 | 			<artifactId>commons-collections</artifactId>
 74 | 			<version>3.2.1</version>
 75 | 		</dependency>
 76 | 		<!-- http client -->
 77 | 		<dependency>
 78 | 			<groupId>org.apache.httpcomponents</groupId>
 79 | 			<artifactId>httpclient</artifactId>
 80 | 			<version>4.5.1</version>
 81 | 		</dependency>
 82 | 		<dependency>
 83 | 			<groupId>org.apache.httpcomponents</groupId>
 84 | 			<artifactId>httpclient-cache</artifactId>
 85 | 			<version>4.5.1</version>
 86 | 		</dependency>
 87 | 		<dependency>
 88 | 			<groupId>org.apache.httpcomponents</groupId>
 89 | 			<artifactId>httpmime</artifactId>
 90 | 			<version>4.5.1</version>
 91 | 		</dependency>
 92 | 		<dependency>
 93 | 			<groupId>org.apache.httpcomponents</groupId>
 94 | 			<artifactId>fluent-hc</artifactId>
 95 | 			<version>4.5.1</version>
 96 | 		</dependency>
 97 | 		<dependency>
 98 | 			<groupId>org.json</groupId>
 99 | 			<artifactId>json</artifactId>
100 | 			<version>20131018</version>
101 | 		</dependency>
102 | 	</dependencies>
103 | 	<build>
104 | 		<sourceDirectory>src</sourceDirectory>
105 | 		<testSourceDirectory>test/jvm</testSourceDirectory>
106 | 		<plugins>
107 | 			<plugin>
108 | 				<groupId>org.apache.maven.plugins</groupId>
109 | 				<artifactId>maven-compiler-plugin</artifactId>
110 | 				<version>2.5.1</version>
111 | 				<configuration>
112 |                 	<source>1.7</source>
113 |                 	<target>1.7</target>
114 |             	</configuration>
115 | 			</plugin>
116 | 			<plugin>
117 | 				<groupId>org.apache.maven.plugins</groupId>
118 | 				<artifactId>maven-shade-plugin</artifactId>
119 | 				<version>1.4</version>
120 | 				<configuration>
121 | 					<createDependencyReducedPom>true</createDependencyReducedPom>
122 | 				</configuration>
123 | 				<executions>
124 | 					<execution>
125 | 						<phase>package</phase>
126 | 						<goals>
127 | 							<goal>shade</goal>
128 | 						</goals>
129 | 						<configuration>
130 | 							<transformers>
131 | 								<transformer
132 | 									implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
133 | 								<transformer
134 | 									implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer" />
135 | 							</transformers>
136 | 						</configuration>
137 | 					</execution>
138 | 				</executions>
139 | 			</plugin>
140 | 			<plugin>
141 | 				<groupId>org.codehaus.mojo</groupId>
142 | 				<artifactId>exec-maven-plugin</artifactId>
143 | 				<version>1.2.1</version>
144 | 				<executions>
145 | 					<execution>
146 | 						<goals>
147 | 							<goal>exec</goal>
148 | 						</goals>
149 | 					</execution>
150 | 				</executions>
151 | 				<configuration>
152 | 					<executable>java</executable>
153 | 					<includeProjectDependencies>true</includeProjectDependencies>
154 | 					<includePluginDependencies>false</includePluginDependencies>
155 | 					<classpathScope>compile</classpathScope>
156 | 					<mainClass>${storm.topology}</mainClass>
157 | 				</configuration>
158 | 			</plugin>
159 | 		</plugins>
160 | 	</build>
161 | 	<repositories>
162 | 		<repository>
163 | 			<id>central</id>
164 | 			<name>Maven Repository Switchboard</name>
165 | 			<layout>default</layout>
166 | 			<url>http://repo1.maven.org/maven2</url>
167 | 			<snapshots>
168 | 				<enabled>false</enabled>
169 | 			</snapshots>
170 | 		</repository>
171 | 		<repository>
172 | 			<id>clojars.org</id>
173 | 			<url>http://clojars.org/repo</url>
174 | 		</repository>
175 | 		<repository>
176 | 			<id>mvnrepo</id>
177 | 			<name>mvnrepo</name>
178 | 			<url>http://mvnrepository.com</url>
179 | 		</repository>
180 | 		<repository>
181 | 			<id>mvnrepo2</id>
182 | 			<name>mvnrepo2</name>
183 | 			<url>http://repo1.maven.org/maven2/</url>
184 | 		</repository>
185 | 	</repositories>
186 | </project>
187 | 


--------------------------------------------------------------------------------
/src/storming_search/run_search_topology.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export STRAW_CONFIG=`pwd`/../../config/config.properties
3 | echo "USING CONFIG FILE: $STRAW_CONFIG"
4 | mvn compile exec:java -Dstorm.topology=straw.storm.StreamingSearchTopology
5 | 


--------------------------------------------------------------------------------
/src/storming_search/src/straw/storm/StreamingSearchTopology.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  * http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | package straw.storm;
 19 | 
 20 | import backtype.storm.Config;
 21 | import backtype.storm.LocalCluster;
 22 | import backtype.storm.StormSubmitter;
 23 | import backtype.storm.topology.TopologyBuilder;
 24 | import backtype.storm.utils.Utils;
 25 | import storm.kafka.*;
 26 | import straw.storm.bolt.SearchBolt;
 27 | import straw.storm.util.ConfigurationManager;
 28 | 
 29 | /**
 30 |  * This is the Topology for Streaming Search
 31 |  */
 32 | public class StreamingSearchTopology {
 33 | 
 34 |   public static void main(String[] args) throws Exception {
 35 | 	  
 36 | 	/*
 37 | 	 * CONFIGURATION
 38 | 	 * TODO: Better config management; should throw meaningful errors
 39 | 	 * when a config value is called but not defined.
 40 | 	 * 
 41 | 	 */
 42 |     ConfigurationManager config_manager = new ConfigurationManager();
 43 |     config_manager.put("elasticsearch_host", "elasticsearch_host");
 44 |     config_manager.put("elasticsearch_cluster_name", "elasticsearch_cluster_name");
 45 |     config_manager.put("elasticsearch_port", "elasticsearch_port");
 46 |     config_manager.put("index_name", "index_name");
 47 |     config_manager.put("document_type", "document_type");
 48 |     config_manager.put("kafka_query_topic", "kafka_query_topic");
 49 |     config_manager.put("kafka_document_topic", "kafka_document_topic");
 50 |     config_manager.put("zookeeper_host", "zookeeper_host");
 51 |     config_manager.put("zookeeper_port", "zookeeper_port");
 52 |     config_manager.put("redis_host", "redis_host");
 53 |     config_manager.put("redis_port", "redis_port");
 54 |     config_manager.put("redis_analytics_host", "redis_analytics_host");
 55 |     config_manager.put("redis_analytics_port", "redis_analytics_port");
 56 |     config_manager.put("search.bolts", "search.bolts");
 57 |     config_manager.put("document.spouts", "document.spouts");
 58 |     config_manager.put("query.spouts", "query.spouts");
 59 |     config_manager.put("workers", "workers");
 60 |     config_manager.put("search.bolt.number.tasks", "search.bolt.number.tasks");
 61 |     Config config = config_manager.get();
 62 |     
 63 |     /*
 64 |      * KafkaSpout configuration
 65 |      */
 66 |     // offset management
 67 |     String zkroot = "/brokers"; // the root path in Zookeeper for the spout to store the consumer offsets
 68 |     String zkid = "ids"; // an id for this consumer for storing the consumer offsets in Zookeeper
 69 |     
 70 |     // set zookeeper host
 71 |     BrokerHosts brokerHosts = new ZkHosts( String.format("%s:%s", 
 72 |     		config.get("zookeeper_host").toString(), 
 73 |     		config.get("zookeeper_port")).toString(), zkroot);
 74 |     
 75 |     // kafka topics
 76 |     String query_topic = config.get("kafka_query_topic").toString();
 77 |     String document_topic = config.get("kafka_document_topic").toString();
 78 |     
 79 |     // define spouts
 80 |     SpoutConfig query_spout_config = new SpoutConfig(brokerHosts, query_topic, zkroot, zkid);
 81 |     query_spout_config.forceFromStart=true;
 82 |     SpoutConfig document_spout_config = new SpoutConfig(brokerHosts, document_topic, zkroot, zkid);
 83 |     document_spout_config.forceFromStart=true;
 84 |     
 85 |     // add a string scheme to the spouts
 86 |     document_spout_config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme());
 87 |     query_spout_config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme());
 88 |     
 89 |     
 90 |     // distribute queries and documents randomly to bolts (since Elasticsearch is centralized, we don't need to broadcast queries).
 91 |     TopologyBuilder builder = new TopologyBuilder();
 92 |     builder.setSpout("query-spout", new KafkaSpout(query_spout_config), Integer.parseInt(config.get("query.spouts").toString()));
 93 |     builder.setSpout("document-spout", new KafkaSpout(document_spout_config), Integer.parseInt(config.get("document.spouts").toString()));
 94 |     builder.setBolt("search-bolt", new SearchBolt(), Integer.parseInt(config.get("search.bolts").toString()))
 95 |         .setNumTasks(Integer.parseInt(config.get("search.bolt.number.tasks").toString()))
 96 |     	.shuffleGrouping("query-spout")
 97 |     	.shuffleGrouping("document-spout");
 98 |     
 99 |     // topology submission
100 |     if (args != null && args.length > 0) {
101 |         config.setNumWorkers(Integer.parseInt(config.get("workers").toString()));
102 |       StormSubmitter.submitTopologyWithProgressBar(args[0], config, builder.createTopology());
103 |     }
104 |     else {
105 |       LocalCluster cluster = new LocalCluster();
106 |       cluster.submitTopology("streaming-search-topology", config, builder.createTopology());
107 |       
108 |       // run for a while then die
109 |       Utils.sleep(50000000);
110 |       cluster.killTopology("streaming-search-topology");
111 |       cluster.shutdown();
112 |       
113 |     }
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/storming_search/src/straw/storm/bolt/SearchBolt.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  * http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | package straw.storm.bolt;
 19 | 
 20 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
 21 | 
 22 | import java.io.IOException;
 23 | import java.util.Map;
 24 | import java.util.Timer;
 25 | 
 26 | import org.elasticsearch.ElasticsearchException;
 27 | import org.elasticsearch.action.percolate.PercolateResponse;
 28 | import org.elasticsearch.client.transport.TransportClient;
 29 | import org.elasticsearch.common.settings.ImmutableSettings;
 30 | import org.elasticsearch.common.settings.Settings;
 31 | import org.elasticsearch.common.transport.InetSocketTransportAddress;
 32 | import org.elasticsearch.common.xcontent.XContentBuilder;
 33 | import org.elasticsearch.common.xcontent.XContentFactory;
 34 | import org.elasticsearch.index.query.QueryBuilder;
 35 | 
 36 | import redis.clients.jedis.Jedis;
 37 | import redis.clients.jedis.JedisPool;
 38 | import redis.clients.jedis.JedisPoolConfig;
 39 | import straw.storm.util.Counter;
 40 | import straw.storm.util.PercolatorHelper;
 41 | import straw.storm.util.RequestsHelper;
 42 | import straw.storm.util.ScheduledMessageCounter;
 43 | import backtype.storm.task.OutputCollector;
 44 | import backtype.storm.task.TopologyContext;
 45 | import backtype.storm.topology.OutputFieldsDeclarer;
 46 | import backtype.storm.topology.base.BaseRichBolt;
 47 | import backtype.storm.tuple.Fields;
 48 | import backtype.storm.tuple.Tuple;
 49 | import backtype.storm.tuple.Values;
 50 | 
 51 | 
 52 | /**
 53 |  * This bolt aggregates counts from multiple upstream bolts.
 54 |  */
 55 | public class SearchBolt extends BaseRichBolt {
 56 | 
 57 | 	private OutputCollector collector;
 58 | 	private Map conf;
 59 | 	private TransportClient client;
 60 | 	private static JedisPool pool;
 61 | 	private Counter counter;
 62 | 
 63 | 	@SuppressWarnings("rawtypes")
 64 | 	@Override
 65 | 	public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
 66 | 		this.conf = conf;
 67 | 		this.collector = collector;
 68 | 		SearchBolt.pool = new JedisPool(new JedisPoolConfig(), conf.get("redis_host").toString());
 69 | 		
 70 | 		// prepare the search engine
 71 | 		String host = conf.get("elasticsearch_host").toString();
 72 | 		String cluster_name = conf.get("elasticsearch_cluster_name").toString();
 73 | 		int port = Integer.parseInt(conf.get("elasticsearch_port").toString());	  
 74 | 		Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", cluster_name).build();
 75 | 		client = new TransportClient(settings)
 76 | 		.addTransportAddress(new InetSocketTransportAddress(host, port));
 77 | 		counter = new Counter();
 78 | 
 79 | 		// count messages periodically
 80 | 		ScheduledMessageCounter message_counter = new ScheduledMessageCounter(counter, conf);
 81 | 		Timer time = new Timer(); // Instantiate Timer Object
 82 | 		time.schedule(message_counter, 0, 10000); // Create Repetitively task for every 30 secs
 83 | 
 84 | 	}
 85 | 
 86 | 	@Override
 87 | 	public void execute(Tuple tuple) {
 88 | 
 89 | 		// process the tuple recieved from kafka
 90 | 		String sourcename = tuple.getSourceComponent();
 91 | 		String data = tuple.getValue(0).toString();
 92 | 
 93 | 		// either we get a query and we need to add it to the index
 94 | 		// or we get a document and we need to do a search
 95 | 		// Values("query", request_id, user_id, query_id, query)
 96 | 		// Values("document", source, document)
 97 | 		if(sourcename.toLowerCase().contains("query")){
 98 | 			// add queries
 99 | 			QueryBuilder query = PercolatorHelper.make_query(data);
100 | 
101 | 			//register the query in the percolator
102 | 			if (query != null ) {
103 | 				try {
104 | 					client.prepareIndex(conf.get("index_name").toString(), ".percolator", RequestsHelper.generate_unique_identifier(data))
105 | 					.setSource(jsonBuilder()
106 | 							.startObject()
107 | 							.field("query", query) // Register the query
108 | 							.field("format", "objects")
109 | 							.endObject())
110 | 							.setRefresh(true) // Needed when the query shall be available immediately
111 | 							.execute().actionGet();
112 | 				} catch (ElasticsearchException e) {
113 | 					// TODO Auto-generated catch block
114 | 					e.printStackTrace();
115 | 				} catch (IOException e) {
116 | 					// TODO Auto-generated catch block
117 | 					e.printStackTrace();
118 | 				}
119 | 			}
120 | 		}
121 | 		else if (sourcename.toLowerCase().contains("document")){
122 | 			// try to parse as document
123 | 			String text = PercolatorHelper.extract_text(data);
124 | 
125 | 			//Build a document to check against the percolator
126 | 			XContentBuilder docBuilder = null;
127 | 			if (text != null){
128 | 				try {
129 | 					docBuilder = XContentFactory.jsonBuilder().startObject();
130 | 					docBuilder.field("doc").startObject(); //This is needed to designate the document
131 | 					docBuilder.field("text", text);
132 | 					docBuilder.endObject(); //End of the doc field
133 | 					docBuilder.endObject(); //End of the JSON root object
134 | 				} catch (IOException e) {
135 | 					// TODO Auto-generated catch block
136 | 					e.printStackTrace();
137 | 				}
138 | 			}
139 | 
140 | 			if (docBuilder != null) {
141 | 				//Percolate
142 | 				PercolateResponse response = client.preparePercolate()
143 | 						.setIndices(conf.get("index_name").toString())
144 | 						.setDocumentType(conf.get("document_type").toString())
145 | 						.setSource(docBuilder).execute().actionGet();
146 | 
147 | 				//Handle the result which is the set of queries in the percolator
148 | 				for(PercolateResponse.Match match : response) {
149 | 					// emit results
150 | 					collector.emit(new Values(data));
151 | 
152 | 					// publish the result to jedis
153 | 			        try (Jedis jedis_client = pool.getResource()) {
154 | 			        	jedis_client.publish(match.getId().toString(), text);
155 | 			        }
156 | 				}
157 | 			}
158 | 		}
159 | 
160 | 		// acknowledge 
161 | 		collector.ack(tuple);
162 | 
163 | 		// update the counter
164 | 		counter.count+=1;
165 | 	}
166 | 
167 | 	@Override
168 | 	public void declareOutputFields(OutputFieldsDeclarer declarer) {
169 | 		declarer.declare(new Fields("document"));
170 | 	}
171 | 
172 | 	@Override
173 | 	public void cleanup() {
174 | 		client.close();
175 | 		pool.destroy();
176 | 	}
177 | 
178 | }
179 | 


--------------------------------------------------------------------------------
/src/storming_search/src/straw/storm/util/ConfigurationManager.java:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |  * 
 4 |  * This class is a simple wrapper around the storm configuration object.
 5 |  * For straw, we store all configuration in a config file whose location is 
 6 |  * given by the value of the enviornment variable STRAW_CONFIG.
 7 |  * 
 8 |  * The purpose of this class is to allow for a simple interface where
 9 |  * we can ask storm to set a configuration value "storm_property" based on 
10 |  * the value of "system_name" in the STRAW_CONFIG file.  An error is thrown
11 |  * when the "system_name" is not found in the config file.
12 |  * 
13 |  */
14 | package straw.storm.util;
15 | 
16 | import java.io.FileInputStream;
17 | import java.io.IOException;
18 | import java.io.InputStream;
19 | import java.util.Map;
20 | import java.util.Properties;
21 | import backtype.storm.Config;
22 | 
23 | public class ConfigurationManager {
24 | 
25 | 	private Map<String, String> env = System.getenv();
26 | 	private String config_filename;
27 | 	private Properties prop = new Properties();
28 | 	private Config config = new Config();
29 | 	
30 | 	public ConfigurationManager() {		
31 | 		// read config file location from sys
32 | 		config_filename = env.get("STRAW_CONFIG");
33 | 	    if(config_filename==null)
34 | 	    {
35 | 	    	throw new RuntimeException("Couldn't access config file, did you set STRAW_CONFIG in enviornment?");
36 | 	    }
37 | 
38 | 	    // load the properties
39 | 	    InputStream input = null;
40 | 	    try {
41 | 			input = new FileInputStream(config_filename);
42 | 			prop.load(input);
43 | 		} catch (IOException ex) {
44 | 			ex.printStackTrace();
45 | 		} finally {
46 | 			if (input != null) {
47 | 				try {
48 | 					input.close();
49 | 				} catch (IOException e) {
50 | 					e.printStackTrace();
51 | 				}
52 | 			}
53 | 		}
54 | 	    
55 | 	}
56 | 	
57 | 	// add a setting to the config
58 | 	public void put(String storm_property, String system_name){
59 | 		if (prop.getProperty(system_name)==null){
60 | 			throw new RuntimeException("Property "+system_name+" not found in config file " + config_filename +".");
61 | 		}
62 | 		config.put(storm_property, prop.getProperty(system_name));
63 | 	}
64 | 	
65 | 	// return the storm config object
66 | 	public Config get(){
67 | 		return config;
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/src/storming_search/src/straw/storm/util/Counter.java:
--------------------------------------------------------------------------------
1 | package straw.storm.util;
2 | 
3 | // wrapper class to hold bolt's throughput count
4 | public class Counter {
5 | 	public int count=0;				
6 | }


--------------------------------------------------------------------------------
/src/storming_search/src/straw/storm/util/PercolatorHelper.java:
--------------------------------------------------------------------------------
 1 | package straw.storm.util;
 2 | 
 3 | import static org.elasticsearch.index.query.QueryBuilders.termsQuery;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | import org.elasticsearch.common.xcontent.XContentBuilder;
 8 | import org.elasticsearch.common.xcontent.XContentFactory;
 9 | import org.elasticsearch.index.query.QueryBuilder;
10 | import org.json.JSONArray;
11 | import org.json.JSONObject;
12 | 
13 | public class PercolatorHelper {
14 | 
15 | 	public static String extract_text(String data) {
16 | 		// parse input JSON
17 | 		JSONObject obj = null;
18 | 		String text = null;
19 | 		try {
20 | 			obj = new JSONObject(data);
21 | 			text = obj.getString("text");	
22 | 		} 
23 | 		catch (org.json.JSONException e) {
24 | 			// TODO: Bad json passed
25 | 			// System.out.println("JSON PARSER FAILED TO HANDLE: " + data);
26 | 			//e.printStackTrace();
27 | 		}
28 | 		
29 | 		return text;
30 | 	}
31 | 	
32 | 	public static QueryBuilder make_query(String data){
33 | 		// build a query out of the data JSON string
34 | 		QueryBuilder qb = null;
35 | 		JSONObject obj = null;
36 | 		try {
37 | 			obj = new JSONObject(data);
38 | 		} 
39 | 		catch (org.json.JSONException e) {
40 | 			System.out.println("JSON PARSER FAILED TO HANDLE: " + data);
41 | 			e.printStackTrace();
42 | 		}
43 | 			
44 | 		String type = obj.getString("type");
45 | 		
46 | 		// terms query parser
47 | 		if(type.equalsIgnoreCase("terms-query")) {
48 | 			Integer minimum_match = obj.getInt("minimum-match");
49 | 			JSONArray arr = obj.getJSONArray("terms");
50 | 			
51 | 			if (arr!=null){
52 | 				String[] string_arry = new String[arr.length()];
53 | 				// use length of array if minimum match not provided
54 | 				if (minimum_match==null){
55 | 					minimum_match=arr.length();
56 | 				}
57 | 				for(int i=0; i<arr.length(); i++){
58 | 					string_arry[i] = arr.getString(i);
59 | 				}
60 | 				qb = termsQuery("text", string_arry).minimumMatch(minimum_match);
61 | 			}
62 | 		}
63 | 		return qb; 
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/src/storming_search/src/straw/storm/util/RequestsHelper.java:
--------------------------------------------------------------------------------
 1 | package straw.storm.util;
 2 | 
 3 | import java.io.UnsupportedEncodingException;
 4 | import java.security.MessageDigest;
 5 | import java.security.NoSuchAlgorithmException;
 6 | 
 7 | public class RequestsHelper {
 8 | 
 9 | 	public static String generate_unique_identifier(String object_string){
10 | 		
11 | 		// generate a unique id for string representation of an object
12 | 	    MessageDigest md;
13 | 		byte[] hash = null;
14 | 		try {
15 | 			md = MessageDigest.getInstance("MD5");
16 | 			hash = md.digest(object_string.getBytes("UTF-8"));
17 | 		} catch (NoSuchAlgorithmException e) {
18 | 			// TODO Auto-generated catch block
19 | 			e.printStackTrace();
20 | 		} catch (UnsupportedEncodingException e) {
21 | 			// TODO Auto-generated catch block
22 | 			e.printStackTrace();
23 | 		}
24 | 	    
25 | 	    //converting byte array to Hexadecimal String
26 | 	   StringBuilder sb = new StringBuilder(2*hash.length);
27 | 	   for(byte b : hash){
28 | 	       sb.append(String.format("%02x", b&0xff));
29 | 	   }
30 | 	   return sb.toString();
31 | 	}
32 | 	
33 | }
34 | 


--------------------------------------------------------------------------------
/src/storming_search/src/straw/storm/util/ScheduledMessageCounter.java:
--------------------------------------------------------------------------------
 1 | package straw.storm.util;
 2 | import java.text.SimpleDateFormat;
 3 | import java.util.Calendar;
 4 | import java.util.Map;
 5 | import java.util.TimerTask;
 6 | 
 7 | import redis.clients.jedis.Jedis;
 8 | import redis.clients.jedis.JedisPool;
 9 | import redis.clients.jedis.JedisPoolConfig;
10 | 
11 | 
12 | /*
13 |  * 
14 |  * This task can be used to log throughput count at a point in time.
15 |  * It consumes a counter object, logs the counters value, then resets that value.
16 |  * It should be passed to a scheduler.
17 |  * 
18 |  */
19 | public class ScheduledMessageCounter extends TimerTask {
20 | 
21 | 	private Counter counter;
22 | 	private SimpleDateFormat tfmt = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss:SSS");
23 | 	private JedisPool pool;
24 | 	
25 | 	public ScheduledMessageCounter(Counter counter, Map conf){
26 | 		super();
27 | 		this.counter = counter;
28 | 		pool = new JedisPool(new JedisPoolConfig(), conf.get("redis_analytics_host").toString());
29 | 	}
30 | 	
31 | 	@Override
32 | 	public void run() {
33 | 		String time_stamp = tfmt.format(Calendar.getInstance().getTime());
34 | 		String msg = String.format("(%s, %s, %d)", time_stamp, counter.hashCode(), counter.count);
35 | 		
36 | 		// publish the result to jedis analytics db
37 |         try (Jedis jedis_client = pool.getResource()) {
38 |         	jedis_client.rpush("percolator", msg);
39 |         }
40 | 		
41 | 		// reset counter
42 | 		counter.count=0;
43 | 	}
44 | 	
45 | }
46 | 


--------------------------------------------------------------------------------
/src/storming_search/submit_search_topology.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export STRAW_CONFIG=`pwd`/../../config/config.properties
3 | /usr/local/storm/bin/storm kill "prod-topology"
4 | /usr/local/storm/bin/storm deactivate "prod-topology"
5 | echo "USING CONFIG FILE: $STRAW_CONFIG"
6 | /usr/local/storm/bin/storm jar target/storming-search-0.0.1.jar straw.storm.StreamingSearchTopology "prod-topology"
7 | 
8 | 


--------------------------------------------------------------------------------
/test/launch_luwak_test_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | #   Run the luwak search topology on a local machine for testing
 5 | #
 6 | 
 7 | (cd ../util && \
 8 |     ./stage_demo_mode.sh && \
 9 |     ./kafka_add_documents.sh
10 | )
11 | (cd ../src/luwak_search && \
12 |     mvn clean && \
13 |     mvn package && \
14 |     ./run_luwak_topology.sh)
15 | 


--------------------------------------------------------------------------------
/test/launch_percolator_test_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | #   Run the percolator search topology on a local machine for testing
 5 | #
 6 | 
 7 | (cd ../util && \
 8 |     ./stage_demo_mode.sh && \
 9 |     ./kafka_add_documents.sh
10 | )
11 | (cd ../src/storming_search && \
12 |     mvn clean && \
13 |     mvn package && \
14 |     ./run_search_topology.sh)
15 | 


--------------------------------------------------------------------------------
/util/docker_elasticsearch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | #   Simple elasticsearch docker container for testing purposes
 4 | #   NOTE: Data does NOT pesist here since we don't care to mount a volume
 5 | 
 6 | docker stop elasticsearch
 7 | docker rm elasticsearch
 8 | docker run --name elasticsearch -d -p 9200:9200 -p 9300:9300 elasticsearch:latest
 9 | echo "Waiting 30 seconds for elasticsearch to be available"
10 | sleep 30
11 | echo "adding a documents index..."
12 | curl -XPUT localhost:9200/documents -d '{
13 |     "mappings": {
14 |         "document": {
15 |             "properties": {
16 |                 "text": {
17 |                     "type": "string"
18 |                 }
19 |             }
20 |         }
21 |     }
22 | }'
23 | echo "done"
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/util/elasticsearch_index_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | #   This script is a simple introduction to the python elasticsearch API. 
 3 | #
 4 | #   This script will populate an elasticsearch index from a file and then give a simple command line query interface.
 5 | #   Each line of the input file will be mapped into a JSON document of the form { "text": "my file line..." } and added
 6 | #   to the index. 
 7 | #
 8 | #   You can use Docker to spin up a local elasticsearch instance to play around with, e.g.
 9 | #   docker run --name elasticsearch -d -p 9200:9200 elasticsearch:latest
10 | #
11 | import argparse, elasticsearch, json
12 | from elasticsearch import Elasticsearch
13 | from elasticsearch.helpers import bulk
14 | 
15 | if __name__=="__main__":
16 | 
17 |     # argument help
18 |     parser = argparse.ArgumentParser(description='Add lines from a file to a simple text Elasticsearch index.')
19 |     parser.add_argument('file', help='Name of file to parse, e.g. /usr/share/dict/american-english')
20 |     parser.add_argument('host', help='Elasticsearch host.')
21 |     parser.add_argument('-p','--port', default=9200, help='port, default is 9200')
22 |     args = parser.parse_args()
23 | 
24 |     # index and document type constants
25 |     INDEX_NAME = "documents"
26 |     TYPE = "document"
27 | 
28 |     # get a client
29 |     es = Elasticsearch(hosts=[{"host":args.host, "port":args.port}])
30 | 
31 |     # create an index, ignore if it exists already
32 |     es.indices.create(index='documents', ignore=400)
33 | 
34 |     # add a single document
35 |     es.create(index='documents', doc_type='document', body={ 'text': "hello message!"})
36 | 
37 |     # json-ize the lines in the file
38 |     def make_documents(f):
39 |         for l in f:
40 |             doc = {
41 |                     '_op_type': 'create',
42 |                     '_index': INDEX_NAME,
43 |                     '_type': TYPE,
44 |                     '_source': {'text': l.strip() }
45 |             }
46 |             yield( doc )            
47 |         
48 |     # put documents in index in bulk
49 |     with open(args.file, "r") as f:            
50 |         bulk(es, make_documents(f))
51 | 
52 |     # count the matches
53 |     count = es.count(index=INDEX_NAME, doc_type=TYPE, body={ "query": {"match_all" : { }}})
54 | 
55 |     # now we can do searches.
56 |     print("Ok. I've got an index of {0} documents. Let's do some searches...".format(count['count']))
57 |     while True:
58 |         try:
59 |             query = input("Enter a search: ")
60 |             result = es.search(index=INDEX_NAME, doc_type=TYPE, body={"query": {"match": {"text": query.strip()}}})
61 |             if result.get('hits') is not None and result['hits'].get('hits') is not None:
62 |                 print(result['hits']['hits'])
63 |             else:
64 |                 print({})
65 |         except(KeyboardInterrupt):
66 |             break
67 | 


--------------------------------------------------------------------------------
/util/elasticsearch_percolator_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | #   This script is a simple introduction to Elasticsearch percolators
 3 | #
 4 | #
 5 | #   You can use Docker to spin up a local elasticsearch instance to play around with, e.g.
 6 | #   docker run --name elasticsearch -d -p 9200:9200 elasticsearch:latest
 7 | #
 8 | import argparse, elasticsearch, json
 9 | from elasticsearch import Elasticsearch
10 | from elasticsearch.helpers import bulk
11 | import time
12 | 
13 | def perco_parse(result):
14 |     # take the first match
15 |     if result.get('matches') is not None and len(result['matches'])>0:
16 |         return([int(r['_id']) for r in result['matches']][0])
17 | 
18 | if __name__=="__main__":
19 | 
20 |     # argument help
21 |     parser = argparse.ArgumentParser(description='Create and test Elasticsearch percolators.')
22 |     parser.add_argument('file', help='File of tweets, one json doc per line.')
23 |     parser.add_argument('host', help='Elasticsearch host.')
24 |     parser.add_argument('-p','--port', default=9200, help='port, default is 9200')
25 |     args = parser.parse_args()
26 | 
27 |     # index and document type constants
28 |     INDEX_NAME = "documents"
29 |     TYPE = "document"
30 | 
31 |     # get a client
32 |     es = Elasticsearch(hosts=[{"host":args.host, "port":args.port}])
33 | 
34 |     # create an index, ignore if it exists already
35 |     es.indices.delete(index='documents', ignore=400)
36 |     es.indices.create(index='documents', ignore=400, body={
37 |           "mappings": {
38 |             "document": {
39 |               "properties": {
40 |                 "message": {
41 |                   "type": "string"
42 |                 }
43 |               }
44 |             }
45 |           }
46 |         }
47 |     )
48 | 
49 |     ###########################
50 |     # add some percolators
51 |     ###########################
52 |     query_table=[]
53 |     queries = ['new york', 'facebook', 'cheese', 'mountain', 'zoology', 'artist', 'tech', 'big data']
54 |     for q in queries:
55 |         es.create(index='documents', doc_type='.percolator', body={'query': {'match': {'message': q}}}, id=len(query_table))
56 |         query_table.append(q)
57 | 
58 |     # now we can do some stream searches.
59 |     counter = 0
60 |     with open(args.file, 'rb') as f:
61 |         for line in f:
62 |             counter+=1
63 |             try:
64 |                 tweet=json.loads(line.decode('utf-8').strip())
65 |                 msg = tweet['text']
66 |                 perco_match = perco_parse(es.percolate(index='documents', doc_type='document', body={'doc':{'message':msg}}))
67 |                 if perco_match is not None:
68 |                     print("{0}:{1}:{2}".format(counter, query_table[perco_match], msg))
69 |             except(ValueError) as e:
70 |                 print("BAD VALUE")
71 |                     
72 | 
73 | 


--------------------------------------------------------------------------------
/util/kafka_add_documents.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # add data to kafka
4 | (cd ../src/kafka_stream_eater \
5 |   && ./kafka_stream_producer.py ../../data/tweets.big.sample localhost documents --delay 2 \
6 | )
7 | 
8 | 


--------------------------------------------------------------------------------
/util/kafka_add_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -ne 1 ]; then
 4 |     echo "USAGE: $0 [number of queries]"
 5 | else
 6 |     echo "Posting $1 queries to Kafka"
 7 | 
 8 |     # take top n from bigrams file
 9 |     head -n $1 ../data/queries.bigrams > queries.tmp
10 | 
11 |     # add data to kafka
12 |     ../src/kafka_stream_eater/kafka_stream_producer.py queries.tmp localhost queries
13 | 
14 |     rm queries.tmp
15 | fi
16 | 


--------------------------------------------------------------------------------
/util/query_maker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | """
 3 | This script suggests lucene queries based on bigram collocations in a sample tweet file
 4 | """
 5 | import nltk, argparse, re, json
 6 | from nltk.collocations import *
 7 | from nltk.corpus import stopwords
 8 | from nltk.tokenize import word_tokenize
 9 | 
10 | # not a robust RE but will probably work fine here
11 | link_re = re.compile(r"http[s]*://.*?($|\s)")
12 | p_re = re.compile(r"[!.?;,]")
13 | 
14 | if __name__=="__main__":
15 | 
16 |     # arg parsing
17 |     parser = argparse.ArgumentParser(description="Generate lucene queries from collocation data in a sample file")
18 |     parser.add_argument("input",  help="File containing text corpus.")
19 |     parser.add_argument("output", help="An output file.")
20 |     parser.add_argument("-n","--number", default=100, help="Size of nbest list.")
21 |     args = parser.parse_args()
22 | 
23 |     # extraction
24 |     count=0
25 |     docs = []
26 |     mystopwords = stopwords.words("english")
27 |     mystopwords.extend(['rt'])
28 |     with open(args.input, "rb") as f:
29 |         for l in f:
30 |             count+=1
31 |             try:
32 |                 tweet = json.loads(l.decode("utf-8").strip())
33 |             except ValueError:
34 |                 print("Bad tweet found at line {0}.".format(count))
35 |                 tweet = {}
36 | 
37 |             if tweet.get("text"):
38 |                 tweet = tweet["text"]
39 |                 tweet = tweet.lower()
40 |                 tweet = re.sub(link_re, "", tweet).strip().replace("\"","")
41 |                 tweet = re.sub(p_re, "", tweet)
42 |                 # generally, word-tokenizer expects one sentance inputs.  Tweets might not satisfy that..but probably
43 |                 # doesn't matter for what we are doing here.
44 |                 tokens = [t.replace("'","") for t in tweet.split() if t not in mystopwords]
45 |                 docs.append(tokens)
46 |             
47 |     # get the nbest list
48 |     bigram_measures = nltk.collocations.BigramAssocMeasures()
49 |     finder = BigramCollocationFinder.from_documents(docs)
50 |     finder.apply_freq_filter(5)
51 |     nbest = finder.nbest(bigram_measures.raw_freq, int(args.number))
52 | 
53 |     # create a file of queries
54 |     #for k,v in finder.ngram_fd.items():
55 |     #    print(k,v)
56 | 
57 |     with open(args.output, "w") as f:
58 |         for q in nbest:
59 |             query = { "type":"terms-query", "terms":list(q), "minimum-match":len(q) }
60 |             f.write(json.dumps(query) + "\n")
61 | 


--------------------------------------------------------------------------------
/util/redis_pub_sub_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | #   Simple example of redis pubsub management in Python.
 4 | #
 5 | import redis
 6 | from time import sleep
 7 | 
 8 | if __name__=="__main__":
 9 | 
10 |     # open the connection
11 |     pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
12 |     r = redis.StrictRedis(connection_pool=pool)
13 |     p = r.pubsub(ignore_subscribe_messages=True)
14 | 
15 |     # what to do with the messages?
16 |     def message_handler(message):
17 |         print('MSG:', message['data'])
18 |     
19 |     query = raw_input("Please enter the topic you'd like to follow: ")
20 | 
21 |     # subscribe to first topic in background thread
22 |     queries = {query: message_handler}
23 |     p.subscribe(**queries)
24 |     thread = p.run_in_thread(sleep_time=0.001)
25 |     query = None
26 | 
27 |     # listen for a new query, if we get one then stop the running thread
28 |     # and start a new one with an updated set of subscriptions
29 |     # warning, we could get duplicates in the time it takes to bring up the new thread
30 |     while True:
31 |         if query is None:
32 |             query = input("Please enter the topic you'd like to follow: ")
33 |         else:
34 |             # the old thread is now out of date (since it doesn't have all our subscriptions)
35 |             thread_stale = thread
36 |             
37 |             # start a new thread with the full set of subscriptions
38 |             queries[query] = message_handler
39 |             query = None
40 |             p.subscribe(**queries)
41 |             thread = p.run_in_thread(sleep_time=0.001)
42 | 
43 |             # now kill off the old thread.
44 |             thread_stale.stop()    
45 | 


--------------------------------------------------------------------------------
/util/stage_demo_mode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # unpack local data
 4 | ( cd ../data/ &&
 5 | gunzip tweets.big.sample.gz -f -k
 6 | )
 7 | 
 8 | 
 9 | # start kafka
10 | (cd ../src/kafka_stream_eater/third_party/kafka-docker-master && docker-compose stop && docker-compose rm && docker-compose up -d --force-recreate)
11 | 
12 | # start elasticsearch
13 | ./docker_elasticsearch.sh
14 | 
15 | 


--------------------------------------------------------------------------------
/util/tweet_sampler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | '''
  3 | Sample from the twitter API and post results to a file or to Kafka.
  4 | 
  5 | To use, set credientials as enviornment variables, e.g.
  6 | 
  7 | export TWITTER_ACCESS_TOKEN=...
  8 | 
  9 | or 
 10 | 
 11 | source myfile
 12 | 
 13 | where myfile exports the authorization variables 
 14 | '''
 15 | 
 16 | import twython, json, re, argparse, subprocess, os, sys, time
 17 | from socket import timeout
 18 | from kafka import SimpleProducer, KafkaClient
 19 | 
 20 | ####################
 21 | #    Constants
 22 | ####################
 23 | access_token = os.environ["TWITTER_ACCESS_TOKEN"]
 24 | access_token_secret = os.environ["TWITTER_SECRET_TOKEN"] 
 25 | consumer_key = os.environ["TWITTER_CONSUMER_TOKEN"]
 26 | consumer_secret = os.environ["TWITTER_CONSUMER_SECRET"]
 27 | 
 28 | class StrawStreamer(twython.TwythonStreamer):
 29 | 
 30 |     def __init__(self, APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET, outfile):
 31 |         super(StrawStreamer, self).__init__(APP_KEY, APP_SECRET,OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
 32 |         self.outfile=outfile
 33 | 
 34 |     def on_success(self, data):
 35 |         if 'text' in data:
 36 |             self.outfile.write((json.dumps(data)+u'\n').encode('utf-8'))
 37 | 
 38 |     def on_error(self, status_code, data):
 39 |         print(status_code)
 40 | 
 41 | class KafkaStrawStreamer(twython.TwythonStreamer):
 42 |     def __init__(self, APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET, host, port):
 43 |         super(KafkaStrawStreamer, self).__init__(APP_KEY, APP_SECRET,OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
 44 | 
 45 |         # connect to Kafka
 46 |         print("Connecting to Kafka node {0}:{1}".format(host, port))
 47 |         kafka = KafkaClient("{0}:{1}".format(host, port))
 48 |         self.producer = BufferedSimpleProducer(kafka, 100)
 49 | 
 50 |     def on_success(self, data):
 51 |         # TODO: add message queue so we can pass messages in bulk
 52 |         if 'text' in data:
 53 |             msg = (json.dumps(data)+u'\n').encode('utf-8')
 54 |             self.producer.send_messages(args.topic, msg)
 55 | 
 56 |     def on_error(self, status_code, data):
 57 |         print(status_code)    
 58 | 
 59 | class BufferedSimpleProducer:
 60 |     def __init__(self, kafka, chunk_size):
 61 |         self.producer = SimpleProducer(kafka)
 62 |         self.queues = {}
 63 |         self.chunk_size = chunk_size
 64 | 
 65 |     def send_messages(self, topic, msg):
 66 |         if topic not in self.queues:
 67 |             self.queues[topic]=[]
 68 |         if len(self.queues[topic])<self.chunk_size:
 69 |             self.queues[topic].append(msg)
 70 |         else:
 71 |             self.producer.send_messages(topic, *(self.queues[topic]))
 72 |             print("Sent {0} documents to Kafka.".format(len(self.queues[topic])))
 73 |             self.queues[topic] = []
 74 | 
 75 | if __name__=="__main__":
 76 | 
 77 |     # arg parsing
 78 |     parser = argparse.ArgumentParser(description="Python twitter firehose sampler")
 79 |     parser.add_argument("-f","--file", help="Output will be appended to this file.")
 80 |     parser.add_argument("-k","--kafka", help="A kafka broker node")
 81 |     parser.add_argument("-t","--topic", help="A kafka topic")
 82 |     parser.add_argument("-p","--port", default=9092, help="A kafka port, default 9200")
 83 |     args = parser.parse_args()
 84 |     if args.file is None and args.kafka is None:
 85 |         raise RuntimeError("Need either an output file or a kafka host")
 86 |     if args.kafka is not None and args.topic is None:
 87 |         raise RuntimeError("Need a topic for Kafka")
 88 | 
 89 |     # connect to twitter API
 90 |     twitter = twython.Twython(consumer_key, consumer_secret)
 91 |     
 92 |     # write to file or to Kafka
 93 |     if args.file is not None:
 94 |         with open(args.file, "ab") as f:
 95 |             while True:
 96 |                 try:
 97 |                     stream = StrawStreamer(consumer_key, consumer_secret, access_token, access_token_secret, f)
 98 |                     stream.statuses.sample(language="en")
 99 |                 except timeout as e:
100 |                     print("GOT SOCKET ERROR: {0}".format(e))
101 |                     print("Retrying connection after 500 second wait...")
102 |                     f.flush()
103 |                     time.sleep(500)
104 |     else:
105 |         while True:
106 |             try:
107 |                 stream = KafkaStrawStreamer(consumer_key, consumer_secret, access_token, access_token_secret, args.kafka, args.port)
108 |                 stream.statuses.sample(language="en")
109 |             except timeout as e:
110 |                 print("GOT SOCKET ERROR: {0}".format(e))
111 |                 print("Retrying connection after 500 second wait...")
112 |                 f.flush()
113 |                 time.sleep(500)
114 | 


--------------------------------------------------------------------------------