├── .gitignore ├── LICENSE ├── README.md ├── aws_config ├── .gitignore ├── __init__.py ├── configure │ ├── __init__.py │ ├── config_utils.py │ ├── configure_elasticsearch.py │ ├── configure_flask.py │ ├── configure_kafka.py │ ├── configure_spark.py │ ├── configure_storm.py │ └── templates │ │ ├── elasticsearch.yml │ │ ├── kafka-server-start.sh │ │ ├── kafka.server.properties │ │ ├── spark-env.sh │ │ ├── storm.yaml │ │ └── zoo.cfg ├── create_clusters.py ├── discover.py ├── host_install_scripts │ ├── elasticsearch_install.sh │ ├── flask_install.sh │ ├── kafka_install.sh │ ├── spark_install.sh │ └── storm_install.sh └── straw_service_config.sh ├── config └── config.properties ├── data ├── .gitignore ├── queries.bigrams.gz ├── queries.small ├── tweets.big.sample.gz └── tweets.small ├── local_demo ├── launch_demo_ui.sh ├── launch_local_cluster.sh ├── mock_firehose.sh └── prerequisites.sh ├── src ├── frontend │ ├── app │ │ ├── __init__.py │ │ ├── query_subscriber.py │ │ ├── static │ │ │ ├── assets │ │ │ │ ├── favicon.ico │ │ │ │ └── straw.pdf │ │ │ ├── css │ │ │ │ ├── bootstrap-theme.css │ │ │ │ ├── bootstrap-theme.css.map │ │ │ │ ├── bootstrap-theme.min.css │ │ │ │ ├── bootstrap.css │ │ │ │ ├── bootstrap.css.map │ │ │ │ ├── bootstrap.min.css │ │ │ │ └── theme.css │ │ │ ├── fonts │ │ │ │ ├── glyphicons-halflings-regular.eot │ │ │ │ ├── glyphicons-halflings-regular.svg │ │ │ │ ├── glyphicons-halflings-regular.ttf │ │ │ │ ├── glyphicons-halflings-regular.woff │ │ │ │ └── glyphicons-halflings-regular.woff2 │ │ │ └── js │ │ │ │ ├── bootstrap.js │ │ │ │ ├── bootstrap.min.js │ │ │ │ ├── customize.min.js │ │ │ │ ├── docs.min.js │ │ │ │ ├── ie10-viewport-bug-workaround.js │ │ │ │ ├── jquery.js │ │ │ │ └── npm.js │ │ ├── straw_app.py │ │ ├── templates │ │ │ ├── about.html │ │ │ └── index.html │ │ └── views.py │ ├── launch.sh │ └── run.py ├── kafka_stream_eater │ ├── kafka_stream_consumer.py │ ├── kafka_stream_producer.py │ └── third_party │ │ └── kafka-docker-master │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ ├── broker-list.sh │ │ ├── docker-compose-single-broker.yml │ │ ├── docker-compose.yml │ │ ├── download-kafka.sh │ │ ├── start-kafka-shell.sh │ │ └── start-kafka.sh ├── luwak_search │ ├── .gitignore │ ├── pom.xml │ ├── run_luwak_topology.sh │ ├── src │ │ └── straw │ │ │ └── storm │ │ │ ├── LuwakSearchTopology.java │ │ │ ├── bolt │ │ │ └── LuwakSearchBolt.java │ │ │ └── util │ │ │ ├── ConfigurationManager.java │ │ │ ├── Counter.java │ │ │ ├── LuwakHelper.java │ │ │ ├── RequestsHelper.java │ │ │ └── ScheduledMessageCounter.java │ └── submit_topology.sh └── storming_search │ ├── .gitignore │ ├── pom.xml │ ├── run_search_topology.sh │ ├── src │ └── straw │ │ └── storm │ │ ├── StreamingSearchTopology.java │ │ ├── bolt │ │ └── SearchBolt.java │ │ └── util │ │ ├── ConfigurationManager.java │ │ ├── Counter.java │ │ ├── PercolatorHelper.java │ │ ├── RequestsHelper.java │ │ └── ScheduledMessageCounter.java │ └── submit_search_topology.sh ├── test ├── launch_luwak_test_cluster.sh └── launch_percolator_test_cluster.sh └── util ├── docker_elasticsearch.sh ├── elasticsearch_index_demo.py ├── elasticsearch_percolator_demo.py ├── kafka_add_documents.sh ├── kafka_add_queries.sh ├── query_maker.py ├── redis_pub_sub_demo.py ├── stage_demo_mode.sh └── tweet_sampler.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Ryan Walker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | straw 2 | ================= 3 | A platform for real-time streaming search 4 | 5 | #### Table of Contents 6 | * [Overview](#overview) 7 | * [What's included:](#whats-included) 8 | * [Architecture](#architecture) 9 | * [Getting started](#getting-started) 10 | * [Running locally](#running-locally) 11 | * [Deploy to AWS](#deploy-to-aws) 12 | * [Prerequisites:](#prerequisites) 13 | * [Steps:](#steps) 14 | * [Submitting topologies](#submitting-topologies) 15 | * [Configuring Redis](#configuring-redis) 16 | * [Benchmarking and simulation](#benchmarking-and-simulation) 17 | * [Measuring throughput](#measuring-throughput) 18 | * [Generating/simulating data](#generatingsimulating-data) 19 | 20 | 21 | 22 | 23 | ## Overview 24 | The goal of this project is to provide a clean, scalable architecture for real-time search on streaming data. Additionally, the project contains utilities to provide some very simple throughput benchmarking of Elasticsearch Percolators vs Lucence-Luwak. A full writeup of the project can be found at: 25 | 26 | http://blog.ryanwalker.us/2015/11/building-streaming-search-platform.html 27 | 28 | This project was inspired by the following excellent blog posts on streaming search: 29 | - http://www.confluent.io/blog/real-time-full-text-search-with-luwak-and-samza/ 30 | - http://www.flax.co.uk/blog/2015/07/27/a-performance-comparison-of-streamed-search-implementations/ 31 | 32 | I completed this project as a Fellow in the 2015C Insight Data Engineering Silicon Valley program. 33 | 34 | The typical use case for a streaming search system involves many users who are interested in running Lucene style queries against a streaming data source in real-time. For example, investors might want to register queries for positive or negative mentions about companies in the twitter firehose and then receive real-time alerts about matches for their queries. This project provides a base architecture for such a system. In particular, it aims to support: 35 | 36 | - Many diverse users registering queries 37 | - Full Lucene query capabilities against streaming text sources 38 | - Scaling in both the volume of data and in the number of queries 39 | 40 | ## What's included: 41 | - Automated AWS cluster deployment utilities using boto3 42 | - Java based Storm implementation: 43 | - KafkaSpout for query and document spouts 44 | - Two flavors of streaming search bolts: 45 | - [Elasticsearch-Percolators](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-percolate.html) 46 | - Pure Lucene with [Luwak](https://github.com/flaxsearch/luwak) 47 | - Storm topology for streaming search and configuration management 48 | - Scripts to populate document streams, including twitter API sampling utilities 49 | - Simple Python flask web UI 50 | - Testing and other utilities, including Docker components so that the entire topology can run on a local machine 51 | 52 | ## Architecture 53 | The core of the platform is an Apache Storm cluster which parallelizes the work of real-time streaming search. Internally, the Storm cluster consumes messages from a Kafka cluster and these messages are distributed to bolts which each contain a Lucene-Luwak index. The project contains a demo flask UI which handles subscriptions with a Redis PUBSUB system. 54 | 55 | The key layers of the system are: 56 | 57 | - Real-time ingestion via Kafka from a streaming source (e.g. Twitter firehose) 58 | - Storm cluster to distribute tweets from Kafka to workers. Each worker contains a Lucene instance with Luwak. 59 | - Publish-Subscribe system (Redis) which receives matches and delivers them back to the application server 60 | - Application server (Python Flask) who registers queries from the users and serves matches 61 | 62 | More about the architecture can be found at: 63 | http://straw.ryanwalker.us/about 64 | 65 | ## Getting started 66 | 67 | There are two options for running straw. For development, you can run a mini version of the entire platform on a single local machine. In local mode, dependent services run in Dockers. For production, you can deploy the system to the cloud. The project supports a fully automated deployment to AWS with fully customizable cluster configurations. 68 | 69 | ### Running locally 70 | 71 | Minimum supported requirements: Ubuntu 14.04 with Docker 1.8.0 or better 72 | 73 | UPDATE: I've added utility scripts to make launching the demo mode a bit simpler. Now, you can just do the following steps: 74 | 75 | 1. `cd local_demo` 76 | 2. Install the prerequisites: `./prerequisites.sh` 77 | 3. run `./launch_local_cluster.sh` 78 | 4. In a separate shell, run `./launch_demo_ui.sh` 79 | 5. In a separate shell, run `./mock_firehose.sh` 80 | 6. Open a web browser and point to [http://localhost:5000](http://localhost:5000) 81 | 7. Type "Justin Bieber" or some other common twitter query (only 100k unique documents can be found in the mock stream). 82 | 83 | For reference, here are the old step=by-step launch instructions: 84 | 85 | 1. install [docker-compose](http://docs.docker.com/compose/install/) and redis-server 86 | 2. run util/stage_demo_mode.sh This will create dockers for Kafka with Zookeeper and Elasticsearch and will populate these services with some example data. [BUG: You may have to run this script twice!] 87 | 3. cd src/storming_search OR src/luwak_search depending on which flavor of search you want to build 88 | 4. run `mvn package` 89 | 5. run `./run_luwak_topology.sh`. This will start the local storm cluster with the Luwak topology. 90 | 6. In a separate terminal, start the webserver frontend by calling ./run.py from src/frontend 91 | 7. Open a browser and point to the frontend UI. By default: [http://localhost:5000](http://localhost:5000) 92 | 8. Enter a query that will likely generate lots of hits e.g. "Justin Bieber". Note: there are only 100k sampled tweets included with the repo but there are utility scripts for collecting more. 93 | 9. To start a simulated tweet stream, `cd util` and `./kafka_add_documents.sh`. 94 | 95 | ### Deploy to AWS 96 | #### Prerequisites: 97 | 98 | 1. Install the aws cli: `sudo apt-get install awscli` 99 | 2. Install Python boto3: `sudo pip3 install boto3` 100 | 3. Set your default configurations by calling `aws configure` 101 | 4. Modify the settings in `aws_config/straw_service_config.sh` to your own AWS account information and then 102 | ``` 103 | source straw_service_config.sh 104 | ``` 105 | 106 | ####Steps: 107 | 108 | 1. `cd aws_config` 109 | 2. `./create_clusters.py --help` to get instructions about this AWS creation script and follow instructions. 110 | 3. Once all resources are created, `cd configure`. This directory contains scripts to configure each of the individual services; you'll need to run each of these to configure the resource, e.g. `./configure_elasticsearch`. 111 | 4. Once resources are created, run 112 | ``` 113 | ./discover.py 114 | ``` 115 | to see the list of services and their IPs. 116 | 117 | ####Submitting topologies 118 | To submit or run topologies, you need to install storm on your machine (or, even better, on a dedicated machine within the subnet of the Storm cluster). Install storm as follows: 119 | ``` 120 | sudo apt-get update 121 | sudo apt-get install openjdk-7-jdk 122 | wget http://mirrors.gigenet.com/apache/storm/apache-storm-0.9.5/apache-storm-0.9.5.tar.gz -P ~/Downloads 123 | sudo tar zxvf ~/Downloads/apache-storm*.gz -C /usr/local 124 | sudo mv /usr/local/apache-storm* /usr/local/storm 125 | ``` 126 | Then edit `/usr/local/storm/config/storm.yaml` by adding the line 127 | ```nimbus.host: 10.X.X.X``` 128 | using either your private or public IP for the nimbus node. If you use a public IP, you need to update the security group. If you use a private IP, you need to be running from within the subnet. 129 | 130 | Next, you need to tell storm where all of your cluster resources reside. To do this, 131 | ``` 132 | vi config/config.properties 133 | ``` 134 | Enter the private IPs of your system resources, following this template. We are assuming that all of the resources live on the same subnet in the cluster. 135 | 136 | You should now switch into the source directory of either the Luwak or Elasticsearch topology and build the topology, e.g. 137 | ``` 138 | cd /home/ubuntu/straw/src/luwak_search 139 | mvn clean 140 | mvn package 141 | ``` 142 | Finally, you can submit the topology to the cluster (whose nimbus node was specified in step 5) by executing 143 | ``` 144 | ./submit_topology.sh 145 | ``` 146 | 147 | 148 | ### Configuring Redis 149 | The included webserver and the query result pipeline both rely on Redis as a publish-subscribe system. Redis can also be used to collect the benchmarking statistics for profiling Luwak and Elasticsearch. 150 | 151 | Install redis on the same server as the webserver and modify the bind interface: 152 | ``` 153 | # set bind 0.0.0.0 in redis.conf: 154 | sudo apt-get install redis-server 155 | sudo vi /etc/redis/redis.conf 156 | ``` 157 | If you want to use a separate redis instance for the benchmarking, you should repeat the above step on a different AWS machine and update the global configuration `config/config.properties`. 158 | 159 | ## Benchmarking and simulation 160 | A goal of the straw project was to allow for benchmarking of the Lucene-Luwak package in a distributed context. 161 | 162 | ### Measuring throughput 163 | I measure throughput through the search bolts of the Storm cluster in simple way. Start a stopwatch in a background thread. Each bolt has a counter which get incremented each time a document gets checked against the search engine. When the stopwatch hits 10 seconds, collect the data from each counter, publish the result to a redis DB, and reset the counter. 164 | 165 | ### Generating/simulating data 166 | For benchmarking and simulations, you'll need a way to generate tweets and queries. For this purpose, I've added many tools to the `straw/utils` directory. In particular, the scripts 167 | ``` 168 | ./kafka_add_documents 169 | ./kafka_add_queries 170 | ``` 171 | can be used to add documents and queries from sample files. Some small example data files are found in ```straw/data```. For long running simulation, you can run ```./kafka_add_documents.sh``` in a cronjob, to periodically put documents into the Kafka cluster. NOTE: Kafka has been configured to purge documents after 1 hour. 172 | 173 | You can easily harvest your own tweet data from the Twitter api. Try the following helper script which uses Twython to read from the Twitter streaming sample API: 174 | ``` 175 | ./tweet_sampler.py --help 176 | ``` 177 | You'll need to export your twitter credentials as environment variables to run this and other scripts, e.g. 178 | ``` 179 | source my_twitter_credentials 180 | ``` 181 | where `my_twitter_credentials` looks like 182 | ``` 183 | export TWITTER_ACCESS_TOKEN=... 184 | export TWITTER_SECRET_TOKEN=... 185 | export TWITTER_CONSUMER_TOKEN=... 186 | export TWITTER_CONSUMER_SECRET=... 187 | 188 | ``` 189 | To generate many reasonably complex queries for the benchmarking studies, the included query maker utility might be helpful 190 | ``` 191 | ./query_maker.py --help 192 | ``` 193 | This script takes a sample of tweets and uses NLTK to compute bigram frequencies. The most frequent bigram are then converted into queries that Straw can parse. For ease of use, I've included `data/queries.bigrams` in the repo. This is a collection of 100,000 generated bigram queries collected from a sample of 20 million tweets. 194 | 195 | -------------------------------------------------------------------------------- /aws_config/.gitignore: -------------------------------------------------------------------------------- 1 | *.tmp 2 | -------------------------------------------------------------------------------- /aws_config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/aws_config/__init__.py -------------------------------------------------------------------------------- /aws_config/configure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/aws_config/configure/__init__.py -------------------------------------------------------------------------------- /aws_config/configure/config_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | def quiet_wrap(cmd): 4 | return(" ".join(["nohup",cmd, "< /dev/null > std.out 2> std.err &"])) 5 | -------------------------------------------------------------------------------- /aws_config/configure/configure_elasticsearch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Configure Kafka on ec2 instances 4 | # 5 | import boto3, os, sys 6 | sys.path.append("..") 7 | from botocore.exceptions import ClientError as BotoClientError 8 | from time import sleep 9 | from create_clusters import get_tag, keyfile 10 | 11 | # configuration 12 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('elasticsearch-node')]}] 13 | 14 | if __name__=="__main__": 15 | 16 | # find all the host nodes 17 | ec2 = boto3.resource('ec2') 18 | hosts = [] 19 | private_ips = [] 20 | reservations = ec2.instances.filter( Filters = my_instances_filters ) 21 | for instance in reservations: 22 | print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address)) 23 | hosts.append(instance.public_ip_address) 24 | private_ips.append(instance.private_ip_address) 25 | 26 | if len(hosts) != len(private_ips): 27 | raise(RuntimeError("Host and private ips not consistent!")) 28 | 29 | if len(hosts) == 0: 30 | raise(RuntimeError("No hosts found.")) 31 | 32 | ####################################################################### 33 | # Elasticsearch 34 | ####################################################################### 35 | print("Starting elasticsearch configuration...") 36 | 37 | # create a temporary config file 38 | with open("templates/elasticsearch.yml.tmp", "w") as tmpfile: 39 | with open("templates/elasticsearch.yml","r") as f: 40 | # copy over the template 41 | for l in f: 42 | tmpfile.write(l) 43 | 44 | # add cloud credentials 45 | # hack: boto3 doesn't yet offer a way to access the store configuration values 46 | S = boto3._get_default_session() 47 | profile = S._session.full_config['profiles']['default'] 48 | 49 | # add profile information to elasticsearch config to enable cloud discovery 50 | tmpfile.write("cloud.aws.access_key: {0}\n".format(profile['aws_access_key_id'])) 51 | tmpfile.write("cloud.aws.secret_key: {0}\n".format(profile['aws_secret_access_key'])) 52 | tmpfile.write("cloud.aws.region: {0}\n".format(profile['region'])) 53 | tmpfile.write("discovery.type: ec2\n") 54 | tmpfile.write("discovery.ec2.groups: {0}\n".format(get_tag('elasticsearch-security-group'))) 55 | #tmpfile.write("discovery.ec2.host_type: public_ip\n") 56 | tmpfile.write("cluster.name: {0}\n".format(get_tag('elasticsearch-cluster'))) 57 | 58 | # build the command queue 59 | cmd_str = [] 60 | for h in hosts: 61 | # add commands to queue 62 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:elasticsearch.yml".format(keyfile, tmpfile.name, h)) 63 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv elasticsearch.yml /etc/elasticsearch/elasticsearch.yml".format(keyfile, h)) 64 | 65 | # start each node 66 | cmd_str.extend(["ssh -i {0} ubuntu@{1} \"sudo service elasticsearch start\"".format(keyfile, h) for h in hosts]) 67 | 68 | # execute the remote commands 69 | for cmd in cmd_str: 70 | print(cmd) 71 | res=os.system(cmd) 72 | if res!=0: 73 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 74 | -------------------------------------------------------------------------------- /aws_config/configure/configure_flask.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Configure Kafka on ec2 instances 4 | # 5 | import boto3, os, sys 6 | sys.path.append("..") 7 | from botocore.exceptions import ClientError as BotoClientError 8 | from time import sleep 9 | from create_clusters import get_tag, keyfile 10 | from config_utils import quiet_wrap 11 | 12 | # configuration 13 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('flask-node')]}] 14 | 15 | if __name__=="__main__": 16 | 17 | # find all the host nodes 18 | ec2 = boto3.resource('ec2') 19 | hosts = [] 20 | private_ips = [] 21 | public_dns = [] 22 | reservations = ec2.instances.filter( Filters = my_instances_filters ) 23 | for instance in reservations: 24 | print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address)) 25 | hosts.append(instance.public_ip_address) 26 | private_ips.append(instance.private_ip_address) 27 | public_dns.append(instance.public_dns_name) 28 | 29 | if len(hosts) != len(private_ips): 30 | raise(RuntimeError("Host and private ips not consistent!")) 31 | 32 | if len(hosts) == 0: 33 | raise(RuntimeError("No hosts found.")) 34 | 35 | ####################################################################### 36 | # flask 37 | ####################################################################### 38 | cmd_str = [] 39 | for h in hosts: 40 | print("Starting flask configuration...") 41 | cmd_str.append("(cd ../../src/ && tar -zcvf frontend.tmp.tar.gz frontend)") 42 | cmd_str.append("(cd ../../src/ && scp -i {0} frontend.tmp.tar.gz ubuntu@{1}:)".format(keyfile, h)) 43 | cmd_str.append("(cd ../../src/ && rm frontend.tmp.tar.gz)") 44 | cmd_str.append("ssh -i {0} ubuntu@{1} tar xvf frontend.tmp.tar.gz".format(keyfile, h)) 45 | 46 | # launch webapp 47 | cmd_str.append("ssh -i {0} ubuntu@{1} \"{2}\"".format(keyfile, h, quiet_wrap("sudo ./frontend/run.py"))) 48 | 49 | # execute the remote commands 50 | for cmd in cmd_str: 51 | print(cmd) 52 | res=os.system(cmd) 53 | if res!=0: 54 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 55 | 56 | for a in public_dns: 57 | print("Straw Frontend:\thttp://{0}:5000".format(a)) 58 | 59 | -------------------------------------------------------------------------------- /aws_config/configure/configure_kafka.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Configure Kafka on ec2 instances 4 | # 5 | 6 | import boto3, os, sys 7 | from botocore.exceptions import ClientError as BotoClientError 8 | from time import sleep 9 | sys.path.append("..") 10 | from create_clusters import get_tag, keyfile 11 | 12 | # configuration 13 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('kafka-node')]}] 14 | 15 | if __name__=="__main__": 16 | 17 | # find all the host nodes 18 | ec2 = boto3.resource('ec2') 19 | hosts = [] 20 | private_ips = [] 21 | reservations = ec2.instances.filter( Filters = my_instances_filters ) 22 | for instance in reservations: 23 | print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address)) 24 | hosts.append(instance.public_ip_address) 25 | private_ips.append(instance.private_ip_address) 26 | 27 | if len(hosts) != len(private_ips): 28 | raise(RuntimeError("Host and private ips not consistent!")) 29 | 30 | if len(hosts) == 0: 31 | raise(RuntimeError("No hosts found.")) 32 | 33 | ####################################################################### 34 | # ZOOKEEPER 35 | ####################################################################### 36 | # just a little hacking to inject some settings into the templates 37 | # TODO: parallelize this to save some boot time 38 | print("Starting zookeeper configuration...") 39 | zooid = 1 40 | for h in hosts: 41 | cmd_str = [] 42 | with open("templates/zoo.cfg.tmp", "w") as tmpfile: 43 | with open("templates/zoo.cfg","r") as f: 44 | # copy over the template 45 | for l in f: 46 | tmpfile.write(l) 47 | 48 | # append the server settings 49 | host_strings= ["server.{0}={1}:2888:3888".format(i+1,private_ips[i]) for i in range(len(hosts))] 50 | for s in host_strings: 51 | tmpfile.write(s + "\n") 52 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:zoo.cfg".format(keyfile, tmpfile.name, h)) 53 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv zoo.cfg /etc/zookeeper/conf/zoo.cfg".format(keyfile, h)) 54 | 55 | # Assign the zookeeper ids 56 | cmd_str.append("ssh -i {0} ubuntu@{1} \" echo 'echo {2} > /var/lib/zookeeper/myid' | sudo -s\" ".format(keyfile, h, zooid)) 57 | zooid+=1 58 | 59 | # execute the remote commands 60 | for cmd in cmd_str: 61 | print(cmd) 62 | res=os.system(cmd) 63 | if res!=0: 64 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 65 | 66 | # start each zookeeper 67 | cmd_str = ["ssh -i {0} ubuntu@{1} sudo service zookeeper restart".format(keyfile, h) for h in hosts] 68 | for cmd in cmd_str: 69 | print(cmd) 70 | res=os.system(cmd) 71 | if res!=0: 72 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 73 | 74 | ####################################################################### 75 | # Kafka 76 | ####################################################################### 77 | print("Starting kafka configuration...") 78 | broker_id = 0 79 | kafka_start_script = "templates/kafka-server-start.sh" 80 | for i,h in enumerate(hosts): 81 | cmd_str = [] 82 | with open("templates/kafka.server.properties.tmp", "w") as tmpfile: 83 | with open("templates/kafka.server.properties","r") as f: 84 | # copy over the template 85 | for l in f: 86 | tmpfile.write(l) 87 | 88 | # advertise host's private IP 89 | # tmpfile.write("advertised.host.name: {0}\n".format(h)) 90 | 91 | # add zookeeper info 92 | host_strings= ["{0}:2181".format(private_ips[i]) for i in range(len(hosts))] 93 | tmpfile.write("zookeeper.connect={0}\n".format(",".join(host_strings))) 94 | 95 | # set broker id 96 | tmpfile.write("broker.id={0}\n".format(broker_id)) 97 | broker_id+=1 98 | 99 | # add commands to queue 100 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:server.properties".format(keyfile, tmpfile.name, h)) 101 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv server.properties /usr/local/kafka/config/server.properties".format(keyfile, h)) 102 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:kafka-server-start.sh".format(keyfile, kafka_start_script, h)) 103 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv kafka-server-start.sh /usr/local/kafka/bin/kafka-server-start.sh ".format(keyfile, h)) 104 | 105 | # execute the remote commands 106 | for cmd in cmd_str: 107 | print(cmd) 108 | res=os.system(cmd) 109 | if res!=0: 110 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 111 | 112 | # start each kafka 113 | cmd_str = ["ssh -i {0} ubuntu@{1} \"nohup sudo /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties < /dev/null > std.out 2> std.err &\"".format(keyfile, h) for h in hosts] 114 | 115 | for cmd in cmd_str: 116 | print(cmd) 117 | res=os.system(cmd) 118 | if res!=0: 119 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 120 | 121 | 122 | # create the documents and queries topics on one of the Kafka nodes 123 | 124 | h = hosts[0] 125 | cmd_str = ["ssh -i {0} ubuntu@{1} /usr/local/kafka/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor {2} --partitions {3} --topic documents".format(keyfile, h, 2, 5), "ssh -i {0} ubuntu@{1} /usr/local/kafka/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor {2} --partitions {3} --topic queries".format(keyfile, h, 3, 1)] 126 | 127 | for cmd in cmd_str: 128 | print(cmd) 129 | res=os.system(cmd) 130 | if res!=0: 131 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 132 | -------------------------------------------------------------------------------- /aws_config/configure/configure_spark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Configure Kafka on ec2 instances 4 | # 5 | 6 | import boto3, os, sys 7 | from botocore.exceptions import ClientError as BotoClientError 8 | from time import sleep 9 | sys.path.append("..") 10 | from create_clusters import get_tag, keyfile 11 | 12 | # configuration 13 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('spark-node')]}] 14 | 15 | if __name__=="__main__": 16 | 17 | # find all the host nodes 18 | ec2 = boto3.resource('ec2') 19 | hosts = [] 20 | private_ips = [] 21 | reservations = ec2.instances.filter( Filters = my_instances_filters ) 22 | for instance in reservations: 23 | print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address)) 24 | hosts.append(instance.public_ip_address) 25 | private_ips.append(instance.private_ip_address) 26 | 27 | if len(hosts) != len(private_ips): 28 | raise(RuntimeError("Host and private ips not consistent!")) 29 | 30 | if len(hosts) == 0: 31 | raise(RuntimeError("No hosts found.")) 32 | 33 | # Identify master node 34 | master = hosts[0] 35 | ####################################################################### 36 | # Spark requires passwordless SSH 37 | ####################################################################### 38 | cmd_str = [] 39 | 40 | # generate a key on the master 41 | cmd_str.append("ssh -i {0} ubuntu@{1} \"sudo apt-get -y install ssh rsync && ssh-keygen -f ~/.ssh/id_rsa -t rsa -P \'\' \"".format(keyfile, hosts[0])) 42 | 43 | # download public key temporarily 44 | cmd_str.append("scp -i {0} ubuntu@{1}:.ssh/id_rsa.pub {2}".format(keyfile, master, "templates/key.tmp")) 45 | 46 | # auth public key for all hosts 47 | for h in hosts: 48 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:".format(keyfile, "templates/key.tmp", h)) 49 | cmd_str.append("ssh -i {0} ubuntu@{1} \"cat key.tmp >> ~/.ssh/authorized_keys\"".format(keyfile, h)) 50 | 51 | for cmd in cmd_str: 52 | print(cmd) 53 | res=os.system(cmd) 54 | if res!=0: 55 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 56 | 57 | ####################################################################### 58 | # Spark 59 | ####################################################################### 60 | print("Starting Spark configuration...") 61 | for i,h in enumerate(hosts): 62 | cmd_str = [] 63 | with open("templates/spark-env.sh.tmp", "w") as tmpfile: 64 | with open("templates/spark-env.sh","r") as f: 65 | # copy over the template 66 | for l in f: 67 | tmpfile.write(l) 68 | 69 | # advertise host's private IP 70 | tmpfile.write("export SPARK_PUBLIC_DNS={0}\n".format(private_ips[i])) 71 | 72 | # add commands to queue 73 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:".format(keyfile, tmpfile.name, h)) 74 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv spark-env.sh.tmp /usr/local/spark/conf/spark-env.sh".format(keyfile, h)) 75 | 76 | # execute the remote commands 77 | for cmd in cmd_str: 78 | print(cmd) 79 | res=os.system(cmd) 80 | if res!=0: 81 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 82 | 83 | # send the slaves file to the master 84 | with open("templates/slaves.tmp", "w") as tmpfile: 85 | for i,h in enumerate(hosts[1:]): 86 | tmpfile.write("{0}\n".format(private_ips[i])) 87 | 88 | # add commands to queue 89 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:".format(keyfile, tmpfile.name, master)) 90 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv slaves.tmp /usr/local/spark/conf/slaves".format(keyfile, master)) 91 | 92 | # start spark on the master 93 | cmd_str.append("ssh -i {0} ubuntu@{1} /usr/local/spark/sbin/start-all.sh".format(keyfile, master)) 94 | 95 | for cmd in cmd_str: 96 | print(cmd) 97 | res=os.system(cmd) 98 | if res!=0: 99 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /aws_config/configure/configure_storm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Configure Kafka on ec2 instances 4 | # 5 | import boto3, os, argparse, sys 6 | sys.path.append("..") 7 | from botocore.exceptions import ClientError as BotoClientError 8 | from time import sleep 9 | from create_clusters import get_tag, keyfile 10 | from config_utils import quiet_wrap 11 | 12 | # configuration 13 | my_instances_filters = [{ 'Name': 'instance-state-name', 'Values': ['running']}, {'Name':'tag-value', 'Values':[get_tag('storm-node')]}] 14 | 15 | if __name__=="__main__": 16 | 17 | # argument help 18 | parser = argparse.ArgumentParser(description='Configure the storm cluster.') 19 | parser.add_argument('--elasticsearch', help='Collocate elasticsearch with Storm cluster.', action='store_true') 20 | args = parser.parse_args() 21 | 22 | # find all the host nodes 23 | ec2 = boto3.resource('ec2') 24 | hosts = [] 25 | private_ips = [] 26 | reservations = ec2.instances.filter( Filters = my_instances_filters ) 27 | for instance in reservations: 28 | print("ID: {0:<15}\tIP: {1:<15}".format(instance.instance_id, instance.public_ip_address)) 29 | hosts.append(instance.public_ip_address) 30 | private_ips.append(instance.private_ip_address) 31 | 32 | if len(hosts) != len(private_ips): 33 | raise(RuntimeError("Host and private ips not consistent!")) 34 | 35 | if len(hosts) == 0: 36 | raise(RuntimeError("No hosts found.")) 37 | 38 | ####################################################################### 39 | # ZOOKEEPER 40 | ####################################################################### 41 | # just a little hacking to inject some settings into the templates 42 | # TODO: parallelize this to save some boot time 43 | print("Starting zookeeper configuration...") 44 | zooid = 1 45 | for h in hosts: 46 | cmd_str = [] 47 | with open("templates/zoo.cfg.tmp", "w") as tmpfile: 48 | with open("templates/zoo.cfg","r") as f: 49 | # copy over the template 50 | for l in f: 51 | tmpfile.write(l) 52 | 53 | # append the server settings 54 | host_strings= ["server.{0}={1}:2888:3888".format(i+1,private_ips[i]) for i in range(len(hosts))] 55 | for s in host_strings: 56 | tmpfile.write(s + "\n") 57 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:zoo.cfg".format(keyfile, tmpfile.name, h)) 58 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv zoo.cfg /etc/zookeeper/conf/zoo.cfg".format(keyfile, h)) 59 | 60 | # Assign the zookeeper ids 61 | cmd_str.append("ssh -i {0} ubuntu@{1} \" echo 'echo {2} > /var/lib/zookeeper/myid' | sudo -s\" ".format(keyfile, h, zooid)) 62 | zooid+=1 63 | 64 | # execute the remote commands 65 | for cmd in cmd_str: 66 | print(cmd) 67 | res=os.system(cmd) 68 | if res!=0: 69 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 70 | 71 | # start each zookeeper 72 | cmd_str = ["ssh -i {0} ubuntu@{1} sudo service zookeeper restart".format(keyfile, h) for h in hosts] 73 | for cmd in cmd_str: 74 | print(cmd) 75 | res=os.system(cmd) 76 | if res!=0: 77 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 78 | 79 | ####################################################################### 80 | # Storm 81 | ####################################################################### 82 | print("Starting Storm configuration...") 83 | for h in hosts: 84 | cmd_str = [] 85 | with open("templates/storm.yaml.tmp", "w") as tmpfile: 86 | with open("templates/storm.yaml.tmp","r") as f: 87 | # copy over the template 88 | for l in f: 89 | tmpfile.write(l) 90 | 91 | # add zookeeper info 92 | tmpfile.write("storm.zookeeper.servers:\n") 93 | host_strings= [" - \"{0}\"\n".format(private_ips[i]) for i in range(len(hosts))] 94 | for v in host_strings: 95 | tmpfile.write(v) 96 | 97 | # declare the master 98 | tmpfile.write("nimbus.host: \"{0}\"\n".format(private_ips[0])) 99 | 100 | # path to stateful info 101 | tmpfile.write("storm.local.dir: \"/usr/local/storm/local_state\"\n") 102 | 103 | # supervisor info 104 | # supervisor.slots.ports: 105 | # - 6700 106 | # - 6701 107 | # etc.. 108 | tmpfile.write("supervisor.slots.ports:\n") 109 | tmpfile.write("".join([" -{0}\n".format([6700 + i for i in range(len(hosts))])])) 110 | 111 | # add commands to queue 112 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:storm.yaml".format(keyfile, tmpfile.name, h)) 113 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv storm.yaml /usr/local/storm/conf/storm.yaml".format(keyfile, h)) 114 | 115 | if h==hosts[0]: 116 | # start nimbus 117 | cmd_str.append("ssh -i {0} ubuntu@{1} \"{2}\"".format(keyfile, h, quiet_wrap("sudo /usr/local/storm/bin/storm nimbus"))) 118 | # web ui 119 | cmd_str.append("ssh -i {0} ubuntu@{1} \"{2}\"".format(keyfile, h, quiet_wrap("sudo /usr/local/storm/bin/storm ui"))) 120 | else: 121 | cmd_str.append("ssh -i {0} ubuntu@{1} \"{2}\"".format(keyfile, h, quiet_wrap("sudo /usr/local/storm/bin/storm supervisor"))) 122 | 123 | # execute the remote commands 124 | for cmd in cmd_str: 125 | print(cmd) 126 | res=os.system(cmd) 127 | if res!=0: 128 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 129 | 130 | # print some info 131 | # TODO: retag master and open its 8080 port. 132 | print("Master: {0}".format(hosts[0])) 133 | print("\n".join(["Worker: "+ h for h in hosts[1:]])) 134 | 135 | if args.elasticsearch == True: 136 | ####################################################################### 137 | # Collocated Elasticsearch 138 | ####################################################################### 139 | 140 | cmd_str = [] 141 | for h in hosts: 142 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:".format(keyfile, "../host_install_scripts/elasticsearch_install.sh", h)) 143 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo ./elasticsearch_install.sh".format(keyfile, h)) 144 | 145 | # execute the remote commands 146 | for cmd in cmd_str: 147 | print(cmd) 148 | res=os.system(cmd) 149 | if res!=0: 150 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 151 | 152 | print("Starting elasticsearch configuration...") 153 | # create a temporary config file 154 | with open("templates/elasticsearch.yml.tmp", "w") as tmpfile: 155 | with open("templates/elasticsearch.yml","r") as f: 156 | # copy over the template 157 | for l in f: 158 | tmpfile.write(l) 159 | 160 | # add cloud credentials 161 | # hack: boto3 doesn't yet offer a way to access the store configuration values 162 | S = boto3._get_default_session() 163 | profile = S._session.full_config['profiles']['default'] 164 | 165 | # add profile information to elasticsearch config to enable cloud discovery 166 | tmpfile.write("cloud.aws.access_key: {0}\n".format(profile['aws_access_key_id'])) 167 | tmpfile.write("cloud.aws.secret_key: {0}\n".format(profile['aws_secret_access_key'])) 168 | tmpfile.write("cloud.aws.region: {0}\n".format(profile['region'])) 169 | tmpfile.write("discovery.type: ec2\n") 170 | tmpfile.write("discovery.ec2.groups: {0}\n".format(get_tag('elasticsearch-security-group'))) 171 | #tmpfile.write("discovery.ec2.host_type: public_ip\n") 172 | tmpfile.write("cluster.name: {0}\n".format(get_tag('elasticsearch-cluster'))) 173 | 174 | # build the command queue 175 | cmd_str = [] 176 | for h in hosts: 177 | # add commands to queue 178 | cmd_str.append("scp -i {0} {1} ubuntu@{2}:elasticsearch.yml".format(keyfile, tmpfile.name, h)) 179 | cmd_str.append("ssh -i {0} ubuntu@{1} sudo mv elasticsearch.yml /etc/elasticsearch/elasticsearch.yml".format(keyfile, h)) 180 | 181 | # start each node 182 | cmd_str.extend(["ssh -i {0} ubuntu@{1} \"sudo service elasticsearch start\"".format(keyfile, h) for h in hosts]) 183 | 184 | # execute the remote commands 185 | for cmd in cmd_str: 186 | print(cmd) 187 | res=os.system(cmd) 188 | if res!=0: 189 | raise(RuntimeError("Something went wrong executing {0} Got exit: {1}".format(cmd, res))) 190 | 191 | 192 | -------------------------------------------------------------------------------- /aws_config/configure/templates/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | ##################### Elasticsearch Configuration Example ##################### 2 | 3 | # This file contains an overview of various configuration settings, 4 | # targeted at operations staff. Application developers should 5 | # consult the guide at . 6 | # 7 | # The installation procedure is covered at 8 | # . 9 | # 10 | # Elasticsearch comes with reasonable defaults for most settings, 11 | # so you can try it out without bothering with configuration. 12 | # 13 | # Most of the time, these defaults are just fine for running a production 14 | # cluster. If you're fine-tuning your cluster, or wondering about the 15 | # effect of certain configuration option, please _do ask_ on the 16 | # mailing list or IRC channel [http://elasticsearch.org/community]. 17 | 18 | # Any element in the configuration can be replaced with environment variables 19 | # by placing them in ${...} notation. For example: 20 | # 21 | #node.rack: ${RACK_ENV_VAR} 22 | 23 | # For information on supported formats and syntax for the config file, see 24 | # 25 | 26 | 27 | ################################### Cluster ################################### 28 | 29 | # Cluster name identifies your cluster for auto-discovery. If you're running 30 | # multiple clusters on the same network, make sure you're using unique names. 31 | # 32 | #cluster.name: elasticsearch 33 | 34 | 35 | #################################### Node ##################################### 36 | 37 | # Node names are generated dynamically on startup, so you're relieved 38 | # from configuring them manually. You can tie this node to a specific name: 39 | # 40 | #node.name: "Franz Kafka" 41 | 42 | # Every node can be configured to allow or deny being eligible as the master, 43 | # and to allow or deny to store the data. 44 | # 45 | # Allow this node to be eligible as a master node (enabled by default): 46 | # 47 | #node.master: true 48 | # 49 | # Allow this node to store data (enabled by default): 50 | # 51 | #node.data: true 52 | 53 | # You can exploit these settings to design advanced cluster topologies. 54 | # 55 | # 1. You want this node to never become a master node, only to hold data. 56 | # This will be the "workhorse" of your cluster. 57 | # 58 | #node.master: false 59 | #node.data: true 60 | # 61 | # 2. You want this node to only serve as a master: to not store any data and 62 | # to have free resources. This will be the "coordinator" of your cluster. 63 | # 64 | #node.master: true 65 | #node.data: false 66 | # 67 | # 3. You want this node to be neither master nor data node, but 68 | # to act as a "search load balancer" (fetching data from nodes, 69 | # aggregating results, etc.) 70 | # 71 | #node.master: false 72 | #node.data: false 73 | 74 | # Use the Cluster Health API [http://localhost:9200/_cluster/health], the 75 | # Node Info API [http://localhost:9200/_nodes] or GUI tools 76 | # such as , 77 | # , 78 | # and 79 | # to inspect the cluster state. 80 | 81 | # A node can have generic attributes associated with it, which can later be used 82 | # for customized shard allocation filtering, or allocation awareness. An attribute 83 | # is a simple key value pair, similar to node.key: value, here is an example: 84 | # 85 | #node.rack: rack314 86 | 87 | # By default, multiple nodes are allowed to start from the same installation location 88 | # to disable it, set the following: 89 | #node.max_local_storage_nodes: 1 90 | 91 | 92 | #################################### Index #################################### 93 | 94 | # You can set a number of options (such as shard/replica options, mapping 95 | # or analyzer definitions, translog settings, ...) for indices globally, 96 | # in this file. 97 | # 98 | # Note, that it makes more sense to configure index settings specifically for 99 | # a certain index, either when creating it or by using the index templates API. 100 | # 101 | # See and 102 | # 103 | # for more information. 104 | 105 | # Set the number of shards (splits) of an index (5 by default): 106 | # 107 | #index.number_of_shards: 5 108 | 109 | # Set the number of replicas (additional copies) of an index (1 by default): 110 | # 111 | #index.number_of_replicas: 1 112 | 113 | # Note, that for development on a local machine, with small indices, it usually 114 | # makes sense to "disable" the distributed features: 115 | # 116 | #index.number_of_shards: 1 117 | #index.number_of_replicas: 0 118 | 119 | # These settings directly affect the performance of index and search operations 120 | # in your cluster. Assuming you have enough machines to hold shards and 121 | # replicas, the rule of thumb is: 122 | # 123 | # 1. Having more *shards* enhances the _indexing_ performance and allows to 124 | # _distribute_ a big index across machines. 125 | # 2. Having more *replicas* enhances the _search_ performance and improves the 126 | # cluster _availability_. 127 | # 128 | # The "number_of_shards" is a one-time setting for an index. 129 | # 130 | # The "number_of_replicas" can be increased or decreased anytime, 131 | # by using the Index Update Settings API. 132 | # 133 | # Elasticsearch takes care about load balancing, relocating, gathering the 134 | # results from nodes, etc. Experiment with different settings to fine-tune 135 | # your setup. 136 | 137 | # Use the Index Status API () to inspect 138 | # the index status. 139 | 140 | 141 | #################################### Paths #################################### 142 | 143 | # Path to directory containing configuration (this file and logging.yml): 144 | # 145 | #path.conf: /path/to/conf 146 | 147 | # Path to directory where to store index data allocated for this node. 148 | # 149 | #path.data: /path/to/data 150 | # 151 | # Can optionally include more than one location, causing data to be striped across 152 | # the locations (a la RAID 0) on a file level, favouring locations with most free 153 | # space on creation. For example: 154 | # 155 | #path.data: /path/to/data1,/path/to/data2 156 | 157 | # Path to temporary files: 158 | # 159 | #path.work: /path/to/work 160 | 161 | # Path to log files: 162 | # 163 | #path.logs: /path/to/logs 164 | 165 | # Path to where plugins are installed: 166 | # 167 | #path.plugins: /path/to/plugins 168 | 169 | 170 | #################################### Plugin ################################### 171 | 172 | # If a plugin listed here is not installed for current node, the node will not start. 173 | # 174 | #plugin.mandatory: mapper-attachments,lang-groovy 175 | 176 | 177 | ################################### Memory #################################### 178 | 179 | # Elasticsearch performs poorly when JVM starts swapping: you should ensure that 180 | # it _never_ swaps. 181 | # 182 | # Set this property to true to lock the memory: 183 | # 184 | #bootstrap.mlockall: true 185 | 186 | # Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set 187 | # to the same value, and that the machine has enough memory to allocate 188 | # for Elasticsearch, leaving enough memory for the operating system itself. 189 | # 190 | # You should also make sure that the Elasticsearch process is allowed to lock 191 | # the memory, eg. by using `ulimit -l unlimited`. 192 | 193 | 194 | ############################## Network And HTTP ############################### 195 | 196 | # Elasticsearch, by default, binds itself to the 0.0.0.0 address, and listens 197 | # on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node 198 | # communication. (the range means that if the port is busy, it will automatically 199 | # try the next port). 200 | 201 | # Set the bind address specifically (IPv4 or IPv6): 202 | # 203 | #network.bind_host: 192.168.0.1 204 | 205 | # Set the address other nodes will use to communicate with this node. If not 206 | # set, it is automatically derived. It must point to an actual IP address. 207 | # 208 | #network.publish_host: 192.168.0.1 209 | 210 | # Set both 'bind_host' and 'publish_host': 211 | # 212 | #network.host: 192.168.0.1 213 | 214 | # Set a custom port for the node to node communication (9300 by default): 215 | # 216 | #transport.tcp.port: 9300 217 | 218 | # Enable compression for all communication between nodes (disabled by default): 219 | # 220 | #transport.tcp.compress: true 221 | 222 | # Set a custom port to listen for HTTP traffic: 223 | # 224 | #http.port: 9200 225 | 226 | # Set a custom allowed content length: 227 | # 228 | #http.max_content_length: 100mb 229 | 230 | # Disable HTTP completely: 231 | # 232 | #http.enabled: false 233 | 234 | 235 | ################################### Gateway ################################### 236 | 237 | # The gateway allows for persisting the cluster state between full cluster 238 | # restarts. Every change to the state (such as adding an index) will be stored 239 | # in the gateway, and when the cluster starts up for the first time, 240 | # it will read its state from the gateway. 241 | 242 | # There are several types of gateway implementations. For more information, see 243 | # . 244 | 245 | # The default gateway type is the "local" gateway (recommended): 246 | # 247 | #gateway.type: local 248 | 249 | # Settings below control how and when to start the initial recovery process on 250 | # a full cluster restart (to reuse as much local data as possible when using shared 251 | # gateway). 252 | 253 | # Allow recovery process after N nodes in a cluster are up: 254 | # 255 | #gateway.recover_after_nodes: 1 256 | 257 | # Set the timeout to initiate the recovery process, once the N nodes 258 | # from previous setting are up (accepts time value): 259 | # 260 | #gateway.recover_after_time: 5m 261 | 262 | # Set how many nodes are expected in this cluster. Once these N nodes 263 | # are up (and recover_after_nodes is met), begin recovery process immediately 264 | # (without waiting for recover_after_time to expire): 265 | # 266 | #gateway.expected_nodes: 2 267 | 268 | 269 | ############################# Recovery Throttling ############################# 270 | 271 | # These settings allow to control the process of shards allocation between 272 | # nodes during initial recovery, replica allocation, rebalancing, 273 | # or when adding and removing nodes. 274 | 275 | # Set the number of concurrent recoveries happening on a node: 276 | # 277 | # 1. During the initial recovery 278 | # 279 | #cluster.routing.allocation.node_initial_primaries_recoveries: 4 280 | # 281 | # 2. During adding/removing nodes, rebalancing, etc 282 | # 283 | #cluster.routing.allocation.node_concurrent_recoveries: 2 284 | 285 | # Set to throttle throughput when recovering (eg. 100mb, by default 20mb): 286 | # 287 | #indices.recovery.max_bytes_per_sec: 20mb 288 | 289 | # Set to limit the number of open concurrent streams when 290 | # recovering a shard from a peer: 291 | # 292 | #indices.recovery.concurrent_streams: 5 293 | 294 | 295 | ################################## Discovery ################################## 296 | 297 | # Discovery infrastructure ensures nodes can be found within a cluster 298 | # and master node is elected. Multicast discovery is the default. 299 | 300 | # Set to ensure a node sees N other master eligible nodes to be considered 301 | # operational within the cluster. This should be set to a quorum/majority of 302 | # the master-eligible nodes in the cluster. 303 | # 304 | #discovery.zen.minimum_master_nodes: 1 305 | 306 | # Set the time to wait for ping responses from other nodes when discovering. 307 | # Set this option to a higher value on a slow or congested network 308 | # to minimize discovery failures: 309 | # 310 | #discovery.zen.ping.timeout: 3s 311 | 312 | # For more information, see 313 | # 314 | 315 | # Unicast discovery allows to explicitly control which nodes will be used 316 | # to discover the cluster. It can be used when multicast is not present, 317 | # or to restrict the cluster communication-wise. 318 | # 319 | # 1. Disable multicast discovery (enabled by default): 320 | # 321 | #discovery.zen.ping.multicast.enabled: false 322 | # 323 | # 2. Configure an initial list of master nodes in the cluster 324 | # to perform discovery when new nodes (master or data) are started: 325 | # 326 | #discovery.zen.ping.unicast.hosts: ["host1", "host2:port"] 327 | 328 | # EC2 discovery allows to use AWS EC2 API in order to perform discovery. 329 | # 330 | # You have to install the cloud-aws plugin for enabling the EC2 discovery. 331 | # 332 | # For more information, see 333 | # 334 | # 335 | # See 336 | # for a step-by-step tutorial. 337 | 338 | # GCE discovery allows to use Google Compute Engine API in order to perform discovery. 339 | # 340 | # You have to install the cloud-gce plugin for enabling the GCE discovery. 341 | # 342 | # For more information, see . 343 | 344 | # Azure discovery allows to use Azure API in order to perform discovery. 345 | # 346 | # You have to install the cloud-azure plugin for enabling the Azure discovery. 347 | # 348 | # For more information, see . 349 | 350 | ################################## Slow Log ################################## 351 | 352 | # Shard level query and fetch threshold logging. 353 | 354 | #index.search.slowlog.threshold.query.warn: 10s 355 | #index.search.slowlog.threshold.query.info: 5s 356 | #index.search.slowlog.threshold.query.debug: 2s 357 | #index.search.slowlog.threshold.query.trace: 500ms 358 | 359 | #index.search.slowlog.threshold.fetch.warn: 1s 360 | #index.search.slowlog.threshold.fetch.info: 800ms 361 | #index.search.slowlog.threshold.fetch.debug: 500ms 362 | #index.search.slowlog.threshold.fetch.trace: 200ms 363 | 364 | #index.indexing.slowlog.threshold.index.warn: 10s 365 | #index.indexing.slowlog.threshold.index.info: 5s 366 | #index.indexing.slowlog.threshold.index.debug: 2s 367 | #index.indexing.slowlog.threshold.index.trace: 500ms 368 | 369 | ################################## GC Logging ################################ 370 | 371 | #monitor.jvm.gc.young.warn: 1000ms 372 | #monitor.jvm.gc.young.info: 700ms 373 | #monitor.jvm.gc.young.debug: 400ms 374 | 375 | #monitor.jvm.gc.old.warn: 10s 376 | #monitor.jvm.gc.old.info: 5s 377 | #monitor.jvm.gc.old.debug: 2s 378 | 379 | ################################## Security ################################ 380 | 381 | # Uncomment if you want to enable JSONP as a valid return transport on the 382 | # http server. With this enabled, it may pose a security risk, so disabling 383 | # it unless you need it is recommended (it is disabled by default). 384 | # 385 | #http.jsonp.enable: true 386 | 387 | 388 | 389 | 390 | -------------------------------------------------------------------------------- /aws_config/configure/templates/kafka-server-start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | export JMX_PORT=${JMX_PORT:-9999} 19 | 20 | if [ $# -lt 1 ]; 21 | then 22 | echo "USAGE: $0 [-daemon] server.properties" 23 | exit 1 24 | fi 25 | base_dir=$(dirname $0) 26 | 27 | if [ "x$KAFKA_LOG4J_OPTS" = "x" ]; then 28 | export KAFKA_LOG4J_OPTS="-Dlog4j.configuration=file:$base_dir/../config/log4j.properties" 29 | fi 30 | 31 | if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then 32 | export KAFKA_HEAP_OPTS="-Xmx1G -Xms1G" 33 | fi 34 | 35 | EXTRA_ARGS="-name kafkaServer -loggc" 36 | 37 | COMMAND=$1 38 | case $COMMAND in 39 | -daemon) 40 | EXTRA_ARGS="-daemon "$EXTRA_ARGS 41 | shift 42 | ;; 43 | *) 44 | ;; 45 | esac 46 | 47 | exec $base_dir/kafka-run-class.sh $EXTRA_ARGS kafka.Kafka $@ 48 | -------------------------------------------------------------------------------- /aws_config/configure/templates/kafka.server.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # see kafka.server.KafkaConfig for additional details and defaults 16 | 17 | ############################# Server Basics ############################# 18 | # SEE END OF FILE 19 | # The id of the broker. This must be set to a unique integer for each broker. 20 | # broker.id=0 21 | 22 | ############################# Socket Server Settings ############################# 23 | 24 | # The port the socket server listens on 25 | port=9092 26 | 27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces 28 | #host.name=localhost 29 | 30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the 31 | # value for "host.name" if configured. Otherwise, it will use the value returned from 32 | # java.net.InetAddress.getCanonicalHostName(). 33 | #advertised.host.name= 34 | 35 | # The port to publish to ZooKeeper for clients to use. If this is not set, 36 | # it will publish the same port that the broker binds to. 37 | #advertised.port= 38 | 39 | # The number of threads handling network requests 40 | num.network.threads=3 41 | 42 | # The number of threads doing disk I/O 43 | num.io.threads=8 44 | 45 | # The send buffer (SO_SNDBUF) used by the socket server 46 | socket.send.buffer.bytes=102400 47 | 48 | # The receive buffer (SO_RCVBUF) used by the socket server 49 | socket.receive.buffer.bytes=102400 50 | 51 | # The maximum size of a request that the socket server will accept (protection against OOM) 52 | socket.request.max.bytes=104857600 53 | 54 | 55 | ############################# Log Basics ############################# 56 | 57 | # A comma seperated list of directories under which to store log files 58 | log.dirs=/tmp/kafka-logs 59 | 60 | # The default number of log partitions per topic. More partitions allow greater 61 | # parallelism for consumption, but this will also result in more files across 62 | # the brokers. 63 | num.partitions=1 64 | 65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown. 66 | # This value is recommended to be increased for installations with data dirs located in RAID array. 67 | num.recovery.threads.per.data.dir=1 68 | 69 | ############################# Log Flush Policy ############################# 70 | 71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync 72 | # the OS cache lazily. The following configurations control the flush of data to disk. 73 | # There are a few important trade-offs here: 74 | # 1. Durability: Unflushed data may be lost if you are not using replication. 75 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush. 76 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 77 | # The settings below allow one to configure the flush policy to flush data after a period of time or 78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis. 79 | 80 | # The number of messages to accept before forcing a flush of data to disk 81 | #log.flush.interval.messages=10000 82 | 83 | # The maximum amount of time a message can sit in a log before we force a flush 84 | log.flush.interval.ms=500 85 | 86 | ############################# Log Retention Policy ############################# 87 | 88 | # The following configurations control the disposal of log segments. The policy can 89 | # be set to delete segments after a period of time, or after a given size has accumulated. 90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens 91 | # from the end of the log. 92 | 93 | # The minimum age of a log file to be eligible for deletion 94 | log.retention.hours=1 95 | 96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining 97 | # segments don't drop below log.retention.bytes. 98 | #log.retention.bytes=1073741824 99 | 100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created. 101 | log.segment.bytes=1073741824 102 | 103 | # The interval at which log segments are checked to see if they can be deleted according 104 | # to the retention policies 105 | log.retention.check.interval.ms=300000 106 | 107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires. 108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction. 109 | log.cleaner.enable=false 110 | 111 | ############################# Zookeeper ############################# 112 | 113 | # Zookeeper connection string (see zookeeper docs for details). 114 | # This is a comma separated host:port pairs, each corresponding to a zk 115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002". 116 | # You can also append an optional chroot string to the urls to specify the 117 | # root directory for all kafka znodes. 118 | 119 | # Timeout in ms for connecting to zookeeper 120 | zookeeper.connection.timeout.ms=6000 121 | 122 | # 123 | # INJECTED CONFIGURATION [SEE ABOVE FOR PARAMETER DETAILS] 124 | # 125 | 126 | 127 | -------------------------------------------------------------------------------- /aws_config/configure/templates/spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This file is sourced when running various Spark programs. 4 | # Copy it as spark-env.sh and edit that to configure Spark for your site. 5 | 6 | # Options read when launching programs locally with 7 | # ./bin/run-example or ./bin/spark-submit 8 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files 9 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node 10 | # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program 11 | # - SPARK_CLASSPATH, default classpath entries to append 12 | 13 | # Options read by executors and drivers running inside the cluster 14 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node 15 | # - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program 16 | # - SPARK_CLASSPATH, default classpath entries to append 17 | # - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data 18 | # - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos 19 | 20 | # Options read in YARN client mode 21 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files 22 | # - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2) 23 | # - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1). 24 | # - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G) 25 | # - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb) 26 | # - SPARK_YARN_APP_NAME, The name of your application (Default: Spark) 27 | # - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’) 28 | # - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job. 29 | # - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job. 30 | 31 | # Options for the daemons used in the standalone deploy mode 32 | # - SPARK_MASTER_IP, to bind the master to a different IP address or hostname 33 | # - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master 34 | # - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") 35 | # - SPARK_WORKER_CORES, to set the number of cores to use on this machine 36 | # - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) 37 | # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker 38 | # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node 39 | # - SPARK_WORKER_DIR, to set the working directory of worker processes 40 | # - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") 41 | # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") 42 | # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") 43 | # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") 44 | # - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers 45 | 46 | # Generic options for the daemons used in the standalone deploy mode 47 | # - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) 48 | # - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) 49 | # - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) 50 | # - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) 51 | # - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) 52 | 53 | export JAVA_HOME=/usr 54 | export SPARK_WORKER_CORES=$(echo $(nproc)*3 | bc) 55 | -------------------------------------------------------------------------------- /aws_config/configure/templates/storm.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | ########### These MUST be filled in for a storm configuration 18 | # storm.zookeeper.servers: 19 | # - "server1" 20 | # - "server2" 21 | # 22 | # nimbus.host: "nimbus" 23 | # 24 | # 25 | # ##### These may optionally be filled in: 26 | # 27 | ## List of custom serializations 28 | # topology.kryo.register: 29 | # - org.mycompany.MyType 30 | # - org.mycompany.MyType2: org.mycompany.MyType2Serializer 31 | # 32 | ## List of custom kryo decorators 33 | # topology.kryo.decorators: 34 | # - org.mycompany.MyDecorator 35 | # 36 | ## Locations of the drpc servers 37 | # drpc.servers: 38 | # - "server1" 39 | # - "server2" 40 | 41 | ## Metrics Consumers 42 | # topology.metrics.consumer.register: 43 | # - class: "backtype.storm.metric.LoggingMetricsConsumer" 44 | # parallelism.hint: 1 45 | # - class: "org.mycompany.MyMetricsConsumer" 46 | # parallelism.hint: 1 47 | # argument: 48 | # - endpoint: "metrics-collector.mycompany.org" 49 | -------------------------------------------------------------------------------- /aws_config/configure/templates/zoo.cfg: -------------------------------------------------------------------------------- 1 | # http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html 2 | 3 | # The number of milliseconds of each tick 4 | tickTime=2000 5 | # The number of ticks that the initial 6 | # synchronization phase can take 7 | initLimit=10 8 | # The number of ticks that can pass between 9 | # sending a request and getting an acknowledgement 10 | syncLimit=5 11 | # the directory where the snapshot is stored. 12 | dataDir=/var/lib/zookeeper 13 | # Place the dataLogDir to a separate physical disc for better performance 14 | # dataLogDir=/disk2/zookeeper 15 | 16 | # the port at which the clients will connect 17 | clientPort=2181 18 | 19 | # To avoid seeks ZooKeeper allocates space in the transaction log file in 20 | # blocks of preAllocSize kilobytes. The default block size is 64M. One reason 21 | # for changing the size of the blocks is to reduce the block size if snapshots 22 | # are taken more often. (Also, see snapCount). 23 | #preAllocSize=65536 24 | 25 | # Clients can submit requests faster than ZooKeeper can process them, 26 | # especially if there are a lot of clients. To prevent ZooKeeper from running 27 | # out of memory due to queued requests, ZooKeeper will throttle clients so that 28 | # there is no more than globalOutstandingLimit outstanding requests in the 29 | # system. The default limit is 1,000.ZooKeeper logs transactions to a 30 | # transaction log. After snapCount transactions are written to a log file a 31 | # snapshot is started and a new transaction log file is started. The default 32 | # snapCount is 10,000. 33 | #snapCount=1000 34 | 35 | # If this option is defined, requests will be will logged to a trace file named 36 | # traceFile.year.month.day. 37 | #traceFile= 38 | 39 | # Leader accepts client connections. Default value is "yes". The leader machine 40 | # coordinates updates. For higher update throughput at thes slight expense of 41 | # read throughput the leader can be configured to not accept clients and focus 42 | # on coordination. 43 | #leaderServes=yes 44 | 45 | # specify all zookeeper servers 46 | # The fist port is used by followers to connect to the leader 47 | # The second one is used for leader election 48 | 49 | 50 | -------------------------------------------------------------------------------- /aws_config/create_clusters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Create the resources for Straw cluster on AWS 4 | # 5 | # RUN aws configure prior to executing this script. 6 | # 7 | # 8 | import boto3, os, argparse 9 | from time import sleep 10 | 11 | ############################# 12 | # CONFIG 13 | ############################# 14 | try: 15 | keyfile = os.environ["AWS_PEM_FILE"] 16 | pemkey=os.environ["PEM_KEY"] 17 | tag_prefix = os.environ["TAG_PREFIX"] 18 | except KeyError as e: 19 | print("Can't find PEM and/or tag ENV variable. You must export values for AWS_PEM FILE and TAG_PREFIX.") 20 | raise e 21 | 22 | # network settings -- only single subnet right now 23 | vpc_cidr = "10.0.0.0/27" 24 | subnet_cidr = "10.0.0.0/27" 25 | 26 | # node settings 27 | kafka_instances=5 28 | elasticsearch_instances=3 29 | storm_instances=7 30 | 31 | # initializtion files 32 | path = "host_install_scripts" 33 | kafka_initfile = os.path.join(path, "kafka_install.sh") 34 | elasticsearch_initfile = os.path.join(path, "elasticsearch_install.sh") 35 | storm_initfile = os.path.join(path, "storm_install.sh") 36 | flask_initfile = os.path.join(path, "flask_install.sh") 37 | spark_initfile = os.path.join(path, "spark_install.sh") 38 | 39 | # base AWS settings 40 | base_aws_image = 'ami-5189a661' 41 | 42 | # services 43 | services = ['kafka', 'elasticsearch', 'storm', 'flask', 'spark'] 44 | 45 | ############################### 46 | # helper methods 47 | def get_tag(name): 48 | # all service tags will be prefixed with the "tag_prefix" value 49 | return (tag_prefix + "-" + name) 50 | 51 | 52 | ############################### 53 | if __name__=="__main__": 54 | 55 | # argument help 56 | parser = argparse.ArgumentParser(description='Launch AWS EC2 instances for the straw cluster.') 57 | parser.add_argument('service', help='Name of service to start one of {0}. Specify \'all\' to launch all services.'.format(services)) 58 | args = parser.parse_args() 59 | 60 | # boto3 api 61 | ec2 = boto3.resource('ec2') 62 | 63 | 64 | ############################################################ 65 | # 66 | # NETWORKING -- common to all services 67 | # 68 | ############################################################ 69 | # check if vpc already exists 70 | vpcid = None 71 | for v in ec2.vpcs.filter(Filters=[{'Name':'tag-value','Values':[get_tag('vpc')]}]): 72 | vpcid = v.id 73 | 74 | # create the vpc 75 | if vpcid is None: 76 | my_vpc = ec2.create_vpc(CidrBlock=vpc_cidr) 77 | vpc = ec2.Vpc(my_vpc.id) 78 | vpc.modify_attribute(VpcId=my_vpc.id, EnableDnsSupport={'Value':True}) 79 | vpc.modify_attribute(VpcId=my_vpc.id, EnableDnsHostnames={'Value':True}) 80 | vpc.create_tags(Tags=[{'Key':'Name', 'Value':get_tag('vpc')}]) 81 | else: 82 | vpc = ec2.Vpc(vpcid) 83 | 84 | # 85 | # Create a single subnet in vpc 86 | # 87 | # subnets 88 | subnetid = None 89 | for v in vpc.subnets.filter(Filters=[{'Name':'tag-value','Values':[get_tag('subnet')]}]): 90 | subnetid = v.id 91 | if subnetid is None: 92 | subnet = vpc.create_subnet(CidrBlock=subnet_cidr) 93 | subnet.create_tags(Tags=[{'Key':'Name', 'Value':get_tag('subnet')}]) 94 | else: 95 | subnet = ec2.Subnet(subnetid) 96 | 97 | # Find the gateway id; gateway is automatically created with the subnet? 98 | gatewayid = None 99 | for v in vpc.internet_gateways.filter(Filters=[{'Name':'tag-value','Values':[get_tag('gateway')]}]): 100 | gatewayid = v.id 101 | if gatewayid is None: 102 | gateway = ec2.create_internet_gateway() 103 | gateway.create_tags(Tags=[{'Key':'Name', 'Value':get_tag('gateway')}]) 104 | gateway.attach_to_vpc(VpcId=vpc.id) 105 | 106 | # 107 | # Create a route table 108 | # 109 | rtid = None 110 | for v in vpc.route_tables.filter(Filters=[{'Name':'tag-value','Values':[get_tag('route_table')]}]): 111 | rtid = v.id 112 | break 113 | if rtid is None: 114 | rt = ec2.create_route_table(VpcId=vpc.id) 115 | rt.associate_with_subnet(SubnetId=subnet.id) 116 | rt.create_route(GatewayId=gateway.id, DestinationCidrBlock='0.0.0.0/0') 117 | rt.create_tags(Tags=[{'Key':'Name', 'Value':get_tag('route_table')}]) 118 | 119 | # 120 | # Create a security group -- just one for the vpc right now. 121 | # 122 | tag = get_tag('security-group') 123 | description = 'A security group for kafka clusters.' 124 | sgid = None 125 | for v in vpc.security_groups.filter(Filters=[{'Name':'group-name','Values':[tag]}]): 126 | sgid = v.id 127 | if sgid is None: 128 | security_group = ec2.create_security_group(GroupName=tag, Description=description, VpcId=vpc.id) 129 | # permissions 130 | IpPermissions=[ 131 | { 132 | 'IpProtocol': 'tcp', 133 | 'FromPort': 0, 134 | 'ToPort': 65535, 135 | 'IpRanges': [ 136 | { 137 | 'CidrIp': '10.0.0.0/16' 138 | }, 139 | ], 140 | }, 141 | { 142 | 'IpProtocol': 'tcp', 143 | 'FromPort': 22, 144 | 'ToPort': 22, 145 | 'IpRanges': [ 146 | { 147 | 'CidrIp': '0.0.0.0/0' 148 | }, 149 | ], 150 | } 151 | ] 152 | security_group.authorize_egress(IpPermissions=IpPermissions) 153 | security_group.authorize_ingress(IpPermissions=IpPermissions) 154 | else: 155 | security_group = ec2.SecurityGroup(sgid) 156 | 157 | ################################################################ 158 | # 159 | # Services 160 | # 161 | ################################################################ 162 | 163 | if args.service.lower() in ['all','kafka']: 164 | ######################################### 165 | # KAFKA CLUSTER 166 | ######################################### 167 | print("Creating a Kafka cluster...") 168 | # 169 | # EC2 Instances 170 | # 171 | shellcodefile=os.path.abspath(kafka_initfile) 172 | shellfile = open(shellcodefile,'r').read() 173 | pemfile =os.path.abspath(keyfile) 174 | instances = ec2.create_instances( 175 | MinCount=kafka_instances, 176 | MaxCount=kafka_instances, 177 | UserData=shellfile, 178 | KeyName=pemkey, 179 | ImageId=base_aws_image, 180 | InstanceType='m4.large', 181 | NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}], 182 | BlockDeviceMappings=[ 183 | { 184 | 'VirtualName': 'ephemeral0', 185 | 'DeviceName': '/dev/sda1', 186 | 'Ebs': { 187 | 'VolumeSize': 128, 188 | 'VolumeType': 'gp2' # standard for magnetic, gp2 for SSD 189 | } 190 | } 191 | ] 192 | ) 193 | 194 | # tag instances and assign a public ip 195 | tag='kafka-node' 196 | print("Sleep 60 seconds to give instances time to configure...") 197 | sleep(60) 198 | for v in instances: 199 | v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}]) 200 | # elastic ip assignment 201 | #address = client.allocate_address() 202 | #client.associate_address(InstanceId=v.instance_id, PublicIp=address['PublicIp']) 203 | print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name)) 204 | 205 | if args.service.lower() in ['all', 'elasticsearch']: 206 | ######################################### 207 | # ELASTICSEARCH CLUSTER 208 | ######################################### 209 | print("Creating an Elasticsearch cluster...") 210 | # 211 | # Create a security group for elasticsearch 212 | # world access to 9200,9300 should modify for production 213 | # 214 | sgid = None 215 | tag = get_tag('elasticsearch-security-group') 216 | description = 'A security group for elasticsearch clusters.' 217 | for v in ec2.security_groups.filter(Filters=[{'Name':'group-name','Values':[tag]}]): 218 | sgid = v.id 219 | if sgid is None: 220 | security_group = ec2.create_security_group(GroupName=tag, Description=description, VpcId=vpc.id) 221 | 222 | # permissions 223 | IpPermissions=[ 224 | { 225 | 'IpProtocol': 'tcp', 226 | 'FromPort': 0, 227 | 'ToPort': 65535, 228 | 'IpRanges': [ 229 | { 230 | 'CidrIp': '10.0.0.0/16' 231 | }, 232 | ], 233 | }, 234 | { 235 | 'IpProtocol': 'tcp', 236 | 'FromPort': 22, 237 | 'ToPort': 22, 238 | 'IpRanges': [ 239 | { 240 | 'CidrIp': '0.0.0.0/0' 241 | } 242 | ] 243 | }, 244 | { 245 | 'IpProtocol': 'tcp', 246 | 'FromPort': 9200, 247 | 'ToPort': 9200, 248 | 'IpRanges': [ 249 | { 250 | 'CidrIp': '0.0.0.0/0' 251 | } 252 | ] 253 | }, 254 | { 255 | 'IpProtocol': 'tcp', 256 | 'FromPort': 9300, 257 | 'ToPort': 9300, 258 | 'IpRanges': [ 259 | { 260 | 'CidrIp': '0.0.0.0/0' 261 | } 262 | ] 263 | } 264 | ] 265 | security_group.authorize_egress(IpPermissions=IpPermissions) 266 | security_group.authorize_ingress(IpPermissions=IpPermissions) 267 | else: 268 | security_group = ec2.SecurityGroup(sgid) 269 | 270 | # 271 | # EC2 Instances 272 | # 273 | shellcodefile=os.path.abspath(elasticsearch_initfile) 274 | shellfile = open(shellcodefile,'r').read() 275 | pemfile =os.path.abspath(keyfile) 276 | instances = ec2.create_instances( 277 | MinCount=elasticsearch_instances, 278 | MaxCount=elasticsearch_instances, 279 | UserData=shellfile, 280 | KeyName=pemkey, 281 | ImageId=base_aws_image, 282 | InstanceType='m4.large', 283 | NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}] 284 | ) 285 | 286 | # tag instances and assign a public ip 287 | tag='elasticsearch-node' 288 | print("Sleep 60 seconds to give instances time to configure...") 289 | sleep(60) 290 | for v in instances: 291 | v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}]) 292 | print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name)) 293 | 294 | if args.service.lower() in ['all', 'storm']: 295 | ######################################### 296 | # STORM CLUSTER 297 | ######################################### 298 | print("Creating a Storm cluster...") 299 | # 300 | # EC2 Instances 301 | # 302 | shellcodefile=os.path.abspath(storm_initfile) 303 | shellfile = open(shellcodefile,'r').read() 304 | pemfile =os.path.abspath(keyfile) 305 | instances = ec2.create_instances( 306 | MinCount=storm_instances, 307 | MaxCount=storm_instances, 308 | UserData=shellfile, 309 | KeyName=pemkey, 310 | ImageId=base_aws_image, 311 | InstanceType='m4.xlarge', 312 | NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}], 313 | BlockDeviceMappings=[ 314 | { 315 | 'VirtualName': 'ephemeral0', 316 | 'DeviceName': '/dev/sda1', 317 | 'Ebs': { 318 | 'VolumeSize': 64, 319 | 'VolumeType': 'gp2' # standard for magnetic, gp2 for SSD 320 | } 321 | } 322 | ] 323 | ) 324 | 325 | # tag instances and assign a public ip 326 | tag='storm-node' 327 | print("Sleep 60 seconds to give instances time to configure...") 328 | sleep(60) 329 | for v in instances: 330 | v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}]) 331 | print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name)) 332 | 333 | if args.service.lower() in ['all', 'spark']: 334 | ######################################### 335 | # Spark 336 | ######################################### 337 | print("Creating Spark Cluster...") 338 | # 339 | # EC2 Instances 340 | # 341 | shellcodefile=os.path.abspath(spark_initfile) 342 | shellfile = open(shellcodefile,'r').read() 343 | pemfile =os.path.abspath(keyfile) 344 | instances = ec2.create_instances( 345 | MinCount=4, 346 | MaxCount=4, 347 | UserData=shellfile, 348 | KeyName=pemkey, 349 | ImageId=base_aws_image, 350 | InstanceType='m4.large', 351 | NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}], 352 | BlockDeviceMappings=[ 353 | { 354 | 'VirtualName': 'ephemeral0', 355 | 'DeviceName': '/dev/sda1', 356 | 'Ebs': { 357 | 'VolumeSize': 32, 358 | 'VolumeType': 'gp2' # standard for magnetic, gp2 for SSD 359 | } 360 | } 361 | ] 362 | ) 363 | 364 | # tag instances and assign a public ip 365 | tag='spark-node' 366 | print("Sleep 60 seconds to give instances time to configure...") 367 | sleep(60) 368 | for v in instances: 369 | v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}]) 370 | print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name)) 371 | 372 | if args.service.lower() in ['all', 'storm']: 373 | ######################################### 374 | # STORM CLUSTER 375 | ######################################### 376 | print("Creating a Storm cluster...") 377 | # 378 | # EC2 Instances 379 | # 380 | shellcodefile=os.path.abspath(storm_initfile) 381 | shellfile = open(shellcodefile,'r').read() 382 | pemfile =os.path.abspath(keyfile) 383 | instances = ec2.create_instances( 384 | MinCount=storm_instances, 385 | MaxCount=storm_instances, 386 | UserData=shellfile, 387 | KeyName=pemkey, 388 | ImageId=base_aws_image, 389 | InstanceType='m4.xlarge', 390 | NetworkInterfaces=[{'SubnetId': subnet.id, 'DeviceIndex':0, 'Groups':[security_group.id], 'AssociatePublicIpAddress':True}], 391 | BlockDeviceMappings=[ 392 | { 393 | 'VirtualName': 'ephemeral0', 394 | 'DeviceName': '/dev/sda1', 395 | 'Ebs': { 396 | 'VolumeSize': 64, 397 | 'VolumeType': 'gp2' # standard for magnetic, gp2 for SSD 398 | } 399 | } 400 | ] 401 | ) 402 | 403 | # tag instances and assign a public ip 404 | tag='storm-node' 405 | print("Sleep 60 seconds to give instances time to configure...") 406 | sleep(60) 407 | for v in instances: 408 | v.create_tags(Tags=[{'Key':'Name', 'Value':get_tag(tag)}]) 409 | print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15}\tDNS: {3:<15}".format(tag, v.instance_id, v.public_ip_address, v.public_dns_name)) 410 | 411 | 412 | -------------------------------------------------------------------------------- /aws_config/discover.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Discover straw cluster resources running on AWS 4 | # 5 | import boto3, argparse 6 | from create_clusters import services, get_tag, keyfile 7 | 8 | class ServicesList: 9 | '''Container class for AWS services info''' 10 | def __init__(self): 11 | ec2 = boto3.resource('ec2') 12 | client= boto3.client('ec2') 13 | filt=[{'Name': 'instance-state-name', 'Values': ['running']},{'Name':'tag-value','Values':[get_tag(s+'-node') for s in services]}] 14 | self.services = [] 15 | for v in ec2.instances.filter(Filters=filt): 16 | self.services.append(v) 17 | 18 | def print(self): 19 | for v in self.services: 20 | print("SERVICE: {0:<15}\tID: {1:<15}\tIP: {2:<15} PRIVATE IP: {3:<15}".format(v.tags[0]['Value'], 21 | v.instance_id, v.public_ip_address, v.private_ip_address)) 22 | 23 | def make_config_file(self, filename): 24 | '''create a straw config file for AWS''' 25 | 26 | def find_first_service(s): 27 | '''find the FIRST listed service with tag post-fix s 28 | NOTE: We implicitly assume that the list of AWS services is fixed. 29 | We should fix that by identifying the leader nodes among each service 30 | type. 31 | ''' 32 | for v in self.services: 33 | if v.tags[0]['Value']==get_tag(s): 34 | return(v.private_ip_address) 35 | 36 | with open(filename,"w") as f: 37 | header = """# 38 | # config for straw.storm application 39 | # 40 | 41 | """ 42 | f.write(header) 43 | elasticsearch = """ 44 | # elasticsearch settings 45 | elasticsearch_host={0} 46 | elasticsearch_port=9300 47 | elasticsearch_cluster_name={1} 48 | index_name=documents 49 | document_type=document 50 | """.format(find_first_service("elasticsearch-node"), get_tag("elasticsearch-cluster")) 51 | f.write(elasticsearch) 52 | 53 | kafka = """ 54 | # kafka settings 55 | zookeeper_host={0} 56 | zookeeper_port=2181 57 | kafka_query_topic=queries 58 | kafka_document_topic=documents 59 | """.format(find_first_service("kafka-node")) 60 | f.write(kafka) 61 | 62 | redis = """ 63 | # redis 64 | redis_host={0} 65 | redis_port=6379 66 | """.format(find_first_service("flask-node")) 67 | f.write(redis) 68 | print("Wrote config file {0}.".format(f.name)) 69 | 70 | if __name__=="__main__": 71 | 72 | # argument help 73 | parser = argparse.ArgumentParser(description='Discover AWS ec2 instances for the straw cluster.') 74 | args = parser.add_argument("--configure", help="Write a configuration file", action="store_true") 75 | args = parser.parse_args() 76 | 77 | # boto3 78 | S = ServicesList() 79 | S.print() 80 | 81 | if args.configure: 82 | S.make_config_file("config.properties.tmp") 83 | -------------------------------------------------------------------------------- /aws_config/host_install_scripts/elasticsearch_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Config/install elasticsearch 1.7 on Ubuntu 14.04 5 | # 6 | # Forked from https://gist.github.com/ricardo-rossi/8265589463915837429d 7 | # and modified by rwalker. 8 | # 9 | # 10 | 11 | ### Agree to stupid oracle license nonsense 12 | ### See http://stackoverflow.com/questions/19275856/auto-yes-to-the-license-agreement-on-sudo-apt-get-y-install-oracle-java7-instal 13 | echo debconf shared/accepted-oracle-license-v1-1 select true | sudo debconf-set-selections 14 | echo debconf shared/accepted-oracle-license-v1-1 seen true | sudo debconf-set-selections 15 | 16 | ### Install Java 8 17 | apt-get install -y python-software-properties 18 | add-apt-repository -y ppa:webupd8team/java 19 | apt-get update 20 | apt-get install -y oracle-java8-installer 21 | 22 | ### Download and install the Public Signing Key 23 | wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch | apt-key add - 24 | 25 | ### Setup Repository 26 | echo "deb http://packages.elastic.co/elasticsearch/1.7/debian stable main" | tee -a /etc/apt/sources.list.d/elk.list 27 | 28 | ### Install Elasticsearch 29 | #apt-get purge elasticsearch -y 30 | apt-get update && sudo apt-get install elasticsearch -y 31 | 32 | ### node discovery plugin for AWS 33 | /usr/share/elasticsearch/bin/plugin install elasticsearch/elasticsearch-cloud-aws/2.5.0 34 | 35 | ### start elasticsearch: 36 | # service elasticsearch start 37 | ### To test: 38 | # curl :9200 39 | 40 | -------------------------------------------------------------------------------- /aws_config/host_install_scripts/flask_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ########################################################## 3 | # Flask Webserver setup 4 | ########################################################## 5 | 6 | # python3 discouraged: http://flask.pocoo.org/docs/0.10/python3/ 7 | sudo apt-get -y update 8 | sudo apt-get install -y python-pip python-dev build-essential 9 | sudo pip install flask 10 | sudo pip install flask-session 11 | 12 | # install redis 13 | sudo apt-get install -y redis-server 14 | sudo apt-get install -y supervisor 15 | -------------------------------------------------------------------------------- /aws_config/host_install_scripts/kafka_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # install java and zookeeper 4 | apt-get update 5 | apt-get install -y default-jre 6 | apt-get install -y zookeeperd 7 | 8 | # install kafka 9 | mkdir -p ~/Downloads 10 | wget "http://mirror.cc.columbia.edu/pub/software/apache/kafka/0.8.2.1/kafka_2.11-0.8.2.1.tgz" -P ~/Downloads 11 | tar zxvf ~/Downloads/kafka_2.11-0.8.2.1.tgz -C /usr/local 12 | mv /usr/local/kafka_2.11-0.8.2.1 /usr/local/kafka 13 | 14 | 15 | -------------------------------------------------------------------------------- /aws_config/host_install_scripts/spark_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ### Agree to stupid oracle license nonsense 4 | ### See http://stackoverflow.com/questions/19275856/auto-yes-to-the-license-agreement-on-sudo-apt-get-y-install-oracle-java7-instal 5 | echo debconf shared/accepted-oracle-license-v1-1 select true | sudo debconf-set-selections 6 | echo debconf shared/accepted-oracle-license-v1-1 seen true | sudo debconf-set-selections 7 | 8 | ### Install Java 8 9 | apt-get update 10 | apt-get install -y python-software-properties 11 | add-apt-repository -y ppa:webupd8team/java 12 | apt-get update 13 | apt-get install -y oracle-java8-installer 14 | 15 | ### 16 | apt-get install -y scala 17 | 18 | # Install sbt 19 | wget https://dl.bintray.com/sbt/debian/sbt-0.13.7.deb -P ~/Downloads 20 | dpkg -i ~/Downloads/sbt-0.13.7.deb 21 | apt-get install sbt 22 | 23 | # Install Spark 24 | wget http://apache.mirrors.tds.net/spark/spark-1.4.1/spark-1.4.1-bin-hadoop2.4.tgz -P ~/Downloads 25 | tar zxvf ~/Downloads/spark-1.4.1-bin-hadoop2.4.tgz -C /usr/local 26 | sudo mv /usr/local/spark-1.4.1-bin-hadoop2.4 /usr/local/spark 27 | sudo chown -R ubuntu /usr/local/spark 28 | 29 | 30 | -------------------------------------------------------------------------------- /aws_config/host_install_scripts/storm_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # install java and zookeeper 4 | apt-get update 5 | apt-get install -y default-jre 6 | apt-get install -y zookeeperd 7 | apt-get install -y supervisor 8 | 9 | # install storm 10 | wget "http://mirrors.gigenet.com/apache/storm/apache-storm-0.9.5/apache-storm-0.9.5.tar.gz" -P ~/Downloads 11 | tar zxvf ~/Downloads/apache-storm*.gz -C /usr/local 12 | mv /usr/local/apache-storm* /usr/local/storm 13 | 14 | # inject some new config 15 | echo "export STORM_HOME=/usr/local/storm" | tee -a /home/ubuntu/.profile /home/ubuntu/.bashrc 16 | echo "export PATH=$PATH:$STORM_HOME/bin" | tee -a /home/ubuntu/.profile /home/ubuntu/.bashrc 17 | 18 | 19 | # create space for local state 20 | mkdir /usr/local/storm/local_state 21 | chown ubuntu /usr/local/storm/local_state 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /aws_config/straw_service_config.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | #Source this file to set enviornment vars for config 3 | export AWS_PEM_FILE=/home/ryan/projects/insight/accounts/rwalker.pem 4 | export PEM_KEY=rwalker 5 | export TAG_PREFIX=rwalker 6 | 7 | -------------------------------------------------------------------------------- /config/config.properties: -------------------------------------------------------------------------------- 1 | # 2 | # config for straw.storm application 3 | # 4 | 5 | # elasticsearch settings 6 | elasticsearch_host=localhost 7 | elasticsearch_port=9300 8 | elasticsearch_cluster_name=elasticsearch 9 | index_name=documents 10 | document_type=document 11 | 12 | # kafka settings 13 | zookeeper_host=127.0.0.1 14 | zookeeper_port=2181 15 | kafka_query_topic=queries 16 | kafka_document_topic=documents 17 | 18 | # redis 19 | redis_host=127.0.0.1 20 | redis_port=6379 21 | 22 | # redis_analytics db 23 | redis_analytics_host=127.0.0.1 24 | redis_analytics_port=6379 25 | 26 | # storm settings 27 | search.bolt.number.tasks=1 28 | search.bolts=5 29 | document.spouts=5 30 | query.spouts=3 31 | workers=6 32 | 33 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | tweets.big.sample 2 | queries.bigrams 3 | -------------------------------------------------------------------------------- /data/queries.bigrams.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/data/queries.bigrams.gz -------------------------------------------------------------------------------- /data/queries.small: -------------------------------------------------------------------------------- 1 | { "type": "terms-query", "terms": [ "coffee", "toast"], "minimum-match": 1 } 2 | { "type": "terms-query", "terms": ["Keith Richards"], "minimum-match": 1 } 3 | { "type": "terms-query", "terms": [ "justin", "beiber"], "minimum-match": 2} 4 | { "type": "terms-query", "terms": ["bonsai","tree"], "minimum-match": 1} 5 | -------------------------------------------------------------------------------- /data/tweets.big.sample.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/data/tweets.big.sample.gz -------------------------------------------------------------------------------- /local_demo/launch_demo_ui.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Launch the local UI for running in DEMO mode 4 | # Visit http://localhost:5000 in a browser to see the results 5 | # 6 | # 7 | ( 8 | cd ../src/frontend && 9 | ./run.py -p 5000 --debug 10 | ) 11 | -------------------------------------------------------------------------------- /local_demo/launch_local_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script helps run the demo mode with the Luwak search topology on a local cluster 4 | # The below commands stage system resources in docker containers and then builds 5 | # the main Luwak topology storm cluster. 6 | # 7 | # You'll want to launch the WEB UI in a seperate terminal/process. 8 | # For this: 9 | # cd src/frontend 10 | # ./run.py. 11 | # See run.py -h for help. 12 | # 13 | # To simulate the twitter firehose, run ./mock_firehose.sh 14 | # 15 | 16 | # startup local resources in docker containers 17 | ( cd ../util && 18 | ./stage_demo_mode.sh 19 | ) 20 | 21 | # build libraries 22 | (cd ../src/luwak_search && 23 | mvn clean && 24 | mvn package 25 | ) 26 | 27 | # launch local storm cluster with luwak topology 28 | (cd ../src/luwak_search && 29 | ./run_luwak_topology.sh 30 | ) 31 | 32 | 33 | -------------------------------------------------------------------------------- /local_demo/mock_firehose.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script simulates the twitter firehose by repeatedly 3 | # adding the collection of 100k tweets included with this repo into Kafka. 4 | # 5 | # data can be found in data/tweets.big.sample 6 | # 7 | ( 8 | cd ../util && 9 | while true ; do 10 | ./kafka_add_documents.sh 11 | echo "Sleeping 10 so I don't swamp your puny local cluster!" 12 | sleep 10 13 | done 14 | ) 15 | -------------------------------------------------------------------------------- /local_demo/prerequisites.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # This script attempts to install necassary dependencies for straw running on Ubuntu 14.04 5 | # 6 | # NOTE: You will need to install docker manually, using at least version 1.8.0. 7 | # 8 | # DOCKER INSTALL INSTRUCTIONS from http://docs.docker.com/engine/installation/ubuntulinux/ 9 | # 10 | # sudo apt-key adv --keyserver hkp://pgp.mit.edu:80 --recv-keys 58118E89F3A912897C070ADBF76221572C52609D 11 | # sudo vi /etc/apt/sources.list.d/docker.list 12 | # Add the line "deb https://apt.dockerproject.org/repo ubuntu-trusty main" and save 13 | # sudo apt-get install docker-engine 14 | # sudo service docker start 15 | # sudo usermod -aG docker $USER 16 | # Log out and log in again 17 | 18 | 19 | # redis 20 | sudo apt-get update 21 | sudo apt-get install redis-server python3-pip python-pip maven openjdk-7-jdk 22 | 23 | # docker compose 24 | ( 25 | curl -L https://github.com/docker/compose/releases/download/1.5.0/docker-compose-`uname -s`-`uname -m` > tmp && 26 | sudo mv tmp /usr/local/bin/docker-compose && 27 | sudo chmod +x /usr/local/bin/docker-compose 28 | ) 29 | 30 | # python2 packages -- flask only recommended for python2 31 | sudo pip install redis flask flask-session kafka-python 32 | 33 | # python3 packages 34 | sudo pip3 install kafka-python 35 | -------------------------------------------------------------------------------- /src/frontend/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/__init__.py -------------------------------------------------------------------------------- /src/frontend/app/query_subscriber.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Consume matches from subscribed queries 4 | # 5 | import redis 6 | from time import sleep 7 | 8 | def message_handler(data, message): 9 | data.append((message['channel'], message['data'])) 10 | 11 | class QuerySubscriber: 12 | 13 | def __init__(self, host, port, msg_handler): 14 | ''' Query subscriber takes an arbitrary msg_handler which is a function 15 | of a single variable message.''' 16 | pool = redis.ConnectionPool(host='localhost', port=6379) 17 | r = redis.Redis(connection_pool=pool) 18 | self.connection = r.pubsub(ignore_subscribe_messages=True) 19 | self.queries = [] 20 | self._thread = None 21 | self.handler = msg_handler 22 | 23 | def add_query(self, query): 24 | # add query to list 25 | self.queries.append(query) 26 | 27 | # stop the existing thread and start new one with the full set of queries 28 | self._update() 29 | 30 | def start(self): 31 | queries = dict((k,v) for (k,v) in [(k,self.handler) for k in self.queries]) 32 | if len(queries)>0: 33 | self.connection.subscribe(**queries) 34 | self._thread = self.connection.run_in_thread(sleep_time=0.001) 35 | else: 36 | self._thread = None 37 | 38 | def _update(self): 39 | # WARNING: We might drop some messages here 40 | if self._thread is not None: 41 | self._thread.stop() 42 | self._thread=None 43 | self.start() 44 | 45 | def close(self): 46 | try: 47 | self._thread.stop() 48 | except: 49 | pass 50 | 51 | if __name__=="__main__": 52 | mydata = [] 53 | subscriber = QuerySubscriber("localhost", 6379, lambda x: message_handler(mydata, x) ) 54 | query = None 55 | while True: 56 | if query is None: 57 | query = raw_input("Please enter the topic you'd like to follow: ") 58 | else: 59 | subscriber.add_query(query) 60 | query = None 61 | print mydata 62 | -------------------------------------------------------------------------------- /src/frontend/app/static/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/assets/favicon.ico -------------------------------------------------------------------------------- /src/frontend/app/static/assets/straw.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/assets/straw.pdf -------------------------------------------------------------------------------- /src/frontend/app/static/css/theme.css: -------------------------------------------------------------------------------- 1 | body { 2 | padding-top: 70px; 3 | padding-bottom: 30px; 4 | } 5 | 6 | .theme-dropdown .dropdown-menu { 7 | position: static; 8 | display: block; 9 | margin-bottom: 20px; 10 | } 11 | 12 | .theme-showcase > p > .btn { 13 | margin: 5px 0; 14 | } 15 | 16 | .theme-showcase .navbar .container { 17 | width: auto; 18 | } 19 | 20 | .highlight { background-color: yellow } 21 | -------------------------------------------------------------------------------- /src/frontend/app/static/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /src/frontend/app/static/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /src/frontend/app/static/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /src/frontend/app/static/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwalk/straw/1940d521538635d5eab394d0ed4c87caf366b0c7/src/frontend/app/static/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /src/frontend/app/static/js/ie10-viewport-bug-workaround.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * IE10 viewport hack for Surface/desktop Windows 8 bug 3 | * Copyright 2014-2015 Twitter, Inc. 4 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 5 | */ 6 | 7 | // See the Getting Started docs for more information: 8 | // http://getbootstrap.com/getting-started/#support-ie10-width 9 | 10 | (function () { 11 | 'use strict'; 12 | 13 | if (navigator.userAgent.match(/IEMobile\/10\.0/)) { 14 | var msViewportStyle = document.createElement('style') 15 | msViewportStyle.appendChild( 16 | document.createTextNode( 17 | '@-ms-viewport{width:auto!important}' 18 | ) 19 | ) 20 | document.querySelector('head').appendChild(msViewportStyle) 21 | } 22 | 23 | })(); 24 | -------------------------------------------------------------------------------- /src/frontend/app/static/js/npm.js: -------------------------------------------------------------------------------- 1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment. 2 | require('../../js/transition.js') 3 | require('../../js/alert.js') 4 | require('../../js/button.js') 5 | require('../../js/carousel.js') 6 | require('../../js/collapse.js') 7 | require('../../js/dropdown.js') 8 | require('../../js/modal.js') 9 | require('../../js/tooltip.js') 10 | require('../../js/popover.js') 11 | require('../../js/scrollspy.js') 12 | require('../../js/tab.js') 13 | require('../../js/affix.js') -------------------------------------------------------------------------------- /src/frontend/app/straw_app.py: -------------------------------------------------------------------------------- 1 | import thread, redis 2 | from kafka import SimpleProducer, KafkaClient 3 | from flask import Flask, session 4 | from flask.ext.session import Session 5 | from query_subscriber import QuerySubscriber 6 | from views import attach_views 7 | from datetime import datetime 8 | 9 | def highlight(word): 10 | return("{0}".format(word)) 11 | 12 | class StrawAppBase: 13 | 14 | def __init__(self, config): 15 | 16 | app = Flask(__name__) 17 | app.secret_key = 'i love to search full text in real time' 18 | 19 | # attach a redis connection pool 20 | app.pool = redis.ConnectionPool(host="localhost", port=6379) 21 | 22 | # user -> channels mapping 23 | app.user_channels = {} 24 | 25 | # how to handle messages that enter the stream from redis pub sub 26 | def redis_message_handler(msg): 27 | redis_connection = redis.Redis(connection_pool=app.pool) 28 | # get channel and content of incoming message 29 | channel = msg['channel'] 30 | data = msg['data'] 31 | 32 | # word highlighting -- TODO: this would be better to do in the search engine! 33 | query = redis_connection.get(channel) 34 | words = list(set(query.split(" "))) 35 | for w in words: 36 | data=data.lower().replace(w.lower(), highlight(w.lower())) 37 | 38 | # find users subscribed to this channel 39 | if app.user_channels.get(channel) is not None: 40 | for user in app.user_channels.get(channel): 41 | redis_connection.lpush(user, data) 42 | else: 43 | # no more users for this channel, unsubscribe from it 44 | redis_connection.unsubscribe(channel) 45 | 46 | # Add Redis query subscriber to app 47 | app.disp = [] 48 | app.subscriber = QuerySubscriber("localhost", 6379, redis_message_handler) 49 | 50 | # setup kafka producer in the app 51 | kafka = KafkaClient("{0}:{1}".format(config["zookeeper_host"], 9092)) 52 | app.producer = SimpleProducer(kafka) 53 | 54 | # add the app 55 | self.app = app 56 | 57 | def clear_user(self, uid): 58 | redis_connection = redis.Redis(connection_pool=self.app.pool) 59 | # print("Trying to clean for user {0}".format(uid)) 60 | # find all the queries to which the user is subscribed 61 | # and remove them from the subscribers list for each query. 62 | for qid in redis_connection.lrange(uid+"-queries", 0, -1): 63 | try: 64 | self.app.user_channels[qid].remove(uid) 65 | except KeyError: 66 | pass 67 | 68 | # remove the user-queries 69 | redis_connection.delete(uid+"-queries") 70 | 71 | # remove the stored results 72 | redis_connection.delete(uid) 73 | 74 | def get_straw_app(config): 75 | base = StrawAppBase(config) 76 | app = base.app 77 | app.clear_user = base.clear_user 78 | attach_views(app) 79 | return app 80 | -------------------------------------------------------------------------------- /src/frontend/app/templates/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Straw -- a platform for streaming search 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 49 | 50 |
51 | 52 | 53 |
54 |

Straw

55 |

Straw is a platform for real-time, full text search on streaming data.

56 |
57 | 58 | 61 | 62 | 65 | 66 | 67 | 68 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /src/frontend/app/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Straw -- a platform for streaming search 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 49 | 50 |
51 | 52 | 53 |
54 |

Straw

55 |

Straw is a platform for real-time, full text search on streaming data.

56 |
57 | 58 | 61 |

Subscribe to real-time alerts from the (simulated) Twitter firehose.

62 |
63 | 64 | 65 | 66 |
67 | 68 | {% if query_list|length>0 %} 69 | 72 | 77 | 80 | 83 | 86 | 104 |
105 | {% endif %} 106 | 107 |
108 | 109 | 110 | 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /src/frontend/app/views.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' 3 | Define the views for the straw web app 4 | ''' 5 | from flask import render_template, session, request, render_template, jsonify, Flask, make_response 6 | from time import sleep 7 | from kafka.common import FailedPayloadsError, NotLeaderForPartitionError, KafkaUnavailableError 8 | import md5, redis 9 | import json, uuid 10 | 11 | MAX_RESULTS = 100 12 | EXPIRATION = 1 13 | def attach_views(app): 14 | 15 | @app.route('/_fetch_messages') 16 | def fetch_messages(): 17 | # get a redis connection 18 | redis_connection = redis.Redis(connection_pool=app.pool) 19 | 20 | # update the query list in the view 21 | if session.get('sid') is not None: 22 | matches = redis_connection.lrange(session.get('sid'), 0, MAX_RESULTS) 23 | return jsonify(result=matches) 24 | 25 | @app.route('/', methods=['GET']) 26 | def index(): 27 | if session.get('sid') is None: 28 | session['sid'] = uuid.uuid4().hex 29 | try: 30 | query_list = session['queries'] 31 | except KeyError: 32 | query_list = [] 33 | return render_template('index.html', query_list=query_list) 34 | 35 | @app.route('/', methods=['POST']) 36 | def search_box_control(): 37 | '''add to or clear the list of queries.''' 38 | 39 | # we need a session 40 | if session.get('sid') is None: 41 | raise RuntimeError("No session.") 42 | sid = session.get('sid') 43 | 44 | # get a redis connection 45 | redis_connection = redis.Redis(connection_pool=app.pool) 46 | 47 | # if clear button pressed: 48 | if 'clear' in request.form: 49 | app.clear_user(session.get('sid')) 50 | if session.has_key('queries'): 51 | del session['queries'] 52 | return render_template("index.html", query_list=[], session=session) 53 | 54 | # create a new query 55 | text = request.form['text'].lower().split(" ") 56 | 57 | # generate a unique query id 58 | msg = {"type":"terms-query","terms":text,"minimum-match":len(text)} 59 | data = json.dumps(msg) 60 | qid = md5.new(data).hexdigest() 61 | query_string = " ".join(text) 62 | 63 | # add the qid and value to the query lookup store 64 | try: 65 | session['queries'].append(query_string) 66 | except KeyError: 67 | # sanity: clear any queries stored for this user but not in the session. 68 | redis_connection.delete(sid+"-queries") 69 | session['queries'] = [query_string] 70 | 71 | # try three times to do the post to kafka. 72 | post_success = False 73 | for i in range(3): 74 | try: 75 | app.producer.send_messages("queries", data) 76 | except (FailedPayloadsError, NotLeaderForPartitionError, KafkaUnavailableError) as e: 77 | # wait a bit and try again 78 | print("Failed to post query {0} to kafka. Try #{1}".format(data, i)) 79 | sleep(0.25) 80 | continue 81 | post_success=True 82 | break 83 | 84 | if post_success==True: 85 | # subscribe the user to the query 86 | try: 87 | app.user_channels[qid].add(sid) 88 | except KeyError: 89 | app.user_channels[qid] = set([sid]) 90 | app.subscriber.add_query(qid) 91 | 92 | # link the id to the query text 93 | redis_connection.set(qid, " ".join(text)) 94 | 95 | # add query to the list of things the user has subscribed to 96 | redis_connection.lpush(sid +"-queries", qid) 97 | 98 | # update the query list in the view 99 | query_list = session["queries"] 100 | return render_template("index.html", query_list=query_list) 101 | 102 | @app.route('/about') 103 | def about(): 104 | return render_template('%s.html' % 'about') 105 | 106 | 107 | @app.route('/straw.pdf') 108 | def pdf(): 109 | return app.send_static_file('assets/straw.pdf') 110 | -------------------------------------------------------------------------------- /src/frontend/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | (cd /home/ubuntu/straw/src/frontend && \ 3 | ./run.py -p 80 ) 4 | sleep 5 5 | 6 | -------------------------------------------------------------------------------- /src/frontend/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from app.straw_app import get_straw_app 3 | import redis 4 | import argparse 5 | 6 | if __name__=="__main__": 7 | 8 | # arg parsing 9 | parser = argparse.ArgumentParser(description="Launch straw webserver frontend") 10 | parser.add_argument("-p", "--port", default=5000, help="port, default 5000") 11 | parser.add_argument("--debug", help="Use flask debug mode, default False.", action="store_true") 12 | args = parser.parse_args() 13 | 14 | with open("../../config/config.properties", "r") as f: 15 | lines = f.readlines() 16 | 17 | config={} 18 | for line in lines: 19 | if line.find("=")!=-1: 20 | ls = line.split("=") 21 | config[ls[0]]=ls[1] 22 | config["debug"]=args.debug 23 | 24 | # get the app and clear the redis db 25 | app = get_straw_app(config) 26 | redis_connection = redis.Redis(connection_pool=app.pool) 27 | redis_connection.flushall() 28 | app.run(host='0.0.0.0', port=int(args.port), debug = args.debug) 29 | 30 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/kafka_stream_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Put documents from the stream into Kafka 4 | # 5 | import argparse 6 | from kafka import KafkaConsumer 7 | 8 | if __name__=="__main__": 9 | 10 | # arg parsing 11 | parser = argparse.ArgumentParser(description="Consume messages from kafka.") 12 | parser.add_argument("host", help="A Kafka host") 13 | parser.add_argument("topic", help="A topic to consume") 14 | parser.add_argument("-p","--port", default="9092", help="A Kafka port, default 9092.") 15 | args = parser.parse_args() 16 | 17 | # get a client 18 | #consumer = KafkaConsumer('documents', group_id='straw', bootstrap_servers=["{0}:{1}".format(args.host, args.port)]) 19 | print("Trying to get messages from topic {0} on {1}:{2}".format(args.topic, args.host, args.port)) 20 | consumer = KafkaConsumer(args.topic, bootstrap_servers=["{0}:{1}".format(args.host, args.port)], auto_offset_reset='smallest') 21 | 22 | # read through the file and send messages to Kafka in chunks 23 | for message in consumer: 24 | # message value is raw byte string -- decode if necessary! 25 | # e.g., for unicode: `message.value.decode('utf-8')` 26 | print("{0}:{1}:{2}: key={3} value={4}".format(message.topic, message.partition, 27 | message.offset, message.key, 28 | message.value.decode('utf-8'))) 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/kafka_stream_producer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Put documents from the stream into Kafka 4 | # 5 | 6 | import argparse 7 | from kafka import SimpleProducer, KafkaClient 8 | from time import sleep 9 | 10 | def chunk_iterable(A,n): 11 | '''An iterable that contains the iterates of A divided into lists of size n. 12 | A iterable 13 | n int, size of chunk 14 | ''' 15 | cnt = 0 16 | chunk = [] 17 | for v in A: 18 | if cnt0: 26 | yield(chunk) 27 | 28 | if __name__=="__main__": 29 | 30 | # arg parsing 31 | parser = argparse.ArgumentParser(description="Feed Kafka a stream from a file") 32 | parser.add_argument("file", help="A file of data, one datum per line") 33 | parser.add_argument("host", help="Public IP address of a Kafka node") 34 | parser.add_argument("topic", help="Kafka topic to feed") 35 | parser.add_argument("-p", "--port", default=9092, help="port for zookeeper, default 9092") 36 | parser.add_argument("-c", "--chunksize", default=100, help="Number of messages to send at one time, default 100") 37 | parser.add_argument("-d", "--delay", default=0, help="Delay in ms between shipment of chunks to Kafka, default 0") 38 | args = parser.parse_args() 39 | 40 | # get a client 41 | print("Connecting to Kafka node {0}:{1}".format(args.host, args.port)) 42 | kafka = KafkaClient("{0}:{1}".format(args.host, args.port)) 43 | producer = SimpleProducer(kafka) 44 | 45 | # read through the file and send messages to Kafka in chunks 46 | with open(args.file, "rb") as f: 47 | for chunk in chunk_iterable(f, args.chunksize): 48 | print("Sending {0} messages to topic {1} on {2}".format(len(chunk), args.topic, args.host)) 49 | producer.send_messages(args.topic, *chunk) 50 | sleep(1.0*int(args.delay)/1000.0) 51 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/third_party/kafka-docker-master/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:trusty 2 | 3 | MAINTAINER Wurstmeister 4 | 5 | ENV KAFKA_VERSION="0.8.2.1" SCALA_VERSION="2.9.2" 6 | 7 | RUN apt-get update && apt-get install -y unzip openjdk-6-jdk wget curl git docker.io jq 8 | 9 | ADD download-kafka.sh /tmp/download-kafka.sh 10 | RUN /tmp/download-kafka.sh 11 | RUN tar xf /tmp/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz -C /opt 12 | 13 | VOLUME ["/kafka"] 14 | 15 | ENV KAFKA_HOME /opt/kafka_${SCALA_VERSION}-${KAFKA_VERSION} 16 | ADD start-kafka.sh /usr/bin/start-kafka.sh 17 | ADD broker-list.sh /usr/bin/broker-list.sh 18 | CMD start-kafka.sh 19 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/third_party/kafka-docker-master/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/third_party/kafka-docker-master/README.md: -------------------------------------------------------------------------------- 1 | kafka-docker 2 | ============ 3 | 4 | Dockerfile for [Apache Kafka](http://kafka.apache.org/) 5 | 6 | The image is available directly from https://registry.hub.docker.com/ 7 | 8 | ##Pre-Requisites 9 | 10 | - install docker-compose [https://docs.docker.com/compose/install/](https://docs.docker.com/compose/install/) 11 | - modify the ```KAFKA_ADVERTISED_HOST_NAME``` in ```docker-compose.yml``` to match your docker host IP (Note: Do not use localhost or 127.0.0.1 as the host ip if you want to run multiple brokers.) 12 | - if you want to customise any Kafka parameters, simply add them as environment variables in ```docker-compose.yml```, e.g. in order to increase the ```message.max.bytes``` parameter set the environment to ```KAFKA_MESSAGE_MAX_BYTES: 2000000```. To turn off automatic topic creation set ```KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'false'``` 13 | 14 | ##Usage 15 | 16 | Start a cluster: 17 | 18 | - ```docker-compose up -d ``` 19 | 20 | Add more brokers: 21 | 22 | - ```docker-compose scale kafka=3``` 23 | 24 | Destroy a cluster: 25 | 26 | - ```docker-compose stop``` 27 | 28 | ##Note 29 | 30 | The default ```docker-compose.yml``` should be seen as a starting point. By default each broker will get a new port number and broker id on restart. Depending on your use case this might not be desirable. If you need to use specific ports and broker ids, modify the docker-compose configuration accordingly, e.g. [docker-compose-single-broker.yml](https://github.com/wurstmeister/kafka-docker/blob/master/docker-compose-single-broker.yml): 31 | 32 | - ```docker-compose -f docker-compose-single-broker.yml up``` 33 | 34 | ##Broker IDs 35 | 36 | If you don't specify a broker id in your docker-compose file, it will automatically be generated based on the name that docker-compose gives the container. This allows scaling up and down. In this case it is recommended to use the ```--no-recreate``` option of docker-compose to ensure that containers are not re-created and thus keep their names and ids. 37 | 38 | 39 | ##Automatically create topics 40 | 41 | If you want to have kafka-docker automatically create topics in Kafka during 42 | creation, a ```KAFKA_CREATE_TOPICS``` environment variable can be 43 | added in ```docker-compose.yml```. 44 | 45 | Here is an example snippet from ```docker-compose.yml```: 46 | 47 | environment: 48 | KAFKA_CREATE_TOPICS: "Topic1:1:3,Topic2:1:1" 49 | 50 | ```Topic 1``` will have 1 partition and 3 replicas, ```Topic 2``` will have 1 partition and 1 replica. 51 | 52 | ##Tutorial 53 | 54 | [http://wurstmeister.github.io/kafka-docker/](http://wurstmeister.github.io/kafka-docker/) 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/third_party/kafka-docker-master/broker-list.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CONTAINERS=$(docker ps | grep 9092 | awk '{print $1}') 4 | BROKERS=$(for CONTAINER in $CONTAINERS; do docker port $CONTAINER 9092 | sed -e "s/0.0.0.0:/$HOST_IP:/g"; done) 5 | echo $BROKERS | sed -e 's/ /,/g' 6 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/third_party/kafka-docker-master/docker-compose-single-broker.yml: -------------------------------------------------------------------------------- 1 | zookeeper: 2 | image: wurstmeister/zookeeper 3 | ports: 4 | - "2181:2181" 5 | kafka: 6 | image: wurstmeister/kafka:0.8.2.0 7 | ports: 8 | - "9092:9092" 9 | links: 10 | - zookeeper:zk 11 | environment: 12 | KAFKA_ADVERTISED_HOST_NAME: 192.168.59.103 13 | volumes: 14 | - /var/run/docker.sock:/var/run/docker.sock 15 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/third_party/kafka-docker-master/docker-compose.yml: -------------------------------------------------------------------------------- 1 | zookeeper: 2 | image: wurstmeister/zookeeper 3 | ports: 4 | - "localhost:2181:2181" 5 | kafka: 6 | build: . 7 | ports: 8 | - "localhost:9092:9092" 9 | links: 10 | - zookeeper:zk 11 | environment: 12 | KAFKA_CREATE_TOPICS: "queries:1:1,documents:1:1" 13 | KAFKA_ADVERTISED_HOST_NAME: "0.0.0.0" 14 | volumes: 15 | - /var/run/docker.sock:/var/run/docker.sock 16 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/third_party/kafka-docker-master/download-kafka.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mirror=$(curl --stderr /dev/null https://www.apache.org/dyn/closer.cgi\?as_json\=1 | jq -r '.preferred') 4 | url="${mirror}kafka/${KAFKA_VERSION}/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz" 5 | wget -q "${url}" -O "/tmp/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz" 6 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/third_party/kafka-docker-master/start-kafka-shell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker run --rm -v /var/run/docker.sock:/var/run/docker.sock -e HOST_IP=$1 -e ZK=$2 -i -t wurstmeister/kafka:0.8.2.0 /bin/bash 3 | -------------------------------------------------------------------------------- /src/kafka_stream_eater/third_party/kafka-docker-master/start-kafka.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ -z "$KAFKA_ADVERTISED_PORT" ]]; then 4 | export KAFKA_ADVERTISED_PORT=$(docker port `hostname` 9092 | sed -r "s/.*:(.*)/\1/g") 5 | fi 6 | if [[ -z "$KAFKA_BROKER_ID" ]]; then 7 | export KAFKA_BROKER_ID=$(docker inspect `hostname` | jq --raw-output '.[0] | .Name' | awk -F_ '{print $3}') 8 | fi 9 | if [[ -z "$KAFKA_LOG_DIRS" ]]; then 10 | export KAFKA_LOG_DIRS="/kafka/kafka-logs-$KAFKA_BROKER_ID" 11 | fi 12 | if [[ -z "$KAFKA_ZOOKEEPER_CONNECT" ]]; then 13 | export KAFKA_ZOOKEEPER_CONNECT=$(env | grep ZK.*PORT_2181_TCP= | sed -e 's|.*tcp://||' | paste -sd ,) 14 | fi 15 | 16 | if [[ -n "$KAFKA_HEAP_OPTS" ]]; then 17 | sed -r -i "s/(export KAFKA_HEAP_OPTS)=\"(.*)\"/\1=\"$KAFKA_HEAP_OPTS\"/g" $KAFKA_HOME/bin/kafka-server-start.sh 18 | unset KAFKA_HEAP_OPTS 19 | fi 20 | 21 | for VAR in `env` 22 | do 23 | if [[ $VAR =~ ^KAFKA_ && ! $VAR =~ ^KAFKA_HOME ]]; then 24 | kafka_name=`echo "$VAR" | sed -r "s/KAFKA_(.*)=.*/\1/g" | tr '[:upper:]' '[:lower:]' | tr _ .` 25 | env_var=`echo "$VAR" | sed -r "s/(.*)=.*/\1/g"` 26 | if egrep -q "(^|^#)$kafka_name=" $KAFKA_HOME/config/server.properties; then 27 | sed -r -i "s@(^|^#)($kafka_name)=(.*)@\2=${!env_var}@g" $KAFKA_HOME/config/server.properties #note that no config values may contain an '@' char 28 | else 29 | echo "$kafka_name=${!env_var}" >> $KAFKA_HOME/config/server.properties 30 | fi 31 | fi 32 | done 33 | 34 | 35 | $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties & 36 | KAFKA_SERVER_PID=$! 37 | 38 | while netstat -lnt | awk '$4 ~ /:9092$/ {exit 1}'; do sleep 1; done 39 | 40 | if [[ -n $KAFKA_CREATE_TOPICS ]]; then 41 | IFS=','; for topicToCreate in $KAFKA_CREATE_TOPICS; do 42 | IFS=':' read -a topicConfig <<< "$topicToCreate" 43 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper $KAFKA_ZOOKEEPER_CONNECT --replication-factor ${topicConfig[2]} --partition ${topicConfig[1]} --topic "${topicConfig[0]}" 44 | done 45 | fi 46 | 47 | wait $KAFKA_SERVER_PID 48 | -------------------------------------------------------------------------------- /src/luwak_search/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .settings/ 3 | .classpath 4 | .project 5 | 6 | /target 7 | -------------------------------------------------------------------------------- /src/luwak_search/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 12 | 14 | 4.0.0 15 | github.com.rwalk333 16 | storming-luwak-search 17 | jar 18 | 0.0.1 19 | storming-search 20 | 21 | 22 | org.apache.lucene 23 | 5.3.0 24 | UTF-8 25 | UTF-8 26 | 1.7 27 | 28 | 29 | 30 | 31 | 32 | 33 | ${lucene.group} 34 | lucene-core 35 | ${lucene.version} 36 | 37 | 38 | ${lucene.group} 39 | lucene-memory 40 | ${lucene.version} 41 | 42 | 43 | ${lucene.group} 44 | lucene-analyzers-common 45 | ${lucene.version} 46 | 47 | 48 | ${lucene.group} 49 | lucene-queries 50 | ${lucene.version} 51 | 52 | 53 | ${lucene.group} 54 | lucene-queryparser 55 | ${lucene.version} 56 | 57 | 58 | 59 | 60 | 61 | 62 | org.apache.storm 63 | storm-kafka 64 | 0.9.2-incubating 65 | 66 | 67 | org.apache.kafka 68 | kafka_2.10 69 | 0.8.2.1 70 | 72 | 73 | 74 | 75 | org.apache.zookeeper 76 | zookeeper 77 | 78 | 79 | log4j 80 | log4j 81 | 82 | 83 | 84 | 85 | org.apache.kafka 86 | kafka-clients 87 | 0.8.2.1 88 | 89 | 90 | 91 | org.apache.storm 92 | storm-core 93 | 0.9.3 94 | 95 | provided 96 | 97 | 98 | redis.clients 99 | jedis 100 | 2.7.2 101 | jar 102 | compile 103 | 104 | 105 | com.github.flaxsearch 106 | luwak 107 | 1.2.0 108 | 109 | 110 | org.json 111 | json 112 | 20131018 113 | 114 | 115 | 116 | src 117 | test/jvm 118 | 119 | 120 | org.apache.maven.plugins 121 | maven-compiler-plugin 122 | 2.5.1 123 | 124 | 1.7 125 | 1.7 126 | 127 | 128 | 129 | org.apache.maven.plugins 130 | maven-shade-plugin 131 | 1.4 132 | 133 | true 134 | 135 | 136 | 137 | package 138 | 139 | shade 140 | 141 | 142 | 143 | 145 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | org.codehaus.mojo 154 | exec-maven-plugin 155 | 1.2.1 156 | 157 | 158 | 159 | exec 160 | 161 | 162 | 163 | 164 | java 165 | true 166 | false 167 | compile 168 | ${storm.topology} 169 | 170 | 171 | 172 | 173 | 174 | 175 | central 176 | Maven Repository Switchboard 177 | default 178 | http://repo1.maven.org/maven2 179 | 180 | false 181 | 182 | 183 | 184 | clojars.org 185 | http://clojars.org/repo 186 | 187 | 188 | mvnrepo 189 | mvnrepo 190 | http://mvnrepository.com 191 | 192 | 193 | mvnrepo2 194 | mvnrepo2 195 | http://repo1.maven.org/maven2/ 196 | 197 | 198 | 199 | -------------------------------------------------------------------------------- /src/luwak_search/run_luwak_topology.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export STRAW_CONFIG=`pwd`/../../config/config.properties 3 | echo "USING CONFIG FILE: $STRAW_CONFIG" 4 | mvn compile exec:java -Dstorm.topology=straw.storm.LuwakSearchTopology 5 | -------------------------------------------------------------------------------- /src/luwak_search/src/straw/storm/LuwakSearchTopology.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package straw.storm; 19 | 20 | import backtype.storm.Config; 21 | import backtype.storm.LocalCluster; 22 | import backtype.storm.StormSubmitter; 23 | import backtype.storm.topology.TopologyBuilder; 24 | import backtype.storm.utils.Utils; 25 | 26 | import storm.kafka.*; 27 | import straw.storm.bolt.LuwakSearchBolt; 28 | import straw.storm.util.ConfigurationManager; 29 | 30 | /** 31 | * This is a basic example of a Storm topology, following the example 32 | * https://github.com/buildlackey/cep/tree/master/storm%2Bkafka 33 | * 34 | */ 35 | public class LuwakSearchTopology { 36 | 37 | public static void main(String[] args) throws Exception { 38 | /* 39 | * Define and packaged a topology to submit to a storm cluster 40 | */ 41 | 42 | 43 | /* 44 | * CONFIGURATION 45 | * TODO: Better config management; should throw meaningful errors 46 | * when a config value is called but not defined. 47 | * 48 | */ 49 | ConfigurationManager config_manager = new ConfigurationManager(); 50 | config_manager.put("document_type", "document_type"); 51 | config_manager.put("kafka_query_topic", "kafka_query_topic"); 52 | config_manager.put("kafka_document_topic", "kafka_document_topic"); 53 | config_manager.put("zookeeper_host", "zookeeper_host"); 54 | config_manager.put("zookeeper_port", "zookeeper_port"); 55 | config_manager.put("redis_host", "redis_host"); 56 | config_manager.put("redis_port", "redis_port"); 57 | config_manager.put("redis_analytics_host", "redis_analytics_host"); 58 | config_manager.put("redis_analytics_port", "redis_analytics_port"); 59 | config_manager.put("search.bolts", "search.bolts"); 60 | config_manager.put("document.spouts", "document.spouts"); 61 | config_manager.put("query.spouts", "query.spouts"); 62 | config_manager.put("workers", "workers"); 63 | config_manager.put("search.bolt.number.tasks", "search.bolt.number.tasks"); 64 | Config config = config_manager.get(); 65 | 66 | /* 67 | * KafkaSpout configuration 68 | */ 69 | // offset management 70 | String zkroot = "/brokers"; // the root path in Zookeeper for the spout to store the consumer offsets 71 | String zkid = "ids"; // an id for this consumer for storing the consumer offsets in Zookeeper 72 | 73 | // set zookeeper host 74 | BrokerHosts brokerHosts = new ZkHosts( String.format("%s:%s", 75 | config.get("zookeeper_host").toString(), 76 | config.get("zookeeper_port")).toString(), zkroot); 77 | 78 | // kafka topics 79 | String query_topic = config.get("kafka_query_topic").toString(); 80 | String document_topic = config.get("kafka_document_topic").toString(); 81 | 82 | // define spouts 83 | SpoutConfig query_spout_config = new SpoutConfig(brokerHosts, query_topic, zkroot, zkid); 84 | query_spout_config.forceFromStart=true; 85 | SpoutConfig document_spout_config = new SpoutConfig(brokerHosts, document_topic, zkroot, zkid); 86 | document_spout_config.forceFromStart=true; 87 | 88 | // add a string scheme to the spouts 89 | document_spout_config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme()); 90 | query_spout_config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme()); 91 | 92 | // topology definition 93 | // distribute documents randomly to bolts; queries are localized in memory at the bolt so we need to broadcast them 94 | TopologyBuilder builder = new TopologyBuilder(); 95 | builder.setSpout("query-spout", new KafkaSpout(query_spout_config), Integer.parseInt(config.get("query.spouts").toString())); 96 | builder.setSpout("document-spout", new KafkaSpout(document_spout_config), Integer.parseInt(config.get("document.spouts").toString())); 97 | builder.setBolt("search-bolt", new LuwakSearchBolt(), Integer.parseInt(config.get("search.bolts").toString())) 98 | .allGrouping("query-spout") 99 | .shuffleGrouping("document-spout"); 100 | 101 | // topology submission 102 | if (args != null && args.length > 0) { 103 | config.setNumWorkers(Integer.parseInt(config.get("workers").toString())); 104 | StormSubmitter.submitTopologyWithProgressBar(args[0], config, builder.createTopology()); 105 | } 106 | else { 107 | LocalCluster cluster = new LocalCluster(); 108 | cluster.submitTopology("streaming-search-topology", config, builder.createTopology()); 109 | 110 | // run for a while then die 111 | Utils.sleep(50000000); 112 | cluster.killTopology("streaming-search-topology"); 113 | cluster.shutdown(); 114 | 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/luwak_search/src/straw/storm/bolt/LuwakSearchBolt.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package straw.storm.bolt; 19 | 20 | 21 | 22 | import java.io.IOException; 23 | import java.util.Map; 24 | import java.util.Timer; 25 | 26 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 27 | 28 | import redis.clients.jedis.Jedis; 29 | import redis.clients.jedis.JedisPool; 30 | import redis.clients.jedis.JedisPoolConfig; 31 | import straw.storm.util.Counter; 32 | import straw.storm.util.LuwakHelper; 33 | import straw.storm.util.RequestsHelper; 34 | import straw.storm.util.ScheduledMessageCounter; 35 | import uk.co.flax.luwak.InputDocument; 36 | import uk.co.flax.luwak.Matches; 37 | import uk.co.flax.luwak.Monitor; 38 | import uk.co.flax.luwak.MonitorQuery; 39 | import uk.co.flax.luwak.QueryMatch; 40 | import uk.co.flax.luwak.matchers.SimpleMatcher; 41 | import uk.co.flax.luwak.presearcher.TermFilteredPresearcher; 42 | import uk.co.flax.luwak.queryparsers.LuceneQueryParser; 43 | import backtype.storm.task.OutputCollector; 44 | import backtype.storm.task.TopologyContext; 45 | import backtype.storm.topology.OutputFieldsDeclarer; 46 | import backtype.storm.topology.base.BaseRichBolt; 47 | import backtype.storm.tuple.Fields; 48 | import backtype.storm.tuple.Tuple; 49 | import backtype.storm.tuple.Values; 50 | 51 | 52 | /** 53 | * This bolt aggregates counts from multiple upstream bolts. 54 | */ 55 | public class LuwakSearchBolt extends BaseRichBolt { 56 | 57 | private OutputCollector collector; 58 | private Map conf; 59 | private static JedisPool pool; 60 | private Monitor monitor; 61 | private Counter counter; 62 | 63 | @SuppressWarnings("rawtypes") 64 | @Override 65 | public void prepare(Map conf, TopologyContext context, OutputCollector collector) { 66 | this.conf = conf; 67 | this.collector = collector; 68 | 69 | // prepare the redis client 70 | pool = new JedisPool(new JedisPoolConfig(), conf.get("redis_host").toString()); 71 | 72 | // count message throughput 73 | counter = new Counter(); 74 | ScheduledMessageCounter message_counter = new ScheduledMessageCounter(counter, conf); 75 | Timer time = new Timer(); // Instantiate Timer Object 76 | time.schedule(message_counter, 0, 10000); // Create Repetitively task for every 30 secs 77 | 78 | // luwak 79 | try { 80 | this.monitor = new Monitor(new LuceneQueryParser("text", new StandardAnalyzer()), new TermFilteredPresearcher()); 81 | } catch (IOException e) { 82 | // TODO Auto-generated catch block 83 | e.printStackTrace(); 84 | } 85 | 86 | } 87 | 88 | @Override 89 | public void execute(Tuple tuple) { 90 | 91 | // process the tuple 92 | String sourcename = tuple.getSourceComponent(); 93 | String data = tuple.getValue(0).toString(); 94 | 95 | // either we get a query and we need to add it to the index 96 | // or we get a document and we need to do a search 97 | // Values("query", request_id, user_id, query_id, query) 98 | // Values("document", source, document) 99 | if(sourcename.toLowerCase().contains("query")){ 100 | // add queries 101 | MonitorQuery query = LuwakHelper.make_query(data); 102 | 103 | //register the query 104 | try { 105 | // System.out.println(query.toString()); 106 | monitor.update(query); 107 | } catch (IOException e) { 108 | // TODO Auto-generated catch block 109 | e.printStackTrace(); 110 | } 111 | } 112 | else if (sourcename.toLowerCase().contains("document")){ 113 | // try to parse as document 114 | String text = LuwakHelper.extract_text(data); 115 | 116 | //Build a document to check against the percolator 117 | InputDocument doc = null; 118 | if (text != null){ 119 | doc = InputDocument.builder(RequestsHelper.generate_unique_identifier(data)) 120 | .addField("text", text, new StandardAnalyzer()) 121 | .build(); 122 | } 123 | 124 | // pass the document through Luwak 125 | if (doc != null) { 126 | try { 127 | Matches matches = monitor.match(doc, SimpleMatcher.FACTORY); 128 | 129 | //Handle the result which is the set of queries in the percolator 130 | for(QueryMatch match : matches) { 131 | // System.out.println("Query: " + match.toString() + " matched document " + text); 132 | // emit results 133 | // collector.emit(new Values(data)); 134 | //System.out.println(match.toString()); 135 | 136 | // publish the result to jedis 137 | try (Jedis jedis_client = pool.getResource()) { 138 | jedis_client.publish(match.getQueryId(), text); 139 | } 140 | } 141 | } catch (IOException e) { 142 | e.printStackTrace(); 143 | } 144 | } 145 | } 146 | 147 | // acknowledge 148 | collector.ack(tuple); 149 | 150 | // we completed a search, so we need to update the counter 151 | counter.count+=1; 152 | } 153 | 154 | 155 | @Override 156 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 157 | declarer.declare(new Fields("document")); 158 | } 159 | 160 | @Override 161 | public void cleanup() { 162 | pool.destroy(); 163 | } 164 | 165 | 166 | } 167 | -------------------------------------------------------------------------------- /src/luwak_search/src/straw/storm/util/ConfigurationManager.java: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * 4 | * This class is a simple wrapper around the storm configuration object. 5 | * For straw, we store all configuration in a config file whose location is 6 | * given by the value of the enviornment variable STRAW_CONFIG. 7 | * 8 | * The purpose of this class is to allow for a simple interface where 9 | * we can ask storm to set a configuration value "storm_property" based on 10 | * the value of "system_name" in the STRAW_CONFIG file. An error is thrown 11 | * when the "system_name" is not found in the config file. 12 | * 13 | */ 14 | package straw.storm.util; 15 | 16 | import java.io.FileInputStream; 17 | import java.io.IOException; 18 | import java.io.InputStream; 19 | import java.util.Map; 20 | import java.util.Properties; 21 | import backtype.storm.Config; 22 | 23 | public class ConfigurationManager { 24 | 25 | private Map env = System.getenv(); 26 | private String config_filename; 27 | private Properties prop = new Properties(); 28 | private Config config = new Config(); 29 | 30 | public ConfigurationManager() { 31 | // read config file location from sys 32 | config_filename = env.get("STRAW_CONFIG"); 33 | if(config_filename==null) 34 | { 35 | throw new RuntimeException("Couldn't access config file, did you set STRAW_CONFIG in enviornment?"); 36 | } 37 | 38 | // load the properties 39 | InputStream input = null; 40 | try { 41 | input = new FileInputStream(config_filename); 42 | prop.load(input); 43 | } catch (IOException ex) { 44 | ex.printStackTrace(); 45 | } finally { 46 | if (input != null) { 47 | try { 48 | input.close(); 49 | } catch (IOException e) { 50 | e.printStackTrace(); 51 | } 52 | } 53 | } 54 | 55 | } 56 | 57 | // add a setting to the config 58 | public void put(String storm_property, String system_name){ 59 | if (prop.getProperty(system_name)==null){ 60 | throw new RuntimeException("Property "+system_name+" not found in config file " + config_filename +"."); 61 | } 62 | config.put(storm_property, prop.getProperty(system_name)); 63 | } 64 | 65 | // return the storm config object 66 | public Config get(){ 67 | return config; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/luwak_search/src/straw/storm/util/Counter.java: -------------------------------------------------------------------------------- 1 | package straw.storm.util; 2 | 3 | // wrapper class to hold bolt's throughput count 4 | public class Counter { 5 | public int count=0; 6 | } -------------------------------------------------------------------------------- /src/luwak_search/src/straw/storm/util/LuwakHelper.java: -------------------------------------------------------------------------------- 1 | package straw.storm.util; 2 | 3 | 4 | 5 | import org.apache.commons.lang.StringUtils; 6 | 7 | import org.json.JSONArray; 8 | import org.json.JSONObject; 9 | 10 | import uk.co.flax.luwak.MonitorQuery; 11 | 12 | public class LuwakHelper { 13 | 14 | public static String extract_text(String data) { 15 | // parse input JSON 16 | JSONObject obj; 17 | String text = null; 18 | try { 19 | obj = new JSONObject(data); 20 | text = obj.getString("text"); 21 | } 22 | catch (org.json.JSONException e) { 23 | // TODO: Bad json passed 24 | // System.out.println("JSON PARSER FAILED TO HANDLE: " + data); 25 | //e.printStackTrace(); 26 | } 27 | 28 | return text; 29 | } 30 | 31 | public static MonitorQuery make_query(String data){ 32 | // build a query out of the data JSON string 33 | MonitorQuery qb = null; 34 | JSONObject obj = null; 35 | try { 36 | obj = new JSONObject(data); 37 | } 38 | catch (org.json.JSONException e) { 39 | System.out.println("JSON PARSER FAILED TO HANDLE: " + data); 40 | e.printStackTrace(); 41 | } 42 | 43 | String type = obj.getString("type"); 44 | 45 | // terms query parser 46 | if(type.equalsIgnoreCase("terms-query")) { 47 | Integer minimum_match = obj.getInt("minimum-match"); 48 | JSONArray arr = obj.getJSONArray("terms"); 49 | 50 | if (arr!=null){ 51 | String[] string_arry = new String[arr.length()]; 52 | // use length of array if minimum match not provided 53 | if (minimum_match==null){ 54 | minimum_match=arr.length(); 55 | } 56 | for(int i=0; i 2 | 12 | 14 | 4.0.0 15 | github.com.rwalk333 16 | storming-search 17 | jar 18 | 0.0.1 19 | storming-search 20 | 21 | 22 | 23 | org.elasticsearch 24 | elasticsearch 25 | 1.7.0 26 | 27 | 28 | 29 | org.apache.storm 30 | storm-kafka 31 | 0.9.2-incubating 32 | 33 | 34 | org.apache.kafka 35 | kafka_2.10 36 | 0.8.2.1 37 | 39 | 40 | 41 | 42 | org.apache.zookeeper 43 | zookeeper 44 | 45 | 46 | log4j 47 | log4j 48 | 49 | 50 | 51 | 52 | org.apache.kafka 53 | kafka-clients 54 | 0.8.2.1 55 | 56 | 57 | 58 | org.apache.storm 59 | storm-core 60 | 0.9.3 61 | 62 | provided 63 | 64 | 65 | redis.clients 66 | jedis 67 | 2.7.2 68 | jar 69 | compile 70 | 71 | 72 | commons-collections 73 | commons-collections 74 | 3.2.1 75 | 76 | 77 | 78 | org.apache.httpcomponents 79 | httpclient 80 | 4.5.1 81 | 82 | 83 | org.apache.httpcomponents 84 | httpclient-cache 85 | 4.5.1 86 | 87 | 88 | org.apache.httpcomponents 89 | httpmime 90 | 4.5.1 91 | 92 | 93 | org.apache.httpcomponents 94 | fluent-hc 95 | 4.5.1 96 | 97 | 98 | org.json 99 | json 100 | 20131018 101 | 102 | 103 | 104 | src 105 | test/jvm 106 | 107 | 108 | org.apache.maven.plugins 109 | maven-compiler-plugin 110 | 2.5.1 111 | 112 | 1.7 113 | 1.7 114 | 115 | 116 | 117 | org.apache.maven.plugins 118 | maven-shade-plugin 119 | 1.4 120 | 121 | true 122 | 123 | 124 | 125 | package 126 | 127 | shade 128 | 129 | 130 | 131 | 133 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | org.codehaus.mojo 142 | exec-maven-plugin 143 | 1.2.1 144 | 145 | 146 | 147 | exec 148 | 149 | 150 | 151 | 152 | java 153 | true 154 | false 155 | compile 156 | ${storm.topology} 157 | 158 | 159 | 160 | 161 | 162 | 163 | central 164 | Maven Repository Switchboard 165 | default 166 | http://repo1.maven.org/maven2 167 | 168 | false 169 | 170 | 171 | 172 | clojars.org 173 | http://clojars.org/repo 174 | 175 | 176 | mvnrepo 177 | mvnrepo 178 | http://mvnrepository.com 179 | 180 | 181 | mvnrepo2 182 | mvnrepo2 183 | http://repo1.maven.org/maven2/ 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /src/storming_search/run_search_topology.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export STRAW_CONFIG=`pwd`/../../config/config.properties 3 | echo "USING CONFIG FILE: $STRAW_CONFIG" 4 | mvn compile exec:java -Dstorm.topology=straw.storm.StreamingSearchTopology 5 | -------------------------------------------------------------------------------- /src/storming_search/src/straw/storm/StreamingSearchTopology.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package straw.storm; 19 | 20 | import backtype.storm.Config; 21 | import backtype.storm.LocalCluster; 22 | import backtype.storm.StormSubmitter; 23 | import backtype.storm.topology.TopologyBuilder; 24 | import backtype.storm.utils.Utils; 25 | import storm.kafka.*; 26 | import straw.storm.bolt.SearchBolt; 27 | import straw.storm.util.ConfigurationManager; 28 | 29 | /** 30 | * This is the Topology for Streaming Search 31 | */ 32 | public class StreamingSearchTopology { 33 | 34 | public static void main(String[] args) throws Exception { 35 | 36 | /* 37 | * CONFIGURATION 38 | * TODO: Better config management; should throw meaningful errors 39 | * when a config value is called but not defined. 40 | * 41 | */ 42 | ConfigurationManager config_manager = new ConfigurationManager(); 43 | config_manager.put("elasticsearch_host", "elasticsearch_host"); 44 | config_manager.put("elasticsearch_cluster_name", "elasticsearch_cluster_name"); 45 | config_manager.put("elasticsearch_port", "elasticsearch_port"); 46 | config_manager.put("index_name", "index_name"); 47 | config_manager.put("document_type", "document_type"); 48 | config_manager.put("kafka_query_topic", "kafka_query_topic"); 49 | config_manager.put("kafka_document_topic", "kafka_document_topic"); 50 | config_manager.put("zookeeper_host", "zookeeper_host"); 51 | config_manager.put("zookeeper_port", "zookeeper_port"); 52 | config_manager.put("redis_host", "redis_host"); 53 | config_manager.put("redis_port", "redis_port"); 54 | config_manager.put("redis_analytics_host", "redis_analytics_host"); 55 | config_manager.put("redis_analytics_port", "redis_analytics_port"); 56 | config_manager.put("search.bolts", "search.bolts"); 57 | config_manager.put("document.spouts", "document.spouts"); 58 | config_manager.put("query.spouts", "query.spouts"); 59 | config_manager.put("workers", "workers"); 60 | config_manager.put("search.bolt.number.tasks", "search.bolt.number.tasks"); 61 | Config config = config_manager.get(); 62 | 63 | /* 64 | * KafkaSpout configuration 65 | */ 66 | // offset management 67 | String zkroot = "/brokers"; // the root path in Zookeeper for the spout to store the consumer offsets 68 | String zkid = "ids"; // an id for this consumer for storing the consumer offsets in Zookeeper 69 | 70 | // set zookeeper host 71 | BrokerHosts brokerHosts = new ZkHosts( String.format("%s:%s", 72 | config.get("zookeeper_host").toString(), 73 | config.get("zookeeper_port")).toString(), zkroot); 74 | 75 | // kafka topics 76 | String query_topic = config.get("kafka_query_topic").toString(); 77 | String document_topic = config.get("kafka_document_topic").toString(); 78 | 79 | // define spouts 80 | SpoutConfig query_spout_config = new SpoutConfig(brokerHosts, query_topic, zkroot, zkid); 81 | query_spout_config.forceFromStart=true; 82 | SpoutConfig document_spout_config = new SpoutConfig(brokerHosts, document_topic, zkroot, zkid); 83 | document_spout_config.forceFromStart=true; 84 | 85 | // add a string scheme to the spouts 86 | document_spout_config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme()); 87 | query_spout_config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme()); 88 | 89 | 90 | // distribute queries and documents randomly to bolts (since Elasticsearch is centralized, we don't need to broadcast queries). 91 | TopologyBuilder builder = new TopologyBuilder(); 92 | builder.setSpout("query-spout", new KafkaSpout(query_spout_config), Integer.parseInt(config.get("query.spouts").toString())); 93 | builder.setSpout("document-spout", new KafkaSpout(document_spout_config), Integer.parseInt(config.get("document.spouts").toString())); 94 | builder.setBolt("search-bolt", new SearchBolt(), Integer.parseInt(config.get("search.bolts").toString())) 95 | .setNumTasks(Integer.parseInt(config.get("search.bolt.number.tasks").toString())) 96 | .shuffleGrouping("query-spout") 97 | .shuffleGrouping("document-spout"); 98 | 99 | // topology submission 100 | if (args != null && args.length > 0) { 101 | config.setNumWorkers(Integer.parseInt(config.get("workers").toString())); 102 | StormSubmitter.submitTopologyWithProgressBar(args[0], config, builder.createTopology()); 103 | } 104 | else { 105 | LocalCluster cluster = new LocalCluster(); 106 | cluster.submitTopology("streaming-search-topology", config, builder.createTopology()); 107 | 108 | // run for a while then die 109 | Utils.sleep(50000000); 110 | cluster.killTopology("streaming-search-topology"); 111 | cluster.shutdown(); 112 | 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/storming_search/src/straw/storm/bolt/SearchBolt.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package straw.storm.bolt; 19 | 20 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; 21 | 22 | import java.io.IOException; 23 | import java.util.Map; 24 | import java.util.Timer; 25 | 26 | import org.elasticsearch.ElasticsearchException; 27 | import org.elasticsearch.action.percolate.PercolateResponse; 28 | import org.elasticsearch.client.transport.TransportClient; 29 | import org.elasticsearch.common.settings.ImmutableSettings; 30 | import org.elasticsearch.common.settings.Settings; 31 | import org.elasticsearch.common.transport.InetSocketTransportAddress; 32 | import org.elasticsearch.common.xcontent.XContentBuilder; 33 | import org.elasticsearch.common.xcontent.XContentFactory; 34 | import org.elasticsearch.index.query.QueryBuilder; 35 | 36 | import redis.clients.jedis.Jedis; 37 | import redis.clients.jedis.JedisPool; 38 | import redis.clients.jedis.JedisPoolConfig; 39 | import straw.storm.util.Counter; 40 | import straw.storm.util.PercolatorHelper; 41 | import straw.storm.util.RequestsHelper; 42 | import straw.storm.util.ScheduledMessageCounter; 43 | import backtype.storm.task.OutputCollector; 44 | import backtype.storm.task.TopologyContext; 45 | import backtype.storm.topology.OutputFieldsDeclarer; 46 | import backtype.storm.topology.base.BaseRichBolt; 47 | import backtype.storm.tuple.Fields; 48 | import backtype.storm.tuple.Tuple; 49 | import backtype.storm.tuple.Values; 50 | 51 | 52 | /** 53 | * This bolt aggregates counts from multiple upstream bolts. 54 | */ 55 | public class SearchBolt extends BaseRichBolt { 56 | 57 | private OutputCollector collector; 58 | private Map conf; 59 | private TransportClient client; 60 | private static JedisPool pool; 61 | private Counter counter; 62 | 63 | @SuppressWarnings("rawtypes") 64 | @Override 65 | public void prepare(Map conf, TopologyContext context, OutputCollector collector) { 66 | this.conf = conf; 67 | this.collector = collector; 68 | SearchBolt.pool = new JedisPool(new JedisPoolConfig(), conf.get("redis_host").toString()); 69 | 70 | // prepare the search engine 71 | String host = conf.get("elasticsearch_host").toString(); 72 | String cluster_name = conf.get("elasticsearch_cluster_name").toString(); 73 | int port = Integer.parseInt(conf.get("elasticsearch_port").toString()); 74 | Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", cluster_name).build(); 75 | client = new TransportClient(settings) 76 | .addTransportAddress(new InetSocketTransportAddress(host, port)); 77 | counter = new Counter(); 78 | 79 | // count messages periodically 80 | ScheduledMessageCounter message_counter = new ScheduledMessageCounter(counter, conf); 81 | Timer time = new Timer(); // Instantiate Timer Object 82 | time.schedule(message_counter, 0, 10000); // Create Repetitively task for every 30 secs 83 | 84 | } 85 | 86 | @Override 87 | public void execute(Tuple tuple) { 88 | 89 | // process the tuple recieved from kafka 90 | String sourcename = tuple.getSourceComponent(); 91 | String data = tuple.getValue(0).toString(); 92 | 93 | // either we get a query and we need to add it to the index 94 | // or we get a document and we need to do a search 95 | // Values("query", request_id, user_id, query_id, query) 96 | // Values("document", source, document) 97 | if(sourcename.toLowerCase().contains("query")){ 98 | // add queries 99 | QueryBuilder query = PercolatorHelper.make_query(data); 100 | 101 | //register the query in the percolator 102 | if (query != null ) { 103 | try { 104 | client.prepareIndex(conf.get("index_name").toString(), ".percolator", RequestsHelper.generate_unique_identifier(data)) 105 | .setSource(jsonBuilder() 106 | .startObject() 107 | .field("query", query) // Register the query 108 | .field("format", "objects") 109 | .endObject()) 110 | .setRefresh(true) // Needed when the query shall be available immediately 111 | .execute().actionGet(); 112 | } catch (ElasticsearchException e) { 113 | // TODO Auto-generated catch block 114 | e.printStackTrace(); 115 | } catch (IOException e) { 116 | // TODO Auto-generated catch block 117 | e.printStackTrace(); 118 | } 119 | } 120 | } 121 | else if (sourcename.toLowerCase().contains("document")){ 122 | // try to parse as document 123 | String text = PercolatorHelper.extract_text(data); 124 | 125 | //Build a document to check against the percolator 126 | XContentBuilder docBuilder = null; 127 | if (text != null){ 128 | try { 129 | docBuilder = XContentFactory.jsonBuilder().startObject(); 130 | docBuilder.field("doc").startObject(); //This is needed to designate the document 131 | docBuilder.field("text", text); 132 | docBuilder.endObject(); //End of the doc field 133 | docBuilder.endObject(); //End of the JSON root object 134 | } catch (IOException e) { 135 | // TODO Auto-generated catch block 136 | e.printStackTrace(); 137 | } 138 | } 139 | 140 | if (docBuilder != null) { 141 | //Percolate 142 | PercolateResponse response = client.preparePercolate() 143 | .setIndices(conf.get("index_name").toString()) 144 | .setDocumentType(conf.get("document_type").toString()) 145 | .setSource(docBuilder).execute().actionGet(); 146 | 147 | //Handle the result which is the set of queries in the percolator 148 | for(PercolateResponse.Match match : response) { 149 | // emit results 150 | collector.emit(new Values(data)); 151 | 152 | // publish the result to jedis 153 | try (Jedis jedis_client = pool.getResource()) { 154 | jedis_client.publish(match.getId().toString(), text); 155 | } 156 | } 157 | } 158 | } 159 | 160 | // acknowledge 161 | collector.ack(tuple); 162 | 163 | // update the counter 164 | counter.count+=1; 165 | } 166 | 167 | @Override 168 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 169 | declarer.declare(new Fields("document")); 170 | } 171 | 172 | @Override 173 | public void cleanup() { 174 | client.close(); 175 | pool.destroy(); 176 | } 177 | 178 | } 179 | -------------------------------------------------------------------------------- /src/storming_search/src/straw/storm/util/ConfigurationManager.java: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * 4 | * This class is a simple wrapper around the storm configuration object. 5 | * For straw, we store all configuration in a config file whose location is 6 | * given by the value of the enviornment variable STRAW_CONFIG. 7 | * 8 | * The purpose of this class is to allow for a simple interface where 9 | * we can ask storm to set a configuration value "storm_property" based on 10 | * the value of "system_name" in the STRAW_CONFIG file. An error is thrown 11 | * when the "system_name" is not found in the config file. 12 | * 13 | */ 14 | package straw.storm.util; 15 | 16 | import java.io.FileInputStream; 17 | import java.io.IOException; 18 | import java.io.InputStream; 19 | import java.util.Map; 20 | import java.util.Properties; 21 | import backtype.storm.Config; 22 | 23 | public class ConfigurationManager { 24 | 25 | private Map env = System.getenv(); 26 | private String config_filename; 27 | private Properties prop = new Properties(); 28 | private Config config = new Config(); 29 | 30 | public ConfigurationManager() { 31 | // read config file location from sys 32 | config_filename = env.get("STRAW_CONFIG"); 33 | if(config_filename==null) 34 | { 35 | throw new RuntimeException("Couldn't access config file, did you set STRAW_CONFIG in enviornment?"); 36 | } 37 | 38 | // load the properties 39 | InputStream input = null; 40 | try { 41 | input = new FileInputStream(config_filename); 42 | prop.load(input); 43 | } catch (IOException ex) { 44 | ex.printStackTrace(); 45 | } finally { 46 | if (input != null) { 47 | try { 48 | input.close(); 49 | } catch (IOException e) { 50 | e.printStackTrace(); 51 | } 52 | } 53 | } 54 | 55 | } 56 | 57 | // add a setting to the config 58 | public void put(String storm_property, String system_name){ 59 | if (prop.getProperty(system_name)==null){ 60 | throw new RuntimeException("Property "+system_name+" not found in config file " + config_filename +"."); 61 | } 62 | config.put(storm_property, prop.getProperty(system_name)); 63 | } 64 | 65 | // return the storm config object 66 | public Config get(){ 67 | return config; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/storming_search/src/straw/storm/util/Counter.java: -------------------------------------------------------------------------------- 1 | package straw.storm.util; 2 | 3 | // wrapper class to hold bolt's throughput count 4 | public class Counter { 5 | public int count=0; 6 | } -------------------------------------------------------------------------------- /src/storming_search/src/straw/storm/util/PercolatorHelper.java: -------------------------------------------------------------------------------- 1 | package straw.storm.util; 2 | 3 | import static org.elasticsearch.index.query.QueryBuilders.termsQuery; 4 | 5 | import java.io.IOException; 6 | 7 | import org.elasticsearch.common.xcontent.XContentBuilder; 8 | import org.elasticsearch.common.xcontent.XContentFactory; 9 | import org.elasticsearch.index.query.QueryBuilder; 10 | import org.json.JSONArray; 11 | import org.json.JSONObject; 12 | 13 | public class PercolatorHelper { 14 | 15 | public static String extract_text(String data) { 16 | // parse input JSON 17 | JSONObject obj = null; 18 | String text = null; 19 | try { 20 | obj = new JSONObject(data); 21 | text = obj.getString("text"); 22 | } 23 | catch (org.json.JSONException e) { 24 | // TODO: Bad json passed 25 | // System.out.println("JSON PARSER FAILED TO HANDLE: " + data); 26 | //e.printStackTrace(); 27 | } 28 | 29 | return text; 30 | } 31 | 32 | public static QueryBuilder make_query(String data){ 33 | // build a query out of the data JSON string 34 | QueryBuilder qb = null; 35 | JSONObject obj = null; 36 | try { 37 | obj = new JSONObject(data); 38 | } 39 | catch (org.json.JSONException e) { 40 | System.out.println("JSON PARSER FAILED TO HANDLE: " + data); 41 | e.printStackTrace(); 42 | } 43 | 44 | String type = obj.getString("type"); 45 | 46 | // terms query parser 47 | if(type.equalsIgnoreCase("terms-query")) { 48 | Integer minimum_match = obj.getInt("minimum-match"); 49 | JSONArray arr = obj.getJSONArray("terms"); 50 | 51 | if (arr!=null){ 52 | String[] string_arry = new String[arr.length()]; 53 | // use length of array if minimum match not provided 54 | if (minimum_match==null){ 55 | minimum_match=arr.length(); 56 | } 57 | for(int i=0; i0: 16 | return([int(r['_id']) for r in result['matches']][0]) 17 | 18 | if __name__=="__main__": 19 | 20 | # argument help 21 | parser = argparse.ArgumentParser(description='Create and test Elasticsearch percolators.') 22 | parser.add_argument('file', help='File of tweets, one json doc per line.') 23 | parser.add_argument('host', help='Elasticsearch host.') 24 | parser.add_argument('-p','--port', default=9200, help='port, default is 9200') 25 | args = parser.parse_args() 26 | 27 | # index and document type constants 28 | INDEX_NAME = "documents" 29 | TYPE = "document" 30 | 31 | # get a client 32 | es = Elasticsearch(hosts=[{"host":args.host, "port":args.port}]) 33 | 34 | # create an index, ignore if it exists already 35 | es.indices.delete(index='documents', ignore=400) 36 | es.indices.create(index='documents', ignore=400, body={ 37 | "mappings": { 38 | "document": { 39 | "properties": { 40 | "message": { 41 | "type": "string" 42 | } 43 | } 44 | } 45 | } 46 | } 47 | ) 48 | 49 | ########################### 50 | # add some percolators 51 | ########################### 52 | query_table=[] 53 | queries = ['new york', 'facebook', 'cheese', 'mountain', 'zoology', 'artist', 'tech', 'big data'] 54 | for q in queries: 55 | es.create(index='documents', doc_type='.percolator', body={'query': {'match': {'message': q}}}, id=len(query_table)) 56 | query_table.append(q) 57 | 58 | # now we can do some stream searches. 59 | counter = 0 60 | with open(args.file, 'rb') as f: 61 | for line in f: 62 | counter+=1 63 | try: 64 | tweet=json.loads(line.decode('utf-8').strip()) 65 | msg = tweet['text'] 66 | perco_match = perco_parse(es.percolate(index='documents', doc_type='document', body={'doc':{'message':msg}})) 67 | if perco_match is not None: 68 | print("{0}:{1}:{2}".format(counter, query_table[perco_match], msg)) 69 | except(ValueError) as e: 70 | print("BAD VALUE") 71 | 72 | 73 | -------------------------------------------------------------------------------- /util/kafka_add_documents.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # add data to kafka 4 | (cd ../src/kafka_stream_eater \ 5 | && ./kafka_stream_producer.py ../../data/tweets.big.sample localhost documents --delay 2 \ 6 | ) 7 | 8 | -------------------------------------------------------------------------------- /util/kafka_add_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 1 ]; then 4 | echo "USAGE: $0 [number of queries]" 5 | else 6 | echo "Posting $1 queries to Kafka" 7 | 8 | # take top n from bigrams file 9 | head -n $1 ../data/queries.bigrams > queries.tmp 10 | 11 | # add data to kafka 12 | ../src/kafka_stream_eater/kafka_stream_producer.py queries.tmp localhost queries 13 | 14 | rm queries.tmp 15 | fi 16 | -------------------------------------------------------------------------------- /util/query_maker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | This script suggests lucene queries based on bigram collocations in a sample tweet file 4 | """ 5 | import nltk, argparse, re, json 6 | from nltk.collocations import * 7 | from nltk.corpus import stopwords 8 | from nltk.tokenize import word_tokenize 9 | 10 | # not a robust RE but will probably work fine here 11 | link_re = re.compile(r"http[s]*://.*?($|\s)") 12 | p_re = re.compile(r"[!.?;,]") 13 | 14 | if __name__=="__main__": 15 | 16 | # arg parsing 17 | parser = argparse.ArgumentParser(description="Generate lucene queries from collocation data in a sample file") 18 | parser.add_argument("input", help="File containing text corpus.") 19 | parser.add_argument("output", help="An output file.") 20 | parser.add_argument("-n","--number", default=100, help="Size of nbest list.") 21 | args = parser.parse_args() 22 | 23 | # extraction 24 | count=0 25 | docs = [] 26 | mystopwords = stopwords.words("english") 27 | mystopwords.extend(['rt']) 28 | with open(args.input, "rb") as f: 29 | for l in f: 30 | count+=1 31 | try: 32 | tweet = json.loads(l.decode("utf-8").strip()) 33 | except ValueError: 34 | print("Bad tweet found at line {0}.".format(count)) 35 | tweet = {} 36 | 37 | if tweet.get("text"): 38 | tweet = tweet["text"] 39 | tweet = tweet.lower() 40 | tweet = re.sub(link_re, "", tweet).strip().replace("\"","") 41 | tweet = re.sub(p_re, "", tweet) 42 | # generally, word-tokenizer expects one sentance inputs. Tweets might not satisfy that..but probably 43 | # doesn't matter for what we are doing here. 44 | tokens = [t.replace("'","") for t in tweet.split() if t not in mystopwords] 45 | docs.append(tokens) 46 | 47 | # get the nbest list 48 | bigram_measures = nltk.collocations.BigramAssocMeasures() 49 | finder = BigramCollocationFinder.from_documents(docs) 50 | finder.apply_freq_filter(5) 51 | nbest = finder.nbest(bigram_measures.raw_freq, int(args.number)) 52 | 53 | # create a file of queries 54 | #for k,v in finder.ngram_fd.items(): 55 | # print(k,v) 56 | 57 | with open(args.output, "w") as f: 58 | for q in nbest: 59 | query = { "type":"terms-query", "terms":list(q), "minimum-match":len(q) } 60 | f.write(json.dumps(query) + "\n") 61 | -------------------------------------------------------------------------------- /util/redis_pub_sub_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Simple example of redis pubsub management in Python. 4 | # 5 | import redis 6 | from time import sleep 7 | 8 | if __name__=="__main__": 9 | 10 | # open the connection 11 | pool = redis.ConnectionPool(host='localhost', port=6379, db=0) 12 | r = redis.StrictRedis(connection_pool=pool) 13 | p = r.pubsub(ignore_subscribe_messages=True) 14 | 15 | # what to do with the messages? 16 | def message_handler(message): 17 | print('MSG:', message['data']) 18 | 19 | query = raw_input("Please enter the topic you'd like to follow: ") 20 | 21 | # subscribe to first topic in background thread 22 | queries = {query: message_handler} 23 | p.subscribe(**queries) 24 | thread = p.run_in_thread(sleep_time=0.001) 25 | query = None 26 | 27 | # listen for a new query, if we get one then stop the running thread 28 | # and start a new one with an updated set of subscriptions 29 | # warning, we could get duplicates in the time it takes to bring up the new thread 30 | while True: 31 | if query is None: 32 | query = input("Please enter the topic you'd like to follow: ") 33 | else: 34 | # the old thread is now out of date (since it doesn't have all our subscriptions) 35 | thread_stale = thread 36 | 37 | # start a new thread with the full set of subscriptions 38 | queries[query] = message_handler 39 | query = None 40 | p.subscribe(**queries) 41 | thread = p.run_in_thread(sleep_time=0.001) 42 | 43 | # now kill off the old thread. 44 | thread_stale.stop() 45 | -------------------------------------------------------------------------------- /util/stage_demo_mode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unpack local data 4 | ( cd ../data/ && 5 | gunzip tweets.big.sample.gz -f -k 6 | ) 7 | 8 | 9 | # start kafka 10 | (cd ../src/kafka_stream_eater/third_party/kafka-docker-master && docker-compose stop && docker-compose rm && docker-compose up -d --force-recreate) 11 | 12 | # start elasticsearch 13 | ./docker_elasticsearch.sh 14 | 15 | -------------------------------------------------------------------------------- /util/tweet_sampler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | ''' 3 | Sample from the twitter API and post results to a file or to Kafka. 4 | 5 | To use, set credientials as enviornment variables, e.g. 6 | 7 | export TWITTER_ACCESS_TOKEN=... 8 | 9 | or 10 | 11 | source myfile 12 | 13 | where myfile exports the authorization variables 14 | ''' 15 | 16 | import twython, json, re, argparse, subprocess, os, sys, time 17 | from socket import timeout 18 | from kafka import SimpleProducer, KafkaClient 19 | 20 | #################### 21 | # Constants 22 | #################### 23 | access_token = os.environ["TWITTER_ACCESS_TOKEN"] 24 | access_token_secret = os.environ["TWITTER_SECRET_TOKEN"] 25 | consumer_key = os.environ["TWITTER_CONSUMER_TOKEN"] 26 | consumer_secret = os.environ["TWITTER_CONSUMER_SECRET"] 27 | 28 | class StrawStreamer(twython.TwythonStreamer): 29 | 30 | def __init__(self, APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET, outfile): 31 | super(StrawStreamer, self).__init__(APP_KEY, APP_SECRET,OAUTH_TOKEN, OAUTH_TOKEN_SECRET) 32 | self.outfile=outfile 33 | 34 | def on_success(self, data): 35 | if 'text' in data: 36 | self.outfile.write((json.dumps(data)+u'\n').encode('utf-8')) 37 | 38 | def on_error(self, status_code, data): 39 | print(status_code) 40 | 41 | class KafkaStrawStreamer(twython.TwythonStreamer): 42 | def __init__(self, APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET, host, port): 43 | super(KafkaStrawStreamer, self).__init__(APP_KEY, APP_SECRET,OAUTH_TOKEN, OAUTH_TOKEN_SECRET) 44 | 45 | # connect to Kafka 46 | print("Connecting to Kafka node {0}:{1}".format(host, port)) 47 | kafka = KafkaClient("{0}:{1}".format(host, port)) 48 | self.producer = BufferedSimpleProducer(kafka, 100) 49 | 50 | def on_success(self, data): 51 | # TODO: add message queue so we can pass messages in bulk 52 | if 'text' in data: 53 | msg = (json.dumps(data)+u'\n').encode('utf-8') 54 | self.producer.send_messages(args.topic, msg) 55 | 56 | def on_error(self, status_code, data): 57 | print(status_code) 58 | 59 | class BufferedSimpleProducer: 60 | def __init__(self, kafka, chunk_size): 61 | self.producer = SimpleProducer(kafka) 62 | self.queues = {} 63 | self.chunk_size = chunk_size 64 | 65 | def send_messages(self, topic, msg): 66 | if topic not in self.queues: 67 | self.queues[topic]=[] 68 | if len(self.queues[topic])