├── .gitignore ├── LICENSE ├── README.md ├── autoscale.py ├── deployment └── to-go │ ├── README.md │ ├── Vagrantfile │ └── provision.sh ├── doc ├── architecture and flow.graffle │ ├── data.plist │ ├── image1.tiff │ ├── image2.tiff │ ├── image4.tiff │ └── image5.tiff ├── architecture.png ├── elsa-marathon-deploy.png ├── flow.png └── sa-logo.jpg ├── elsa.conf.example ├── launch-elsa.sh ├── pom.xml └── src └── main └── scala └── spark └── elsa ├── ElsaHelper.scala └── OnlineSA.scala /.gitignore: -------------------------------------------------------------------------------- 1 | # Vagrant 2 | .vagrant/ 3 | 4 | 5 | *.conf 6 | *.class 7 | *.log 8 | 9 | # sbt specific 10 | .cache/ 11 | .history/ 12 | .lib/ 13 | dist/* 14 | target/ 15 | lib_managed/ 16 | src_managed/ 17 | project/boot/ 18 | project/plugins/project/ 19 | 20 | # Scala-IDE specific 21 | .scala_dependencies 22 | .worksheet 23 | .idea/ 24 | *.iml 25 | 26 | # Python 27 | __pycache__/ 28 | *.py[cod] 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elastic Sentiment Analysis (ElSA) 2 | 3 | The *Elastic Sentiment Analysis* (ElSA) is a Spark Streaming-based application written for the [DCOS](http://mesosphere.com/product/). It derives public opinions/sentiments on specified Twitter topics and is able to elastically scale its processing capacity, based on the volume of the topics' traffic, leveraging Apache Mesos and Marathon. 4 | 5 | ElSA works as follows: 6 | 7 | * It takes a list of words (called topics in the following), such as *Mesos*, *Docker*, *DCOS*, etc., as input and—using the Twitter firehose—pulls tweets containing these topics for processing. 8 | * Based on the tweet content it performs a simple sentiment analysis in an ongoing fashion. This operation is implemented via [Spark Streaming](https://spark.apache.org/docs/latest/streaming-programming-guide.html). 9 | * Last but not least, based on the activity in a certain topic the app scales elastically through leveraging the [Marathon REST API](https://mesosphere.github.io/marathon/docs/rest-api.html). This means that if, for example, a rapid increase of mentions of the topic *DCOS* is detected (tweets per time unit), then more instances are launched. 10 | 11 | See below for the architecture and data flow as well as [deployment](#deployment) and [usage](#usage) instructions. 12 | 13 | ## Architecture 14 | 15 | | components | flow | 16 | | ------------------------------------- | -------------------- | 17 | | ![Architecture](doc/architecture.png) |![Flow](doc/flow.png) | 18 | 19 | Description TBD. 20 | 21 | ## Deployment 22 | 23 | In the following, an Ubuntu 14.04 environment is assumed and in addition, ElSA depends on: 24 | 25 | * Apache [Mesos 0.21.x](http://archive.apache.org/dist/mesos/0.21.0/) with [Marathon 0.7.6](https://github.com/mesosphere/marathon/releases/tag/v0.7.6) 26 | * [marathon-python](https://github.com/thefactory/marathon-python) 27 | * Apache [Spark 1.2.x](https://spark.apache.org/downloads.html) 28 | * A Twitter account and an [app](https://apps.twitter.com/) that can be used for accessing the Twitter firehose 29 | 30 | ### ElSA to-go: Vagrant deployment 31 | 32 | **ElSA to-go** is a single-node Vagrant deployment based on the ingenious [Playa Mesos](https://github.com/mesosphere/playa-mesos). See details in [deployment/to-go](deployment/to-go) … 33 | 34 | 35 | ### ElSA Docker 36 | 37 | 38 | ### Digital Ocean deployment 39 | 40 | IaaS deployment on [DO](https://cloud.digitalocean.com/) 41 | 42 | ### Google Compute deployment 43 | 44 | IaaS deployment on [GCE](https://cloud.google.com/) 45 | 46 | ### AWS EC2 deployment 47 | 48 | IaaS deployment on [EC2](https://console.aws.amazon.com/) 49 | 50 | 51 | ### Manual single node deployment 52 | 53 | For Python packages we need `pip` so before anything else do: 54 | 55 | $ sudo apt-get install python-pip 56 | 57 | Now we can start with the setup. 58 | 59 | **Install Mesos**: simply use [Playa Mesos](https://github.com/mesosphere/playa-mesos) which contains an Marathon installation or follow the [step-by-step instructions](http://mesos.apache.org/gettingstarted/) from the Apache Mesos site and install Marathon on top of it. 60 | 61 | Further, as a preparation for the ElSA app, we need a [Python package](https://github.com/thefactory/marathon-python) wrapping the Marathon [REST API](https://mesosphere.github.io/marathon/docs/rest-api.html) so let's do that right away: 62 | 63 | $ pip install marathon 64 | 65 | **Install Spark**: 66 | 67 | First we download the Spark source and make sure Java env is set up correctly: 68 | 69 | $ cd 70 | $ wget http://d3kbcqa49mib13.cloudfront.net/spark-1.2.0.tgz 71 | $ tar xzf spark-1.2.0.tgz && cd spark-1.2.0/ 72 | $ sudo apt-get install default-jdk 73 | $ export JAVA_HOME=$(readlink -f /usr/bin/javac | sed "s:bin/javac::") 74 | 75 | Now make sure the correct version of Maven (3.0.4 or higher) is available: 76 | 77 | $ sudo apt-get update 78 | $ sudo apt-get install maven 79 | $ mvn -version 80 | Apache Maven 3.0.5 81 | Maven home: /usr/share/maven 82 | Java version: 1.7.0_65, vendor: Oracle Corporation 83 | Java home: /usr/lib/jvm/java-7-openjdk-amd64/jre 84 | 85 | OK, ready to build Spark. Note: right now is a good time to get a cup of tea or coffee, whatever floats your boat. As usual, Maven is downloading half of the Internet for the following and that might take, um, a while: 86 | 87 | $ export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m" 88 | $ sudo mvn -DskipTests clean package 89 | 90 | So, here we are. Next we package our newly built Spark distribution for the Mesos slaves to use: 91 | 92 | $ ./make-distribution.sh 93 | $ mv dist spark-1.2.0 94 | $ tar czf spark-1.2.0.tgz spark-1.2.0/ 95 | $ cd conf/ 96 | $ cp spark-env.sh.template spark-env.sh 97 | 98 | Now open `../spark-1.2.0/conf/spark-env.sh` in your favorite editor and add the following at the end of the file: 99 | 100 | export MESOS_NATIVE_LIBRARY=/usr/local/lib/libmesos.so 101 | export SPARK_EXECUTOR_URI=file:///home/vagrant/spark-1.2.0/spark-1.2.0.tgz 102 | export MASTER=mesos://127.0.1.1:5050 103 | 104 | Note that if you've built Spark in a different directory (I did it in `/home/vagrant/`) then you'll have to change the setting for the `SPARK_EXECUTOR_URI` to point to the resulting `tgz` file from the previous step, 105 | 106 | Then, finally, we're ready to launch Spark: 107 | 108 | $ cd .. 109 | $ bin/spark-shell 110 | 111 | **Install Elsa**: 112 | 113 | $ cd 114 | $ git clone https://github.com/mhausenblas/elsa.git 115 | $ cd elsa 116 | $ mvn clean package 117 | 118 | ## Usage 119 | 120 | Assuming you've installed ElSA using one of the options above, you should now be in the position to launch it as described below. 121 | 122 | Note: in order for ElSA to run you'll need to supply your Twitter credentials, that is, you `cp elsa.conf.example elsa.conf` and replace the `YOUR STUFF HERE` sections with the details you obtain from creating a Twitter application and generating the access token via the [app](https://apps.twitter.com/) interface. 123 | 124 | Before you start, you might want to quickly check out this 3min walkthrough of ElSA op: 125 | 126 | ElSA op video walkthrough on YouTUb 127 | 128 | 129 | ### Launching ElSA manually 130 | 131 | To launch ElSA manually (without elasticity, directly on Mesos), do the following: 132 | 133 | $ cd elsa 134 | $ ./launch-elsa.sh 135 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 136 | 15/03/06 19:02:32 INFO ElsaHelper: Setting log level to [ERROR]. 137 | 138 | In the past 5 seconds I found 0 tweet(s) containing your topics: Datacenter DCOS Docker Mesos Mesosphere devop microservice 139 | 140 | In the past 5 seconds I found 1 tweet(s) containing your topics: Datacenter DCOS Docker Mesos Mesosphere devop microservice 141 | === 142 | RT @SoftLayer: Let’s talk software, specifically how to create a private @Docker registry on SoftLayer. ≡ http://t.co/UVpX1Anl4s http://t.c… 143 | === 144 | 145 | ## Launching Elsa through Marathon 146 | 147 | To launch the ElSA app (through Marathon) and automatically scale the number of instances used, depending on the increase/decrease of traffic detected for the specified topics, do the following (hint: stop app by hitting `CTRL+C`): 148 | 149 | $ cd elsa 150 | $ ./autoscale.py http://localhost:8080 elsa.conf 151 | Using /tmp/elsa/stats to monitor topic traffic 152 | Using traffic increase threshold of 10 and scale factor 5 153 | ElSA is deployed and running, waiting now 5 sec before starting auto-scale ... 154 | Difference in traffic in the past 10 seconds: 9 155 | Difference in traffic in the past 10 seconds: -9 156 | Resetting number of instances to 1 157 | Difference in traffic in the past 10 seconds: 11 158 | Increasing number of instances to 2 159 | Difference in traffic in the past 10 seconds: -14 160 | Resetting number of instances to 1 161 | ^CElSA has been stopped by user, halting app and rolling back deployment. Thanks and bye! 162 | 163 | You should then see something like the following in [Marathon](http://10.141.141.10:8080/): 164 | 165 | ![ElSA Marathon deployment](doc/elsa-marathon-deploy.png) 166 | 167 | ## To do 168 | 169 | - [x] Core business logic 170 | - [x] Single node deployment and launch 171 | - [x] Single node elastic 172 | - [x] Make all auto-scale parameter configurable via config 173 | - [x] Improve SA (positive negative) 174 | - [x] Video walkthrough 175 | - [x] Vagrant file 176 | - [ ] Docker image 177 | - [ ] Cluster deployment DO 178 | - [ ] Cluster deployment GCE 179 | - [ ] Cluster deployment EC2 180 | - [ ] Architecture and flow explanation 181 | 182 | ## Notes 183 | 184 | Kudos to the Spark team for providing [the basis](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala) for the SA part and to Alexandre Rodrigues for helping me out concerning the initial development. 185 | 186 | If you want to learn how to run Spark on Mesos, I suggest you try out the great [step-by-step tutorial](https://mesosphere.com/docs/tutorials/run-spark-on-mesos/) provided by the Mesosphere folks. 187 | 188 | Lastly, apologies to all [Frozen](http://www.imdb.com/title/tt2294629/) fans, especially our kids, for hijacking the Elsa label in this context. I thought it's funny … 189 | -------------------------------------------------------------------------------- /autoscale.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Launches ElSA app using Marathon and scales it depending of topic traffic. 4 | 5 | Usage: 6 | 7 | ./autoscale.py $MARATHON_URL $ELSA_CONFIG_FILE 8 | 9 | Example: 10 | 11 | ./autoscale.py http://localhost:8080 ./elsa.conf 12 | 13 | 14 | @author: Michael Hausenblas, http://mhausenblas.info/#i 15 | @since: 2015-03-06 16 | @status: init 17 | """ 18 | 19 | import logging 20 | import os 21 | import sys 22 | import time 23 | 24 | from marathon import MarathonClient 25 | from marathon.models import MarathonApp 26 | 27 | ################################################################################ 28 | # Defaults 29 | # 30 | 31 | DEBUG = True 32 | 33 | if DEBUG: 34 | FORMAT = '%(asctime)-0s %(levelname)s %(message)s [at line %(lineno)d]' 35 | logging.basicConfig(level=logging.DEBUG, format=FORMAT, datefmt='%Y-%m-%dT%I:%M:%S') 36 | else: 37 | FORMAT = '%(asctime)-0s %(message)s' 38 | logging.basicConfig(level=logging.INFO, format=FORMAT, datefmt='%Y-%m-%dT%I:%M:%S') 39 | 40 | TRAFFIC_INCREASE_THRESHOLD = 6 # difference between previous and current traffic 41 | SCALE_FACTOR = 2 # part of threshold number of instances should be scaled 42 | 43 | ################################################################################ 44 | # Scaling example: 45 | # 46 | # If TRAFFIC_INCREASE_THRESHOLD == 10 and SCALE_FACTOR == 10 and there has been 47 | # a traffic increase of 25, then this means that (because 25 > 10) the number of 48 | # instances will be increased by a factor of int(25/10) == 2, that is doubled. 49 | 50 | 51 | ################################################################################ 52 | # Helper 53 | # 54 | 55 | def get_config_params(elsa_config): 56 | stats_file_path = '' 57 | traffic_increase_threshold = 6 58 | scale_factor = 2 59 | if os.path.exists(elsa_config): 60 | logging.info('Using %s as config file' %(elsa_config)) 61 | lines = tuple(open(elsa_config, 'r')) 62 | for line in lines: 63 | l = str(line).strip() 64 | if l and not l.startswith('#'): # not empty or comment line 65 | cfg_param = line.split('=')[0].rstrip() # extract config parameter 66 | if cfg_param == 'stats-file': 67 | stats_file_path = line.split('=')[1].strip().translate(None, '"') 68 | if cfg_param == 'batch-window': 69 | batch_window = int(line.split('=')[1].strip()) 70 | if cfg_param == 'traffic-increase-threshold': 71 | traffic_increase_threshold = int(line.split('=')[1].strip()) 72 | if cfg_param == 'scale-factor': 73 | scale_factor = int(line.split('=')[1].strip()) 74 | else: 75 | logging.info('No config file provided.') 76 | logging.debug('[%s]' %(stats_file_path)) 77 | return (stats_file_path, batch_window, traffic_increase_threshold, scale_factor) 78 | 79 | 80 | def launch_elsa(marathon, stats_file, scale_window): 81 | logging.info('Start monitoring the inbound traffic on topics using %s' %(stats_file)) 82 | # make sure the stats file is properly initialized: 83 | if not os.path.exists(stats_file): 84 | f = open(stats_file, 'w') 85 | f.write('0') 86 | f.close() 87 | 88 | # launch the Elsa app via Marathon 89 | c = MarathonClient(marathon) 90 | c.create_app('elsa', MarathonApp(cmd='/home/vagrant/elsa/launch-elsa.sh', mem=200, cpus=1, user='vagrant')) 91 | # c.list_apps() 92 | 93 | print('ElSA is deployed and running, waiting now 5 sec before starting auto-scale ...') 94 | time.sleep(5) # allow time to deploy before autoscaling sets in 95 | 96 | # kick off traffic monitoring and trigger autoscaling: 97 | previous_topic_traffic = 0 98 | try: 99 | while True: 100 | with open(stats_file, 'r') as elsa_file: 101 | topic_traffic = int(elsa_file.read()) 102 | topic_traffic_diff = topic_traffic - previous_topic_traffic 103 | print('Difference in traffic in the past %d seconds: %d' %(scale_window, topic_traffic_diff)) 104 | previous_topic_traffic = topic_traffic 105 | 106 | current_instance_num = c.get_app('elsa').instances 107 | 108 | if topic_traffic_diff > TRAFFIC_INCREASE_THRESHOLD: # we see a surge of traffic above threshold ... 109 | instance_multiplier = int(topic_traffic_diff / SCALE_FACTOR) # ... increase number of instances 110 | c.scale_app('elsa', current_instance_num * instance_multiplier) 111 | print('Increasing number of instances to %d' %(current_instance_num * instance_multiplier)) 112 | elif topic_traffic_diff < 0: # negative, back off exponentially 113 | target_instance_num = int(current_instance_num/2) 114 | if target_instance_num > 1: 115 | c.scale_app('elsa', target_instance_num) 116 | print('Decreasing number of instances to %d' %(target_instance_num)) 117 | else: 118 | c.scale_app('elsa', 1) 119 | print('Resetting number of instances to 1') 120 | time.sleep(scale_window) 121 | except KeyboardInterrupt: 122 | print('ElSA has been stopped by user, halting app and rolling back deployment. Thanks and bye!') 123 | c.delete_app('elsa', force=True) 124 | 125 | ################################################################################ 126 | # Main script 127 | # 128 | if __name__ == '__main__': 129 | try: 130 | marathon = sys.argv[1] # Marathon URL to use 131 | (stats_file_path, batch_window, traffic_increase_threshold, scale_factor) = get_config_params(sys.argv[2]) 132 | print('Using %s to monitor topic traffic' %(stats_file_path)) 133 | if traffic_increase_threshold: 134 | TRAFFIC_INCREASE_THRESHOLD = traffic_increase_threshold 135 | if scale_factor: 136 | SCALE_FACTOR = scale_factor 137 | print('Using traffic increase threshold of %d and scale factor %d' %(TRAFFIC_INCREASE_THRESHOLD, SCALE_FACTOR)) 138 | launch_elsa(marathon, stats_file_path, batch_window) 139 | except Exception, e: 140 | print(e) 141 | print(__doc__) 142 | sys.exit(2) -------------------------------------------------------------------------------- /deployment/to-go/README.md: -------------------------------------------------------------------------------- 1 | # ElSA to-go 2 | 3 | ElSA to-go is derived from [Playa Mesos][1] which in turn relies on [VirtualBox][2] and [Vagrant][3], and an Ubuntu box image. 4 | 5 | ## Prerequisites 6 | 7 | * [VirtualBox][2] 4.2+ 8 | * [Vagrant][3] 1.3+ 9 | 10 | ## Quick Start 11 | 12 | Preparation: 13 | 14 | 1. [Install VirtualBox](https://www.virtualbox.org/wiki/Downloads) 15 | 16 | 1. [Install Vagrant](http://www.vagrantup.com/downloads.html) 17 | 18 | 1. Clone this repository 19 | 20 | ```bash 21 | git clone https://github.com/mhausenblas/elsa.git 22 | cd elsa/deployment/to-go 23 | ``` 24 | 1. Start the VM 25 | 26 | ```bash 27 | vagrant up 28 | ``` 29 | 30 | 1. Connect to the Mesos Web UI on [10.141.141.10:5050](http://10.141.141.10:5050) and the Marathon Web UI on [10.141.141.10:8080](http://10.141.141.10:8080) 31 | 32 | 1. SSH to the VM 33 | 34 | ```bash 35 | vagrant ssh 36 | ls -al 37 | exit 38 | ``` 39 | 40 | At this point in time you can [launch ElSA via the autoscale](https://github.com/mhausenblas/elsa#launching-elsa-through-marathon) script. 41 | 42 | Once you're done experimenting, you can shut down the VM like so: 43 | 44 | ```bash 45 | vagrant halt 46 | ``` 47 | 48 | … or, for faster start-up but larger disk footprint: 49 | 50 | ```bash 51 | vagrant suspend 52 | ``` 53 | 54 | When you want to get rid of the VM, do the following: 55 | 56 | ```bash 57 | vagrant destroy 58 | ``` 59 | 60 | 61 | ## Kudos 62 | 63 | Kudos to the original [Playa Mesos][1] authors: 64 | 65 | * [Jeremy Lingmann](https://github.com/lingmann) ([@lingmann](https://twitter.com/lingmann)) 66 | * [Jason Dusek](https://github.com/solidsnack) ([@solidsnack](https://twitter.com/solidsnack)) 67 | 68 | [1]: https://github.com/mesosphere/playa-mesos "Playa Mesos" 69 | [2]: http://www.virtualbox.org/ "VirtualBox" 70 | [3]: http://www.vagrantup.com/ "Vagrant" 71 | -------------------------------------------------------------------------------- /deployment/to-go/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing 5 | VAGRANTFILE_API_VERSION = '2' 6 | 7 | PM_ROOT = File.dirname(__FILE__) 8 | 9 | box_url = "http://downloads.mesosphere.io/playa-mesos/playa_mesos_ubuntu_14.04-virtualbox.box" 10 | 11 | 12 | # ############################################################################# 13 | # Vagrant VM Definitions 14 | # ############################################################################# 15 | 16 | 17 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| 18 | 19 | # Create a private network, which allows host-only access to the machine 20 | # using a specific IP. 21 | config.vm.network :private_network, ip: "10.141.141.10" 22 | 23 | # If true, then any SSH connections made will enable agent forwarding. 24 | # Default value: false 25 | config.ssh.forward_agent = true 26 | 27 | # Every Vagrant virtual environment requires a box to build off of. 28 | config.vm.box = "playa_mesos_ubuntu_14.04" 29 | 30 | # Hardcoded the Playa Mesos box 31 | config.vm.box_url = box_url 32 | 33 | # Only VirtualBox provider 34 | config.vm.provider :virtualbox do |vb| 35 | vb.name = "elsa" 36 | vb.customize ['modifyvm', :id, '--memory', "2048"] 37 | vb.customize ['modifyvm', :id, '--cpus', "2"] 38 | end 39 | 40 | # Make the project root available to the guest VM. 41 | # config.vm.synced_folder '.', '/vagrant' 42 | 43 | # Provision the ElSA app 44 | config.vm.provision :shell do |shell| 45 | shell.path = 'provision.sh' 46 | end 47 | 48 | end 49 | -------------------------------------------------------------------------------- /deployment/to-go/provision.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############################################################################### 4 | # Provisioning script for deploying ElSA on Ubuntu 14.04 5 | # 6 | # Usage: 7 | # 8 | # ./provision.sh [INSTALL_DIR] 9 | # 10 | # Examples: 11 | # 12 | # ... without INSTALL_DIR provided installs it in `/home/vagrant`: 13 | # $ ./provision.sh 14 | # 15 | # ... install it in `/home/mhausenblas`: 16 | # $ ./provision.sh /home/mhausenblas 17 | # 18 | # Author: Michael Hausenblas 19 | # Init: 2015-03-16 20 | 21 | 22 | set -e # exit on error immediately, just to keep things sane 23 | 24 | 25 | ############################################################################### 26 | # Global variables 27 | 28 | SCRIPT_PATH=`dirname $0` 29 | 30 | BASE_INSTALL=${1:-"/home/vagrant"} 31 | 32 | SPARK_CONF_TEMPLATE=$(cat <> spark-env.sh 104 | 105 | cd $BASE_INSTALL 106 | rm -f spark-1.2.0.tgz 107 | 108 | ############################################################################### 109 | # Building ElSA 110 | 111 | echo Phase 3: Building ElSA 112 | 113 | cd $BASE_INSTALL # back to the base install dir to set up ElSA 114 | git clone https://github.com/mhausenblas/elsa.git 115 | cd elsa 116 | mvn clean package 117 | 118 | echo Done provisioning ElSA into $BASE_INSTALL 119 | 120 | popd # restore and change back to where we started 121 | 122 | exit 0 -------------------------------------------------------------------------------- /doc/architecture and flow.graffle/data.plist: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhausenblas/elsa/6c11ea3d4f03cbefbca9f83021645786d6f252a1/doc/architecture and flow.graffle/data.plist -------------------------------------------------------------------------------- /doc/architecture and flow.graffle/image1.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhausenblas/elsa/6c11ea3d4f03cbefbca9f83021645786d6f252a1/doc/architecture and flow.graffle/image1.tiff -------------------------------------------------------------------------------- /doc/architecture and flow.graffle/image2.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhausenblas/elsa/6c11ea3d4f03cbefbca9f83021645786d6f252a1/doc/architecture and flow.graffle/image2.tiff -------------------------------------------------------------------------------- /doc/architecture and flow.graffle/image4.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhausenblas/elsa/6c11ea3d4f03cbefbca9f83021645786d6f252a1/doc/architecture and flow.graffle/image4.tiff -------------------------------------------------------------------------------- /doc/architecture and flow.graffle/image5.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhausenblas/elsa/6c11ea3d4f03cbefbca9f83021645786d6f252a1/doc/architecture and flow.graffle/image5.tiff -------------------------------------------------------------------------------- /doc/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhausenblas/elsa/6c11ea3d4f03cbefbca9f83021645786d6f252a1/doc/architecture.png -------------------------------------------------------------------------------- /doc/elsa-marathon-deploy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhausenblas/elsa/6c11ea3d4f03cbefbca9f83021645786d6f252a1/doc/elsa-marathon-deploy.png -------------------------------------------------------------------------------- /doc/flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhausenblas/elsa/6c11ea3d4f03cbefbca9f83021645786d6f252a1/doc/flow.png -------------------------------------------------------------------------------- /doc/sa-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhausenblas/elsa/6c11ea3d4f03cbefbca9f83021645786d6f252a1/doc/sa-logo.jpg -------------------------------------------------------------------------------- /elsa.conf.example: -------------------------------------------------------------------------------- 1 | deployment = "production" 2 | master = "local[*]" 3 | checkpoint-dir = "file:///tmp/elsa/checkpoints" 4 | stats-file = "/tmp/elsa/stats" 5 | batch-window = 5 6 | traffic-increase-threshold = 6 7 | scale-factor = 2 8 | topics = "Datacenter,DCOS,Docker,Mesos,Mesosphere,devop,microservice" 9 | consumer-key = "YOUR STUFF HERE" 10 | consumer-secret = "YOUR STUFF HERE" 11 | access-token = "YOUR STUFF HERE" 12 | access-token-secret = "YOUR STUFF HERE" -------------------------------------------------------------------------------- /launch-elsa.sh: -------------------------------------------------------------------------------- 1 | /home/vagrant/spark-1.2.0/bin/spark-submit \ 2 | --class spark.elsa.OnlineSA \ 3 | --master mesos://127.0.1.1:5050 \ 4 | /home/vagrant/elsa/target/elsa-1.0-SNAPSHOT-jar-with-dependencies.jar \ 5 | /home/vagrant/elsa/elsa.conf -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | info.mhausenblas.dev 5 | elsa 6 | Elastic Sentiment Analysis (ElSA) 7 | https://github.com/mhausenblas/elsa 8 | 1.0-SNAPSHOT 9 | 10 | 11 | org.apache.spark 12 | spark-core_2.10 13 | 1.2.0 14 | 15 | 16 | org.apache.spark 17 | spark-streaming_2.10 18 | 1.2.0 19 | 20 | 21 | org.apache.spark 22 | spark-streaming-twitter_2.10 23 | 1.2.0 24 | 25 | 26 | org.streum 27 | configrity-core_2.10 28 | 1.0.0 29 | 30 | 31 | org.scala-lang 32 | scala-library 33 | 2.10.4 34 | 35 | 36 | 37 | 38 | 39 | maven-assembly-plugin 40 | 41 | 42 | jar-with-dependencies 43 | 44 | 45 | 46 | spark.elsa.OnlineSA 47 | 48 | 49 | 50 | 51 | 52 | make-assembly 53 | package 54 | 55 | single 56 | 57 | 58 | 59 | 60 | 61 | org.scala-tools 62 | maven-scala-plugin 63 | 64 | 65 | 66 | compile 67 | 68 | 69 | 70 | 71 | 72 | -Xms64m 73 | -Xmx1024m 74 | 75 | 76 | 77 | 78 | org.apache.maven.plugins 79 | maven-compiler-plugin 80 | 3.1 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /src/main/scala/spark/elsa/ElsaHelper.scala: -------------------------------------------------------------------------------- 1 | package spark.elsa 2 | 3 | import org.apache.spark.Logging 4 | 5 | import org.apache.log4j.{Level, Logger} 6 | 7 | object ElsaHelper extends Logging { 8 | 9 | def setLogLevel() { 10 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements 11 | if (!log4jInitialized) { 12 | logInfo("Setting log level to [ERROR].") 13 | Logger.getRootLogger.setLevel(Level.ERROR) 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /src/main/scala/spark/elsa/OnlineSA.scala: -------------------------------------------------------------------------------- 1 | package spark.elsa 2 | 3 | import java.nio.file.{Paths, Files} 4 | import java.nio.charset.StandardCharsets 5 | 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming.{Seconds, StreamingContext} 8 | import org.apache.spark.streaming.twitter._ 9 | 10 | import org.streum.configrity._ 11 | 12 | object OnlineSA { 13 | 14 | def runAnalysis(elsaConf: Configuration): Unit = { 15 | 16 | // setting up the Spark configuration: 17 | val conf = new SparkConf().setAppName("ElSA Online").setMaster(elsaConf[String]("master")) 18 | // setting up the filename where to log the stats to: 19 | val stats = elsaConf[String]("stats-file") 20 | // setting up list of topics to be monitored by ElSA: 21 | val topics: Array[String] = elsaConf[String]("topics").split(",").distinct 22 | // setting up the Spark Streaming context: 23 | val ssc = new StreamingContext(conf, Seconds(elsaConf[Int]("batch-window"))) 24 | 25 | // setting up system properties for Twitter4j lib OAuth credentials: 26 | System.setProperty("twitter4j.oauth.consumerKey", elsaConf[String]("consumer-key")) 27 | System.setProperty("twitter4j.oauth.consumerSecret", elsaConf[String]("consumer-secret")) 28 | System.setProperty("twitter4j.oauth.accessToken", elsaConf[String]("access-token")) 29 | System.setProperty("twitter4j.oauth.accessTokenSecret", elsaConf[String]("access-token-secret")) 30 | 31 | // sentiment triggers: 32 | val posSen: Array[String] = Array("like", "cool", "awesome", "nice", "good", "love") 33 | val negSen: Array[String] = Array("dislike", "meh", "bad", "sad", "hate", "mad") 34 | 35 | // hook into the Twitter firehose and get tweets with the topics of interest: 36 | val twitterFirehose = TwitterUtils.createStream(ssc, None, topics) 37 | 38 | 39 | twitterFirehose.foreachRDD(rdd => { 40 | val tweetCount = rdd.count() 41 | 42 | // overall stats: 43 | print("\n\nIn the past " + elsaConf[Int]("batch-window") + " seconds " + 44 | "I found " + tweetCount + " tweet(s) " + 45 | "containing your topics: " 46 | ) 47 | for (topic <- topics) print(topic + " ") 48 | println("\n**********************") 49 | 50 | // display tweet details and determine sentiment 51 | rdd.foreach{ tweet => 52 | val tweetText = tweet.getText.toLowerCase // normalize for comparison with sentiments 53 | 54 | println("\n===\n" + tweetText + "\n===") 55 | 56 | // here comes the *very* simplistic sentiment analysis (just check if certain words are present): 57 | if ( posSen.exists(tweetText.contains) ) { print("SA: positive sentiment") } 58 | if ( negSen.exists(tweetText.contains) ) { print("SA: negative sentiment") } 59 | } 60 | 61 | // write out the tweet count as primary input for the auto-scale process: 62 | Files.write(Paths.get(stats), tweetCount.toString.getBytes(StandardCharsets.UTF_8)) 63 | }) 64 | 65 | // kick off the ongoing stream processing: 66 | //ssc.checkpoint(elsaConf[String]("checkpoint-dir")) 67 | ssc.start() 68 | ssc.awaitTermination() 69 | } 70 | 71 | def main(args: Array[String]) { 72 | if (args.length < 1) { 73 | System.err.println("Usage: OnlineSA ") 74 | System.exit(1) 75 | } 76 | // setting up configuration: 77 | val elsaConf = Configuration.load(args(0)) 78 | 79 | // makes sure that if and only if we're in production we don't show too verbose logs info: 80 | if (elsaConf[String]("deployment") == "production") { 81 | ElsaHelper.setLogLevel() 82 | } 83 | 84 | runAnalysis(elsaConf) 85 | System.exit(0) 86 | } 87 | } --------------------------------------------------------------------------------