├── .gitignore
├── LICENSE
├── README.md
├── dist
    ├── helloSpark-assembly-2.1.jar
    └── streaming-twitter-assembly-1.6.jar
├── docs
    ├── Twitter Sentiment with Watson TA and PI architecture diagram.orig.png
    └── Twitter Sentiment with Watson TA and PI architecture diagram.png
├── helloGraphx
    ├── build.sbt
    ├── project
    │   └── assembly.sbt
    ├── readme.md
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── ibm
    │                   └── cds
    │                       └── spark
    │                           └── samples
    │                               └── HelloGraphx.scala
├── helloSpark
    ├── .settings
    │   └── org.scala-ide.sdt.core.prefs
    ├── build.sbt
    ├── project
    │   └── assembly.sbt
    ├── python
    │   ├── helloSpark.py
    │   ├── helloSpark
    │   │   └── __init__.py
    │   └── setup.py
    ├── readme.md
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── ibm
    │                   └── cds
    │                       └── spark
    │                           └── samples
    │                               ├── HelloSpark.scala
    │                               └── package-info.java
├── notebook
    ├── DashDB Twitter Car 2015 Python Notebook.ipynb
    ├── Get Service Credentials for Twitter Sentiment with Watson TA and PI.md
    ├── PYCON 2016 spark tutorial quick links.txt
    ├── README.md
    └── Twitter Sentiment with Watson TA and PI.ipynb
└── streaming-twitter
    ├── .classpath
    ├── .gitignore
    ├── .project
    ├── build.sbt
    ├── lib
        ├── couchdb-scala
        │   └── com
        │   │   └── ibm
        │   │       └── couchdb-scala_2.10
        │   │           └── 0.5.3
        │   │               ├── couchdb-scala_2.10-0.5.3-javadoc.jar
        │   │               ├── couchdb-scala_2.10-0.5.3-javadoc.jar.md5
        │   │               ├── couchdb-scala_2.10-0.5.3-javadoc.jar.sha1
        │   │               ├── couchdb-scala_2.10-0.5.3-sources.jar
        │   │               ├── couchdb-scala_2.10-0.5.3-sources.jar.md5
        │   │               ├── couchdb-scala_2.10-0.5.3-sources.jar.sha1
        │   │               ├── couchdb-scala_2.10-0.5.3.jar
        │   │               ├── couchdb-scala_2.10-0.5.3.jar.md5
        │   │               ├── couchdb-scala_2.10-0.5.3.jar.sha1
        │   │               ├── couchdb-scala_2.10-0.5.3.pom
        │   │               ├── couchdb-scala_2.10-0.5.3.pom.md5
        │   │               └── couchdb-scala_2.10-0.5.3.pom.sha1
        ├── messagehub.login-1.0.0.jar
        └── pixiedust.jar
    ├── notebook
        ├── Spark Streaming Twitter-Watson-MessageHub.ipynb
        ├── Twitter + Watson Tone Analyzer Part 1.ipynb
        ├── Twitter + Watson Tone Analyzer Part 2.ipynb
        └── Twitter Sentiment with Pixiedust.ipynb
    ├── project
        └── assembly.sbt
    ├── readme.md
    ├── sampleConfig
        └── sampleconf.properties
    └── src
        └── main
            └── scala
                └── com
                    └── ibm
                        └── cds
                            └── spark
                                └── samples
                                    ├── KafkaProducerTest.scala
                                    ├── MessageHubStreamingTwitter.scala
                                    ├── PixiedustStreamingTwitter.scala
                                    ├── StatusSerializer.scala
                                    ├── StreamingListener.scala
                                    ├── StreamingTwitter.scala
                                    ├── ToneAnalyzer.scala
                                    ├── TwitterAdapter.scala
                                    ├── config
                                        ├── DemoConfig.scala
                                        ├── MessageHubConfig.scala
                                        └── jaas.conf
                                    ├── dstream
                                        └── KafkaInputDStream.scala
                                    ├── package-info.java
                                    └── package.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache
 6 | .history
 7 | .lib/
 8 | target/
 9 | lib_managed/
10 | src_managed/
11 | project/boot/
12 | project/plugins/project/
13 | 
14 | # Scala-IDE specific
15 | .scala_dependencies
16 | .worksheet
17 | 
18 | helloSpark/.cache-main
19 | 
20 | helloSpark/.classpath
21 | 
22 | helloSpark/.project
23 | 
24 | streaming-twitter/.cache-main
25 | 
26 | streaming-twitter/.settings/org.scala-ide.sdt.core.prefs
27 | 
28 | streaming-twitter/config/MessageHubYP.properties
29 | 
30 | *.pyc
31 | 
32 | pixiedust/pixiedust.egg-info
33 | 
34 | pixiedust/dist
35 | 
36 | .DS_Store
37 | 
38 | streaming-twitter/conf/log4j.properties
39 | 
40 | streaming-twitter/conf/log4j.properties.template
41 | 
42 | streaming-twitter/src/main/scala/resources/log4j.properties
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #Spark Tutorials
 2 | 
 3 | This repository contains tutorials and samples that show you how get the most out of IBM Analytics for Apache Spark. 
 4 | 
 5 | Watch this repo for new content. Meanwhile, try these tutorials:
 6 | 
 7 | - [Start Developing with Spark](https://developer.ibm.com/clouddataservices/start-developing-with-spark-and-notebooks/)
 8 | 
 9 | - [Sentiment Analysis of Twitter Hashtags](https://developer.ibm.com/clouddataservices/sentiment-analysis-of-twitter-hashtags/)  
10 | 
11 | - [Real-time Sentiment Analysis of Twitter Hashtags with Spark](https://developer.ibm.com/clouddataservices/2016/01/15/real-time-sentiment-analysis-of-twitter-hashtags-with-spark/)  
12 | 
13 | - [Getting started with GraphFrames in Apache Spark](https://developer.ibm.com/clouddataservices/2016/07/15/intro-to-apache-spark-graphframes/)  
14 | 
15 | - [Predict Flight Delays with Apache Spark MLLib, FlightStats, and Weather Data](https://developer.ibm.com/clouddataservices/2016/08/04/predict-flight-delays-with-apache-spark-mllib-flightstats-and-weather-data/)  
16 | 
17 | - [Analyze Market Trends in Twitter Using Apache Spark, Python, and dashDB](https://developer.ibm.com/clouddataservices/2016/06/13/analyze-market-trends-in-twitter-using-apache-spark-python-and-dashdb/)  
18 | 
19 | - [PixieDust: Magic for Your Python Notebook](https://developer.ibm.com/clouddataservices/2016/10/11/pixiedust-magic-for-python-notebook/)
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/dist/helloSpark-assembly-2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/dist/helloSpark-assembly-2.1.jar


--------------------------------------------------------------------------------
/dist/streaming-twitter-assembly-1.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/dist/streaming-twitter-assembly-1.6.jar


--------------------------------------------------------------------------------
/docs/Twitter Sentiment with Watson TA and PI architecture diagram.orig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/docs/Twitter Sentiment with Watson TA and PI architecture diagram.orig.png


--------------------------------------------------------------------------------
/docs/Twitter Sentiment with Watson TA and PI architecture diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/docs/Twitter Sentiment with Watson TA and PI architecture diagram.png


--------------------------------------------------------------------------------
/helloGraphx/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "helloGraphx"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies ++= {
 8 |   val sparkVersion =  "1.6.0"
 9 |   Seq(
10 |     "org.apache.spark" %%  "spark-core"	  		%  sparkVersion % "provided",
11 |     "org.apache.spark" %%  "spark-sql"	  		%  sparkVersion % "provided",
12 |     "org.apache.spark" %%  "spark-graphx" 		%  sparkVersion % "provided",
13 |     "org.apache.spark" %%  "spark-repl" 		% sparkVersion % "provided",
14 |     "org.http4s"       %%  "http4s-core"    	% "0.8.2",
15 | 	"org.http4s"       %%  "http4s-client"  	% "0.8.2",
16 | 	"org.http4s"       %%  "http4s-blazeclient"	% "0.8.2"
17 |   )
18 | }
19 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
20 | 


--------------------------------------------------------------------------------
/helloGraphx/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 | 


--------------------------------------------------------------------------------
/helloGraphx/readme.md:
--------------------------------------------------------------------------------
1 | # Start Developing with GraphX
2 | 
3 | 


--------------------------------------------------------------------------------
/helloGraphx/src/main/scala/com/ibm/cds/spark/samples/HelloGraphx.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.ibm.cds.spark.samples
 19 | 
 20 | import org.apache.spark._
 21 | import scalaz._
 22 | import java.net.URL
 23 | import java.util.Calendar
 24 | import java.net.URLEncoder
 25 | import java.text.SimpleDateFormat
 26 | import org.apache.spark.sql.SQLContext
 27 | import scala.collection.immutable.Map
 28 | import org.apache.spark.rdd.RDD
 29 | import org.apache.spark.graphx.VertexId
 30 | import org.apache.spark.sql.Row
 31 | import org.apache.spark.graphx.Edge
 32 | import org.apache.spark.graphx.Graph
 33 | import org.http4s.EntityEncoder
 34 | import org.codehaus.jettison.json.JSONObject
 35 | import org.http4s.Uri
 36 | import org.http4s.Request
 37 | import org.http4s.BasicCredentials
 38 | import org.http4s.headers.Authorization
 39 | import org.http4s.Header
 40 | import org.http4s.Headers
 41 | import org.http4s.Method
 42 | import org.http4s.client.blaze.PooledHttp1Client
 43 | import org.http4s.client.Client
 44 | import org.http4s.EntityDecoder
 45 | import org.apache.spark.graphx.EdgeTriplet
 46 | 
 47 | class Node(val properties: Map[String, String]) extends Serializable
 48 | case class Airport(override val properties: Map[String,String]) extends Node(properties)
 49 | case class Country(override val properties: Map[String,String]) extends Node(properties)
 50 | case class Continent(override val properties: Map[String,String]) extends Node(properties)
 51 | case class Route(override val properties: Map[String, String]) extends Node(properties)
 52 | 
 53 | object HelloGraphx {
 54 |   
 55 |   //main method invoked when running as a standalone Spark Application
 56 |   def main(args: Array[String]) {
 57 |     lazy val client = PooledHttp1Client()
 58 |     val conf = new SparkConf().setAppName("Hello Graphx")
 59 |     val sc = new SparkContext(conf)
 60 | 
 61 |     println("Hello Graphx Demo. Load/Save a graph to/from Graphx RDDs")
 62 |     
 63 |     val sqlContext = new SQLContext(sc);
 64 |     
 65 |     //Load airports
 66 |     val airportsDF = sqlContext.read.format("com.databricks.spark.xml")
 67 |           .option("rowTag","node")
 68 |           .option("rootTag","graphml/graph")
 69 |           .load("/Users/dtaieb/Downloads/air-routes-graph/air-routes.graphml")
 70 |     airportsDF.printSchema()
 71 |     println(airportsDF.count())
 72 |     
 73 |     val airportsRdd: RDD[(VertexId, Node with Product)] = 
 74 |         airportsDF.map { x => {
 75 |           val propertiesMap:Map[String,String] = x.getAs[Seq[Row]]("data")
 76 |               .map { row => row.getAs[String]("@key")->row.getAs[String]("#VALUE") }.toMap
 77 |           val id = x.getAs[Long]("@id")
 78 |           val nodeType:String = propertiesMap.get("type").getOrElse("")
 79 |           nodeType match {
 80 |                 case "airport" => (id, Airport(propertiesMap))
 81 |                 case "country" => (id, Country(propertiesMap))
 82 |                 case "continent" => (id, Continent(propertiesMap))
 83 |                 case _ => println("Skip node with type " + nodeType); (id, null)
 84 |           }
 85 |         }}.filter( f => f._2 !=null )
 86 |     println(airportsRdd.take(5).deep.mkString("\n"))
 87 |     
 88 |     //Load routes
 89 |     val routesDF = sqlContext.read.format("com.databricks.spark.xml")
 90 |           .option("rowTag","edge")
 91 |           .option("rootTag","graphml/graph")
 92 |           .load("/Users/dtaieb/Downloads/air-routes-graph/air-routes.graphml")
 93 |     routesDF.printSchema()
 94 |     println(routesDF.count())
 95 |     
 96 |     val routesRdd: RDD[(Edge[Route])] = 
 97 |         routesDF.map { x => {
 98 |           val propertiesMap:Map[String,String] = x.getAs[Seq[Row]]("data")
 99 |               .map { row => row.getAs[String]("@key")->row.getAs[String]("#VALUE") }.toMap + 
100 |               ("id" -> x.getAs[Long]("@id").toString)
101 |           Edge(x.getAs[Long]("@source"), x.getAs[Long]("@target"),Route(propertiesMap))
102 |         }}
103 |     println(routesRdd.take(5).deep.mkString("\n"))
104 |     
105 |     val graph = Graph( airportsRdd, routesRdd )
106 |     
107 |     //Iterate over the graph and send the vertices/edges to Gremlin Server
108 |     graph.triplets.foreach( f => {
109 |       addTriplet(client, f );
110 |     })    
111 |     
112 |     //Traverse all nodes and all vertices, send them to the graphdb service via gremlin
113 |     sc.stop()
114 |   }
115 |   
116 |   def escape(s:String):String={
117 |     s.replace("'", "\\'")
118 |   }
119 |   
120 |   def addTriplet(client: Client, f: EdgeTriplet[Node with Product, Route] ){
121 |     val sb = new StringBuilder()
122 | 
123 |     //Add the source vertex if necessary
124 |     sb.append( "v1=graph.traversal().V(" + f.srcId + ").tryNext().orElse(null);")
125 |     sb.append(" if(!v1) v1=graph.addVertex(id, " + f.srcId)
126 |     f.srcAttr.properties.foreach { case(k,v) => sb.append(",'" + escape(k) + "','" + escape(v) + "'" ) }
127 |     sb.append(");")
128 |     
129 |     //Add the target vertex if necessary
130 |     sb.append( "v2=graph.traversal().V(" + f.dstId + ").tryNext().orElse(null);")
131 |     sb.append(" if(!v2) v2=graph.addVertex(id, " + f.dstId)
132 |     f.dstAttr.properties.foreach { case(k,v) => sb.append(",'" + escape(k) + "','" + escape(v) + "'") }
133 |     sb.append(");")
134 |     
135 |     //Add the edge
136 |     sb.append("v1.addEdge('edge', v2")
137 |     f.attr.properties.foreach { f => sb.append(",'" + escape(f._1) + "','" + escape(f._2) + "'") }
138 |     sb.append(");")
139 |     
140 |     runScript(client, sb.toString )
141 |   }
142 |   
143 |   def addVertex(client: Client, id: Long, keyValues: Seq[(String,String)]){
144 |     val sb = new StringBuilder();
145 |     sb.append( "if(!graph.traversal().V(" + id + ")) graph.addVertex(id, " + id);
146 |     keyValues.foreach { case(k,v) => sb.append("," + k + "," + v) }
147 |     sb.append(")")
148 |     runScript(client, sb.toString() )
149 |   }
150 |   
151 |   def runScript(client: Client, script: String){
152 |     //println("{\"gremlin\":" + JSONObject.quote( script ) + "}")
153 |       val results = EntityEncoder[String].toEntity("{\"gremlin\":" + JSONObject.quote( script ) + "}" ).flatMap { 
154 |         entity => 
155 |           val gremlinUri = Uri.fromString( "http://localhost:8182" ).getOrElse( null )
156 |           client(
157 |               Request( 
158 |                   method = Method.POST, 
159 |                   uri = gremlinUri,
160 |                   headers = Headers(
161 |                       Header("Accept", "application/json"),
162 |                       Header("Content-Type", "application/json")
163 |                     ),
164 |                   body = entity.body
165 |               )
166 |           ).flatMap { response =>
167 |              val res = response.as[String]
168 |              if (response.status.code == 200 ) {              
169 |               res
170 |              } else {
171 |               println( "Error received from Gremlin. Code : " + response.status.code + " reason: " + response.status.reason )
172 |               res
173 |             }
174 |           }
175 |       }.attemptRun match {
176 |         case -\/(e) => //Ignore
177 |         case \/-(a) => println(a)
178 |       }
179 |   }
180 | }
181 | 


--------------------------------------------------------------------------------
/helloSpark/.settings/org.scala-ide.sdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | scala.compiler.additionalParams=\ -Xsource\:2.10 -Ymacro-expand\:none
3 | scala.compiler.installation=78943290
4 | scala.compiler.sourceLevel=2.10
5 | scala.compiler.useProjectSettings=true
6 | 


--------------------------------------------------------------------------------
/helloSpark/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "helloSpark"
 2 | 
 3 | version := "2.1"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies ++= {
 8 |   val sparkVersion =  "1.6.0"
 9 |   Seq(
10 |     "org.apache.spark" %%  "spark-core"	  %  sparkVersion % "provided",
11 |     "org.apache.spark" %%  "spark-sql"	  %  sparkVersion % "provided",
12 |     "org.apache.spark" %% "spark-repl" % sparkVersion % "provided"
13 |   )
14 | }
15 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
16 | 


--------------------------------------------------------------------------------
/helloSpark/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 | 


--------------------------------------------------------------------------------
/helloSpark/python/helloSpark.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark import SparkContext
 3 | 
 4 | def computeStatsForCollection(sc,countPerPartitions=100000,partitions=5):
 5 | 	totalNumber = min( countPerPartitions * partitions, sys.maxsize)
 6 | 	rdd = sc.parallelize( range(totalNumber),partitions)
 7 | 	return (rdd.mean(), rdd.variance())
 8 | 	
 9 | if __name__ == "__main__":
10 | 	sc = SparkContext(appName="Hello Spark")
11 | 	print("Hello Spark Demo. Compute the mean and variance of a collection")
12 | 	stats = computeStatsForCollection(sc);
13 | 	print(">>> Results: ")
14 | 	print(">>>>>>>Mean: " + str(stats[0]));
15 | 	print(">>>>>>>Variance: " + str(stats[1]));
16 | 	sc.stop()


--------------------------------------------------------------------------------
/helloSpark/python/helloSpark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/helloSpark/python/helloSpark/__init__.py


--------------------------------------------------------------------------------
/helloSpark/python/setup.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/helloSpark/python/setup.py


--------------------------------------------------------------------------------
/helloSpark/readme.md:
--------------------------------------------------------------------------------
 1 | # Start Developing with Spark
 2 | 
 3 | ####Build a custom library for Apache&#174; Spark&trade; and deploy it to a Jupyter Notebook.
 4 | 
 5 | If you're new to developing Spark applications you've come to the right place. Our [**Start Developing with Spark** tutorial](https://developer.ibm.com/clouddataservices/start-developing-with-spark-and-notebooks/) provides detailed end-to-end steps that show you how to build a simple custom library for Spark (written in scala) and how to deploy it on IBM Analytics for Apache Spark for Bluemix. 
 6 | 
 7 | These steps are the foundation for building real-life production applications.  You'll also learn how to manage your project with the import, test, and debug features of Scala IDE for Eclipse. 
 8 | 
 9 | [Get started](https://developer.ibm.com/clouddataservices/start-developing-with-spark-and-notebooks/)
10 | 


--------------------------------------------------------------------------------
/helloSpark/src/main/scala/com/ibm/cds/spark/samples/HelloSpark.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.ibm.cds.spark.samples
19 | 
20 | import org.apache.spark._
21 | 
22 | object HelloSpark {
23 |   
24 |   //main method invoked when running as a standalone Spark Application
25 |   def main(args: Array[String]) {
26 |     val conf = new SparkConf().setAppName("Hello Spark")
27 |     val spark = new SparkContext(conf)
28 | 
29 |     println("Hello Spark Demo. Compute the mean and variance of a collection")
30 |     val stats = computeStatsForCollection(spark);
31 |     println(">>> Results: ")
32 |     println(">>>>>>>Mean: " + stats._1 );
33 |     println(">>>>>>>Variance: " + stats._2);
34 |     spark.stop()
35 |   }
36 |   
37 |   //Library method that can be invoked from Jupyter Notebook
38 |   def computeStatsForCollection( spark: SparkContext, countPerPartitions: Int = 100000, partitions: Int=5): (Double, Double) = {    
39 |     val totalNumber = math.min( countPerPartitions * partitions, Long.MaxValue).toInt;
40 |     val rdd = spark.parallelize( 1 until totalNumber,partitions);
41 |     (rdd.mean(), rdd.variance())
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/helloSpark/src/main/scala/com/ibm/cds/spark/samples/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | /**
18 |  * Spark Sample Applications
19 |  *
20 |  */
21 | package com.ibm.cds.spark.samples;


--------------------------------------------------------------------------------
/notebook/Get Service Credentials for Twitter Sentiment with Watson TA and PI.md:
--------------------------------------------------------------------------------
 1 | # Set Up Services and Get Credentials
 2 | 
 3 | These instructions accompany the [Twitter Sentiment analysis with Watson Tone Analyzer and Watson Personality Insights Notebook](https://github.com/ibm-watson-data-lab/spark.samples/tree/master/notebook). This sample notebook requires a connection to the following online services: 
 4 | 
 5 | - Twitter
 6 | - Watson Tone Analyzer 
 7 | - Watson Personality Insights 
 8 | 
 9 | Follow these steps to set up, retrieve, and enter credentials for all 3 services:
10 | 
11 | ## Get OAuth Credentials for Twitter
12 | 
13 | 
14 | Create a new app on your Twitter account and configure the OAuth credentials.
15 | 
16 | <ol>
17 | <li>Go to <a href="https://apps.twitter.com/" target="_blank">https://apps.twitter.com/</a>. Sign in and click the <strong>Create New App</strong> button<br /></li>
18 | <li>Complete the required fields:
19 | 
20 | <ul>
21 | <li><strong>Name</strong> and <strong>Description</strong> can be anything you want. </li>
22 | <li><strong>Website.</strong> It doesn't matter what URL you enter here, as long as it's valid. For example, I used my Bluemix account URL: <em>https://davidtaiebspark.mybluemix.net</em> .</li>
23 | </ul></li>
24 | <li>Below the developer agreement, turn on the  <strong>Yes, I agree</strong> check box and click <strong>Create your Twitter application</strong>.<br /></li>
25 | <li>Click the <strong>Keys and Access Tokens</strong> tab.</li>
26 | <li>Scroll to the bottom of the page and click the <strong>Create My Access Tokens</strong> button.<br /></li>
27 | <li>Copy the <strong>Consumer Key</strong>, <strong>Consumer Secret</strong>, <strong>Access Token</strong>, and <strong>Access Token Secret</strong>. You will need them in a few minutes.
28 | <p>
29 | <img src="https://developer.ibm.com/clouddataservices2/wp-content/uploads/sites/85/2015/09/twitter_app_keys.png" alt="twitter_keys"></p></li>
30 | </ol>
31 | 
32 | ## Get Watson Personality Insights Credentials
33 | 
34 | Provision the service and grab your credentials:
35 | 
36 | 1. Still in Bluemix, go to the top menu, and click <strong>Catalog</strong>.</li>
37 | 2. In the search box, type <strong>Personality Insights</strong>.</li>
38 | 3. Click the <strong>Personality Insights</strong> service tile, then click <strong>Create</strong>. </li>
39 | 4. On left side of the screen, click <strong>Service Credentials</strong> and open or create credentials.
40 | 
41 |    ![creds](http://developer.ibm.com/clouddataservices/wp-content/uploads/sites/85/2016/10/pi_creds.png) 
42 | 
43 | 5. Copy the `username` and `password` values.
44 | 
45 | 
46 | ## Get Watson Tone Analyzer Credentials
47 | 
48 | Provision the service and grab your credentials:
49 | 
50 | 1. In a new browser tab or window, open Bluemix, go to the top menu, and click <strong>Catalog</strong>.</li>
51 | 2. In the search box, type <strong>Tone Analyzer</strong>.</li>
52 | 3. Click the <strong>Tone Analyzer</strong> tile, then click <strong>Create</strong>. </li>
53 | 4. On left side of the screen, click <strong>Service Credentials</strong> and open or create credentials.
54 | 5. Copy the `username` and `password` values. 
55 | 
56 | 
57 | 
58 | ## Paste Credentials into the Notebook
59 | 
60 | 1. Return to your version of the [Twitter Sentiment analysis with Watson Tone Analyzer and Watson Personality Insights Notebook](https://github.com/ibm-watson-data-lab/spark.samples/tree/master/notebook)
61 | 
62 | 2. Paste all the credentials you just collected into the notebook, replacing the XXXXs for each item:
63 | 
64 | ```
65 | sqlContext=SQLContext(sc)
66 | 
67 | #Set up the twitter credentials, they will be used both in scala and python cells below
68 | consumerKey = "XXXX"
69 | consumerSecret = "XXXX"
70 | accessToken = "XXXX"
71 | accessTokenSecret = "XXXX"
72 | 
73 | #Set up the Watson Personality insight credentials
74 | piUserName = "XXXX"
75 | piPassword = "XXXX"
76 | 
77 | #Set up the Watson Tone Analyzer credentials
78 | taUserName = "XXXX"
79 | taPassword = "XXXX"
80 | ```
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/notebook/PYCON 2016 spark tutorial quick links.txt:
--------------------------------------------------------------------------------
 1 | Bluemix:
 2 | https://console.ng.bluemix.net
 3 | 
 4 | FlightStats:
 5 | https://developer.flightstats.com/signup
 6 | https://developer.flightstats.com/admin/applications
 7 | 
 8 | Simple Data Pipe:
 9 | https://github.com/ibm-watson-data-lab/simple-data-pipe
10 | 
11 | Flight Predict Notebook, Slides 36 & 37:
12 | https://github.com/ibm-watson-data-lab/simple-data-pipe-connector-flightstats/raw/master/notebook/Flight%20Predict%20PyCon%202016.ipynb
13 | 
14 | Car Notebook, Slide 21:
15 | https://github.com/ibm-watson-data-lab/spark.samples/raw/master/notebook/DashDB%20Twitter%20Car%202015%20Python%20Notebook.ipynb
16 | 
17 | 
18 | SIMPLE DATA PIPE package.json:
19 | "simple-data-pipe-connector-flightstats":"git://github.com/ibm-watson-data-lab/simple-data-pipe-connector-flightstats.git"
20 | 


--------------------------------------------------------------------------------
/notebook/README.md:
--------------------------------------------------------------------------------
 1 | # Sample Notebooks
 2 | 
 3 | This repository contains sample notebooks that show you how get the most out of IBM Analytics for Apache Spark. You may run these notebooks in a locally set up notebook environment (i.e., [Jupyter Notebook](https://jupyter.readthedocs.io/en/latest/install.html)) or through the [IBM Data Science Experience (DSX)](http://datascience.ibm.com/).  
 4 | 
 5 | ## Service Credentials
 6 | 
 7 | Some of the notebooks require credentials to various services (e.g., Twitter API, Watson Tone Analyzer, etc.). Instructions for provisioning these services and getting credentials are outlined here: [Set Up Services and Get Credentials](https://github.com/ibm-watson-data-lab/spark.samples/blob/master/notebook/Get%20Service%20Credentials%20for%20Twitter%20Sentiment%20with%20Watson%20TA%20and%20PI.md)  
 8 | 
 9 | 
10 | ## Running a notebook in DSX
11 | 
12 | More info and detailed instruction for DSX can be found its [documentation](http://datascience.ibm.com/docs/content/getting-started/get-started.html).
13 |  
14 | 1. Log into DSX  
15 | 2. Go to __My Projects__  
16 | 3. Select an existing project or create a new project  
17 | 
18 | 	##### To set up a new project
19 | 	1. 	Click __create project__
20 | 	2. Enter a __Name__
21 | 	3. Select an existing or create a new __Spark Service__ to associate with the project
22 | 	4. Select and existing or create a new __Target Object Storage Instance__ to associate with the project
23 | 	5. Click __Create__
24 | 
25 | 4. Create a new notebook  
26 | 
27 | 	##### To set up a new notebook
28 | 	1. Click __add notebooks__
29 | 	2. Click __From URL__
30 | 	3. Enter a __Name__
31 | 	4. Enter the __Notebook URL__
32 | 	5. Select an existing __Spark Service__ to associate with the notebook
33 | 	6. Click __Create Notebook__
34 | 
35 | 5. Once in the notebook, follow it's instructions for running the notebook  
36 | 


--------------------------------------------------------------------------------
/notebook/Twitter Sentiment with Watson TA and PI.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Twitter Sentiment analysis with Watson Tone Analyzer and Watson Personality Insights\n",
   8 |     "\n",
   9 |     "<img style=\"max-width: 800px; padding: 25px 0px;\" src=\"https://ibm-watson-data-lab.github.io/spark.samples/Twitter%20Sentiment%20with%20Watson%20TA%20and%20PI%20architecture%20diagram.png\"/>\n",
  10 |     "\n",
  11 |     "In this notebook, we perform the following steps:  \n",
  12 |     "1. Install python-twitter and watson-developer-cloud modules\n",
  13 |     "2. Install the streaming Twitter jar using PixieDust packageManager\n",
  14 |     "3. Invoke the streaming Twitter app using the PixieDust Scala Bridge to get a DataFrame containing all the tweets enriched with Watson Tone Analyzer scores\n",
  15 |     "4. Create a new RDD that groups the tweets by author and concatenates all the associated tweets into one blob\n",
  16 |     "5. For each author and aggregated text, invoke the Watson Personality Insights to get the scores\n",
  17 |     "6. Visualize results using PixieDust display  \n",
  18 |     "\n",
  19 |     "## Learn more \n",
  20 |     "* [Watson Tone Analyzer](http://www.ibm.com/watson/developercloud/tone-analyzer.html)  \n",
  21 |     "* [Watson Personality Insights](http://www.ibm.com/watson/developercloud/personality-insights.html)  \n",
  22 |     "* [python-twitter](https://github.com/bear/python-twitter)  \n",
  23 |     "* [watson-developer-cloud](https://github.com/watson-developer-cloud)  \n",
  24 |     "* [PixieDust](https://github.com/ibm-watson-data-lab/pixiedust)\n",
  25 |     "* [Realtime Sentiment Analysis of Twitter Hashtags with Spark](https://developer.ibm.com/clouddataservices/2016/01/15/real-time-sentiment-analysis-of-twitter-hashtags-with-spark)"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "markdown",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "# Install python-twitter and watson-developer-cloud\n",
  33 |     "If you haven't already installed the following modules, run these 2 cells:"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": null,
  39 |    "metadata": {
  40 |     "collapsed": false
  41 |    },
  42 |    "outputs": [],
  43 |    "source": [
  44 |     "!pip install --user python-twitter"
  45 |    ]
  46 |   },
  47 |   {
  48 |    "cell_type": "code",
  49 |    "execution_count": null,
  50 |    "metadata": {
  51 |     "collapsed": false
  52 |    },
  53 |    "outputs": [],
  54 |    "source": [
  55 |     "!pip install --user watson-developer-cloud"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "markdown",
  60 |    "metadata": {},
  61 |    "source": [
  62 |     "# Install latest pixiedust\n",
  63 |     "Make sure you are running the latest pixiedust version. After upgrading restart the kernel before continuing to the next cells."
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": null,
  69 |    "metadata": {
  70 |     "collapsed": true
  71 |    },
  72 |    "outputs": [],
  73 |    "source": [
  74 |     "!pip install --upgrade --user pixiedust"
  75 |    ]
  76 |   },
  77 |   {
  78 |    "cell_type": "markdown",
  79 |    "metadata": {},
  80 |    "source": [
  81 |     "## Install the streaming Twitter jar in the notebook from the Github repo\n",
  82 |     "This jar file contains the Spark Streaming application (written in Scala) that connects to Twitter to fetch the tweets and send them to Watson Tone Analyzer for analysis. The resulting scores are then added to the tweets dataframe as separate columns."
  83 |    ]
  84 |   },
  85 |   {
  86 |    "cell_type": "code",
  87 |    "execution_count": null,
  88 |    "metadata": {
  89 |     "collapsed": false
  90 |    },
  91 |    "outputs": [],
  92 |    "source": [
  93 |     "import pixiedust\n",
  94 |     "jarPath = \"https://github.com/ibm-watson-data-lab/spark.samples/raw/master/dist/streaming-twitter-assembly-1.6.jar\"\n",
  95 |     "pixiedust.installPackage(jarPath)\n",
  96 |     "print(\"done\")"
  97 |    ]
  98 |   },
  99 |   {
 100 |    "cell_type": "markdown",
 101 |    "metadata": {},
 102 |    "source": [
 103 |     "<h3>If PixieDust or the streaming Twitter jar were just installed or upgraded, <span style=\"color: red\">restart the kernel</span> before continuing.</h3>"
 104 |    ]
 105 |   },
 106 |   {
 107 |    "cell_type": "markdown",
 108 |    "metadata": {},
 109 |    "source": [
 110 |     "## Use Scala Bridge to run the command line version of the app\n",
 111 |     "Insert your credentials for Twitter, Watson Tone Analyzer, and Watson Personality Insights. Then run the following cell. \n",
 112 |     "[Read how to provision these services and get credentials](https://github.com/ibm-watson-data-lab/spark.samples/blob/master/notebook/Get%20Service%20Credentials%20for%20Twitter%20Sentiment%20with%20Watson%20TA%20and%20PI.md). "
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "code",
 117 |    "execution_count": null,
 118 |    "metadata": {
 119 |     "collapsed": true
 120 |    },
 121 |    "outputs": [],
 122 |    "source": [
 123 |     "import pixiedust\n",
 124 |     "\n",
 125 |     "sqlContext=SQLContext(sc)\n",
 126 |     "\n",
 127 |     "#Set up the twitter credentials, they will be used both in scala and python cells below\n",
 128 |     "consumerKey = \"XXXX\"\n",
 129 |     "consumerSecret = \"XXXX\"\n",
 130 |     "accessToken = \"XXXX\"\n",
 131 |     "accessTokenSecret = \"XXXX\"\n",
 132 |     "\n",
 133 |     "#Set up the Watson Personality insight credentials\n",
 134 |     "piUserName = \"XXXX\"\n",
 135 |     "piPassword = \"XXXX\"\n",
 136 |     "\n",
 137 |     "#Set up the Watson Tone Analyzer credentials\n",
 138 |     "taUserName = \"XXXX\"\n",
 139 |     "taPassword = \"XXXX\""
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": null,
 145 |    "metadata": {
 146 |     "collapsed": false,
 147 |     "scrolled": false
 148 |    },
 149 |    "outputs": [],
 150 |    "source": [
 151 |     "%%scala\n",
 152 |     "val demo = com.ibm.cds.spark.samples.StreamingTwitter\n",
 153 |     "demo.setConfig(\"twitter4j.oauth.consumerKey\",consumerKey)\n",
 154 |     "demo.setConfig(\"twitter4j.oauth.consumerSecret\",consumerSecret)\n",
 155 |     "demo.setConfig(\"twitter4j.oauth.accessToken\",accessToken)\n",
 156 |     "demo.setConfig(\"twitter4j.oauth.accessTokenSecret\",accessTokenSecret)\n",
 157 |     "demo.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer/api\")\n",
 158 |     "demo.setConfig(\"watson.tone.password\",taPassword)\n",
 159 |     "demo.setConfig(\"watson.tone.username\",taUserName)\n",
 160 |     "\n",
 161 |     "import org.apache.spark.streaming._\n",
 162 |     "demo.startTwitterStreaming(sc, Seconds(30))  //Run the application for a limited time"
 163 |    ]
 164 |   },
 165 |   {
 166 |    "cell_type": "markdown",
 167 |    "metadata": {},
 168 |    "source": [
 169 |     "# Create a tweets dataframe from the data fetched above and transfer it to Python\n",
 170 |     "Notice the __ prefix for each variable which is used to signal PixieDust that the variable needs to be transfered back to Python"
 171 |    ]
 172 |   },
 173 |   {
 174 |    "cell_type": "code",
 175 |    "execution_count": null,
 176 |    "metadata": {
 177 |     "collapsed": false
 178 |    },
 179 |    "outputs": [],
 180 |    "source": [
 181 |     "%%scala\n",
 182 |     "val demo = com.ibm.cds.spark.samples.StreamingTwitter\n",
 183 |     "val (__sqlContext, __df) = demo.createTwitterDataFrames(sc)"
 184 |    ]
 185 |   },
 186 |   {
 187 |    "cell_type": "markdown",
 188 |    "metadata": {},
 189 |    "source": [
 190 |     "## Group the tweets by author and userid\n",
 191 |     "This will be used later to fetch the last 200 tweets for each author"
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "code",
 196 |    "execution_count": null,
 197 |    "metadata": {
 198 |     "collapsed": false
 199 |    },
 200 |    "outputs": [],
 201 |    "source": [
 202 |     "import pyspark.sql.functions as F\n",
 203 |     "usersDF = __df.groupby(\"author\", \"userid\").agg(F.avg(\"Anger\").alias(\"Anger\"), F.avg(\"Disgust\").alias(\"Disgust\"))\n",
 204 |     "usersDF.show()"
 205 |    ]
 206 |   },
 207 |   {
 208 |    "cell_type": "markdown",
 209 |    "metadata": {},
 210 |    "source": [
 211 |     "# Set up the Twitter API from python-twitter module"
 212 |    ]
 213 |   },
 214 |   {
 215 |    "cell_type": "code",
 216 |    "execution_count": null,
 217 |    "metadata": {
 218 |     "collapsed": false
 219 |    },
 220 |    "outputs": [],
 221 |    "source": [
 222 |     "import twitter\n",
 223 |     "api = twitter.Api(consumer_key=consumerKey,\n",
 224 |     "                  consumer_secret=consumerSecret,\n",
 225 |     "                  access_token_key=accessToken,\n",
 226 |     "                  access_token_secret=accessTokenSecret)\n",
 227 |     "\n",
 228 |     "#print(api.VerifyCredentials())"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "markdown",
 233 |    "metadata": {},
 234 |    "source": [
 235 |     "# For each author, fetch the last 200 tweets\n",
 236 |     "use flatMap to return a new RDD that contains a list of tuples composed of userid and tweets text: (userid, tweetText)"
 237 |    ]
 238 |   },
 239 |   {
 240 |    "cell_type": "code",
 241 |    "execution_count": null,
 242 |    "metadata": {
 243 |     "collapsed": false
 244 |    },
 245 |    "outputs": [],
 246 |    "source": [
 247 |     "def getTweets(screenName):\n",
 248 |     "    statuses = api.GetUserTimeline(screen_name=screenName,\n",
 249 |     "                        since_id=None,\n",
 250 |     "                        max_id=None,\n",
 251 |     "                        count=200,\n",
 252 |     "                        include_rts=False,\n",
 253 |     "                        trim_user=False,\n",
 254 |     "                        exclude_replies=True)\n",
 255 |     "    return statuses\n",
 256 |     "\n",
 257 |     "usersWithTweetsRDD = usersDF.flatMap(lambda s: [(s.user.screen_name, s.text.encode('ascii', 'ignore')) for s in getTweets(s['userid'])])\n",
 258 |     "print(usersWithTweetsRDD.count())"
 259 |    ]
 260 |   },
 261 |   {
 262 |    "cell_type": "markdown",
 263 |    "metadata": {},
 264 |    "source": [
 265 |     "# Concatenate all the tweets for each user so we have enough words to send to Watson Personality Insights\n",
 266 |     "* Use map to create an RDD of key, value pair composed of userId and tweets \n",
 267 |     "* Use reduceByKey to group all record with same author and concatenate the tweets"
 268 |    ]
 269 |   },
 270 |   {
 271 |    "cell_type": "code",
 272 |    "execution_count": null,
 273 |    "metadata": {
 274 |     "collapsed": false,
 275 |     "scrolled": true
 276 |    },
 277 |    "outputs": [],
 278 |    "source": [
 279 |     "import re\n",
 280 |     "usersWithTweetsRDD2 = usersWithTweetsRDD.map(lambda s: (s[0], s[1])).reduceByKey(lambda s,t: s + '\\n' + t)\\\n",
 281 |     "    .filter(lambda s: len(re.findall(r'\\w+', s[1])) > 100 )\n",
 282 |     "print(usersWithTweetsRDD2.count())\n",
 283 |     "#usersWithTweetsRDD2.take(2)"
 284 |    ]
 285 |   },
 286 |   {
 287 |    "cell_type": "markdown",
 288 |    "metadata": {},
 289 |    "source": [
 290 |     "# Call Watson Personality Insights on the text for each author\n",
 291 |     "Watson Personality Insights requires at least 100 words from its lexicon to be available, which may not exist for each user. This is why the getPersonlityInsight helper function guards against exceptions from calling Watson PI. If an exception occurs, then an empty array is returned. Each record with empty array is filtered out of the resulting RDD.\n",
 292 |     "\n",
 293 |     "Note also that we use broadcast variables to propagate the userName and password to the cluster"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": null,
 299 |    "metadata": {
 300 |     "collapsed": false,
 301 |     "scrolled": true
 302 |    },
 303 |    "outputs": [],
 304 |    "source": [
 305 |     "from pyspark.sql.types import *\n",
 306 |     "from watson_developer_cloud import PersonalityInsightsV3\n",
 307 |     "broadCastPIUsername = sc.broadcast(piUserName)\n",
 308 |     "broadCastPIPassword = sc.broadcast(piPassword)\n",
 309 |     "def getPersonalityInsight(text, schema=False):\n",
 310 |     "    personality_insights = PersonalityInsightsV3(\n",
 311 |     "          version='2016-10-20',\n",
 312 |     "          username=broadCastPIUsername.value,\n",
 313 |     "          password=broadCastPIPassword.value)\n",
 314 |     "    try:\n",
 315 |     "        p = personality_insights.profile(\n",
 316 |     "            text, content_type='text/plain',\n",
 317 |     "            raw_scores=True, consumption_preferences=True)\n",
 318 |     "\n",
 319 |     "        if schema:\n",
 320 |     "            return \\\n",
 321 |     "                [StructField(t['name'], FloatType()) for t in p[\"needs\"]] + \\\n",
 322 |     "                [StructField(t['name'], FloatType()) for t in p[\"values\"]] + \\\n",
 323 |     "                [StructField(t['name'], FloatType()) for t in p['personality' ]]\n",
 324 |     "        else:\n",
 325 |     "            return \\\n",
 326 |     "                [t['raw_score'] for t in p[\"needs\"]] + \\\n",
 327 |     "                [t['raw_score'] for t in p[\"values\"]] + \\\n",
 328 |     "                [t['raw_score'] for t in p['personality']]   \n",
 329 |     "    except:\n",
 330 |     "        return []\n",
 331 |     "\n",
 332 |     "usersWithPIRDD = usersWithTweetsRDD2.map(lambda s: [s[0]] + getPersonalityInsight(s[1])).filter(lambda s: len(s)>1)\n",
 333 |     "print(usersWithPIRDD.count())\n",
 334 |     "#usersWithPIRDD.take(2)"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "markdown",
 339 |    "metadata": {},
 340 |    "source": [
 341 |     "# Convert the RDD back to a DataFrame and call PixieDust display to visualize the results\n",
 342 |     "The schema is automatically created from introspecting a sample payload result from Watson Personality Insights"
 343 |    ]
 344 |   },
 345 |   {
 346 |    "cell_type": "code",
 347 |    "execution_count": null,
 348 |    "metadata": {
 349 |     "collapsed": false,
 350 |     "pixiedust": {
 351 |      "displayParams": {
 352 |       "aggregation": "SUM",
 353 |       "handlerId": "barChart",
 354 |       "keyFields": "userid",
 355 |       "showLegend": "true",
 356 |       "stacked": "false",
 357 |       "staticFigure": "false",
 358 |       "title": "Personality Insights",
 359 |       "valueFields": "Challenge,Closeness,Curiosity,Excitement"
 360 |      }
 361 |     },
 362 |     "scrolled": false
 363 |    },
 364 |    "outputs": [],
 365 |    "source": [
 366 |     "#convert to dataframe\n",
 367 |     "schema = StructType(\n",
 368 |     "    [StructField('userid',StringType())] + getPersonalityInsight(usersWithTweetsRDD2.take(1)[0][1], schema=True)\n",
 369 |     ")\n",
 370 |     "\n",
 371 |     "usersWithPIDF = sqlContext.createDataFrame(\n",
 372 |     "    usersWithPIRDD, schema\n",
 373 |     ")\n",
 374 |     "\n",
 375 |     "usersWithPIDF.cache()\n",
 376 |     "display(usersWithPIDF)"
 377 |    ]
 378 |   },
 379 |   {
 380 |    "cell_type": "markdown",
 381 |    "metadata": {},
 382 |    "source": [
 383 |     "# Compare Twitter users Personality Insights scores with this year presidential candidates\n",
 384 |     "\n",
 385 |     "For a quick look on the difference in Personality Insights scores Spark provides a describe() function that computes stddev and mean values off the dataframe. Compare differences in the scores of twitter users and presidential candidates."
 386 |    ]
 387 |   },
 388 |   {
 389 |    "cell_type": "code",
 390 |    "execution_count": null,
 391 |    "metadata": {
 392 |     "collapsed": true
 393 |    },
 394 |    "outputs": [],
 395 |    "source": [
 396 |     "candidates = \"realDonaldTrump HillaryClinton\".split(\" \")\n",
 397 |     "candidatesRDD = sc.parallelize(candidates)\\\n",
 398 |     "    .flatMap(lambda s: [(t.user.screen_name, t.text.encode('ascii', 'ignore')) for t in getTweets(s)])\\\n",
 399 |     "    .map(lambda s: (s[0], s[1]))\\\n",
 400 |     "    .reduceByKey(lambda s,t: s + '\\n' + t)\\\n",
 401 |     "    .filter(lambda s: len(re.findall(r'\\w+', s[1])) > 100 )\\\n",
 402 |     "    .map(lambda s: [s[0]] + getPersonalityInsight(s[1]))\n",
 403 |     "\n",
 404 |     "candidatesPIDF = sqlContext.createDataFrame(\n",
 405 |     "   candidatesRDD, schema\n",
 406 |     ")"
 407 |    ]
 408 |   },
 409 |   {
 410 |    "cell_type": "code",
 411 |    "execution_count": null,
 412 |    "metadata": {
 413 |     "collapsed": true
 414 |    },
 415 |    "outputs": [],
 416 |    "source": [
 417 |     "c = candidatesPIDF.collect()\n",
 418 |     "broadCastTrumpPI = sc.broadcast(c[0][1:])\n",
 419 |     "broadCastHillaryPI = sc.broadcast(c[1][1:])"
 420 |    ]
 421 |   },
 422 |   {
 423 |    "cell_type": "code",
 424 |    "execution_count": null,
 425 |    "metadata": {
 426 |     "collapsed": false,
 427 |     "pixiedust": {
 428 |      "displayParams": {
 429 |       "handlerId": "dataframe"
 430 |      }
 431 |     }
 432 |    },
 433 |    "outputs": [],
 434 |    "source": [
 435 |     "display(candidatesPIDF)"
 436 |    ]
 437 |   },
 438 |   {
 439 |    "cell_type": "code",
 440 |    "execution_count": null,
 441 |    "metadata": {
 442 |     "collapsed": false
 443 |    },
 444 |    "outputs": [],
 445 |    "source": [
 446 |     "candidatesPIDF.select('userid','Emotional range','Agreeableness', 'Extraversion','Conscientiousness', 'Openness').show()\n",
 447 |     "\n",
 448 |     "usersWithPIDF.describe(['Emotional range']).show()\n",
 449 |     "usersWithPIDF.describe(['Agreeableness']).show()\n",
 450 |     "usersWithPIDF.describe(['Extraversion']).show()\n",
 451 |     "usersWithPIDF.describe(['Conscientiousness']).show()\n",
 452 |     "usersWithPIDF.describe(['Openness']).show()"
 453 |    ]
 454 |   },
 455 |   {
 456 |    "cell_type": "markdown",
 457 |    "metadata": {},
 458 |    "source": [
 459 |     "# Calculate Euclidean distance (norm) between each Twitter user and the presidential candidates using the Personality Insights scores\n",
 460 |     "\n",
 461 |     "Add the distances into 2 extra columns and display the results"
 462 |    ]
 463 |   },
 464 |   {
 465 |    "cell_type": "code",
 466 |    "execution_count": null,
 467 |    "metadata": {
 468 |     "collapsed": false,
 469 |     "pixiedust": {
 470 |      "displayParams": {
 471 |       "aggregation": "COUNT",
 472 |       "handlerId": "barChart",
 473 |       "keyFields": "closerHillary",
 474 |       "showLegend": "true",
 475 |       "stacked": "true",
 476 |       "staticFigure": "false",
 477 |       "valueFields": "closerHillary"
 478 |      }
 479 |     }
 480 |    },
 481 |    "outputs": [],
 482 |    "source": [
 483 |     "import numpy as np\n",
 484 |     "from pyspark.sql.types import Row\n",
 485 |     "def addEuclideanDistance(s):\n",
 486 |     "    dict = s.asDict()\n",
 487 |     "    def getEuclideanDistance(a,b):\n",
 488 |     "        return np.linalg.norm(np.array(a) - np.array(b)).item()\n",
 489 |     "    dict[\"distDonaldTrump\"]=getEuclideanDistance(s[1:], broadCastTrumpPI.value)\n",
 490 |     "    dict[\"distHillary\"]=getEuclideanDistance(s[1:], broadCastHillaryPI.value)\n",
 491 |     "    dict[\"closerHillary\"] = \"Yes\" if dict[\"distHillary\"] < dict[\"distDonaldTrump\"] else \"No\"\n",
 492 |     "    return Row(**dict)\n",
 493 |     "\n",
 494 |     "#add euclidean distances to Trump and Hillary\n",
 495 |     "euclideanDF = sqlContext.createDataFrame(usersWithPIDF.map(lambda s: addEuclideanDistance(s)))\n",
 496 |     "\n",
 497 |     "#Reorder columns to have userid and distances first\n",
 498 |     "cols = euclideanDF.columns\n",
 499 |     "reorderCols = [\"userid\",\"distHillary\",\"distDonaldTrump\", \"closerHillary\"]\n",
 500 |     "euclideanDF = euclideanDF.select(reorderCols + [x for x in cols if x not in reorderCols])\n",
 501 |     "\n",
 502 |     "#PixieDust display. \n",
 503 |     "#To visualize the distribution, select the bar chart display, use closerHillary as key and value and aggregation=count\n",
 504 |     "display(euclideanDF)"
 505 |    ]
 506 |   },
 507 |   {
 508 |    "cell_type": "markdown",
 509 |    "metadata": {},
 510 |    "source": [
 511 |     "# Optional: do some extra data science on the tweets"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "code",
 516 |    "execution_count": null,
 517 |    "metadata": {
 518 |     "collapsed": false,
 519 |     "pixiedust": {
 520 |      "displayParams": {
 521 |       "aggregation": "COUNT",
 522 |       "handlerId": "barChart",
 523 |       "keyFields": "Anger",
 524 |       "showLegend": "true",
 525 |       "stacked": "true",
 526 |       "staticFigure": "false",
 527 |       "valueFields": "Openness"
 528 |      }
 529 |     }
 530 |    },
 531 |    "outputs": [],
 532 |    "source": [
 533 |     "tweets=__df\n",
 534 |     "tweets.count()\n",
 535 |     "display(tweets)"
 536 |    ]
 537 |   },
 538 |   {
 539 |    "cell_type": "markdown",
 540 |    "metadata": {},
 541 |    "source": [
 542 |     "# Compute the sentiment distributions for tweets with scores greater than 60% and create matplotlib chart visualization"
 543 |    ]
 544 |   },
 545 |   {
 546 |    "cell_type": "code",
 547 |    "execution_count": null,
 548 |    "metadata": {
 549 |     "collapsed": false
 550 |    },
 551 |    "outputs": [],
 552 |    "source": [
 553 |     "#create an array that will hold the count for each sentiment\n",
 554 |     "sentimentDistribution=[0] * 13\n",
 555 |     "#For each sentiment, run a sql query that counts the number of tweets for which the sentiment score is greater than 60%\n",
 556 |     "#Store the data in the array\n",
 557 |     "for i, sentiment in enumerate(tweets.columns[-13:]):\n",
 558 |     "    sentimentDistribution[i]=__sqlContext.sql(\"SELECT count(*) as sentCount FROM tweets where \" + sentiment + \" > 60\")\\\n",
 559 |     "        .collect()[0].sentCount"
 560 |    ]
 561 |   },
 562 |   {
 563 |    "cell_type": "code",
 564 |    "execution_count": null,
 565 |    "metadata": {
 566 |     "collapsed": false
 567 |    },
 568 |    "outputs": [],
 569 |    "source": [
 570 |     "%matplotlib inline\n",
 571 |     "import matplotlib\n",
 572 |     "import numpy as np\n",
 573 |     "import matplotlib.pyplot as plt\n",
 574 |     "\n",
 575 |     "ind=np.arange(13)\n",
 576 |     "width = 0.35\n",
 577 |     "bar = plt.bar(ind, sentimentDistribution, width, color='g', label = \"distributions\")\n",
 578 |     "\n",
 579 |     "params = plt.gcf()\n",
 580 |     "plSize = params.get_size_inches()\n",
 581 |     "params.set_size_inches( (plSize[0]*2.5, plSize[1]*2) )\n",
 582 |     "plt.ylabel('Tweet count')\n",
 583 |     "plt.xlabel('Tone')\n",
 584 |     "plt.title('Distribution of tweets by sentiments > 60%')\n",
 585 |     "plt.xticks(ind+width, tweets.columns[-13:])\n",
 586 |     "plt.legend()\n",
 587 |     "\n",
 588 |     "plt.show()"
 589 |    ]
 590 |   },
 591 |   {
 592 |    "cell_type": "markdown",
 593 |    "metadata": {},
 594 |    "source": [
 595 |     "# Compute the top hashtags used in each tweet"
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "code",
 600 |    "execution_count": null,
 601 |    "metadata": {
 602 |     "collapsed": true
 603 |    },
 604 |    "outputs": [],
 605 |    "source": [
 606 |     "from operator import add\n",
 607 |     "import re\n",
 608 |     "tagsRDD = tweets.flatMap( lambda t: re.split(\"\\s\", t.text))\\\n",
 609 |     "    .filter( lambda word: word.startswith(\"#\") )\\\n",
 610 |     "    .map( lambda word : (word, 1 ))\\\n",
 611 |     "    .reduceByKey(add, 10).map(lambda (a,b): (b,a)).sortByKey(False).map(lambda (a,b):(b,a))\n",
 612 |     "top10tags = tagsRDD.take(10)"
 613 |    ]
 614 |   },
 615 |   {
 616 |    "cell_type": "code",
 617 |    "execution_count": null,
 618 |    "metadata": {
 619 |     "collapsed": false
 620 |    },
 621 |    "outputs": [],
 622 |    "source": [
 623 |     "%matplotlib inline\n",
 624 |     "import matplotlib\n",
 625 |     "import matplotlib.pyplot as plt\n",
 626 |     "\n",
 627 |     "params = plt.gcf()\n",
 628 |     "plSize = params.get_size_inches()\n",
 629 |     "params.set_size_inches( (plSize[0]*2, plSize[1]*2) )\n",
 630 |     "\n",
 631 |     "labels = [i[0] for i in top10tags]\n",
 632 |     "sizes = [int(i[1]) for i in top10tags]\n",
 633 |     "colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', \"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\"]\n",
 634 |     "\n",
 635 |     "plt.pie(sizes, labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90)\n",
 636 |     "\n",
 637 |     "plt.axis('equal')\n",
 638 |     "plt.show()"
 639 |    ]
 640 |   },
 641 |   {
 642 |    "cell_type": "markdown",
 643 |    "metadata": {},
 644 |    "source": [
 645 |     "# Compute the aggregate sentiment distribution for all the tweets that contain the top hashtags"
 646 |    ]
 647 |   },
 648 |   {
 649 |    "cell_type": "code",
 650 |    "execution_count": null,
 651 |    "metadata": {
 652 |     "collapsed": true
 653 |    },
 654 |    "outputs": [],
 655 |    "source": [
 656 |     "cols = tweets.columns[-13:]\n",
 657 |     "def expand( t ):\n",
 658 |     "    ret = []\n",
 659 |     "    for s in [i[0] for i in top10tags]:\n",
 660 |     "        if ( s in t.text ):\n",
 661 |     "            for tone in cols:\n",
 662 |     "                ret += [s.replace(':','').replace('-','') + u\"-\" + unicode(tone) + \":\" + unicode(getattr(t, tone))]\n",
 663 |     "    return ret \n",
 664 |     "def makeList(l):\n",
 665 |     "    return l if isinstance(l, list) else [l]\n",
 666 |     "\n",
 667 |     "#Create RDD from tweets dataframe\n",
 668 |     "tagsRDD = tweets.map(lambda t: t )\n",
 669 |     "\n",
 670 |     "#Filter to only keep the entries that are in top10tags\n",
 671 |     "tagsRDD = tagsRDD.filter( lambda t: any(s in t.text for s in [i[0] for i in top10tags] ) )\n",
 672 |     "\n",
 673 |     "#Create a flatMap using the expand function defined above, this will be used to collect all the scores \n",
 674 |     "#for a particular tag with the following format: Tag-Tone-ToneScore\n",
 675 |     "tagsRDD = tagsRDD.flatMap( expand )\n",
 676 |     "\n",
 677 |     "#Create a map indexed by Tag-Tone keys \n",
 678 |     "tagsRDD = tagsRDD.map( lambda fullTag : (fullTag.split(\":\")[0], float( fullTag.split(\":\")[1]) ))\n",
 679 |     "\n",
 680 |     "#Call combineByKey to format the data as follow\n",
 681 |     "#Key=Tag-Tone\n",
 682 |     "#Value=(count, sum_of_all_score_for_this_tone)\n",
 683 |     "tagsRDD = tagsRDD.combineByKey((lambda x: (x,1)),\n",
 684 |     "                  (lambda x, y: (x[0] + y, x[1] + 1)),\n",
 685 |     "                  (lambda x, y: (x[0] + y[0], x[1] + y[1])))\n",
 686 |     "\n",
 687 |     "#ReIndex the map to have the key be the Tag and value be (Tone, Average_score) tuple\n",
 688 |     "#Key=Tag\n",
 689 |     "#Value=(Tone, average_score)\n",
 690 |     "tagsRDD = tagsRDD.map(lambda (key, ab): (key.split(\"-\")[0], (key.split(\"-\")[1], round(ab[0]/ab[1], 2))))\n",
 691 |     "\n",
 692 |     "#Reduce the map on the Tag key, value becomes a list of (Tone,average_score) tuples\n",
 693 |     "tagsRDD = tagsRDD.reduceByKey( lambda x, y : makeList(x) + makeList(y) )\n",
 694 |     "\n",
 695 |     "#Sort the (Tone,average_score) tuples alphabetically by Tone\n",
 696 |     "tagsRDD = tagsRDD.mapValues( lambda x : sorted(x) )\n",
 697 |     "\n",
 698 |     "#Format the data as expected by the plotting code in the next cell. \n",
 699 |     "#map the Values to a tuple as follow: ([list of tone], [list of average score])\n",
 700 |     "#e.g. #someTag:([u'Agreeableness', u'Analytical', u'Anger', u'Cheerfulness', u'Confident', u'Conscientiousness', u'Negative', u'Openness', u'Tentative'], [1.0, 0.0, 0.0, 1.0, 0.0, 0.48, 0.0, 0.02, 0.0])\n",
 701 |     "tagsRDD = tagsRDD.mapValues( lambda x : ([elt[0] for elt in x],[elt[1] for elt in x])  )\n",
 702 |     "\n",
 703 |     "#Use custom sort function to sort the entries by order of appearance in top10tags\n",
 704 |     "def customCompare( key ):\n",
 705 |     "    for (k,v) in top10tags:\n",
 706 |     "        if k == key:\n",
 707 |     "            return v\n",
 708 |     "    return 0\n",
 709 |     "tagsRDD = tagsRDD.sortByKey(ascending=False, numPartitions=None, keyfunc = customCompare)\n",
 710 |     "\n",
 711 |     "#Take the mean tone scores for the top 10 tags\n",
 712 |     "top10tagsMeanScores = tagsRDD.take(10)"
 713 |    ]
 714 |   },
 715 |   {
 716 |    "cell_type": "code",
 717 |    "execution_count": null,
 718 |    "metadata": {
 719 |     "collapsed": false
 720 |    },
 721 |    "outputs": [],
 722 |    "source": [
 723 |     "%matplotlib inline\n",
 724 |     "import matplotlib\n",
 725 |     "import numpy as np\n",
 726 |     "import matplotlib.pyplot as plt\n",
 727 |     "\n",
 728 |     "params = plt.gcf()\n",
 729 |     "plSize = params.get_size_inches()\n",
 730 |     "params.set_size_inches( (plSize[0]*3, plSize[1]*2) )\n",
 731 |     "\n",
 732 |     "top5tagsMeanScores = top10tagsMeanScores[:5]\n",
 733 |     "width = 0\n",
 734 |     "ind=np.arange(13)\n",
 735 |     "(a,b) = top5tagsMeanScores[0]\n",
 736 |     "labels=b[0]\n",
 737 |     "colors = [\"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\", \"lightgreen\", \"gainsboro\", \"aquamarine\",\"c\"]\n",
 738 |     "idx=0\n",
 739 |     "for key, value in top5tagsMeanScores:\n",
 740 |     "    plt.bar(ind + width, value[1], 0.15, color=colors[idx], label=key)\n",
 741 |     "    width += 0.15\n",
 742 |     "    idx += 1\n",
 743 |     "plt.xticks(ind+0.3, labels)\n",
 744 |     "plt.ylabel('AVERAGE SCORE')\n",
 745 |     "plt.xlabel('TONES')\n",
 746 |     "plt.title('Breakdown of top hashtags by sentiment tones')\n",
 747 |     "\n",
 748 |     "plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='center',ncol=5, mode=\"expand\", borderaxespad=0.)\n",
 749 |     "\n",
 750 |     "plt.show()"
 751 |    ]
 752 |   },
 753 |   {
 754 |    "cell_type": "markdown",
 755 |    "metadata": {},
 756 |    "source": [
 757 |     "# Optional: Use Twitter demo embedded app to run the same app with a UI"
 758 |    ]
 759 |   },
 760 |   {
 761 |    "cell_type": "code",
 762 |    "execution_count": null,
 763 |    "metadata": {
 764 |     "collapsed": false
 765 |    },
 766 |    "outputs": [],
 767 |    "source": [
 768 |     "%%scala\n",
 769 |     "val demo = com.ibm.cds.spark.samples.PixiedustStreamingTwitter\n",
 770 |     "demo.setConfig(\"twitter4j.oauth.consumerKey\",consumerKey)\n",
 771 |     "demo.setConfig(\"twitter4j.oauth.consumerSecret\",consumerSecret)\n",
 772 |     "demo.setConfig(\"twitter4j.oauth.accessToken\",accessToken)\n",
 773 |     "demo.setConfig(\"twitter4j.oauth.accessTokenSecret\",accessTokenSecret)\n",
 774 |     "demo.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer/api\")\n",
 775 |     "demo.setConfig(\"watson.tone.password\",taPassword)\n",
 776 |     "demo.setConfig(\"watson.tone.username\",taUserName)\n",
 777 |     "demo.setConfig(\"checkpointDir\", System.getProperty(\"user.home\") + \"/pixiedust/ssc\")"
 778 |    ]
 779 |   },
 780 |   {
 781 |    "cell_type": "code",
 782 |    "execution_count": null,
 783 |    "metadata": {
 784 |     "collapsed": true
 785 |    },
 786 |    "outputs": [],
 787 |    "source": [
 788 |     "!pip install --upgrade --user pixiedust-twitterdemo"
 789 |    ]
 790 |   },
 791 |   {
 792 |    "cell_type": "code",
 793 |    "execution_count": null,
 794 |    "metadata": {
 795 |     "collapsed": false,
 796 |     "pixiedust": {
 797 |      "displayParams": {
 798 |       "handlerId": "twitterdemo"
 799 |      }
 800 |     }
 801 |    },
 802 |    "outputs": [],
 803 |    "source": [
 804 |     "from pixiedust_twitterdemo import *\n",
 805 |     "twitterDemo()"
 806 |    ]
 807 |   },
 808 |   {
 809 |    "cell_type": "markdown",
 810 |    "metadata": {},
 811 |    "source": [
 812 |     "## The embedded app has generated a DataFrame called __tweets. Let's use it to do some data science"
 813 |    ]
 814 |   },
 815 |   {
 816 |    "cell_type": "code",
 817 |    "execution_count": null,
 818 |    "metadata": {
 819 |     "collapsed": false,
 820 |     "pixiedust": {
 821 |      "displayParams": {
 822 |       "handlerId": "dataframe"
 823 |      }
 824 |     }
 825 |    },
 826 |    "outputs": [],
 827 |    "source": [
 828 |     "display(__tweets)"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "code",
 833 |    "execution_count": null,
 834 |    "metadata": {
 835 |     "collapsed": false,
 836 |     "pixiedust": {
 837 |      "displayParams": {
 838 |       "aggregation": "COUNT",
 839 |       "handlerId": "barChart",
 840 |       "keyFields": "emotion",
 841 |       "showLegend": "true",
 842 |       "stacked": "true",
 843 |       "valueFields": "score"
 844 |      }
 845 |     }
 846 |    },
 847 |    "outputs": [],
 848 |    "source": [
 849 |     "from pyspark.sql import Row\n",
 850 |     "from pyspark.sql.types import *\n",
 851 |     "emotions=__tweets.columns[-13:]\n",
 852 |     "distrib = __tweets.flatMap(lambda t: [(x,t[x]) for x in emotions]).filter(lambda t: t[1]>60)\\\n",
 853 |     "    .toDF(StructType([StructField('emotion',StringType()),StructField('score',DoubleType())]))\n",
 854 |     "display(distrib)"
 855 |    ]
 856 |   },
 857 |   {
 858 |    "cell_type": "code",
 859 |    "execution_count": null,
 860 |    "metadata": {
 861 |     "collapsed": false
 862 |    },
 863 |    "outputs": [],
 864 |    "source": [
 865 |     "__tweets.registerTempTable(\"pixiedust_tweets\")\n",
 866 |     "#create an array that will hold the count for each sentiment\n",
 867 |     "sentimentDistribution=[0] * 13\n",
 868 |     "#For each sentiment, run a sql query that counts the number of tweets for which the sentiment score is greater than 60%\n",
 869 |     "#Store the data in the array\n",
 870 |     "for i, sentiment in enumerate(__tweets.columns[-13:]):\n",
 871 |     "    sentimentDistribution[i]=sqlContext.sql(\"SELECT count(*) as sentCount FROM pixiedust_tweets where \" + sentiment + \" > 60\")\\\n",
 872 |     "        .collect()[0].sentCount"
 873 |    ]
 874 |   },
 875 |   {
 876 |    "cell_type": "code",
 877 |    "execution_count": null,
 878 |    "metadata": {
 879 |     "collapsed": false
 880 |    },
 881 |    "outputs": [],
 882 |    "source": [
 883 |     "%matplotlib inline\n",
 884 |     "import matplotlib\n",
 885 |     "import numpy as np\n",
 886 |     "import matplotlib.pyplot as plt\n",
 887 |     "\n",
 888 |     "ind=np.arange(13)\n",
 889 |     "width = 0.35\n",
 890 |     "bar = plt.bar(ind, sentimentDistribution, width, color='g', label = \"distributions\")\n",
 891 |     "\n",
 892 |     "params = plt.gcf()\n",
 893 |     "plSize = params.get_size_inches()\n",
 894 |     "params.set_size_inches( (plSize[0]*2.5, plSize[1]*2) )\n",
 895 |     "plt.ylabel('Tweet count')\n",
 896 |     "plt.xlabel('Tone')\n",
 897 |     "plt.title('Distribution of tweets by sentiments > 60%')\n",
 898 |     "plt.xticks(ind+width, __tweets.columns[-13:])\n",
 899 |     "plt.legend()\n",
 900 |     "\n",
 901 |     "plt.show()"
 902 |    ]
 903 |   },
 904 |   {
 905 |    "cell_type": "code",
 906 |    "execution_count": null,
 907 |    "metadata": {
 908 |     "collapsed": true
 909 |    },
 910 |    "outputs": [],
 911 |    "source": [
 912 |     "from operator import add\n",
 913 |     "import re\n",
 914 |     "tagsRDD = __tweets.flatMap( lambda t: re.split(\"\\s\", t.text))\\\n",
 915 |     "    .filter( lambda word: word.startswith(\"#\") )\\\n",
 916 |     "    .map( lambda word : (word, 1 ))\\\n",
 917 |     "    .reduceByKey(add, 10).map(lambda (a,b): (b,a)).sortByKey(False).map(lambda (a,b):(b,a))\n",
 918 |     "top10tags = tagsRDD.take(10)"
 919 |    ]
 920 |   },
 921 |   {
 922 |    "cell_type": "code",
 923 |    "execution_count": null,
 924 |    "metadata": {
 925 |     "collapsed": false
 926 |    },
 927 |    "outputs": [],
 928 |    "source": [
 929 |     "%matplotlib inline\n",
 930 |     "import matplotlib\n",
 931 |     "import matplotlib.pyplot as plt\n",
 932 |     "\n",
 933 |     "params = plt.gcf()\n",
 934 |     "plSize = params.get_size_inches()\n",
 935 |     "params.set_size_inches( (plSize[0]*2, plSize[1]*2) )\n",
 936 |     "\n",
 937 |     "labels = [i[0] for i in top10tags]\n",
 938 |     "sizes = [int(i[1]) for i in top10tags]\n",
 939 |     "colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', \"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\"]\n",
 940 |     "\n",
 941 |     "plt.pie(sizes, labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90)\n",
 942 |     "\n",
 943 |     "plt.axis('equal')\n",
 944 |     "plt.show()"
 945 |    ]
 946 |   },
 947 |   {
 948 |    "cell_type": "code",
 949 |    "execution_count": null,
 950 |    "metadata": {
 951 |     "collapsed": true
 952 |    },
 953 |    "outputs": [],
 954 |    "source": [
 955 |     "cols = __tweets.columns[-13:]\n",
 956 |     "def expand( t ):\n",
 957 |     "    ret = []\n",
 958 |     "    for s in [i[0] for i in top10tags]:\n",
 959 |     "        if ( s in t.text ):\n",
 960 |     "            for tone in cols:\n",
 961 |     "                ret += [s.replace(':','').replace('-','') + u\"-\" + unicode(tone) + \":\" + unicode(getattr(t, tone))]\n",
 962 |     "    return ret \n",
 963 |     "def makeList(l):\n",
 964 |     "    return l if isinstance(l, list) else [l]\n",
 965 |     "\n",
 966 |     "#Create RDD from tweets dataframe\n",
 967 |     "tagsRDD = __tweets.map(lambda t: t )\n",
 968 |     "\n",
 969 |     "#Filter to only keep the entries that are in top10tags\n",
 970 |     "tagsRDD = tagsRDD.filter( lambda t: any(s in t.text for s in [i[0] for i in top10tags] ) )\n",
 971 |     "\n",
 972 |     "#Create a flatMap using the expand function defined above, this will be used to collect all the scores \n",
 973 |     "#for a particular tag with the following format: Tag-Tone-ToneScore\n",
 974 |     "tagsRDD = tagsRDD.flatMap( expand )\n",
 975 |     "\n",
 976 |     "#Create a map indexed by Tag-Tone keys \n",
 977 |     "tagsRDD = tagsRDD.map( lambda fullTag : (fullTag.split(\":\")[0], float( fullTag.split(\":\")[1]) ))\n",
 978 |     "\n",
 979 |     "#Call combineByKey to format the data as follow\n",
 980 |     "#Key=Tag-Tone\n",
 981 |     "#Value=(count, sum_of_all_score_for_this_tone)\n",
 982 |     "tagsRDD = tagsRDD.combineByKey((lambda x: (x,1)),\n",
 983 |     "                  (lambda x, y: (x[0] + y, x[1] + 1)),\n",
 984 |     "                  (lambda x, y: (x[0] + y[0], x[1] + y[1])))\n",
 985 |     "\n",
 986 |     "#ReIndex the map to have the key be the Tag and value be (Tone, Average_score) tuple\n",
 987 |     "#Key=Tag\n",
 988 |     "#Value=(Tone, average_score)\n",
 989 |     "tagsRDD = tagsRDD.map(lambda (key, ab): (key.split(\"-\")[0], (key.split(\"-\")[1], round(ab[0]/ab[1], 2))))\n",
 990 |     "\n",
 991 |     "#Reduce the map on the Tag key, value becomes a list of (Tone,average_score) tuples\n",
 992 |     "tagsRDD = tagsRDD.reduceByKey( lambda x, y : makeList(x) + makeList(y) )\n",
 993 |     "\n",
 994 |     "#Sort the (Tone,average_score) tuples alphabetically by Tone\n",
 995 |     "tagsRDD = tagsRDD.mapValues( lambda x : sorted(x) )\n",
 996 |     "\n",
 997 |     "#Format the data as expected by the plotting code in the next cell. \n",
 998 |     "#map the Values to a tuple as follow: ([list of tone], [list of average score])\n",
 999 |     "#e.g. #someTag:([u'Agreeableness', u'Analytical', u'Anger', u'Cheerfulness', u'Confident', u'Conscientiousness', u'Negative', u'Openness', u'Tentative'], [1.0, 0.0, 0.0, 1.0, 0.0, 0.48, 0.0, 0.02, 0.0])\n",
1000 |     "tagsRDD = tagsRDD.mapValues( lambda x : ([elt[0] for elt in x],[elt[1] for elt in x])  )\n",
1001 |     "\n",
1002 |     "#Use custom sort function to sort the entries by order of appearance in top10tags\n",
1003 |     "def customCompare( key ):\n",
1004 |     "    for (k,v) in top10tags:\n",
1005 |     "        if k == key:\n",
1006 |     "            return v\n",
1007 |     "    return 0\n",
1008 |     "tagsRDD = tagsRDD.sortByKey(ascending=False, numPartitions=None, keyfunc = customCompare)\n",
1009 |     "\n",
1010 |     "#Take the mean tone scores for the top 10 tags\n",
1011 |     "top10tagsMeanScores = tagsRDD.take(10)"
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "code",
1016 |    "execution_count": null,
1017 |    "metadata": {
1018 |     "collapsed": false
1019 |    },
1020 |    "outputs": [],
1021 |    "source": [
1022 |     "%matplotlib inline\n",
1023 |     "import matplotlib\n",
1024 |     "import numpy as np\n",
1025 |     "import matplotlib.pyplot as plt\n",
1026 |     "\n",
1027 |     "params = plt.gcf()\n",
1028 |     "plSize = params.get_size_inches()\n",
1029 |     "params.set_size_inches( (plSize[0]*3, plSize[1]*2) )\n",
1030 |     "\n",
1031 |     "top5tagsMeanScores = top10tagsMeanScores[:5]\n",
1032 |     "width = 0\n",
1033 |     "ind=np.arange(13)\n",
1034 |     "(a,b) = top5tagsMeanScores[0]\n",
1035 |     "labels=b[0]\n",
1036 |     "colors = [\"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\", \"lightgreen\", \"gainsboro\", \"aquamarine\",\"c\"]\n",
1037 |     "idx=0\n",
1038 |     "for key, value in top5tagsMeanScores:\n",
1039 |     "    plt.bar(ind + width, value[1], 0.15, color=colors[idx], label=key)\n",
1040 |     "    width += 0.15\n",
1041 |     "    idx += 1\n",
1042 |     "plt.xticks(ind+0.3, labels)\n",
1043 |     "plt.ylabel('AVERAGE SCORE')\n",
1044 |     "plt.xlabel('TONES')\n",
1045 |     "plt.title('Breakdown of top hashtags by sentiment tones')\n",
1046 |     "\n",
1047 |     "plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='center',ncol=5, mode=\"expand\", borderaxespad=0.)\n",
1048 |     "\n",
1049 |     "plt.show()"
1050 |    ]
1051 |   }
1052 |  ],
1053 |  "metadata": {
1054 |   "anaconda-cloud": {},
1055 |   "kernelspec": {
1056 |    "display_name": "pySpark (Spark 1.6.0) Python 2",
1057 |    "language": "python",
1058 |    "name": "pyspark1.6python2"
1059 |   },
1060 |   "language_info": {
1061 |    "codemirror_mode": {
1062 |     "name": "ipython",
1063 |     "version": 2
1064 |    },
1065 |    "file_extension": ".py",
1066 |    "mimetype": "text/x-python",
1067 |    "name": "python",
1068 |    "nbconvert_exporter": "python",
1069 |    "pygments_lexer": "ipython2",
1070 |    "version": "2.7.11"
1071 |   }
1072 |  },
1073 |  "nbformat": 4,
1074 |  "nbformat_minor": 0
1075 | }
1076 | 


--------------------------------------------------------------------------------
/streaming-twitter/.classpath:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <classpath>
  3 | 	<classpathentry kind="src" output="target/scala-2.10/classes" path="src/main/scala-2.10"/>
  4 | 	<classpathentry kind="src" output="target/scala-2.10/classes" path="src/main/scala"/>
  5 | 	<classpathentry kind="src" output="target/scala-2.10/classes" path="src/main/java"/>
  6 | 	<classpathentry kind="src" output="target/scala-2.10/test-classes" path="src/test/scala-2.10"/>
  7 | 	<classpathentry kind="src" output="target/scala-2.10/test-classes" path="src/test/scala"/>
  8 | 	<classpathentry kind="src" output="target/scala-2.10/test-classes" path="src/test/java"/>
  9 | 	<classpathentry kind="lib" path="lib/messagehub.login-1.0.0.jar"/>
 10 | 	<classpathentry kind="con" path="org.scala-ide.sdt.launching.SCALA_CONTAINER"/>
 11 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-streaming-twitter_2.10/jars/spark-streaming-twitter_2.10-1.4.1.jar"/>
 12 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.twitter4j/twitter4j-stream/jars/twitter4j-stream-3.0.3.jar"/>
 13 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.twitter4j/twitter4j-core/jars/twitter4j-core-3.0.3.jar"/>
 14 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.spark-project.spark/unused/jars/unused-1.0.0.jar"/>
 15 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.scalaz/scalaz-core_2.10/bundles/scalaz-core_2.10-7.1.0.jar"/>
 16 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.scalaz/scalaz-effect_2.10/bundles/scalaz-effect_2.10-7.1.0.jar"/>
 17 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.http4s/http4s-core_2.10/jars/http4s-core_2.10-0.8.2.jar"/>
 18 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/net.iharder/base64/jars/base64-2.3.8.jar"/>
 19 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.http4s/http4s-websocket_2.10/jars/http4s-websocket_2.10-0.1.1.jar"/>
 20 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.log4s/log4s_2.10/jars/log4s_2.10-1.1.3.jar"/>
 21 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.parboiled/parboiled_2.10/jars/parboiled_2.10-2.1.0.jar"/>
 22 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.chuusai/shapeless_2.10.4/bundles/shapeless_2.10.4-2.1.0.jar"/>
 23 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.scalamacros/quasiquotes_2.10/jars/quasiquotes_2.10-2.0.1.jar"/>
 24 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.scalaz.stream/scalaz-stream_2.10/bundles/scalaz-stream_2.10-0.7.1a.jar"/>
 25 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.scalaz/scalaz-concurrent_2.10/bundles/scalaz-concurrent_2.10-7.1.0.jar"/>
 26 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.scodec/scodec-bits_2.10/bundles/scodec-bits_2.10-1.0.6.jar"/>
 27 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.http4s/http4s-client_2.10/jars/http4s-client_2.10-0.8.2.jar"/>
 28 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.http4s/http4s-blazeclient_2.10/jars/http4s-blazeclient_2.10-0.8.2.jar"/>
 29 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.http4s/http4s-blazecore_2.10/jars/http4s-blazecore_2.10-0.8.2.jar"/>
 30 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.http4s/blaze-http_2.10/jars/blaze-http_2.10-0.8.2.jar"/>
 31 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.http4s/blaze-core_2.10/jars/blaze-core_2.10-0.8.2.jar"/>
 32 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/hpack/jars/hpack-v1.0.1.jar"/>
 33 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.eclipse.jetty.alpn/alpn-api/jars/alpn-api-1.1.2.v20150522.jar"/>
 34 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.lihaoyi/upickle_2.10/jars/upickle_2.10-0.2.6.jar"/>
 35 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.spire-math/jawn-parser_2.10/jars/jawn-parser_2.10-0.7.0.jar"/>
 36 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.github.julien-truffaut/monocle-core_2.10/jars/monocle-core_2.10-1.0.1.jar"/>
 37 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.github.julien-truffaut/monocle-macro_2.10/jars/monocle-macro_2.10-1.0.1.jar"/>
 38 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.kafka/kafka-log4j-appender/jars/kafka-log4j-appender-0.9.0.0.jar"/>
 39 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/log4j/log4j/bundles/log4j-1.2.17.jar"/>
 40 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.kafka/kafka-clients/jars/kafka-clients-0.9.0.0.jar" sourcepath="/Users/dtaieb/watsondev/temp/kafka-0.9.0.0-src/clients/src"/>
 41 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.xerial.snappy/snappy-java/bundles/snappy-java-1.1.1.7.jar"/>
 42 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/net.jpountz.lz4/lz4/jars/lz4-1.2.0.jar"/>
 43 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.kafka/kafka_2.10/jars/kafka_2.10-0.9.0.0.jar"/>
 44 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.101tec/zkclient/jars/zkclient-0.7.jar"/>
 45 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.zookeeper/zookeeper/jars/zookeeper-3.4.6.jar"/>
 46 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/jline/jline/jars/jline-0.9.94.jar"/>
 47 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/junit/junit/jars/junit-3.8.1.jar"/>
 48 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.yammer.metrics/metrics-core/jars/metrics-core-2.2.0.jar"/>
 49 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/net.sf.jopt-simple/jopt-simple/jars/jopt-simple-3.2.jar"/>
 50 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-core_2.10/jars/spark-core_2.10-1.4.1.jar"/>
 51 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/chill_2.10/jars/chill_2.10-0.5.0.jar"/>
 52 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/chill-java/jars/chill-java-0.5.0.jar"/>
 53 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.esotericsoftware.kryo/kryo/bundles/kryo-2.21.jar"/>
 54 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.esotericsoftware.reflectasm/reflectasm/jars/reflectasm-1.07-shaded.jar"/>
 55 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.esotericsoftware.minlog/minlog/jars/minlog-1.2.jar"/>
 56 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.objenesis/objenesis/jars/objenesis-1.2.jar"/>
 57 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-client/jars/hadoop-client-2.2.0.jar"/>
 58 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-common/jars/hadoop-common-2.2.0.jar"/>
 59 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-annotations/jars/hadoop-annotations-2.2.0.jar"/>
 60 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.google.code.findbugs/jsr305/jars/jsr305-1.3.9.jar"/>
 61 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-cli/commons-cli/jars/commons-cli-1.2.jar"/>
 62 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.commons/commons-math/jars/commons-math-2.1.jar"/>
 63 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/xmlenc/xmlenc/jars/xmlenc-0.52.jar"/>
 64 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-httpclient/commons-httpclient/jars/commons-httpclient-3.1.jar"/>
 65 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-logging/commons-logging/jars/commons-logging-1.1.1.jar"/>
 66 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-lang/commons-lang/jars/commons-lang-2.5.jar"/>
 67 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-configuration/commons-configuration/jars/commons-configuration-1.6.jar"/>
 68 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-collections/commons-collections/jars/commons-collections-3.2.1.jar"/>
 69 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-digester/commons-digester/jars/commons-digester-1.8.jar"/>
 70 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-beanutils/commons-beanutils/jars/commons-beanutils-1.7.0.jar"/>
 71 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-beanutils/commons-beanutils-core/jars/commons-beanutils-core-1.8.0.jar"/>
 72 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.avro/avro/jars/avro-1.7.4.jar"/>
 73 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.commons/commons-compress/jars/commons-compress-1.4.1.jar"/>
 74 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.tukaani/xz/jars/xz-1.0.jar"/>
 75 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.google.protobuf/protobuf-java/bundles/protobuf-java-2.5.0.jar"/>
 76 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-auth/jars/hadoop-auth-2.2.0.jar"/>
 77 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-hdfs/jars/hadoop-hdfs-2.2.0.jar"/>
 78 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.mortbay.jetty/jetty-util/jars/jetty-util-6.1.26.jar"/>
 79 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-mapreduce-client-app/jars/hadoop-mapreduce-client-app-2.2.0.jar"/>
 80 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-mapreduce-client-common/jars/hadoop-mapreduce-client-common-2.2.0.jar"/>
 81 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-yarn-common/jars/hadoop-yarn-common-2.2.0.jar"/>
 82 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-yarn-api/jars/hadoop-yarn-api-2.2.0.jar"/>
 83 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.google.inject/guice/jars/guice-3.0.jar"/>
 84 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/javax.inject/javax.inject/jars/javax.inject-1.jar"/>
 85 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/aopalliance/aopalliance/jars/aopalliance-1.0.jar"/>
 86 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.sonatype.sisu.inject/cglib/jars/cglib-2.2.1-v20090111.jar"/>
 87 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/asm/asm/jars/asm-3.2.jar"/>
 88 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.sun.jersey.jersey-test-framework/jersey-test-framework-grizzly2/jars/jersey-test-framework-grizzly2-1.9.jar"/>
 89 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.sun.jersey/jersey-server/bundles/jersey-server-1.9.jar"/>
 90 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.sun.jersey/jersey-json/bundles/jersey-json-1.9.jar"/>
 91 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.codehaus.jettison/jettison/bundles/jettison-1.1.jar"/>
 92 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/stax/stax-api/jars/stax-api-1.0.1.jar"/>
 93 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.codehaus.jackson/jackson-jaxrs/jars/jackson-jaxrs-1.8.8.jar"/>
 94 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.codehaus.jackson/jackson-xc/jars/jackson-xc-1.8.8.jar"/>
 95 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.sun.jersey.contribs/jersey-guice/jars/jersey-guice-1.9.jar"/>
 96 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-yarn-client/jars/hadoop-yarn-client-2.2.0.jar"/>
 97 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-mapreduce-client-core/jars/hadoop-mapreduce-client-core-2.2.0.jar"/>
 98 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-yarn-server-common/jars/hadoop-yarn-server-common-2.2.0.jar"/>
 99 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-mapreduce-client-shuffle/jars/hadoop-mapreduce-client-shuffle-2.2.0.jar"/>
100 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.hadoop/hadoop-mapreduce-client-jobclient/jars/hadoop-mapreduce-client-jobclient-2.2.0.jar"/>
101 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-launcher_2.10/jars/spark-launcher_2.10-1.4.1.jar"/>
102 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-network-common_2.10/jars/spark-network-common_2.10-1.4.1.jar"/>
103 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/io.netty/netty-all/jars/netty-all-4.0.23.Final.jar"/>
104 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-network-shuffle_2.10/jars/spark-network-shuffle_2.10-1.4.1.jar"/>
105 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-unsafe_2.10/jars/spark-unsafe_2.10-1.4.1.jar"/>
106 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/net.java.dev.jets3t/jets3t/jars/jets3t-0.7.1.jar"/>
107 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.curator/curator-recipes/bundles/curator-recipes-2.4.0.jar"/>
108 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.curator/curator-framework/bundles/curator-framework-2.4.0.jar"/>
109 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.curator/curator-client/bundles/curator-client-2.4.0.jar"/>
110 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.google.guava/guava/bundles/guava-14.0.1.jar"/>
111 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.eclipse.jetty.orbit/javax.servlet/orbits/javax.servlet-3.0.0.v201112011016.jar"/>
112 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.commons/commons-lang3/jars/commons-lang3-3.3.2.jar"/>
113 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.commons/commons-math3/jars/commons-math3-3.4.1.jar"/>
114 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.slf4j/slf4j-api/jars/slf4j-api-1.7.10.jar"/>
115 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.slf4j/jul-to-slf4j/jars/jul-to-slf4j-1.7.10.jar"/>
116 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.slf4j/jcl-over-slf4j/jars/jcl-over-slf4j-1.7.10.jar"/>
117 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.slf4j/slf4j-log4j12/jars/slf4j-log4j12-1.7.10.jar"/>
118 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.ning/compress-lzf/bundles/compress-lzf-1.0.3.jar"/>
119 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.roaringbitmap/RoaringBitmap/bundles/RoaringBitmap-0.4.5.jar"/>
120 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-net/commons-net/jars/commons-net-2.2.jar"/>
121 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.spark-project.akka/akka-remote_2.10/jars/akka-remote_2.10-2.3.4-spark.jar"/>
122 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.spark-project.akka/akka-actor_2.10/jars/akka-actor_2.10-2.3.4-spark.jar"/>
123 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.typesafe/config/bundles/config-1.2.1.jar"/>
124 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/io.netty/netty/bundles/netty-3.8.0.Final.jar"/>
125 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.spark-project.protobuf/protobuf-java/bundles/protobuf-java-2.5.0-spark.jar"/>
126 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.uncommons.maths/uncommons-maths/jars/uncommons-maths-1.2.2a.jar"/>
127 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.spark-project.akka/akka-slf4j_2.10/jars/akka-slf4j_2.10-2.3.4-spark.jar"/>
128 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.json4s/json4s-jackson_2.10/jars/json4s-jackson_2.10-3.2.10.jar"/>
129 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.json4s/json4s-core_2.10/jars/json4s-core_2.10-3.2.10.jar"/>
130 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.json4s/json4s-ast_2.10/jars/json4s-ast_2.10-3.2.10.jar"/>
131 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.thoughtworks.paranamer/paranamer/jars/paranamer-2.6.jar"/>
132 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.scala-lang/scalap/jars/scalap-2.10.0.jar"/>
133 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.fasterxml.jackson.core/jackson-databind/bundles/jackson-databind-2.4.4.jar"/>
134 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.sun.jersey/jersey-core/bundles/jersey-core-1.9.jar"/>
135 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.mesos/mesos/jars/mesos-0.21.1-shaded-protobuf.jar"/>
136 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.clearspring.analytics/stream/jars/stream-2.7.0.jar"/>
137 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/io.dropwizard.metrics/metrics-core/bundles/metrics-core-3.1.0.jar"/>
138 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/io.dropwizard.metrics/metrics-jvm/bundles/metrics-jvm-3.1.0.jar"/>
139 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/io.dropwizard.metrics/metrics-json/bundles/metrics-json-3.1.0.jar"/>
140 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.fasterxml.jackson.core/jackson-core/bundles/jackson-core-2.4.4.jar"/>
141 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/io.dropwizard.metrics/metrics-graphite/bundles/metrics-graphite-3.1.0.jar"/>
142 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.fasterxml.jackson.module/jackson-module-scala_2.10/bundles/jackson-module-scala_2.10-2.4.4.jar"/>
143 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.fasterxml.jackson.core/jackson-annotations/bundles/jackson-annotations-2.4.4.jar"/>
144 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.ivy/ivy/jars/ivy-2.4.0.jar"/>
145 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/oro/oro/jars/oro-2.0.8.jar"/>
146 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.tachyonproject/tachyon-client/jars/tachyon-client-0.6.4.jar"/>
147 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.tachyonproject/tachyon/jars/tachyon-0.6.4.jar"/>
148 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-codec/commons-codec/jars/commons-codec-1.5.jar"/>
149 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/commons-io/commons-io/jars/commons-io-2.4.jar"/>
150 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/net.razorvine/pyrolite/jars/pyrolite-4.4.jar"/>
151 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/net.sf.py4j/py4j/jars/py4j-0.8.2.1.jar"/>
152 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-sql_2.10/jars/spark-sql_2.10-1.4.1.jar"/>
153 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-catalyst_2.10/jars/spark-catalyst_2.10-1.4.1.jar"/>
154 | 	<classpathentry kind="con" path="org.scala-ide.sdt.launching.SCALA_COMPILER_CONTAINER"/>
155 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/parquet-column/jars/parquet-column-1.6.0rc3.jar"/>
156 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/parquet-common/jars/parquet-common-1.6.0rc3.jar"/>
157 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/parquet-encoding/jars/parquet-encoding-1.6.0rc3.jar"/>
158 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/parquet-generator/jars/parquet-generator-1.6.0rc3.jar"/>
159 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/parquet-hadoop/jars/parquet-hadoop-1.6.0rc3.jar"/>
160 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/parquet-format/jars/parquet-format-2.2.0-rc1.jar"/>
161 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.twitter/parquet-jackson/jars/parquet-jackson-1.6.0rc3.jar"/>
162 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.codehaus.jackson/jackson-mapper-asl/jars/jackson-mapper-asl-1.9.11.jar"/>
163 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.codehaus.jackson/jackson-core-asl/jars/jackson-core-asl-1.9.11.jar"/>
164 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.jodd/jodd-core/jars/jodd-core-3.6.3.jar"/>
165 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-streaming_2.10/jars/spark-streaming_2.10-1.4.1.jar"/>
166 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-repl_2.10/jars/spark-repl_2.10-1.4.1.jar"/>
167 | 	<classpathentry kind="lib" path="/Users/dtaieb/.sbt/boot/scala-2.10.4/lib/jline.jar"/>
168 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.fusesource.jansi/jansi/jars/jansi-1.4.jar"/>
169 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-bagel_2.10/jars/spark-bagel_2.10-1.4.1.jar"/>
170 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-mllib_2.10/jars/spark-mllib_2.10-1.4.1.jar"/>
171 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.apache.spark/spark-graphx_2.10/jars/spark-graphx_2.10-1.4.1.jar"/>
172 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.github.fommil.netlib/core/jars/core-1.1.2.jar"/>
173 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/net.sourceforge.f2j/arpack_combined_all/jars/arpack_combined_all-0.1.jar"/>
174 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/net.sourceforge.f2j/arpack_combined_all/jars/arpack_combined_all-0.1-javadoc.jar"/>
175 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.scalanlp/breeze_2.10/jars/breeze_2.10-0.11.2.jar"/>
176 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.scalanlp/breeze-macros_2.10/jars/breeze-macros_2.10-0.11.2.jar"/>
177 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/net.sf.opencsv/opencsv/jars/opencsv-2.3.jar"/>
178 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.github.rwl/jtransforms/jars/jtransforms-2.4.0.jar"/>
179 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.spire-math/spire_2.10/jars/spire_2.10-0.7.4.jar"/>
180 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.spire-math/spire-macros_2.10/jars/spire-macros_2.10-0.7.4.jar"/>
181 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.jpmml/pmml-model/jars/pmml-model-1.1.15.jar"/>
182 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.jpmml/pmml-agent/jars/pmml-agent-1.1.15.jar"/>
183 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/org.jpmml/pmml-schema/jars/pmml-schema-1.1.15.jar"/>
184 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.sun.xml.bind/jaxb-impl/jars/jaxb-impl-2.2.7.jar"/>
185 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.sun.xml.bind/jaxb-core/jars/jaxb-core-2.2.7.jar"/>
186 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/javax.xml.bind/jaxb-api/jars/jaxb-api-2.2.7.jar"/>
187 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.sun.istack/istack-commons-runtime/jars/istack-commons-runtime-2.16.jar"/>
188 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/com.sun.xml.fastinfoset/FastInfoset/jars/FastInfoset-1.2.12.jar"/>
189 | 	<classpathentry kind="lib" path="/Users/dtaieb/.ivy2/cache/javax.xml.bind/jsr173_api/jars/jsr173_api-1.0.jar"/>
190 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
191 | 	<classpathentry kind="lib" path="lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar"/>
192 | 	<classpathentry kind="output" path="bin"/>
193 | </classpath>
194 | 


--------------------------------------------------------------------------------
/streaming-twitter/.gitignore:
--------------------------------------------------------------------------------
1 | /bin/
2 | /config
3 | 


--------------------------------------------------------------------------------
/streaming-twitter/.project:
--------------------------------------------------------------------------------
 1 | <projectDescription>
 2 |   <name>streaming-twitter</name>
 3 |   <buildSpec>
 4 |     <buildCommand>
 5 |       <name>org.scala-ide.sdt.core.scalabuilder</name>
 6 |     </buildCommand>
 7 |   </buildSpec>
 8 |   <natures>
 9 |     <nature>org.scala-ide.sdt.core.scalanature</nature>
10 |     <nature>org.eclipse.jdt.core.javanature</nature>
11 |   </natures>
12 |   <linkedResources> </linkedResources>
13 | </projectDescription>


--------------------------------------------------------------------------------
/streaming-twitter/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "streaming-twitter"
 2 | 
 3 | version := "1.6"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies ++= {
 8 |   val sparkVersion =  "1.6.0"
 9 |   Seq(
10 |     "org.apache.spark" %%  "spark-core"	  %  sparkVersion % "provided",
11 |     "org.apache.spark" %%  "spark-sql"  %  sparkVersion % "provided",
12 |     "org.apache.spark" %%  "spark-streaming"	  %  sparkVersion % "provided",
13 |     "org.apache.spark" %%  "spark-streaming-twitter"  %  sparkVersion,
14 |     "org.apache.spark" %% "spark-repl" % sparkVersion % "provided",
15 |     "com.ibm" %% "couchdb-scala" % "0.5.3",
16 |     "org.apache.kafka" % "kafka-log4j-appender" % "0.9.0.0",
17 |     "org.apache.kafka" % "kafka-clients" % "0.9.0.0",
18 |     "org.apache.kafka" %% "kafka" % "0.9.0.0",
19 |     "com.google.guava" % "guava" % "14.0.1"
20 |   )
21 | }
22 | 
23 | assemblyMergeStrategy in assembly := {
24 |   case PathList("org", "apache", "spark", xs @ _*) => MergeStrategy.first
25 |   case PathList("scala", xs @ _*) => MergeStrategy.discard
26 |   case PathList("com", "ibm", "pixiedust", xs @ _*) => MergeStrategy.discard
27 |   case PathList("META-INF", "maven", "org.slf4j", xs @ _* ) => MergeStrategy.first
28 |   case x =>
29 |     val oldStrategy = (assemblyMergeStrategy in assembly).value
30 |     oldStrategy(x)
31 | }
32 | 
33 | unmanagedBase <<= baseDirectory { base => base / "lib" }
34 | 
35 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
36 | 
37 | resolvers += "scalaz-bintray" at "https://dl.bintray.com/scalaz/releases"
38 | resolvers += "Local couchdb-scala repo" at (baseDirectory.value / "lib/couchdb-scala").toURI.toString
39 | 


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar.md5:
--------------------------------------------------------------------------------
1 | e5ee6d0be04b3b9fc6f2f9c7dabc2497


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar.sha1:
--------------------------------------------------------------------------------
1 | ba8a2e725a4aae35185cbc0862f93fb86dc50138


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar.md5:
--------------------------------------------------------------------------------
1 | be140baa91495e6a161eb95b3415b48d


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar.sha1:
--------------------------------------------------------------------------------
1 | eda716f52436863b442564400ebcecc09662d8f7


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar.md5:
--------------------------------------------------------------------------------
1 | 554911d3e139c8ba42957989e4f76428


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar.sha1:
--------------------------------------------------------------------------------
1 | 6c25040548743c9ae0bb2cf4636ec9da9d55068c


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.pom:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='UTF-8'?>
  2 | <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 |     <groupId>com.ibm</groupId>
  5 |     <artifactId>couchdb-scala_2.10</artifactId>
  6 |     <packaging>jar</packaging>
  7 |     <description>A purely functional Scala client for CouchDB</description>
  8 |     <url>https://github.com/beloglazov/couchdb-scala</url>
  9 |     <version>0.5.3</version>
 10 |     <licenses>
 11 |         <license>
 12 |             <name>The Apache Software License, Version 2.0</name>
 13 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 14 |             <distribution>repo</distribution>
 15 |         </license>
 16 |     </licenses>
 17 |     <name>couchdb-scala</name>
 18 |     <organization>
 19 |         <name>com.ibm</name>
 20 |         <url>https://github.com/beloglazov/couchdb-scala</url>
 21 |     </organization>
 22 |     <scm>
 23 |         <connection>scm:git:git@github.com:beloglazov/couchdb-scala.git</connection>
 24 |         <developerConnection>scm:git:git@github.com:beloglazov/couchdb-scala.git</developerConnection>
 25 |         <url>https://github.com/beloglazov/couchdb-scala</url>
 26 |     </scm>
 27 |     <developers>
 28 |         <developer>
 29 |             <id>beloglazov</id>
 30 |             <name>Anton Beloglazov</name>
 31 |             <email>anton.beloglazov@gmail.com</email>
 32 |             <url>http://beloglazov.info</url>
 33 |         </developer>
 34 |     </developers>
 35 |     <dependencies>
 36 |         <dependency>
 37 |             <groupId>org.scala-lang</groupId>
 38 |             <artifactId>scala-library</artifactId>
 39 |             <version>2.10.4</version>
 40 |         </dependency>
 41 |         <dependency>
 42 |             <groupId>org.scalaz</groupId>
 43 |             <artifactId>scalaz-core_2.10</artifactId>
 44 |             <version>7.1.0</version>
 45 |         </dependency>
 46 |         <dependency>
 47 |             <groupId>org.scalaz</groupId>
 48 |             <artifactId>scalaz-effect_2.10</artifactId>
 49 |             <version>7.1.0</version>
 50 |         </dependency>
 51 |         <dependency>
 52 |             <groupId>org.http4s</groupId>
 53 |             <artifactId>http4s-core_2.10</artifactId>
 54 |             <version>0.8.2</version>
 55 |         </dependency>
 56 |         <dependency>
 57 |             <groupId>org.http4s</groupId>
 58 |             <artifactId>http4s-client_2.10</artifactId>
 59 |             <version>0.8.2</version>
 60 |         </dependency>
 61 |         <dependency>
 62 |             <groupId>org.http4s</groupId>
 63 |             <artifactId>http4s-blazeclient_2.10</artifactId>
 64 |             <version>0.8.2</version>
 65 |         </dependency>
 66 |         <dependency>
 67 |             <groupId>com.lihaoyi</groupId>
 68 |             <artifactId>upickle_2.10</artifactId>
 69 |             <version>0.2.6</version>
 70 |         </dependency>
 71 |         <dependency>
 72 |             <groupId>com.github.julien-truffaut</groupId>
 73 |             <artifactId>monocle-core_2.10</artifactId>
 74 |             <version>1.0.1</version>
 75 |         </dependency>
 76 |         <dependency>
 77 |             <groupId>com.github.julien-truffaut</groupId>
 78 |             <artifactId>monocle-macro_2.10</artifactId>
 79 |             <version>1.0.1</version>
 80 |         </dependency>
 81 |         <dependency>
 82 |             <groupId>org.log4s</groupId>
 83 |             <artifactId>log4s_2.10</artifactId>
 84 |             <version>1.1.3</version>
 85 |         </dependency>
 86 |         <dependency>
 87 |             <groupId>org.specs2</groupId>
 88 |             <artifactId>specs2_2.10</artifactId>
 89 |             <version>2.4.16</version>
 90 |             <scope>test</scope>
 91 |         </dependency>
 92 |         <dependency>
 93 |             <groupId>org.typelevel</groupId>
 94 |             <artifactId>scalaz-specs2_2.10</artifactId>
 95 |             <version>0.3.0</version>
 96 |             <scope>test</scope>
 97 |         </dependency>
 98 |         <dependency>
 99 |             <groupId>org.scalacheck</groupId>
100 |             <artifactId>scalacheck_2.10</artifactId>
101 |             <version>1.12.1</version>
102 |             <scope>test</scope>
103 |         </dependency>
104 |         <dependency>
105 |             <groupId>org.scalaz</groupId>
106 |             <artifactId>scalaz-scalacheck-binding_2.10</artifactId>
107 |             <version>7.1.0</version>
108 |             <scope>test</scope>
109 |         </dependency>
110 |         <dependency>
111 |             <groupId>ch.qos.logback</groupId>
112 |             <artifactId>logback-classic</artifactId>
113 |             <version>1.1.2</version>
114 |             <scope>test</scope>
115 |         </dependency>
116 |     </dependencies>
117 | </project>


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.pom.md5:
--------------------------------------------------------------------------------
1 | c19ebb91556b46c2e2a7ff027b351e15


--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.pom.sha1:
--------------------------------------------------------------------------------
1 | 342d29d046750084aabf94c85081f54e19bbcaa6


--------------------------------------------------------------------------------
/streaming-twitter/lib/messagehub.login-1.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/messagehub.login-1.0.0.jar


--------------------------------------------------------------------------------
/streaming-twitter/lib/pixiedust.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/pixiedust.jar


--------------------------------------------------------------------------------
/streaming-twitter/notebook/Spark Streaming Twitter-Watson-MessageHub.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#Spark Streaming sample application using Twitter, Watson Tone Analyzer, Event Hub and Message Hub\n",
  8 |     "In this Notebook, we show how to run a Spark Streaming application using a Notebook. There are multiple limitations to be aware of:  \n",
  9 |     "1. The application will stop when the page is refreshed or closed\n",
 10 |     "2. As events are being processed, the application generates lots of console output which may cause memory to build up in the browser. Therefore it is not recommended to run the application for too long  \n",
 11 |     "\n",
 12 |     "The code can be found here: https://github.com/ibm-watson-data-lab/spark.samples/tree/master/streaming-twitter \n",
 13 |     "The following code is using a pre-built jar that has been posted on the Github project, but you can replace with your own url if needed."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {
 20 |     "collapsed": false
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "%AddJar https://github.com/DTAIEB/demos/raw/master/streaming-twitter-assembly-1.6.jar -f"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "##Set up the credentials for the different services\n",
 32 |     "Please refer to the tutorial for details on how to find the credentials for all the services, then add the value in the placeholders specified in the code below"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "val demo = com.ibm.cds.spark.samples.MessageHubStreamingTwitter\n",
 44 |     "val config = demo.getConfig()\n",
 45 |     "\n",
 46 |     "//Watson Tone Analyzer service\n",
 47 |     "config.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer-beta/api\")\n",
 48 |     "config.setConfig(\"watson.tone.password\",\"XXXX\")\n",
 49 |     "config.setConfig(\"watson.tone.username\",\"XXXX\")\n",
 50 |     "\n",
 51 |     "//Message Hub/Kafka service\n",
 52 |     "config.setConfig(\"bootstrap.servers\",\"kafka01-prod01.messagehub.services.us-south.bluemix.net:9093,kafka02-prod01.messagehub.services.us-south.bluemix.net:9093,kafka03-prod01.messagehub.services.us-south.bluemix.net:9093,kafka04-prod01.messagehub.services.us-south.bluemix.net:9093,kafka05-prod01.messagehub.services.us-south.bluemix.net:9093\")\n",
 53 |     "config.setConfig(\"api_key\",\"XXXX\")\n",
 54 |     "config.setConfig(\"kafka.topic.tweet\",\"twitter-spark\")\n",
 55 |     "config.setConfig(\"kafka.user.name\",\"XXXX\")\n",
 56 |     "config.setConfig(\"kafka.user.password\",\"XXXX\")\n",
 57 |     "config.setConfig(\"kafka_rest_url\",\"https://kafka-rest-prod01.messagehub.services.us-south.bluemix.net:443\")\n",
 58 |     "\n",
 59 |     "//Spark Streaming checkpointing configuration with Object Storage Swift container\n",
 60 |     "config.setConfig(\"name\",\"spark\");\n",
 61 |     "config.setConfig(\"auth_url\",\"https://identity.open.softlayer.com\");\n",
 62 |     "config.setConfig(\"project_id\",\"XXXX\");\n",
 63 |     "config.setConfig(\"region\",\"dallas\");\n",
 64 |     "config.setConfig(\"user_id\",\"XXXX\");\n",
 65 |     "config.setConfig(\"password\",\"XXXX\");\n",
 66 |     "config.setConfig(\"checkpointDir\", \"swift://notebooks.spark/ssc\")"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "##Producing tweets directly from Twitter\n",
 74 |     "Optional: The following cell is to be used only if your MessageConnect service doesn't work.  \n",
 75 |     "In the next cell, you configure your Twitter credentials and call the code that will connect to Twitter, fetch the tweets and send them to MessageHub for consumption (Please refer to the tutorial for more information)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "config.setConfig(\"twitter4j.oauth.consumerKey\",\"XXXX\")\n",
 87 |     "config.setConfig(\"twitter4j.oauth.consumerSecret\",\"XXXX\")\n",
 88 |     "config.setConfig(\"twitter4j.oauth.accessToken\",\"XXXX\")\n",
 89 |     "config.setConfig(\"twitter4j.oauth.accessTokenSecret\",\"XXXX\")\n",
 90 |     "val twitterStream = com.ibm.cds.spark.samples.KafkaProducerTest.createTwitterStream(config)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "##Start the Spark Stream to collect tweets from Message Hub\n",
 98 |     "Start a new Twitter Stream that collects the live tweets and enrich them with Sentiment Analysis scores. The stream is run for a duration specified in the second argument of the **startTwitterStreaming** method.\n",
 99 |     "Note: if no duration is specified then the stream will run until the **stopTwitterStreaming** method is called."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": false,
107 |     "scrolled": false
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "demo.startTwitterStreaming(sc)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "##Close the Tweet producer\n",
119 |     "Optional: To be used only if you have started it"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "collapsed": false
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "com.ibm.cds.spark.samples.KafkaProducerTest.closeTwitterStream"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "##Close the Spark Streaming"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "demo.stopTwitterStreaming"
149 |    ]
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "kernelspec": {
154 |    "display_name": "Scala 2.10",
155 |    "language": "scala",
156 |    "name": "spark"
157 |   },
158 |   "language_info": {
159 |    "name": "scala"
160 |   },
161 |   "name": "Twitter + Watson Tone Analyzer Part 1.ipynb"
162 |  },
163 |  "nbformat": 4,
164 |  "nbformat_minor": 0
165 | }


--------------------------------------------------------------------------------
/streaming-twitter/notebook/Twitter + Watson Tone Analyzer Part 1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#Twitter + Watson Tone Analyzer sample Notebook Part 1: Loading the data\n",
  8 |     "In this Notebook, we show how to load the custom library generate as part of the Twitter + Watson Tone Analyzer streaming application. Code can be found here: https://github.com/ibm-watson-data-lab/spark.samples/tree/master/streaming-twitter.\n",
  9 |     "The following code is using a pre-built jar has been posted on the Github project, but you can replace with your own url if needed."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {
 16 |     "collapsed": false
 17 |    },
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "Starting download from https://github.com/ibm-watson-data-lab/spark.samples/raw/master/dist/streaming-twitter-assembly-1.6.jar\n",
 24 |       "Finished download of streaming-twitter-assembly-1.6.jar\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "%AddJar https://github.com/ibm-watson-data-lab/spark.samples/raw/master/dist/streaming-twitter-assembly-1.6.jar -f"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "##Set up the Twitter and Watson credentials\n",
 37 |     "Please refer to the tutorial for details on how to find the Twitter and Watson credentials, then add the value in the placeholders specified in the code below"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "val demo = com.ibm.cds.spark.samples.StreamingTwitter\n",
 49 |     "demo.setConfig(\"twitter4j.oauth.consumerKey\",\"XXXX\")\n",
 50 |     "demo.setConfig(\"twitter4j.oauth.consumerSecret\",\"XXXX\")\n",
 51 |     "demo.setConfig(\"twitter4j.oauth.accessToken\",\"XXXX\")\n",
 52 |     "demo.setConfig(\"twitter4j.oauth.accessTokenSecret\",\"XXXX\")\n",
 53 |     "demo.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer-beta/api\")\n",
 54 |     "demo.setConfig(\"watson.tone.password\",\"XXXX\")\n",
 55 |     "demo.setConfig(\"watson.tone.username\",\"XXXX\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "##Start the Spark Stream to collect live tweets\n",
 63 |     "Start a new Twitter Stream that collects the live tweets and enrich them with Sentiment Analysis scores. The stream is run for a duration specified in the second argument of the **startTwitterStreaming** method.\n",
 64 |     "Note: if no duration is specified then the stream will run until the **stopTwitterStreaming** method is called."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "Twitter stream started\n",
 79 |       "Tweets are collected real-time and analyzed\n",
 80 |       "To stop the streaming and start interacting with the data use: StreamingTwitter.stopTwitterStreaming\n",
 81 |       "Receiver Started: TwitterReceiver-0\n",
 82 |       "Batch started with 139 records\n",
 83 |       "Batch completed with 139 records\n",
 84 |       "Batch started with 270 records\n",
 85 |       "Stopping Twitter stream. Please wait this may take a while\n",
 86 |       "Receiver Stopped: TwitterReceiver-0\n",
 87 |       "Reason:  : Stopped by driver\n",
 88 |       "Batch completed with 270 records\n",
 89 |       "Twitter stream stopped\n",
 90 |       "You can now create a sqlContext and DataFrame with 38 Tweets created. Sample usage: \n",
 91 |       "val (sqlContext, df) = com.ibm.cds.spark.samples.StreamingTwitter.createTwitterDataFrames(sc)\n",
 92 |       "df.printSchema\n",
 93 |       "sqlContext.sql(\"select author, text from tweets\").show\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "import org.apache.spark.streaming._\n",
 99 |     "demo.startTwitterStreaming(sc, Seconds(40))"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "##Create a SQLContext and a dataframe with all the tweets\n",
107 |     "Note: this method will register a SparkSQL table called tweets"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 4,
113 |    "metadata": {
114 |     "collapsed": false
115 |    },
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "A new table named tweets with 38 records has been correctly created and can be accessed through the SQLContext variable\n",
122 |       "Here's the schema for tweets\n",
123 |       "root\n",
124 |       " |-- author: string (nullable = true)\n",
125 |       " |-- date: string (nullable = true)\n",
126 |       " |-- lang: string (nullable = true)\n",
127 |       " |-- text: string (nullable = true)\n",
128 |       " |-- lat: double (nullable = true)\n",
129 |       " |-- long: double (nullable = true)\n",
130 |       " |-- Anger: double (nullable = true)\n",
131 |       " |-- Disgust: double (nullable = true)\n",
132 |       " |-- Fear: double (nullable = true)\n",
133 |       " |-- Joy: double (nullable = true)\n",
134 |       " |-- Sadness: double (nullable = true)\n",
135 |       " |-- Analytical: double (nullable = true)\n",
136 |       " |-- Confident: double (nullable = true)\n",
137 |       " |-- Tentative: double (nullable = true)\n",
138 |       " |-- Openness: double (nullable = true)\n",
139 |       " |-- Conscientiousness: double (nullable = true)\n",
140 |       " |-- Extraversion: double (nullable = true)\n",
141 |       " |-- Agreeableness: double (nullable = true)\n",
142 |       " |-- EmotionalRange: double (nullable = true)\n",
143 |       "\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "val (sqlContext, df) = demo.createTwitterDataFrames(sc)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "##Execute a SparkSQL query that contains all the data"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 5,
161 |    "metadata": {
162 |     "collapsed": false
163 |    },
164 |    "outputs": [
165 |     {
166 |      "name": "stdout",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "+--------------------+--------------------+-----+--------------------+---+----+------------------+------------------+------------------+-----------------+------------------+----------+---------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+\n",
170 |       "|              author|                date| lang|                text|lat|long|             Anger|           Disgust|              Fear|              Joy|           Sadness|Analytical|Confident|        Tentative|         Openness| Conscientiousness|     Extraversion|    Agreeableness|   EmotionalRange|\n",
171 |       "+--------------------+--------------------+-----+--------------------+---+----+------------------+------------------+------------------+-----------------+------------------+----------+---------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+\n",
172 |       "|Three Words o Wisdom|Sun Mar 06 13:00:...|en-gb|wildebeest rebuff...|0.0| 0.0|              11.0|              20.0|              19.0|             44.0|              22.0|       0.0|      0.0|              0.0|             80.0| 56.00000000000001|             15.0|              1.0|             39.0|\n",
173 |       "|             Jonny P|Sun Mar 06 13:00:...|   en|Getting a pizza i...|0.0| 0.0|               8.0|               5.0|              13.0|56.00000000000001|               5.0|       0.0|      0.0|56.99999999999999|             24.0|              23.0|             83.0|56.99999999999999|             82.0|\n",
174 |       "|               Kayla|Sun Mar 06 13:00:...|   en|RT @ebhoniogarro:...|0.0| 0.0|               2.0|               0.0|               1.0|             99.0|               2.0|       0.0|      0.0|              0.0|             30.0| 56.00000000000001|             85.0|             66.0|             39.0|\n",
175 |       "|             Adamlbr|Sun Mar 06 13:00:...|   en|New Event now on....|0.0| 0.0|              24.0|              10.0|              11.0|             46.0|               4.0|       0.0|      0.0|              0.0|             11.0|              98.0|             46.0|             49.0|              6.0|\n",
176 |       "|Lexa deserved better|Sun Mar 06 13:00:...|   en|RT @canoodleclexa...|0.0| 0.0|               8.0| 7.000000000000001|               9.0|             80.0| 7.000000000000001|      84.0|      0.0|              0.0|             12.0|28.000000000000004|             73.0|             59.0|             51.0|\n",
177 |       "|  LoveBakesGoodCakes|Sun Mar 06 13:00:...|   en|Yum, yum! Honey B...|0.0| 0.0|              41.0|               2.0|               6.0|             62.0| 7.000000000000001|       0.0|      0.0|              0.0|             60.0|              69.0|             64.0|             18.0|             11.0|\n",
178 |       "|    High Tech Planet|Sun Mar 06 13:00:...|   en|Google is testing...|0.0| 0.0|              11.0|               5.0|              32.0|             37.0|               5.0|      78.0|      0.0|              0.0|56.99999999999999|              30.0|              6.0|             13.0|57.99999999999999|\n",
179 |       "|                Kael|Sun Mar 06 13:00:...|   en|RT @mgiseelle: Ha...|0.0| 0.0|              16.0|               4.0|14.000000000000002|             23.0|              13.0|       0.0|      0.0|              0.0|             68.0|              85.0|57.99999999999999|             35.0|              6.0|\n",
180 |       "|                Ryan|Sun Mar 06 13:00:...|   en|ALL THAT EFFORT T...|0.0| 0.0|              19.0|14.000000000000002|              24.0|             12.0|              24.0|      61.0|     79.0|              0.0|             78.0|               3.0|             49.0|              1.0|             91.0|\n",
181 |       "|           princesss|Sun Mar 06 13:00:...|   en|RT @SexualGif: Be...|0.0| 0.0|              13.0| 7.000000000000001|              13.0|             34.0|              15.0|       0.0|      0.0|              0.0|56.00000000000001|              93.0|             62.0|             38.0|             39.0|\n",
182 |       "|         Fadi Nasser|Sun Mar 06 13:00:...|   en|#USA missiles cha...|0.0| 0.0| 7.000000000000001|              10.0|               8.0|             30.0|              13.0|       0.0|      0.0|              0.0|             94.0|              75.0|             27.0|             23.0|             20.0|\n",
183 |       "|            Briyon?e|Sun Mar 06 13:00:...|   en|RT @tonestradamus...|0.0| 0.0|              52.0|              19.0|               5.0|              1.0|14.000000000000002|      23.0|      0.0|             75.0|             21.0|               6.0|             84.0|             44.0|             59.0|\n",
184 |       "|       BarnBurnerBBQ|Sun Mar 06 13:00:...|   en|Presenting sponso...|0.0| 0.0|              10.0|              18.0|              10.0|             26.0|               8.0|      67.0|      0.0|              0.0|             36.0|              91.0|             71.0|             91.0|              2.0|\n",
185 |       "|        Majid Navabi|Sun Mar 06 13:00:...|   en|            Download|0.0| 0.0|              12.0|               9.0|              18.0|56.99999999999999|14.000000000000002|       0.0|      0.0|              0.0|             52.0| 56.00000000000001|             15.0|            100.0|              0.0|\n",
186 |       "|        ?????? ?????|Sun Mar 06 13:00:...|   en|RT @Adel__Almalki...|0.0| 0.0|              43.0|               6.0|              20.0|              3.0|               2.0|       0.0|      0.0|              0.0|             90.0| 56.00000000000001|             15.0|              1.0|             39.0|\n",
187 |       "|                 liv|Sun Mar 06 13:00:...|   en|RT @iamjojo: You ...|0.0| 0.0|               5.0|               2.0|               9.0|             89.0|               9.0|       0.0|      0.0|              0.0|              2.0|               2.0|            100.0|             85.0|              2.0|\n",
188 |       "|           LADY GAGA|Sun Mar 06 13:00:...|   en|Miek_tweet #TilIt...|0.0| 0.0|              16.0|              16.0|               8.0|             23.0|              21.0|       0.0|      0.0|              0.0|             80.0| 56.00000000000001|             15.0|              1.0|             39.0|\n",
189 |       "|        donatello ;)|Sun Mar 06 13:00:...|   en|RT @__trillgawdd:...|0.0| 0.0|14.000000000000002|               3.0|              13.0|             66.0|               9.0|       0.0|      0.0|              0.0|             30.0| 56.00000000000001|             53.0|             69.0|             20.0|\n",
190 |       "|                 Liz|Sun Mar 06 13:00:...|   en|RT @Samantha_Evel...|0.0| 0.0|              12.0|               8.0|              24.0|             10.0|              33.0|      43.0|     72.0|             91.0|              5.0|              12.0|             34.0|             61.0|             97.0|\n",
191 |       "|    Chrystal Johnson|Sun Mar 06 13:00:...|   en|Take Aromatherapy...|0.0| 0.0|              16.0|              12.0|              44.0|              8.0|               8.0|       0.0|      0.0|              0.0|             71.0|              96.0|             40.0|             60.0|              2.0|\n",
192 |       "+--------------------+--------------------+-----+--------------------+---+----+------------------+------------------+------------------+-----------------+------------------+----------+---------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+\n",
193 |       "only showing top 20 rows\n",
194 |       "\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "val fullSet = sqlContext.sql(\"select * from tweets\")  //Select all columns\n",
200 |     "fullSet.show"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "##SparkSQL query example on the data.\n",
208 |     "Select all the tweets that have Anger score greated than 70%"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 6,
214 |    "metadata": {
215 |     "collapsed": false
216 |    },
217 |    "outputs": [
218 |     {
219 |      "name": "stdout",
220 |      "output_type": "stream",
221 |      "text": [
222 |       "0\n",
223 |       "+----+\n",
224 |       "|text|\n",
225 |       "+----+\n",
226 |       "+----+\n",
227 |       "\n"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "val set = sqlContext.sql(\"select text from tweets where Anger > 60\")\n",
233 |     "println(set.count)\n",
234 |     "set.show"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "##Persist the dataset into a parquet file on Object Storage service\n",
242 |     "The parquet file will be reloaded in IPython Part 2 Notebook\n",
243 |     "Note: you can disregard the warning messages related to SLF4J"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 7,
249 |    "metadata": {
250 |     "collapsed": false
251 |    },
252 |    "outputs": [
253 |     {
254 |      "name": "stdout",
255 |      "output_type": "stream",
256 |      "text": [
257 |       "SLF4J: Failed to load class \"org.slf4j.impl.StaticLoggerBinder\".\n",
258 |       "SLF4J: Defaulting to no-operation (NOP) logger implementation\n",
259 |       "SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.\n"
260 |      ]
261 |     }
262 |    ],
263 |    "source": [
264 |     "fullSet.repartition(1).saveAsParquetFile(\"swift://notebooks.spark/tweetsFull.parquet\")"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {
271 |     "collapsed": true
272 |    },
273 |    "outputs": [],
274 |    "source": []
275 |   }
276 |  ],
277 |  "metadata": {
278 |   "kernelspec": {
279 |    "display_name": "Scala 2.10",
280 |    "language": "scala",
281 |    "name": "spark"
282 |   },
283 |   "language_info": {
284 |    "name": "scala"
285 |   },
286 |   "name": "Twitter + Watson Tone Analyzer Part 1.ipynb"
287 |  },
288 |  "nbformat": 4,
289 |  "nbformat_minor": 0
290 | }


--------------------------------------------------------------------------------
/streaming-twitter/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 | 


--------------------------------------------------------------------------------
/streaming-twitter/readme.md:
--------------------------------------------------------------------------------
 1 | #Sentiment Analysis of Twitter Hashtags
 2 | 
 3 | ####Use Spark Streaming in combination with IBM Watson to perform sentiment analysis showing how a conversation is trending on Twitter.
 4 | 
 5 | Track how consumers feel about you based on their tweets. To get real-time sentiment analysis, deploy our sample **Spark Streaming with Twitter and Watson** app on Bluemix and use its Notebook to analyze public opinion. 
 6 |  
 7 | 
 8 | This sample app uses Spark Streaming to create a feed that captures live tweets from Twitter. You can filter the tweets that contain the hashtag(s) of your choice. The tweet data is enriched in real time with various sentiment scores provided by the Watson Tone Analyzer service (available on Bluemix). This service provides insight into sentiment, or how the author feels. Then use Spark SQL to load the data into a DataFrame for further analysis. Here's the basic architecture of this app:
 9 | ![Twitter + Watson high level architecture](https://i2.wp.com/developer.ibm.com/clouddataservices/wp-content/uploads/sites/47/2015/10/Spark-Streaming-Twitter-architecture.png)
10 | 
11 | Follow the full tutorial to understand how it works and create your own stream.
12 | 
13 |  [Get started](https://developer.ibm.com/clouddataservices/sentiment-analysis-of-twitter-hashtags/)
14 | 


--------------------------------------------------------------------------------
/streaming-twitter/sampleConfig/sampleconf.properties:
--------------------------------------------------------------------------------
 1 | #Twitter credentials
 2 | twitter4j.oauth.consumerKey=XXXX
 3 | twitter4j.oauth.consumerSecret=XXXX
 4 | twitter4j.oauth.accessToken=XXXX
 5 | twitter4j.oauth.accessTokenSecret=XXXX
 6 | 
 7 | #MessageHub
 8 | kafka.topic.tweet=twitter-spark
 9 | kafka.user.name=XXXX
10 | kafka.user.password=XXXX
11 | bootstrap.servers=kafka01-prod01.messagehub.services.us-south.bluemix.net:9093,\
12 |       kafka02-prod01.messagehub.services.us-south.bluemix.net:9093,\
13 |       kafka03-prod01.messagehub.services.us-south.bluemix.net:9093,\
14 |       kafka04-prod01.messagehub.services.us-south.bluemix.net:9093,\
15 |       kafka05-prod01.messagehub.services.us-south.bluemix.net:9093
16 | api_key=XXXX
17 | kafka_rest_url=https://kafka-rest-prod01.messagehub.services.us-south.bluemix.net:443
18 | 
19 | #Watson Tone Analyzer
20 | watson.tone.url=https://gateway.watsonplatform.net/tone-analyzer-experimental/api
21 | watson.tone.password=XXXX
22 | watson.tone.username=XXXX
23 | 
24 | #Checkpoint directory
25 | checkpointDir=XXXX
26 | 


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/KafkaProducerTest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package com.ibm.cds.spark.samples
 18 | 
 19 | import java.io.ByteArrayInputStream
 20 | import java.io.ByteArrayOutputStream
 21 | import java.io.ObjectInputStream
 22 | import java.io.ObjectOutputStream
 23 | import java.util.concurrent.TimeUnit
 24 | import scala.collection.JavaConversions.mapAsJavaMap
 25 | import scala.collection.JavaConversions.seqAsJavaList
 26 | import org.apache.kafka.clients.consumer.KafkaConsumer
 27 | import org.apache.kafka.clients.producer.ProducerRecord
 28 | import org.apache.kafka.common.serialization.Deserializer
 29 | import org.apache.kafka.common.serialization.Serializer
 30 | import org.apache.kafka.common.serialization.StringDeserializer
 31 | import org.apache.log4j.Level
 32 | import org.apache.log4j.Logger
 33 | import com.ibm.cds.spark.samples.config.MessageHubConfig
 34 | import twitter4j.StallWarning
 35 | import twitter4j.Status
 36 | import twitter4j.StatusDeletionNotice
 37 | import twitter4j.StatusListener
 38 | import twitter4j.TwitterStreamFactory
 39 | import scala.util.parsing.json.JSON
 40 | import java.io.InputStream
 41 | import twitter4j.TwitterStream
 42 | import com.ibm.cds.spark.samples.config.DemoConfig
 43 | import org.apache.spark.Logging
 44 | 
 45 | 
 46 | /**
 47 |  * @author dtaieb
 48 |  */
 49 | object KafkaProducerTest extends Logging{
 50 |   //Very verbose, enable only if necessary
 51 |   //Logger.getLogger("org.apache.kafka").setLevel(Level.ALL)
 52 |   //Logger.getLogger("kafka").setLevel(Level.ALL)
 53 |   
 54 |   var twitterStream : TwitterStream = _;
 55 |   
 56 |   def main(args: Array[String]): Unit = {
 57 |     createTwitterStream();
 58 |   }
 59 |   
 60 |   def createTwitterStream(props: DemoConfig=null):TwitterStream = {
 61 |     if( twitterStream != null){
 62 |       println("Twitter Stream already running. Please call closeTwitterStream first");
 63 |       return twitterStream;
 64 |     }
 65 |     var kafkaProps:MessageHubConfig = null;
 66 |     if ( props == null ){
 67 |       kafkaProps = new MessageHubConfig
 68 |     }else{
 69 |       kafkaProps = props.cloneConfig
 70 |     }
 71 |     kafkaProps.setValueSerializer[StatusSerializer]    
 72 |     kafkaProps.validateConfiguration("watson.tone.")
 73 |     kafkaProps.createTopicsIfNecessary( kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS ) )
 74 |     val kafkaProducer = new org.apache.kafka.clients.producer.KafkaProducer[java.lang.String, Status]( kafkaProps.toImmutableMap() );
 75 |     
 76 |     twitterStream = new TwitterStreamFactory().getInstance();
 77 |     twitterStream.addListener( new StatusListener(){
 78 |       var lastSent:Long = 0;
 79 |       def onStatus(status: Status){
 80 |         if ( lastSent == 0 || System.currentTimeMillis() - lastSent > 200L){
 81 |           lastSent = System.currentTimeMillis()
 82 |           logInfo("Got a status  " + status.getText )
 83 |           val producerRecord = new ProducerRecord(kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS ), "tweet", status )
 84 |           try{
 85 |             val metadata = kafkaProducer.send( producerRecord ).get(2000, TimeUnit.SECONDS);
 86 |             logInfo("Successfully sent record: Topic: " + metadata.topic + " Offset: " + metadata.offset )
 87 |           }catch{
 88 |             case e:Throwable => e.printStackTrace
 89 |           }
 90 |         }
 91 |       }
 92 |       def onDeletionNotice( notice: StatusDeletionNotice){
 93 |         
 94 |       }
 95 |       def onTrackLimitationNotice( numLimitation : Int){
 96 |         println("Received track limitation notice from Twitter: " + numLimitation)
 97 |       }
 98 |       
 99 |       def onException( e: Exception){
100 |         println("Unexpected error from twitterStream: " + e.getMessage);
101 |         logError(e.getMessage, e)
102 |       }
103 |       
104 |       def onScrubGeo(lat: Long, long: Long ){
105 |         
106 |       }
107 |       
108 |       def onStallWarning(warning: StallWarning ){
109 |         
110 |       }     
111 |     })
112 |     
113 |     //Start twitter stream sampling
114 |     twitterStream.sample();
115 |     
116 |     println("Twitter stream started. Tweets will flow to MessageHub instance. Please call closeTwitterStream to stop the stream")
117 |     twitterStream
118 |   }
119 |   
120 |   def closeTwitterStream(){
121 |     if ( twitterStream==null){
122 |       println("Nothing to close. Twitter stream has not been started")
123 |     }else{
124 |       println("Stopping twitter stream");
125 |       twitterStream.shutdown()
126 |       twitterStream=null
127 |       println("Twitter Stream stopped")
128 |     }
129 |   }
130 | }
131 | 
132 | object KafkaConsumerTest {
133 |   def main(args: Array[String]): Unit = {
134 |     val kafkaProps = new MessageHubConfig
135 |     kafkaProps.validateConfiguration("watson.tone.")
136 |     val kafkaConsumer = new KafkaConsumer[java.lang.String, StatusAdapter](kafkaProps.toImmutableMap, new StringDeserializer(), new StatusDeserializer())
137 |     
138 |     kafkaConsumer.subscribe( List(kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS )) )
139 |     new Thread( new Runnable {
140 |       def run(){
141 |         while( true ){
142 |           Thread.sleep( 1000L )
143 |           val it = kafkaConsumer.poll(1000L).iterator
144 |           while( it.hasNext() ){
145 |             val record = it.next();
146 |             println( record.value );
147 |           }
148 |         }
149 |       }
150 |     }).start
151 |   }
152 | }


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/MessageHubStreamingTwitter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.ibm.cds.spark.samples
 19 | 
 20 | import scala.BigDecimal
 21 | import scala.collection.JavaConversions.mapAsJavaMap
 22 | import scala.collection.immutable.Seq.canBuildFrom
 23 | import scala.collection.mutable.ListBuffer
 24 | import scala.collection.mutable.Map
 25 | import scala.reflect.ClassTag
 26 | import org.apache.kafka.clients.producer.ProducerRecord
 27 | import org.apache.kafka.common.serialization.StringDeserializer
 28 | import org.apache.kafka.common.serialization.StringSerializer
 29 | import org.apache.spark.HashPartitioner
 30 | import org.apache.spark.SparkConf
 31 | import org.apache.spark.SparkContext
 32 | import org.apache.spark.rdd.RDD
 33 | import org.apache.spark.sql.Row
 34 | import org.apache.spark.streaming.Duration
 35 | import org.apache.spark.streaming.Seconds
 36 | import org.apache.spark.streaming.StreamingContext
 37 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 38 | import org.http4s.client.blaze.PooledHttp1Client
 39 | import com.google.common.base.CharMatcher
 40 | import com.ibm.cds.spark.samples.config.MessageHubConfig
 41 | import com.ibm.cds.spark.samples.dstream.KafkaStreaming.KafkaStreamingContextAdapter
 42 | import twitter4j.Status
 43 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted
 44 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchCompleted
 45 | import com.ibm.cds.spark.samples.config.DemoConfig
 46 | import org.apache.log4j.Level
 47 | import org.apache.log4j.Logger
 48 | import org.apache.spark.streaming.dstream.DStream
 49 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStopped
 50 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverError
 51 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
 52 | import org.apache.spark.broadcast.Broadcast
 53 | import org.apache.spark.Logging
 54 | import java.util.Arrays
 55 | 
 56 | /**
 57 |  * @author dtaieb
 58 |  * Twitter+Watson sample app with MessageHub/Kafka
 59 |  */
 60 | object MessageHubStreamingTwitter extends Logging{
 61 |   
 62 |   var ssc: StreamingContext = null
 63 |   val reuseCheckpoint = false;
 64 |   
 65 |   val queue = new scala.collection.mutable.Queue[(String, String)] 
 66 |   
 67 |   final val KAFKA_TOPIC_TOP_HASHTAGS = "topHashTags"
 68 |   final val KAFKA_TOPIC_TONE_SCORES = "topHashTags.toneScores"
 69 |   final val KAFKA_TOPIC_TOTAL_TWEETS_PROCESSED = "total_tweets"
 70 |   
 71 |   //Logger.getLogger("org.apache.kafka").setLevel(Level.ALL)
 72 |   //Logger.getLogger("kafka").setLevel(Level.ALL)
 73 |   Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
 74 | 
 75 |   def main(args: Array[String]): Unit = {
 76 |     println("Printing arguments: ");
 77 |     args.foreach {  println }
 78 |     
 79 |     if(args.length>0 && System.getProperty("DEMO_CONFIG_PATH") == null ){
 80 |       //On Spark Service, input files are passed as parameters, if available, we assume first parameter is config file
 81 |       System.setProperty("DEMO_CONFIG_PATH", args(0))
 82 |     }
 83 |     val conf = new SparkConf().setAppName("Spark Streaming Twitter + Watson with MessageHub/Kafka Demo")
 84 |     val sc = new SparkContext(conf)
 85 |     startTwitterStreaming(sc);
 86 |     
 87 |     if(ssc!=null){
 88 |       //When running as stand alone app, we call awaitTermination to make sure the JVM doesn't exit prematurely due to the fact 
 89 |       //that all non-daemon threads have terminated. Note: Don't call awaitTermination directly from startTwitterStreaming as it can be run 
 90 |       //From Notebook
 91 |       ssc.awaitTermination()
 92 |     }
 93 |   }
 94 |   
 95 |   //Hold configuration key/value pairs
 96 |   lazy val kafkaProps = new MessageHubConfig
 97 |   
 98 |   //Wrapper api for Notebook access
 99 |   def getConfig():DemoConfig={
100 |     kafkaProps
101 |   }
102 |   
103 |   def startTwitterStreaming( sc: SparkContext, stopAfter: Duration = Seconds(0) ){
104 |     if ( ssc != null ){
105 |       println("Twitter Stream already running");
106 |       return;
107 |     }
108 |     
109 |     kafkaProps.setValueSerializer[StringSerializer];
110 |     
111 |     if ( !kafkaProps.validateConfiguration("twitter4j.oauth") ){
112 |       return;
113 |     }
114 |     
115 |     //Set the hadoop configuration if needed
116 |     val checkpointDir = kafkaProps.getConfig( MessageHubConfig.CHECKPOINT_DIR_KEY );
117 |     if ( checkpointDir.startsWith("swift") ){
118 |       println("Setting hadoop configuration for swift container")
119 |       kafkaProps.set_hadoop_config(sc)
120 |     }
121 |     
122 |     //Make sure the topics are already created
123 |     kafkaProps.createTopicsIfNecessary( KAFKA_TOPIC_TONE_SCORES, KAFKA_TOPIC_TOP_HASHTAGS, KAFKA_TOPIC_TOTAL_TWEETS_PROCESSED )
124 |     
125 |     val kafkaProducer = new org.apache.kafka.clients.producer.KafkaProducer[String, String]( kafkaProps.toImmutableMap ); 
126 |     
127 |     if ( !reuseCheckpoint ){
128 |       createStreamingContextAndRunAnalytics(sc);
129 |     }else{
130 |       ssc = StreamingContext.getOrCreate( 
131 |           kafkaProps.getConfig( MessageHubConfig.CHECKPOINT_DIR_KEY ), 
132 |           () => {
133 |             createStreamingContextAndRunAnalytics(sc);
134 |           },
135 |           sc.hadoopConfiguration,
136 |           true
137 |       );
138 |     }
139 |     
140 |     ssc.addStreamingListener( new StreamingListener() )
141 |     
142 |     new Thread( new Runnable() {
143 |       def run(){
144 |         while(ssc!=null){          
145 |           while(!queue.isEmpty ){
146 |             try{
147 |                 var task:(String,String) = null;
148 |                 queue.synchronized{
149 |                   task = queue.dequeue();
150 |                 }
151 |                 if ( task != null ){
152 |                   val producerRecord = new ProducerRecord[String,String](task._1, "tweet", task._2 )
153 |                   val metadata = kafkaProducer.send( producerRecord ).get;
154 |                   logInfo("Sent record " + metadata.offset() + " Topic " + task._1)
155 |                 }
156 |             }catch{
157 |                 case e:Throwable => logError(e.getMessage, e)
158 |             }
159 |           }
160 |           queue.synchronized{
161 |             queue.wait();
162 |           }
163 |         }
164 |       }
165 |     },"Message Hub producer").start
166 |    
167 |     ssc.start
168 |     
169 |     println("Twitter stream started");
170 |     println("Tweets are collected real-time and analyzed")
171 |     println("To stop the streaming and start interacting with the data use: StreamingTwitter.stopTwitterStreaming")
172 |     
173 |     if ( !stopAfter.isZero ){
174 |       //Automatically stop it after 10s
175 |       new Thread( new Runnable {
176 |         def run(){
177 |           Thread.sleep( stopAfter.milliseconds )
178 |           stopTwitterStreaming
179 |         }
180 |       }).start
181 |     }
182 |   }
183 |   
184 |   def createStreamingContextAndRunAnalytics(sc:SparkContext):StreamingContext={
185 |     //Broadcast the config to each worker node
186 |     val broadcastVar = sc.broadcast( kafkaProps.toImmutableMap )
187 |     ssc = new StreamingContext( sc, Seconds(5) )
188 |     ssc.checkpoint(kafkaProps.getConfig( MessageHubConfig.CHECKPOINT_DIR_KEY ));
189 |     val stream = ssc.createKafkaStream[String, StatusAdapter,StringDeserializer, StatusDeserializer](
190 |         kafkaProps,
191 |         List(kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS ))
192 |     );
193 |     runAnalytics(sc, broadcastVar, stream)
194 |     ssc;
195 |   }
196 |   
197 |   def runAnalytics(sc:SparkContext, broadcastVar: Broadcast[scala.collection.immutable.Map[String,String]], stream:DStream[(String,StatusAdapter)]){
198 |     val keys = broadcastVar.value.get("tweets.key").get.split(",");
199 |     val tweets = stream.map( t => t._2)
200 |       .filter { status => 
201 |         status.userLang.startsWith("en") && CharMatcher.ASCII.matchesAllOf(status.text) && ( keys.isEmpty || keys.exists{status.text.contains(_)})
202 |       }
203 |     
204 |     val rowTweets = tweets.map(status=> {
205 |       lazy val client = PooledHttp1Client()
206 |       val sentiment = ToneAnalyzer.computeSentiment( client, status, broadcastVar )        
207 |       var scoreMap : Map[String, Double] = Map()
208 |       if ( sentiment != null ){
209 |         for( toneCategory <- Option(sentiment.tone_categories).getOrElse( Seq() )){
210 |           for ( tone <- Option( toneCategory.tones ).getOrElse( Seq() ) ){
211 |             scoreMap.put( tone.tone_id, (BigDecimal(tone.score).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) * 100.0 )
212 |           }
213 |         }
214 |       }
215 |       
216 |       EnrichedTweet( 
217 |           status.userName, 
218 |           status.userId,
219 |           status.createdAt, 
220 |           status.userLang, 
221 |           status.text, 
222 |           status.long,
223 |           status.lat,
224 |           scoreMap
225 |       )
226 |     })
227 |    
228 |    val delimTagTone = "-%!"
229 |    val delimToneScore = ":%@"
230 |    val statsStream = rowTweets.map { eTweet => ("total_tweets", 1L) }
231 |       .reduceByKey( _+_ )
232 |       .updateStateByKey( (a:Seq[Long], b:Option[Long] ) => {
233 |         var runningCount=b.getOrElse(0L)
234 |         a.foreach { v => runningCount=runningCount+v }
235 |         Some(runningCount)
236 |       })
237 |    statsStream.foreachRDD( rdd =>{
238 |      queue.synchronized{
239 |        queue+=((KAFKA_TOPIC_TOTAL_TWEETS_PROCESSED, TweetsMetricJsonSerializer.serialize(rdd.collect())))
240 | 		   try{
241 | 			   queue.notify
242 | 		   }catch{
243 | 		     case e:Throwable=>logError(e.getMessage, e)
244 | 		   }
245 |      }
246 |    })
247 |    
248 |    val metricsStream = rowTweets.flatMap { eTweet => {
249 |      val retList = ListBuffer[String]()
250 |      for ( tag <- eTweet.text.split("\\s+") ){
251 |        if ( tag.startsWith( "#") && tag.length > 1 ){
252 |            for ( tone <- Option( eTweet.sentimentScores.keys ).getOrElse( Seq() ) ){
253 |                retList += (tag + delimTagTone + tone + delimToneScore + eTweet.sentimentScores.getOrElse( tone, 0.0))
254 |            }
255 |        }
256 |      }
257 |      retList.toList
258 |    }}
259 |    .map { fullTag => {
260 |        val split = fullTag.split(delimToneScore);
261 |        (split(0), split(1).toFloat) 
262 |    }}
263 |    .combineByKey( 
264 |        (x:Float) => (x,1), 
265 |        (x:(Float,Int), y:Float) => (x._1 + y, x._2+1), 
266 |        (x:(Float,Int),y:(Float,Int)) => (x._1 + y._1, x._2 + y._2),
267 |        new HashPartitioner(sc.defaultParallelism)
268 |    )
269 |    .map[(String,(Long/*count*/, List[(String, Double)]))]{ t => {
270 |      val key = t._1;
271 |      val ab = t._2;
272 |      val split = key.split(delimTagTone)
273 |      (split(0), (ab._2, List((split(1), BigDecimal(ab._1/ab._2).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble ))))
274 |    }}
275 |    .reduceByKey( (t,u) => (t._1+u._1, (t._2 ::: u._2).sortWith( (l,r) => l._1.compareTo( r._1 ) < 0 )))
276 |    .mapValues( (item:(Long, List[(String,Double)])) => {
277 |      val unzip = item._2.unzip
278 |      (item._1/(item._2.size), unzip._1, unzip._2)
279 |    })
280 |    .updateStateByKey( (a:scala.collection.Seq[(Long, List[String], List[Double])], b: Option[(Long, List[String], List[Double])]) => {
281 |      val safeB = b.getOrElse( (0L, List(), List() ) )
282 |      var listTones = safeB._2
283 |      var listScores = safeB._3
284 |      var count = safeB._1
285 |      for( item <- a ){
286 |        count += item._1
287 |        listScores = listScores.zipAll( item._3, 0.0, 0.0).map{ case(a,b)=>(a+b)/2 }.toList
288 |        listTones = item._2
289 |      }
290 |      
291 |      Some( (count, listTones, listScores) )
292 |    })
293 |    
294 |    metricsStream.print
295 |    
296 |    metricsStream.foreachRDD( rdd =>{
297 |      val topHashTags = rdd.sortBy( f => f._2._1, false ).take(5)
298 |      if ( !topHashTags.isEmpty){
299 |        queue.synchronized{
300 |          queue += ((KAFKA_TOPIC_TOP_HASHTAGS, TweetsMetricJsonSerializer.serialize(topHashTags.map( f => (f._1, f._2._1 )))))
301 |          queue += ((KAFKA_TOPIC_TONE_SCORES, ToneScoreJsonSerializer.serialize(topHashTags)))
302 |          try{
303 |            queue.notify
304 |          }catch{
305 |            case e:Throwable=>logError(e.getMessage, e)
306 |          }
307 |        }
308 |      }
309 |    })
310 |   }
311 |   
312 |   def stopTwitterStreaming(){
313 |     if ( ssc == null){
314 |       println("No Twitter stream to stop");
315 |       return;
316 |     }
317 |     
318 |     println("Stopping Twitter stream. Please wait this may take a while")
319 |     ssc.stop(stopSparkContext = false, stopGracefully = true)
320 |     ssc = null
321 |     println("Twitter stream stopped");
322 |   }
323 | }
324 | 
325 | object TweetsMetricJsonSerializer extends Logging{
326 |   def serialize(value: Seq[(String,Long)] ): String = {   
327 |     val sb = new StringBuilder("[")
328 |     var comma = ""
329 |     value.foreach( item => {
330 |       sb.append( comma + "[\"" + item._1.replaceAll("\"", "") + "\"," + item._2 + "]")
331 |       comma=","
332 |     })
333 |     sb.append("]")
334 |     logInfo("Serialized json: " + sb)
335 |     sb.toString()
336 |   }
337 | }
338 | 
339 | object ToneScoreJsonSerializer extends Logging{
340 |   def serializeList[U:ClassTag]( label: String, value: List[U] ):String = {
341 |     val sb = new StringBuilder("[\"" + label.replaceAll("\"", "") + "\"")
342 |     value.foreach { item => {
343 |       if ( item.isInstanceOf[String] ) {
344 |         val s = ",\"" + item.toString().replaceAll("\"", "") + "\"";
345 |         sb.append( s.replaceAll("\"\"", "\"") )
346 |       }else if ( item.isInstanceOf[Double] ){
347 |         sb.append("," + item )
348 |       }
349 |     }}
350 |     sb.append("]")
351 |     sb.toString
352 |   }
353 |   def serialize(value:Seq[(String, (Long, List[String], List[Double]))]):String={
354 |     val sb = new StringBuilder("[")
355 |     var comma = ""
356 |     var appendToneData = true;
357 |     value.foreach( item => {
358 |       if ( appendToneData ){
359 |         sb.append( comma + serializeList( "x", item._2._2 ) )
360 |         appendToneData = false
361 |         comma = ","
362 |       }
363 |       sb.append( comma + serializeList( item._1, item._2._3 ) )
364 |       comma=","
365 |     })
366 |     sb.append("]")
367 |     logInfo("Serialized size: " + value.size + ". Tone json: " + sb)
368 |     sb.toString()
369 |   }
370 | }


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/PixiedustStreamingTwitter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.ibm.cds.spark.samples
 19 | 
 20 | import scala.collection.mutable._
 21 | import com.ibm.pixiedust.ChannelReceiver
 22 | import org.apache.spark.Logging
 23 | import org.apache.log4j.Logger
 24 | import org.apache.log4j.Level
 25 | import org.apache.spark.SparkContext
 26 | import org.apache.spark.streaming.StreamingContext
 27 | import org.apache.spark.rdd.RDD
 28 | import org.apache.spark.sql.types.StructType
 29 | import org.apache.spark.sql.Row
 30 | import com.ibm.cds.spark.samples.config.DemoConfig
 31 | import org.apache.spark.streaming.Seconds
 32 | import org.apache.spark.sql.types.IntegerType
 33 | import org.apache.spark.sql.types.DoubleType
 34 | import org.http4s.client.blaze.PooledHttp1Client
 35 | import org.apache.spark.sql.types.StructField
 36 | import org.apache.spark.sql.types.StringType
 37 | import com.google.common.base.CharMatcher
 38 | import com.ibm.couchdb.CouchDb
 39 | import com.ibm.couchdb.TypeMapping
 40 | import com.ibm.couchdb.CouchDbApi
 41 | import org.apache.spark.sql.SQLContext
 42 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverError
 43 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStopped
 44 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
 45 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchCompleted
 46 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted
 47 | import org.apache.spark.SparkConf
 48 | import org.apache.spark.streaming.dstream.DStream
 49 | import org.apache.spark.broadcast.Broadcast
 50 | import org.apache.spark.HashPartitioner
 51 | import twitter4j.Status
 52 | import org.codehaus.jettison.json.JSONObject
 53 | import org.apache.spark.AccumulableParam
 54 | import org.apache.spark.streaming.StreamingContextState
 55 | import org.apache.spark.sql.DataFrame
 56 | 
 57 | /* @author dtaieb
 58 |  * Twitter+Watson sentiment analysis app powered by Pixiedust
 59 |  */
 60 | object PixiedustStreamingTwitter extends ChannelReceiver() with Logging{
 61 |   var ssc: StreamingContext = null
 62 |   var workingRDD: RDD[Row] = null
 63 |   //Hold configuration key/value pairs
 64 |   lazy val config = new DemoConfig
 65 |   lazy val logger: Logger = Logger.getLogger( "com.ibm.cds.spark.samples.PixiedustStreamingTwitter" )
 66 |   
 67 |   val BEGINSTREAM = "@BEGINSTREAM@"
 68 |   val ENDSTREAM = "@ENDSTREAM@"
 69 |   
 70 |   def sendLog(s:String){
 71 |     send("log", s)
 72 |   }
 73 |   
 74 |   //Wrapper api for Notebook access
 75 |   def setConfig(key:String, value:String){
 76 |     config.setConfig(key, value)
 77 |   }
 78 |   
 79 |   //main method invoked when running as a standalone Spark Application
 80 |   def main(args: Array[String]) {    
 81 |     val conf = new SparkConf().setAppName("Pixiedust Spark Streaming Twitter Demo")
 82 |     val sc = new SparkContext(conf)
 83 |     startStreaming();
 84 |   }
 85 |   
 86 |   def createTwitterDataFrames(sqlContext: SQLContext) : DataFrame = {
 87 |     if ( workingRDD == null || workingRDD.count <= 0 ){
 88 |       println("No data receive. Please start the Twitter stream again to collect data")
 89 |       return null
 90 |     }
 91 |  
 92 |     sqlContext.createDataFrame( workingRDD, schemaTweets )
 93 |   }
 94 |   
 95 |   class PixiedustStreamingListener extends org.apache.spark.streaming.scheduler.StreamingListener {
 96 |       override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) { 
 97 |         sendLog("Receiver Started: " + receiverStarted.receiverInfo.name )
 98 |         //Signal the frontend that we started streaming
 99 |         sendLog(BEGINSTREAM)
100 |       }
101 |     
102 |       override def onReceiverError(receiverError: StreamingListenerReceiverError) { 
103 |         sendLog("Receiver Error: " + receiverError.receiverInfo.lastError)
104 |       }
105 |     
106 |       override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped) { 
107 |         sendLog("Receiver Stopped: " + receiverStopped.receiverInfo.name)
108 |         sendLog("Reason: " + receiverStopped.receiverInfo.lastError + " : " + receiverStopped.receiverInfo.lastErrorMessage)
109 |         //signal the front end that we're done streaming
110 |         sendLog(ENDSTREAM)
111 |       }
112 |       
113 |       override def onBatchStarted(batchStarted: StreamingListenerBatchStarted){
114 |         sendLog("Batch started with " + batchStarted.batchInfo.numRecords + " records")
115 |       }
116 |       
117 |       override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted){
118 |         sendLog("Batch completed with " + batchCompleted.batchInfo.numRecords + " records");
119 |       }
120 |   }
121 |   
122 |   val reuseCheckpoint = false;
123 |   
124 |   def startStreaming(){
125 |     val sc = SparkContext.getOrCreate
126 |     sendLog("Starting twitter stream");
127 |     if ( ssc != null ){
128 |       sendLog("Twitter Stream already running");
129 |       sendLog("Please use stopTwitterStreaming() first and try again");
130 |       return;
131 |     }
132 |     
133 |     if ( !config.validateConfiguration() ){
134 |       sendLog("Unable to validate config")
135 |       sendLog(ENDSTREAM)
136 |       return;
137 |     }
138 |     
139 |     Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
140 |     
141 |     //Set the hadoop configuration if needed
142 |     val checkpointDir = config.getConfig( DemoConfig.CHECKPOINT_DIR_KEY );
143 |     if ( checkpointDir.startsWith("swift") ){
144 |       println("Setting hadoop configuration for swift container")
145 |       config.set_hadoop_config(sc)
146 |     }
147 |     
148 |     workingRDD = sc.emptyRDD
149 |     
150 |     if ( !reuseCheckpoint ){
151 |       ssc = createStreamingContextAndRunAnalytics(sc);
152 |     }else{
153 |       ssc = StreamingContext.getOrCreate( 
154 |           config.getConfig( DemoConfig.CHECKPOINT_DIR_KEY ), 
155 |           () => {
156 |             createStreamingContextAndRunAnalytics(sc);
157 |           },
158 |           sc.hadoopConfiguration,
159 |           true
160 |       );
161 |     }
162 |     
163 |     ssc.addStreamingListener( new PixiedustStreamingListener )
164 |     
165 |     ssc.start()
166 |     
167 |     sendLog("Twitter stream started");
168 |   }
169 |   
170 |   def stopStreaming(){
171 |     if ( ssc == null){
172 |       sendLog("No Twitter stream to stop");
173 |       return;
174 |     }
175 |     
176 |     sendLog("Stopping Twitter stream. Please wait this may take a while")
177 |     ssc.stop(stopSparkContext = false, stopGracefully = false)
178 |     ssc = null
179 |     sendLog("Twitter stream stopped");
180 |   }
181 |   
182 |   def createStreamingContextAndRunAnalytics(sc:SparkContext):StreamingContext={
183 |     //Broadcast the config to each worker node
184 |     val broadcastVar = sc.broadcast( config.toImmutableMap )
185 |     ssc = new StreamingContext( sc, Seconds(5) )
186 |     ssc.checkpoint(config.getConfig( DemoConfig.CHECKPOINT_DIR_KEY ));
187 |     val stream = org.apache.spark.streaming.twitter.TwitterUtils.createStream( ssc, None );
188 |     runAnalytics(sc, broadcastVar, stream)
189 |     ssc;
190 |   }
191 |   
192 |   def runAnalytics(sc:SparkContext, broadcastVar: Broadcast[scala.collection.immutable.Map[String,String]], stream:DStream[Status]){
193 |     val keys = broadcastVar.value.get("tweets.key").get.split(",");
194 |     val tweets = stream.filter { status => 
195 |       Option(status.getUser).flatMap[String] { 
196 |         u => Option(u.getLang) 
197 |       }.getOrElse("").startsWith("en") && CharMatcher.ASCII.matchesAllOf(status.getText) && ( keys.isEmpty || keys.exists{key => status.getText.toLowerCase.contains(key.toLowerCase)})
198 |     }
199 |     
200 |     val tweetAccumulator = sc.accumulable(Array[(String,String)]())(TweetsAccumulatorParam)
201 |     
202 |     new Thread( new Runnable() {
203 |       def run(){
204 |         try{
205 |           while(ssc!=null && ssc.getState() != StreamingContextState.STOPPED ){
206 |             val accuValue = tweetAccumulator.value
207 |             if ( accuValue.size > 0 ){
208 |               tweetAccumulator.setValue(Array[(String,String)]() )
209 |               accuValue.foreach( v => send(v._1, v._2) )
210 |             }
211 |             Thread.sleep( 1000L )
212 |           }
213 |           System.out.println("Stopping the accumulator thread")
214 |         }catch{
215 |           case e:Throwable => e.printStackTrace()
216 |         }
217 |       }
218 |     },"Accumulator").start
219 |     
220 |     val rowTweets = tweets.map(status=> {
221 |       lazy val client = PooledHttp1Client()
222 |       val sentiment = ToneAnalyzer.computeSentiment( client, status, broadcastVar )
223 |       var scoreMap : Map[String, Double] = Map()
224 |       if ( sentiment != null ){
225 |         for( toneCategory <- Option(sentiment.tone_categories).getOrElse( Seq() )){
226 |           for ( tone <- Option( toneCategory.tones ).getOrElse( Seq() ) ){
227 |             scoreMap.put( tone.tone_id, (BigDecimal(tone.score).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) * 100.0 )
228 |           }
229 |         }
230 |       }
231 |       
232 |       var jsonSentiment="{";
233 |       scoreMap.foreach( t => jsonSentiment = jsonSentiment + (if (jsonSentiment.length() == 1) "" else ",") + "\"" + t._1 + "\":" + t._2)
234 |       jsonSentiment += "}";
235 |       val sendValue:String = "{\"author\": \"" + 
236 |             status.getUser.getName + 
237 |             "\", \"userid\":\"" + status.getUser.getScreenName +
238 |             "\", \"pic\":\"" + status.getUser.getOriginalProfileImageURLHttps +
239 |             "\",\"text\":" + JSONObject.quote( status.getText ) + ", \"sentiment\": " + jsonSentiment + "}"
240 |             
241 |       tweetAccumulator+=("tweets",sendValue)
242 |       
243 |       EnrichedTweet( 
244 |           status.getUser.getName,
245 |           status.getUser.getScreenName,
246 |           status.getCreatedAt.toString,          
247 |           status.getUser.getLang,
248 |           status.getText,
249 |           Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0),
250 |           Option(status.getGeoLocation).map{ _.getLongitude}.getOrElse(0.0),          
251 |           scoreMap
252 |       )
253 |     })
254 |     
255 |     rowTweets.foreachRDD( rdd => {
256 |         if( rdd.count > 0 ){
257 |           workingRDD = SparkContext.getOrCreate().parallelize( rdd.map( t => t.toRow() ).collect()).union( workingRDD )
258 |         }
259 |     })
260 |    
261 |    val delimTagTone = "-%!"
262 |    val delimToneScore = ":%@"
263 |    val statsStream = rowTweets.map { eTweet => ("total_tweets", 1L) }
264 |       .reduceByKey( _+_ )
265 |       .updateStateByKey( (a:scala.collection.Seq[Long], b:Option[Long] ) => {
266 |         var runningCount=b.getOrElse(0L)
267 |         a.foreach { v => runningCount=runningCount+v }
268 |         Some(runningCount)
269 |       })
270 |    statsStream.foreachRDD( rdd =>{
271 |      send("TweetProcessed", TweetsMetricJsonSerializer.serialize(rdd.collect()))
272 |    })
273 |    
274 |    val metricsStream = rowTweets.flatMap { eTweet => {
275 |      val retList = ListBuffer[String]()
276 |      for ( tag <- eTweet.text.split("\\s+") ){
277 |        if ( tag.startsWith( "#") && tag.length > 1 ){
278 |            for ( tone <- Option( eTweet.sentimentScores.keys ).getOrElse( Seq() ) ){
279 |                retList += (tag + delimTagTone + tone + delimToneScore + eTweet.sentimentScores.getOrElse( tone, 0.0))
280 |            }
281 |        }
282 |      }
283 |      retList.toList
284 |    }}
285 |    .map { fullTag => {
286 |        val split = fullTag.split(delimToneScore);
287 |        (split(0), split(1).toFloat) 
288 |    }}
289 |    .combineByKey( 
290 |        (x:Float) => (x,1), 
291 |        (x:(Float,Int), y:Float) => (x._1 + y, x._2+1), 
292 |        (x:(Float,Int),y:(Float,Int)) => (x._1 + y._1, x._2 + y._2),
293 |        new HashPartitioner(sc.defaultParallelism)
294 |    )
295 |    .map[(String,(Long/*count*/, List[(String, Double)]))]{ t => {
296 |      val key = t._1;
297 |      val ab = t._2;
298 |      val split = key.split(delimTagTone)
299 |      (split(0), (ab._2, List((split(1), BigDecimal(ab._1/ab._2).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble ))))
300 |    }}
301 |    .reduceByKey( (t,u) => (t._1+u._1, (t._2 ::: u._2).sortWith( (l,r) => l._1.compareTo( r._1 ) < 0 )))
302 |    .mapValues( (item:(Long, List[(String,Double)])) => {
303 |      val unzip = item._2.unzip
304 |      (item._1/(item._2.size), unzip._1, unzip._2)
305 |    })
306 |    .updateStateByKey( (a:scala.collection.Seq[(Long, List[String], List[Double])], b: Option[(Long, List[String], List[Double])]) => {
307 |      val safeB = b.getOrElse( (0L, List(), List() ) )
308 |      var listTones = safeB._2
309 |      var listScores = safeB._3
310 |      var count = safeB._1
311 |      for( item <- a ){
312 |        count += item._1
313 |        listScores = listScores.zipAll( item._3, 0.0, 0.0).map{ case(a,b)=>(a+b)/2 }.toList
314 |        listTones = item._2
315 |      }
316 |      
317 |      Some( (count, listTones, listScores) )
318 |    })
319 |    
320 |    metricsStream.print
321 |    
322 |    metricsStream.foreachRDD( rdd =>{
323 |      val topHashTags = rdd.sortBy( f => f._2._1, false ).take(5)
324 |      if ( !topHashTags.isEmpty){
325 |          tweetAccumulator+=("topHashtags", TweetsMetricJsonSerializer.serialize(topHashTags.map( f => (f._1, f._2._1 ))))
326 |          tweetAccumulator+=("toneScores", ToneScoreJsonSerializer.serialize(topHashTags))
327 |      }
328 |    })
329 |    
330 |   }
331 | }
332 | 
333 | object TweetsAccumulatorParam extends AccumulableParam[Array[(String,String)], (String,String)]{
334 |   def zero(initialValue:Array[(String,String)]):Array[(String,String)] = {
335 |     Array()
336 |   }
337 |   
338 |   def addInPlace(s1:Array[(String,String)], s2:Array[(String,String)]):Array[(String,String)] = {
339 |     s1 ++ s2
340 |   }
341 |   
342 |   def addAccumulator(current:Array[(String,String)], s:(String,String)):Array[(String,String)] = {
343 |     current :+ s
344 |   }
345 | }


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/StatusSerializer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.ibm.cds.spark.samples
19 | 
20 | import java.io.ObjectOutputStream
21 | import java.io.ByteArrayOutputStream
22 | import org.apache.kafka.common.serialization.Serializer
23 | import twitter4j.Status
24 | 
25 | /**
26 |  * @author dtaieb
27 |  */
28 | class StatusSerializer extends Serializer[Status]{
29 |   def configure( props: java.util.Map[String, _], isKey: Boolean) = {
30 |     
31 |   }
32 |   
33 |   def close(){
34 |     
35 |   }
36 |   
37 |   def serialize(topic: String, value: Status ): Array[Byte] = {
38 |     val baos = new ByteArrayOutputStream(1024)
39 |     val oos = new ObjectOutputStream(baos)
40 |     oos.writeObject( value )
41 |     oos.close
42 |     baos.toByteArray()
43 |   }
44 | }


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/StreamingListener.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.ibm.cds.spark.samples
19 | 
20 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverError
21 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStopped
22 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
23 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchCompleted
24 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted
25 | 
26 | /**
27 |  * @author dtaieb
28 |  */
29 | class StreamingListener extends org.apache.spark.streaming.scheduler.StreamingListener {
30 |   override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) { 
31 |     println("Receiver Started: " + receiverStarted.receiverInfo.name )
32 |   }
33 | 
34 |   override def onReceiverError(receiverError: StreamingListenerReceiverError) { 
35 |     println("Receiver Error: " + receiverError.receiverInfo.lastError)
36 |   }
37 | 
38 |   override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped) { 
39 |     println("Receiver Stopped: " + receiverStopped.receiverInfo.name)
40 |     println("Reason: " + receiverStopped.receiverInfo.lastError + " : " + receiverStopped.receiverInfo.lastErrorMessage)
41 |   }
42 |   
43 |   override def onBatchStarted(batchStarted: StreamingListenerBatchStarted){
44 |     println("Batch started with " + batchStarted.batchInfo.numRecords + " records")
45 |   }
46 |   
47 |   override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted){
48 |     println("Batch completed with " + batchCompleted.batchInfo.numRecords + " records");
49 |   }
50 | }


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/StreamingTwitter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.ibm.cds.spark.samples
 19 | 
 20 | import scala.collection.mutable._
 21 | import org.apache.commons.lang3.StringEscapeUtils
 22 | import org.apache.log4j.Level
 23 | import org.apache.log4j.Logger
 24 | import org.apache.spark.Accumulator
 25 | import org.apache.spark.SparkConf
 26 | import org.apache.spark.SparkContext
 27 | import org.apache.spark.streaming._
 28 | import org.apache.spark.streaming.dstream._
 29 | import org.http4s._
 30 | import org.http4s.Http4s._
 31 | import org.http4s.Status._
 32 | import org.http4s.client.Client
 33 | import org.http4s.client.blaze.PooledHttp1Client
 34 | import org.http4s.headers.Authorization
 35 | import com.ibm.couchdb._
 36 | import scalaz._
 37 | import scalaz.concurrent.Task
 38 | import twitter4j.Status
 39 | import org.apache.spark.sql.SQLContext
 40 | import org.apache.spark.sql.Row
 41 | import org.apache.spark.sql.types._
 42 | import org.apache.spark.sql.DataFrame
 43 | import org.apache.spark.rdd.RDD
 44 | import org.apache.spark.rdd.EmptyRDD
 45 | import com.google.common.base.CharMatcher
 46 | import scala.math.BigDecimal
 47 | import com.ibm.cds.spark.samples.config.DemoConfig
 48 | import com.ibm.cds.spark.samples.ToneAnalyzer.ToneCategory
 49 | import org.apache.spark.Logging
 50 | 
 51 | 
 52 | 
 53 | 
 54 | /**
 55 |  * @author dtaieb
 56 |  */
 57 | object StreamingTwitter extends Logging{
 58 |   var ssc: StreamingContext = null
 59 |   var sqlContext: SQLContext = null
 60 |   var workingRDD: RDD[Row] = null
 61 |   var schemaTweets : StructType = null
 62 |   val logger: Logger = Logger.getLogger( "com.ibm.cds.spark.samples.StreamingTwitter" )
 63 |   
 64 |   //main method invoked when running as a standalone Spark Application
 65 |   def main(args: Array[String]) {
 66 |     
 67 |     val conf = new SparkConf().setAppName("Spark Streaming Twitter Demo")
 68 |     val sc = new SparkContext(conf)
 69 |     startTwitterStreaming(sc, Seconds(10));
 70 |   }
 71 |   
 72 |   //Hold configuration key/value pairs
 73 |   val config = new DemoConfig
 74 |   
 75 |   //Wrapper api for Notebook access
 76 |   def setConfig(key:String, value:String){
 77 |     config.setConfig(key, value)
 78 |   }
 79 |   
 80 |   def startTwitterStreaming( sc: SparkContext, stopAfter: Duration = Seconds(0) ){
 81 |     println("Starting twitter stream");
 82 |     if ( ssc != null ){
 83 |       println("Twitter Stream already running");
 84 |       println("Please use stopTwitterStreaming() first and try again");
 85 |       return;
 86 |     }
 87 |     
 88 |     if ( !config.validateConfiguration(DemoConfig.CHECKPOINT_DIR_KEY) ){
 89 |       println("Unable to validate config")
 90 |       return;
 91 |     }
 92 |     
 93 |     Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
 94 |     
 95 |     workingRDD = sc.emptyRDD
 96 |     //Broadcast the config to each worker node
 97 |     val broadcastVar = sc.broadcast(config.toImmutableMap)
 98 |     
 99 |     var canStopTwitterStream = true
100 |     var batchesProcessed=0
101 |     
102 |     ssc = new StreamingContext( sc, Seconds(5) )
103 |     
104 |     ssc.addStreamingListener( new StreamingListener )
105 |     
106 |     try{
107 |       sqlContext = new SQLContext(sc)
108 |       val keys = config.getConfig("tweets.key").split(",");
109 |       val stream = org.apache.spark.streaming.twitter.TwitterUtils.createStream( ssc, None );
110 |       
111 |       if ( schemaTweets == null ){
112 |         val schemaString = "author userid date lang text lat:double long:double"
113 |         schemaTweets =
114 |           StructType(
115 |             schemaString.split(" ").map(
116 |               fieldName => {
117 |                 val ar = fieldName.split(":")
118 |                 StructField(
119 |                     ar.lift(0).get, 
120 |                     ar.lift(1).getOrElse("string") match{
121 |                       case "int" => IntegerType
122 |                       case "double" => DoubleType
123 |                       case _ => StringType
124 |                     },
125 |                     true)
126 |               }
127 |             ).union( 
128 |                 ToneAnalyzer.sentimentFactors.map( f => StructField( f._1, DoubleType )).toArray[StructField]
129 |             )
130 |           )
131 |       }
132 |       val tweets = stream.filter { status => 
133 |         Option(status.getUser).flatMap[String] { 
134 |           u => Option(u.getLang) 
135 |         }.getOrElse("").startsWith("en") && CharMatcher.ASCII.matchesAllOf(status.getText) && ( keys.isEmpty || keys.exists{status.getText.contains(_)})
136 |       }
137 |       
138 |       lazy val client = PooledHttp1Client()
139 |       val rowTweets = tweets.map(status=> {
140 |         val sentiment = ToneAnalyzer.computeSentiment( client, status, broadcastVar )
141 |         
142 |         var colValues = Array[Any](
143 |           status.getUser.getName, //author
144 |           status.getUser.getScreenName, //Userid
145 |           status.getCreatedAt.toString,   //date
146 |           status.getUser.getLang,  //Lang
147 |           status.getText,               //text
148 |           Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0),      //lat
149 |           Option(status.getGeoLocation).map{_.getLongitude}.getOrElse(0.0)    //long
150 |           //exception
151 |         )
152 |         
153 |         var scoreMap : Map[String, Double] = Map()
154 |         if ( sentiment != null ){
155 |           for( toneCategory <- Option(sentiment.tone_categories).getOrElse( Seq() )){
156 |             for ( tone <- Option( toneCategory.tones ).getOrElse( Seq() ) ){
157 |               scoreMap.put( tone.tone_id, tone.score )
158 |             }
159 |           }
160 |         }
161 |              
162 |         colValues = colValues ++ ToneAnalyzer.sentimentFactors.map { f => (BigDecimal(scoreMap.get(f._2).getOrElse(0.0)).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) * 100.0  }
163 |         //Return [Row, (sentiment, status)]
164 |         (Row(colValues.toArray:_*),(sentiment, status))
165 |       })
166 | 
167 |       rowTweets.foreachRDD( rdd => {
168 |         if(batchesProcessed==0){
169 |           canStopTwitterStream=false
170 |         }
171 |         try{
172 |           if( rdd.count > 0 ){
173 |             batchesProcessed += 1
174 |             workingRDD = sc.parallelize( rdd.map( t => t._1 ).collect()).union( workingRDD )
175 |         
176 |             val saveToCloudant = broadcastVar.value.get("cloudant.save").get.toBoolean
177 |             if ( saveToCloudant ){
178 |               rdd.foreachPartition { iterator => 
179 |                 var db: CouchDbApi = null;
180 |                 val couch = CouchDb( broadcastVar.value.get("cloudant.hostName").get, 
181 |                     broadcastVar.value.get("cloudant.port").get.toInt, 
182 |                     broadcastVar.value.get("cloudant.https").get.toBoolean, 
183 |                     broadcastVar.value.get("cloudant.username").get, 
184 |                     broadcastVar.value.get("cloudant.password").get
185 |                 );
186 |                 val dbName = "spark-streaming-twitter"
187 |                 couch.dbs.get(dbName).attemptRun match{
188 |                   case -\/(e) => logger.trace("Couch Database does not exist, creating it now"); couch.dbs.create(dbName).run
189 |                   case \/-(a) => println("Connected to cloudant db " + dbName )
190 |                 }
191 |                 val typeMapping = TypeMapping(classOf[ToneAnalyzer.Tweet] -> "Tweet")
192 |                 db = couch.db(dbName, typeMapping)
193 |                 iterator.foreach( t => {
194 |                     saveTweetToCloudant( client, db, t._2._2, t._2._1 )
195 |                   }
196 |                 )
197 |               }
198 |             }
199 |           }
200 |         }catch{
201 |           case e: InterruptedException=>//Ignore
202 |           case e: Exception => logError(e.getMessage, e )
203 |         }finally{
204 |             canStopTwitterStream = true
205 |         }        
206 |       })
207 | 
208 |     }catch{
209 |       case e : Exception => logError(e.getMessage, e )
210 |       return
211 |     }
212 |     ssc.start()
213 |     
214 |     println("Twitter stream started");
215 |     println("Tweets are collected real-time and analyzed")
216 |     println("To stop the streaming and start interacting with the data use: StreamingTwitter.stopTwitterStreaming")
217 |     
218 |     if ( !stopAfter.isZero ){
219 |       //Automatically stop it after 10s
220 |       new Thread( new Runnable {
221 |         var displayMessage = true;
222 |         def run(){
223 |           Thread.sleep( stopAfter.milliseconds )
224 |           var loop = true
225 |           while(loop){
226 |             if (canStopTwitterStream){
227 |               stopTwitterStreaming
228 |               loop = false
229 |             }else{
230 |               if ( displayMessage ){
231 |                 displayMessage = false
232 |                 println("Received directive to stop twitter Stream: Waiting for already received tweets to be processed...")
233 |               }
234 |               Thread.sleep(5000L)
235 |             }
236 |           }
237 |         }
238 |       }).start
239 |     }
240 |   }
241 |   
242 |   def saveTweetToCloudant(client: Client, db: CouchDbApi, status:Status, sentiment: ToneAnalyzer.Sentiment) : Status = {    
243 |     if ( db != null){
244 |       logger.trace("Creating new Tweet in Couch Database " + status.getText())
245 |       val task:Task[Res.DocOk] = db.docs.create( 
246 |           ToneAnalyzer.Tweet(
247 |               status.getUser().getName, 
248 |               status.getCreatedAt().toString(),
249 |               status.getUser().getLang(),
250 |               status.getText(),
251 |               ToneAnalyzer.Geo( 
252 |                   Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0), 
253 |                   Option(status.getGeoLocation).map{_.getLongitude}.getOrElse(0.0) 
254 |               ),
255 |               sentiment
256 |           ) 
257 |       )
258 |       
259 |       // Execute the actions and process the result
260 |       task.attemptRun match {
261 |         case -\/(e) => logError(e.getMessage, e );
262 |         case \/-(a) => logger.trace("Successfully create new Tweet in Couch Database " + status.getText() )
263 |       }
264 |     }
265 |       
266 |     status
267 |   }
268 |   
269 |   def createTwitterDataFrames(sc: SparkContext) : (SQLContext, DataFrame) = {
270 |     if ( workingRDD.count <= 0 ){
271 |       println("No data receive. Please start the Twitter stream again to collect data")
272 |       return null
273 |     }
274 |     
275 |     try{
276 |       val df = sqlContext.createDataFrame( workingRDD, schemaTweets )
277 |       df.registerTempTable("tweets")
278 |       
279 |       println("A new table named tweets with " + df.count() + " records has been correctly created and can be accessed through the SQLContext variable")
280 |       println("Here's the schema for tweets")
281 |       df.printSchema()
282 |       
283 |       (sqlContext, df)
284 |     }catch{
285 |       case e: Exception => {logError(e.getMessage, e ); return null}
286 |     }
287 |   }
288 |  
289 |   def stopTwitterStreaming(){
290 |     if ( ssc == null){
291 |       println("No Twitter stream to stop");
292 |       return;
293 |     }
294 |     
295 |     println("Stopping Twitter stream. Please wait this may take a while")
296 |     ssc.stop(stopSparkContext = false, stopGracefully = false)
297 |     ssc = null
298 |     println("Twitter stream stopped");
299 |     
300 |     println( "You can now create a sqlContext and DataFrame with " + workingRDD.count + " Tweets created. Sample usage: ")
301 |     println("val (sqlContext, df) = com.ibm.cds.spark.samples.StreamingTwitter.createTwitterDataFrames(sc)")
302 |     println("df.printSchema")
303 |     println("sqlContext.sql(\"select author, text from tweets\").show")
304 |   }
305 | }


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/ToneAnalyzer.scala:
--------------------------------------------------------------------------------
 1 | package com.ibm.cds.spark.samples
 2 | 
 3 | import org.http4s.EntityEncoder
 4 | import org.http4s.Uri
 5 | import org.http4s.client.Client
 6 | import org.http4s.Request
 7 | import org.http4s.BasicCredentials
 8 | import org.http4s.Header
 9 | import org.http4s.Headers
10 | import org.http4s.Method
11 | import org.http4s.headers.Authorization
12 | import org.apache.log4j.Logger
13 | import org.apache.spark.broadcast.Broadcast
14 | import org.apache.spark.Logging
15 | import scala.util.parsing.json.JSON
16 | import org.codehaus.jettison.json.JSONObject
17 | 
18 | /**
19 |  * @author dtaieb
20 |  */
21 | 
22 | object ToneAnalyzer extends Logging{
23 |   
24 |   val sentimentFactors = Array(
25 |       ("Anger","anger"),
26 |       ("Disgust","disgust"),
27 |       ("Fear","fear"),
28 |       ("Joy","joy"),
29 |       ("Sadness","sadness"),
30 |       ("Analytical","analytical"),
31 |       ("Confident","confident"),
32 |       ("Tentative","tentative"),
33 |       ("Openness","openness_big5"),
34 |       ("Conscientiousness","conscientiousness_big5"),
35 |       ("Extraversion","extraversion_big5"),
36 |       ("Agreeableness","agreeableness_big5"),
37 |       ("EmotionalRange","neuroticism_big5")
38 |   )
39 |   
40 |   //Class models for Sentiment JSON
41 |   case class DocumentTone( document_tone: Sentiment )
42 |   case class Sentiment(tone_categories: Seq[ToneCategory]);
43 |   case class ToneCategory(category_id: String, category_name: String, tones: Seq[Tone]);
44 |   case class Tone(score: Double, tone_id: String, tone_name: String)
45 | //  case class Sentiment( scorecard: String, children: Seq[Tone] )
46 | //  case class Tone( name: String, id: String, children: Seq[ToneResult])
47 | //  case class ToneResult(name: String, id: String, word_count: Double, normalized_score: Double, raw_score: Double, linguistic_evidence: Seq[LinguisticEvidence] )
48 | //  case class LinguisticEvidence( evidence_score: Double, word_count: Double, correlation: String, words : Seq[String])
49 |   
50 |   case class Geo( lat: Double, long: Double )
51 |   case class Tweet(author: String, date: String, language: String, text: String, geo : Geo, sentiment : Sentiment )
52 |  
53 |   def computeSentiment( client: Client, status:StatusAdapter, broadcastVar: Broadcast[Map[String,String]] ) : Sentiment = {
54 |     logTrace("Calling sentiment from Watson Tone Analyzer: " + status.text)
55 |     try{
56 |       //Get Sentiment on the tweet
57 |       val sentimentResults: String = 
58 |         EntityEncoder[String].toEntity("{\"text\": " + JSONObject.quote( status.text ) + "}" ).flatMap { 
59 |           entity =>
60 |             val s = broadcastVar.value.get("watson.tone.url").get + "/v3/tone?version=" + broadcastVar.value.get("watson.api.version").get
61 |             val toneuri: Uri = Uri.fromString( s ).getOrElse( null )
62 |             client(
63 |                 Request( 
64 |                     method = Method.POST, 
65 |                     uri = toneuri,
66 |                     headers = Headers(
67 |                         Authorization(
68 |                           BasicCredentials(broadcastVar.value.get("watson.tone.username").get, broadcastVar.value.get("watson.tone.password").get)
69 |                         ),
70 |                         Header("Accept", "application/json"),
71 |                         Header("Content-Type", "application/json; charset=utf-8")
72 |                       ),
73 |                     body = entity.body
74 |                 )
75 |             ).flatMap { response =>
76 |                if (response.status.code == 200 ) {
77 |                 response.as[String]
78 |                } else {
79 |                 println( "Error received from Watson Tone Analyzer. Code : " + response.status.code + " reason: " + response.status.reason )
80 |                 null
81 |               }
82 |             }
83 |         }.run
84 |   
85 |         upickle.read[DocumentTone](sentimentResults).document_tone
86 |       }catch{
87 |         case e:Throwable => {
88 |           e.printStackTrace()
89 |           null
90 |         }
91 |       }
92 |     }
93 | }


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/TwitterAdapter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.ibm.cds.spark.samples
19 | 
20 | import java.io.ObjectInputStream
21 | import java.io.ByteArrayInputStream
22 | import scala.util.parsing.json.JSON
23 | import org.apache.kafka.common.serialization.Deserializer
24 | import twitter4j.Status
25 | 
26 | /**
27 |  * @author dtaieb
28 |  * Deserialization adapters for Twitter4J Status
29 |  */
30 | 
31 | case class StatusAdapter(userName:String, userId: String, userLang: String,createdAt:String,text:String, long:Double, lat:Double);
32 | 
33 | object StatusAdapter{
34 |   implicit def statusAdapterWrapper(status: Status) = 
35 |       StatusAdapter(
36 |           status.getUser.getName,
37 |           status.getUser.getScreenName,
38 |           status.getUser.getLang,
39 |           status.getCreatedAt.toString,
40 |           status.getText,
41 |           Option(status.getGeoLocation).map{ _.getLongitude}.getOrElse(0.0),
42 |           Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0)
43 |       )
44 | }
45 | 
46 | class StatusDeserializer extends Deserializer[StatusAdapter]{
47 |   def configure( props: java.util.Map[String, _], isKey: Boolean) = {
48 |     
49 |   }
50 |   
51 |   def close(){
52 |     
53 |   }
54 |   
55 |   def deserialize(topic: String, data: Array[Byte] ): StatusAdapter = {
56 |     try{
57 |       val bais = new ByteArrayInputStream( data )
58 |       var ois:ObjectInputStream = null
59 |       try{
60 |         ois = new ObjectInputStream( bais )
61 |         ois.readObject().asInstanceOf[Status]
62 |       }finally{
63 |         if (bais != null ){
64 |           bais.close
65 |         }
66 |         if ( ois != null ){
67 |           ois.close
68 |         }
69 |       }
70 |     }catch{
71 |       case e:Throwable=>{
72 |         val jsonObject = JSON.parseFull( new String(data) ).getOrElse(Map.empty).asInstanceOf[Map[String, Any]]
73 |         val user=jsonObject.get("user").getOrElse( Map.empty ).asInstanceOf[Map[String,Any]]
74 |         val geo = Option(jsonObject.get("geo").orNull).getOrElse(Map.empty).asInstanceOf[Map[String,Any]]
75 |         StatusAdapter(
76 |           user.get("name").getOrElse("").asInstanceOf[String], 
77 |           user.get("userid").getOrElse("").asInstanceOf[String],
78 |           user.get("lang").getOrElse("").asInstanceOf[String],
79 |           jsonObject.get("created_at").getOrElse("").asInstanceOf[String],
80 |           jsonObject.get("text").getOrElse("").asInstanceOf[String],
81 |           geo.get("long").getOrElse(0.0).asInstanceOf[Double],
82 |           geo.get("lat").getOrElse(0.0).asInstanceOf[Double]
83 |         )
84 |       }
85 |     }
86 |   }
87 | }


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/config/DemoConfig.scala:
--------------------------------------------------------------------------------
  1 | package com.ibm.cds.spark.samples.config
  2 | 
  3 | import org.apache.kafka.clients.CommonClientConfigs
  4 | import java.io.FileInputStream
  5 | import java.io.InputStream
  6 | import scala.collection.JavaConversions._
  7 | import org.apache.spark.SparkContext
  8 | 
  9 | 
 10 | /**
 11 |  * @author dtaieb
 12 |  */
 13 | 
 14 | class DemoConfig() extends Serializable{  
 15 |   
 16 |   //Hold configuration key/value pairs
 17 |   var config = scala.collection.mutable.Map[String, String](
 18 |       registerConfigKey("twitter4j.oauth.consumerKey" ),
 19 |       registerConfigKey("twitter4j.oauth.consumerSecret" ),
 20 |       registerConfigKey("twitter4j.oauth.accessToken" ),
 21 |       registerConfigKey("twitter4j.oauth.accessTokenSecret"),
 22 |       registerConfigKey("tweets.key",""),
 23 |       registerConfigKey("cloudant.hostName" ),
 24 |       registerConfigKey("cloudant.https", "true"),
 25 |       registerConfigKey("cloudant.port" ),
 26 |       registerConfigKey("cloudant.username" ),
 27 |       registerConfigKey("cloudant.password" ),
 28 |       registerConfigKey("watson.tone.url" ),
 29 |       registerConfigKey("watson.tone.username" ),
 30 |       registerConfigKey("watson.tone.password" ),
 31 |       registerConfigKey("watson.api.version", "2016-05-19"),
 32 |       registerConfigKey("cloudant.save", "false" ),
 33 |       registerConfigKey(DemoConfig.CHECKPOINT_DIR_KEY)
 34 |   )
 35 |   
 36 |   private def getKeyOrFail(key:String):String={
 37 |     config.get(key).getOrElse( {
 38 |       throw new IllegalStateException("Missing key: " + key)
 39 |     })
 40 |   }
 41 |   
 42 |   def cloneConfig():MessageHubConfig={
 43 |     val props = new MessageHubConfig
 44 |     config.foreach{ entry => props.setConfig(entry._1, entry._2)}
 45 |     props
 46 |   }
 47 |   
 48 |   def set_hadoop_config(sc:SparkContext){
 49 |     val prefix = "fs.swift.service." + getKeyOrFail("name") 
 50 |     val hconf = sc.hadoopConfiguration
 51 |     hconf.set(prefix + ".auth.url", getKeyOrFail("auth_url")+"/v3/auth/tokens")
 52 |     hconf.set(prefix + ".auth.endpoint.prefix", "endpoints")
 53 |     hconf.set(prefix + ".tenant", getKeyOrFail("project_id"))
 54 |     hconf.set(prefix + ".username", getKeyOrFail("user_id"))
 55 |     hconf.set(prefix + ".password", getKeyOrFail("password"))
 56 |     hconf.setInt(prefix + ".http.port", 8080)
 57 |     hconf.set(prefix + ".region", getKeyOrFail("region"))
 58 |     hconf.setBoolean(prefix + ".public", true)
 59 |   }
 60 |   
 61 |   def initConfigKeys(){
 62 |     //Overridable by subclasses
 63 |   }
 64 |   
 65 |   //Give a chance to subclasses to init the keys
 66 |   initConfigKeys;
 67 |   
 68 |   {
 69 |     //Load config from property file if specified
 70 |     val configPath = Option(System.getProperty("DEMO_CONFIG_PATH") ).orElse( Option(System.getenv("DEMO_CONFIG_PATH")))
 71 |       .orElse( Option(System.getProperty("spark.service.user.DEMO_CONFIG_PATH") )).orElse(Option(System.getenv("spark.service.user.DEMO_CONFIG_PATH") ))
 72 |       .getOrElse(null)
 73 |     if (configPath != null ){
 74 |       println("ConfigPath is: " + configPath )
 75 |     }
 76 |     if ( configPath != null ){
 77 |       println("Loading config from DEMO_CONFIG_PATH env variable: " + configPath)
 78 |       val props = new java.util.Properties
 79 |       var fis:InputStream = null
 80 |       try{
 81 |         fis = new FileInputStream(configPath)
 82 |         props.load(fis)
 83 |         for( key <- props.keysIterator ){
 84 |           setConfig( key, props.getProperty(key))
 85 |         }
 86 |       }catch{
 87 |         case e:Throwable => e.printStackTrace
 88 |       }finally{
 89 |         if ( fis != null ){
 90 |           fis.close
 91 |         }
 92 |       }    
 93 |     }
 94 |   }
 95 |   
 96 |   private[config] def registerConfigKey( key: String, default: String = null ) : (String,String) = {
 97 |     if ( default == null ){
 98 |       (key, Option(System.getProperty(key)).orNull )
 99 |     }
100 |     (key, Option(System.getProperty(key)) getOrElse default )
101 |   }
102 |   
103 |   def setConfig(key:String, value:String){
104 |     config.put( key, value )
105 |   }
106 |   
107 |   def getConfig(key:String):String={
108 |     config.get(key).getOrElse("")
109 |   }
110 |   
111 |   implicit def toImmutableMap(): Map[String,String]= {
112 |     Map( config.toList: _* )
113 |   }
114 |   
115 |   //Validate configuration settings
116 |   def validateConfiguration(ignorePrefix:String*) : Boolean = {
117 |     def ignoreKey( key: String ): Boolean = {
118 |       var o = ignorePrefix.find { p => p.startsWith( key ) };
119 |       o.isDefined
120 |     }
121 |     var ret: Boolean = true;
122 |     val saveToCloudant = config.get("cloudant.save").get.toBoolean
123 |     config.foreach( (t:(String, Any)) => 
124 |       if ( t._2 == null ){
125 |         if ( saveToCloudant || !t._1.startsWith("cloudant")  ){
126 |           if ( !ignoreKey( t._1) ){
127 |             println(t._1 + " configuration not set. Use setConfig(\"" + t._1 + "\",<your Value>)"); 
128 |             ret = false;
129 |           }
130 |         }
131 |       }
132 |     )
133 |     
134 |     if ( ret ){
135 |       config.foreach( (t:(String,Any)) => 
136 |         try{
137 |           if ( t._1.startsWith( "twitter4j") && t._2 != null && !ignoreKey(t._1) ) {
138 |             System.setProperty( t._1, t._2.asInstanceOf[String] )
139 |           }
140 |         }catch{
141 |           case e:Throwable => println("error" + t)
142 |         }
143 |       )
144 |     }
145 |     ret
146 |   }
147 | }
148 | 
149 | object DemoConfig extends DemoConfig{
150 |   final val CHECKPOINT_DIR_KEY = "checkpointDir"
151 | }
152 | 


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/config/MessageHubConfig.scala:
--------------------------------------------------------------------------------
  1 | package com.ibm.cds.spark.samples.config
  2 | 
  3 | import scala.collection.mutable.ListBuffer
  4 | import scala.reflect.ClassTag
  5 | import org.apache.kafka.clients.CommonClientConfigs
  6 | import org.apache.kafka.common.config.SslConfigs
  7 | import org.apache.kafka.common.security.JaasUtils
  8 | import scala.io.Source
  9 | import java.io.InputStream
 10 | import java.io.FileWriter
 11 | import java.io.File
 12 | import org.http4s.EntityEncoder
 13 | import org.http4s.Uri
 14 | import org.http4s.client.blaze.PooledHttp1Client
 15 | import org.http4s.Request
 16 | import org.http4s.Method
 17 | import org.http4s.Headers
 18 | import org.http4s.headers.Authorization
 19 | import org.http4s.BasicCredentials
 20 | import org.http4s.Header
 21 | import javax.net.ssl.SSLContext
 22 | import org.codehaus.jettison.json.JSONObject
 23 | 
 24 | 
 25 | /**
 26 |  * @author dtaieb
 27 |  */
 28 | class MessageHubConfig extends DemoConfig{  
 29 |   lazy val kafkaOptionKeys = ListBuffer[String]()
 30 |   override def initConfigKeys(){
 31 |     config = config ++ Map[String,String]( 
 32 |       registerConfigKey(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG),
 33 |       registerConfigKey(CommonClientConfigs.CLIENT_ID_CONFIG, "demo.watson.twitter.messagehub"),
 34 |       registerConfigKey("auto.offset.reset", "latest"),
 35 |       registerConfigKey("acks", "-1"),
 36 |       registerConfigKey("retries", "0"),
 37 |       registerConfigKey("batch.size", "16384"),
 38 |       registerConfigKey("linger.ms", "1"),
 39 |       registerConfigKey("buffer.memory", "33554432"),
 40 |       registerConfigKey("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"),
 41 |       registerConfigKey("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"),
 42 |       registerConfigKey(SslConfigs.SSL_PROTOCOL_CONFIG, "TLSv1.2"),
 43 |       registerConfigKey(SslConfigs.SSL_ENABLED_PROTOCOLS_CONFIG, "TLSv1.2"),
 44 |       registerConfigKey(SslConfigs.SSL_TRUSTSTORE_TYPE_CONFIG, "JKS"),
 45 |       registerConfigKey(SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG, getDefaultSSLTrustStoreLocation),
 46 |       registerConfigKey(SslConfigs.SSL_TRUSTSTORE_PASSWORD_CONFIG, "changeit"),
 47 |       registerConfigKey(SslConfigs.SSL_ENDPOINT_IDENTIFICATION_ALGORITHM_CONFIG, "HTTPS"),
 48 |       registerConfigKey(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, "SASL_SSL" ),
 49 |       
 50 |       registerConfigKey(MessageHubConfig.CHECKPOINT_DIR_KEY),
 51 |       registerConfigKey(MessageHubConfig.KAFKA_TOPIC_TWEETS, "demo.tweets.watson.topic"),
 52 |       registerConfigKey(MessageHubConfig.KAFKA_USER_NAME),
 53 |       registerConfigKey(MessageHubConfig.KAFKA_USER_PASSWORD),
 54 |       registerConfigKey(MessageHubConfig.MESSAGEHUB_API_KEY),
 55 |       registerConfigKey(MessageHubConfig.MESSAGEHUB_REST_URL)
 56 |     )    
 57 |   }
 58 |   
 59 |   private def getDefaultSSLTrustStoreLocation():String={
 60 |     val javaHome = System.getProperty("java.home") + File.separator + "lib" + File.separator + "security" + File.separator + "cacerts"
 61 |     println("default location of ssl Trust store is: " + javaHome)
 62 |     javaHome
 63 |   }
 64 | 
 65 |   override private[config] def registerConfigKey( key: String, default: String = null ) : (String,String) = {
 66 |     kafkaOptionKeys += key
 67 |     super.registerConfigKey(key,default)
 68 |   }
 69 |   
 70 |   override def validateConfiguration(ignorePrefix:String*) : Boolean = {
 71 |     val ret = super.validateConfiguration(ignorePrefix:_*)
 72 |     if ( ret ){
 73 |       //Create the jaas configuration
 74 |       MessageHubConfig.createJaasConfiguration(getConfig(MessageHubConfig.KAFKA_USER_NAME ), getConfig(MessageHubConfig.KAFKA_USER_PASSWORD) )
 75 |     }
 76 |     ret
 77 |   }
 78 |   
 79 |   def copyKafkaOptionKeys(other:MessageHubConfig){
 80 |     kafkaOptionKeys.foreach { key => other.setConfig(key, getConfig(key) ) }
 81 |   }
 82 |   
 83 |   def setValueSerializer[U]()(implicit c: ClassTag[U]){
 84 |     setConfig("value.serializer", c.runtimeClass.getName);
 85 |   }
 86 |   
 87 |   def setValueDeserializer[U]()(implicit c: ClassTag[U]){
 88 |     setConfig("value.deserializer", c.runtimeClass.getName);
 89 |   }
 90 |   
 91 |   def createTopicsIfNecessary( topics: String* ){
 92 |     val sslContext = SSLContext.getInstance("TLSv1.2")
 93 |     sslContext.init(null, null, null)
 94 |     lazy val client = PooledHttp1Client(sslContext=Option(sslContext))
 95 |     for( topic <- topics ){
 96 |       EntityEncoder[String].toEntity("{\"name\":" + JSONObject.quote( topic ) + "}" ).flatMap { 
 97 |         entity =>
 98 |           val topicUri: Uri = Uri.fromString( getConfig(MessageHubConfig.MESSAGEHUB_REST_URL) + "/admin/topics" ).getOrElse( null )
 99 |           println(topicUri)
100 |           client(
101 |               Request( 
102 |                   method = Method.POST, 
103 |                   uri = topicUri,
104 |                   headers = Headers(
105 |                       Header("Content-Type", "application/json"),
106 |                       Header("X-Auth-Token", getConfig(MessageHubConfig.MESSAGEHUB_API_KEY))
107 |                     ),
108 |                   body = entity.body
109 |               )
110 |           ).flatMap { response =>
111 |              response.status.code match {
112 |                case 200 | 202 => println("Successfully created topic: " + topic)
113 |                case 422 | 403 => println("Topic already exists in the server: " + topic)
114 |                case _ => throw new IllegalStateException("Error when trying to create topic: " + response.status.code + " Reason: " + response.status.reason)
115 |              }
116 |              response.as[String]
117 |           }
118 |       }.run
119 |     }
120 |   }
121 | }
122 | 
123 | object MessageHubConfig{
124 |   final val CHECKPOINT_DIR_KEY = DemoConfig.CHECKPOINT_DIR_KEY
125 |   final val KAFKA_TOPIC_TWEETS = "kafka.topic.tweet"    //Key for name of the kafka topic holding used for publishing the tweets
126 |   final val KAFKA_USER_NAME = "kafka.user.name"
127 |   final val KAFKA_USER_PASSWORD = "kafka.user.password"
128 |   
129 |   final val MESSAGEHUB_API_KEY = "api_key"
130 |   final val MESSAGEHUB_REST_URL = "kafka_rest_url"
131 |   
132 |   private def fixPath(path: String):String = {
133 |     path.replaceAll("\\ / : * ? \" < > |,", "_")
134 |   }
135 |   
136 |   def createJaasConfiguration( userName: String, password: String){
137 |     //Create the jaas configuration
138 |       var is:InputStream = null
139 |       try{
140 |         val packageName = MessageHubConfig.getClass.getPackage.getName.replace('.', File.separatorChar)
141 |         is = MessageHubConfig.getClass.getClassLoader.getResourceAsStream(packageName + "/jaas.conf");
142 |         val confString = Source.fromInputStream( is ).mkString
143 |           .replace( "$USERNAME", userName)
144 |           .replace( "$PASSWORD", password )
145 |         
146 |         val confDir= new File( System.getProperty("java.io.tmpdir") + File.separator + 
147 |             fixPath( userName ) )
148 |         confDir.mkdirs
149 |         val confFile = new File( confDir, "jaas.conf");
150 |         val fw = new FileWriter( confFile );
151 |         fw.write( confString )
152 |         fw.close
153 |         
154 |         //Set the jaas login config property
155 |         println("Registering JaasConfiguration: " + confFile.getAbsolutePath)
156 |         System.setProperty(JaasUtils.JAVA_LOGIN_CONFIG_PARAM, confFile.getAbsolutePath )
157 |       }catch{
158 |         case e:Throwable => {
159 |           e.printStackTrace
160 |           throw e
161 |         }        
162 |       }finally{
163 |         if ( is != null ) is.close
164 |       }
165 |   }
166 | }


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/config/jaas.conf:
--------------------------------------------------------------------------------
1 | KafkaClient {
2 |     com.ibm.messagehub.login.MessageHubLoginModule required
3 |     serviceName="kafka"
4 |     username="$USERNAME"
5 |     password="$PASSWORD";
6 | };


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/dstream/KafkaInputDStream.scala:
--------------------------------------------------------------------------------
  1 | package com.ibm.cds.spark.samples.dstream
  2 | 
  3 | import scala.collection.JavaConversions._
  4 | import scala.collection.Map
  5 | import scala.reflect.ClassTag
  6 | import scala.reflect.classTag
  7 | import org.apache.kafka.clients.consumer.ConsumerRecord
  8 | import org.apache.kafka.clients.consumer.KafkaConsumer
  9 | import org.apache.kafka.common.serialization.Deserializer
 10 | import org.apache.spark.Logging
 11 | import org.apache.spark.storage.StorageLevel
 12 | import org.apache.spark.streaming.StreamingContext
 13 | import org.apache.spark.streaming.dstream._
 14 | import org.apache.spark.streaming.receiver.Receiver
 15 | import org.apache.log4j.Level
 16 | import org.apache.log4j.Logger
 17 | import java.util.Properties
 18 | import com.ibm.cds.spark.samples.config.MessageHubConfig
 19 | import org.apache.kafka.common.security.JaasUtils
 20 | 
 21 | class KafkaInputDStream[
 22 |   K: ClassTag,
 23 |   V: ClassTag,
 24 |   U <: Deserializer[_]: ClassTag,
 25 |   T <: Deserializer[_]: ClassTag](
 26 |     ssc : StreamingContext,
 27 |     kafkaParams: Map[String, String],
 28 |     topics: List[String],
 29 |     storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK
 30 |   ) extends ReceiverInputDStream[(K, V)](ssc) with Logging {
 31 | 
 32 |   def getReceiver(): Receiver[(K, V)] = {
 33 |       new KafkaReceiver[K, V, U, T](kafkaParams, topics, storageLevel)
 34 |   }
 35 | }
 36 | 
 37 | object KafkaStreaming{
 38 |   implicit class KafkaStreamingContextAdapter( val ssc : StreamingContext ){
 39 |     def createKafkaStream[K: ClassTag, V: ClassTag, U <: Deserializer[_]: ClassTag, T <: Deserializer[_]: ClassTag](
 40 |         bootStrapKafkaConfig: MessageHubConfig,
 41 |         topics: List[String]
 42 |     ): ReceiverInputDStream[(K, V)] = {
 43 |       val kafkaProps = new MessageHubConfig;
 44 |       bootStrapKafkaConfig.copyKafkaOptionKeys( kafkaProps)
 45 |       kafkaProps.setValueDeserializer[T];
 46 |       new KafkaInputDStream[K, V, U, T](ssc, kafkaProps.toImmutableMap, topics)
 47 |     }
 48 |   }
 49 | }
 50 | 
 51 | class KafkaReceiver[
 52 |   K: ClassTag,
 53 |   V: ClassTag,
 54 |   U <: Deserializer[_]: ClassTag,
 55 |   T <: Deserializer[_]: ClassTag](
 56 |     kafkaParams: Map[String,String],
 57 |     topics: List[String],
 58 |     storageLevel: StorageLevel
 59 |   ) extends Receiver[(K, V)](storageLevel) with Logging {
 60 | 
 61 |   // Connection to Kafka
 62 |   var kafkaConsumer: KafkaConsumer[K,V] = null
 63 | 
 64 |   def onStop() {
 65 |     if (kafkaConsumer != null) {
 66 |       kafkaConsumer.synchronized {
 67 |         print("Stopping kafkaConsumer")
 68 |         kafkaConsumer.close()
 69 |         kafkaConsumer = null
 70 |       }
 71 |     }
 72 |   }
 73 | 
 74 |   def onStart() {
 75 |     logInfo("Starting Kafka Consumer Stream")
 76 |     
 77 |     //Make sure the Jaas Login config param is set
 78 |     val jaasLoginParam = System.getProperty(JaasUtils.JAVA_LOGIN_CONFIG_PARAM);
 79 |     if ( jaasLoginParam == null ){
 80 |       MessageHubConfig.createJaasConfiguration( kafkaParams.get(MessageHubConfig.KAFKA_USER_NAME).get, kafkaParams.get(MessageHubConfig.KAFKA_USER_PASSWORD).get)
 81 |     }
 82 |     
 83 |     
 84 |     val keyDeserializer = classTag[U].runtimeClass.getConstructor().newInstance().asInstanceOf[Deserializer[K]]
 85 |     val valueDeserializer = classTag[T].runtimeClass.getConstructor().newInstance().asInstanceOf[Deserializer[V]]
 86 |     
 87 |     //Create a new kafka consumer and subscribe to the relevant topics
 88 |     kafkaConsumer = new KafkaConsumer[K, V](kafkaParams)
 89 |     kafkaConsumer.subscribe( topics )
 90 |     
 91 |     new Thread( new Runnable {
 92 |       def run(){
 93 |         try{
 94 | 			    while( kafkaConsumer != null ){
 95 |             var it:Iterator[ConsumerRecord[K, V]] = null;
 96 |             
 97 |             if ( kafkaConsumer != null ){
 98 |               kafkaConsumer.synchronized{     
 99 |                 //Poll for new events
100 |                 it = kafkaConsumer.poll(1000L).iterator              
101 |                 while( it != null && it.hasNext() ){
102 |                   //Get the record and store it
103 |                   val record = it.next();
104 |                   store( (record.key, record.value) )
105 |                 }            
106 |                 kafkaConsumer.commitSync
107 |               }
108 |             }            
109 | 
110 |             Thread.sleep( 1000L )
111 |           }  
112 |           println("Exiting Thread")
113 |         }catch{
114 |           case e:Throwable => {
115 |             reportError( "Error in KafkaConsumer thread", e);
116 |             e.printStackTrace()
117 |           }
118 |         }
119 | 	    }
120 |     }).start
121 |   }
122 | }
123 | 


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | /**
18 |  * Spark Streaming sample application
19 |  *
20 |  */
21 | package com.ibm.cds.spark.samples;


--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.ibm.cds.spark
19 | 
20 | /**
21 |  * @author dtaieb
22 |  */
23 | import scala.collection.mutable._
24 | import org.apache.spark.sql.types.IntegerType
25 | import org.apache.spark.sql.types.DoubleType
26 | import org.apache.spark.sql.types.StructField
27 | import org.apache.spark.sql.types.StringType
28 | import org.apache.spark.sql.types.StructField
29 | import org.apache.spark.sql.types.StructType
30 | import org.apache.spark.sql.Row
31 | 
32 | package object samples {
33 |   
34 |   case class EnrichedTweet( author:String="", userid: String="", date: String, lang: String, text: String, lat: Double, long: Double, sentimentScores: Map[String, Double]){
35 |     def toRow():Row={
36 |       var colValues = Array[Any](author,userid,date,lang,text,lat,long)
37 |       val scores = for {
38 |         (_,emotion)<-ToneAnalyzer.sentimentFactors
39 |         score=sentimentScores.getOrElse(emotion, 0.0)
40 |       }yield score
41 |       colValues = colValues ++ scores
42 |       Row(colValues.toArray:_*)
43 |     }
44 |   }
45 | 	
46 |   val schemaString = "author userid date lang text lat:double long:double"
47 | 	val schemaTweets =
48 | 			StructType(
49 | 					schemaString.split(" ").map(
50 | 							fieldName => {
51 | 								val ar = fieldName.split(":");
52 | 								StructField(
53 | 										ar.lift(0).get, 
54 | 										ar.lift(1).getOrElse("string") match{
55 | 											case "int" => IntegerType
56 | 											case "double" => DoubleType
57 | 											case _ => StringType
58 | 										},
59 | 										true
60 |                 )
61 | 							}
62 | 					).union( 
63 | 							ToneAnalyzer.sentimentFactors.map( f => StructField( f._1, DoubleType )).toArray[StructField]
64 | 					)
65 | 			)
66 | }


--------------------------------------------------------------------------------