├── .gitignore
├── LICENSE
├── README.md
├── dist
├── helloSpark-assembly-2.1.jar
└── streaming-twitter-assembly-1.6.jar
├── docs
├── Twitter Sentiment with Watson TA and PI architecture diagram.orig.png
└── Twitter Sentiment with Watson TA and PI architecture diagram.png
├── helloGraphx
├── build.sbt
├── project
│ └── assembly.sbt
├── readme.md
└── src
│ └── main
│ └── scala
│ └── com
│ └── ibm
│ └── cds
│ └── spark
│ └── samples
│ └── HelloGraphx.scala
├── helloSpark
├── .settings
│ └── org.scala-ide.sdt.core.prefs
├── build.sbt
├── project
│ └── assembly.sbt
├── python
│ ├── helloSpark.py
│ ├── helloSpark
│ │ └── __init__.py
│ └── setup.py
├── readme.md
└── src
│ └── main
│ └── scala
│ └── com
│ └── ibm
│ └── cds
│ └── spark
│ └── samples
│ ├── HelloSpark.scala
│ └── package-info.java
├── notebook
├── DashDB Twitter Car 2015 Python Notebook.ipynb
├── Get Service Credentials for Twitter Sentiment with Watson TA and PI.md
├── PYCON 2016 spark tutorial quick links.txt
├── README.md
└── Twitter Sentiment with Watson TA and PI.ipynb
└── streaming-twitter
├── .classpath
├── .gitignore
├── .project
├── build.sbt
├── lib
├── couchdb-scala
│ └── com
│ │ └── ibm
│ │ └── couchdb-scala_2.10
│ │ └── 0.5.3
│ │ ├── couchdb-scala_2.10-0.5.3-javadoc.jar
│ │ ├── couchdb-scala_2.10-0.5.3-javadoc.jar.md5
│ │ ├── couchdb-scala_2.10-0.5.3-javadoc.jar.sha1
│ │ ├── couchdb-scala_2.10-0.5.3-sources.jar
│ │ ├── couchdb-scala_2.10-0.5.3-sources.jar.md5
│ │ ├── couchdb-scala_2.10-0.5.3-sources.jar.sha1
│ │ ├── couchdb-scala_2.10-0.5.3.jar
│ │ ├── couchdb-scala_2.10-0.5.3.jar.md5
│ │ ├── couchdb-scala_2.10-0.5.3.jar.sha1
│ │ ├── couchdb-scala_2.10-0.5.3.pom
│ │ ├── couchdb-scala_2.10-0.5.3.pom.md5
│ │ └── couchdb-scala_2.10-0.5.3.pom.sha1
├── messagehub.login-1.0.0.jar
└── pixiedust.jar
├── notebook
├── Spark Streaming Twitter-Watson-MessageHub.ipynb
├── Twitter + Watson Tone Analyzer Part 1.ipynb
├── Twitter + Watson Tone Analyzer Part 2.ipynb
└── Twitter Sentiment with Pixiedust.ipynb
├── project
└── assembly.sbt
├── readme.md
├── sampleConfig
└── sampleconf.properties
└── src
└── main
└── scala
└── com
└── ibm
└── cds
└── spark
└── samples
├── KafkaProducerTest.scala
├── MessageHubStreamingTwitter.scala
├── PixiedustStreamingTwitter.scala
├── StatusSerializer.scala
├── StreamingListener.scala
├── StreamingTwitter.scala
├── ToneAnalyzer.scala
├── TwitterAdapter.scala
├── config
├── DemoConfig.scala
├── MessageHubConfig.scala
└── jaas.conf
├── dstream
└── KafkaInputDStream.scala
├── package-info.java
└── package.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
4 | # sbt specific
5 | .cache
6 | .history
7 | .lib/
8 | target/
9 | lib_managed/
10 | src_managed/
11 | project/boot/
12 | project/plugins/project/
13 |
14 | # Scala-IDE specific
15 | .scala_dependencies
16 | .worksheet
17 |
18 | helloSpark/.cache-main
19 |
20 | helloSpark/.classpath
21 |
22 | helloSpark/.project
23 |
24 | streaming-twitter/.cache-main
25 |
26 | streaming-twitter/.settings/org.scala-ide.sdt.core.prefs
27 |
28 | streaming-twitter/config/MessageHubYP.properties
29 |
30 | *.pyc
31 |
32 | pixiedust/pixiedust.egg-info
33 |
34 | pixiedust/dist
35 |
36 | .DS_Store
37 |
38 | streaming-twitter/conf/log4j.properties
39 |
40 | streaming-twitter/conf/log4j.properties.template
41 |
42 | streaming-twitter/src/main/scala/resources/log4j.properties
43 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #Spark Tutorials
2 |
3 | This repository contains tutorials and samples that show you how get the most out of IBM Analytics for Apache Spark.
4 |
5 | Watch this repo for new content. Meanwhile, try these tutorials:
6 |
7 | - [Start Developing with Spark](https://developer.ibm.com/clouddataservices/start-developing-with-spark-and-notebooks/)
8 |
9 | - [Sentiment Analysis of Twitter Hashtags](https://developer.ibm.com/clouddataservices/sentiment-analysis-of-twitter-hashtags/)
10 |
11 | - [Real-time Sentiment Analysis of Twitter Hashtags with Spark](https://developer.ibm.com/clouddataservices/2016/01/15/real-time-sentiment-analysis-of-twitter-hashtags-with-spark/)
12 |
13 | - [Getting started with GraphFrames in Apache Spark](https://developer.ibm.com/clouddataservices/2016/07/15/intro-to-apache-spark-graphframes/)
14 |
15 | - [Predict Flight Delays with Apache Spark MLLib, FlightStats, and Weather Data](https://developer.ibm.com/clouddataservices/2016/08/04/predict-flight-delays-with-apache-spark-mllib-flightstats-and-weather-data/)
16 |
17 | - [Analyze Market Trends in Twitter Using Apache Spark, Python, and dashDB](https://developer.ibm.com/clouddataservices/2016/06/13/analyze-market-trends-in-twitter-using-apache-spark-python-and-dashdb/)
18 |
19 | - [PixieDust: Magic for Your Python Notebook](https://developer.ibm.com/clouddataservices/2016/10/11/pixiedust-magic-for-python-notebook/)
20 |
21 |
22 |
--------------------------------------------------------------------------------
/dist/helloSpark-assembly-2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/dist/helloSpark-assembly-2.1.jar
--------------------------------------------------------------------------------
/dist/streaming-twitter-assembly-1.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/dist/streaming-twitter-assembly-1.6.jar
--------------------------------------------------------------------------------
/docs/Twitter Sentiment with Watson TA and PI architecture diagram.orig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/docs/Twitter Sentiment with Watson TA and PI architecture diagram.orig.png
--------------------------------------------------------------------------------
/docs/Twitter Sentiment with Watson TA and PI architecture diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/docs/Twitter Sentiment with Watson TA and PI architecture diagram.png
--------------------------------------------------------------------------------
/helloGraphx/build.sbt:
--------------------------------------------------------------------------------
1 | name := "helloGraphx"
2 |
3 | version := "1.0"
4 |
5 | scalaVersion := "2.10.4"
6 |
7 | libraryDependencies ++= {
8 | val sparkVersion = "1.6.0"
9 | Seq(
10 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
11 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
12 | "org.apache.spark" %% "spark-graphx" % sparkVersion % "provided",
13 | "org.apache.spark" %% "spark-repl" % sparkVersion % "provided",
14 | "org.http4s" %% "http4s-core" % "0.8.2",
15 | "org.http4s" %% "http4s-client" % "0.8.2",
16 | "org.http4s" %% "http4s-blazeclient" % "0.8.2"
17 | )
18 | }
19 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
20 |
--------------------------------------------------------------------------------
/helloGraphx/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 |
--------------------------------------------------------------------------------
/helloGraphx/readme.md:
--------------------------------------------------------------------------------
1 | # Start Developing with GraphX
2 |
3 |
--------------------------------------------------------------------------------
/helloGraphx/src/main/scala/com/ibm/cds/spark/samples/HelloGraphx.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.ibm.cds.spark.samples
19 |
20 | import org.apache.spark._
21 | import scalaz._
22 | import java.net.URL
23 | import java.util.Calendar
24 | import java.net.URLEncoder
25 | import java.text.SimpleDateFormat
26 | import org.apache.spark.sql.SQLContext
27 | import scala.collection.immutable.Map
28 | import org.apache.spark.rdd.RDD
29 | import org.apache.spark.graphx.VertexId
30 | import org.apache.spark.sql.Row
31 | import org.apache.spark.graphx.Edge
32 | import org.apache.spark.graphx.Graph
33 | import org.http4s.EntityEncoder
34 | import org.codehaus.jettison.json.JSONObject
35 | import org.http4s.Uri
36 | import org.http4s.Request
37 | import org.http4s.BasicCredentials
38 | import org.http4s.headers.Authorization
39 | import org.http4s.Header
40 | import org.http4s.Headers
41 | import org.http4s.Method
42 | import org.http4s.client.blaze.PooledHttp1Client
43 | import org.http4s.client.Client
44 | import org.http4s.EntityDecoder
45 | import org.apache.spark.graphx.EdgeTriplet
46 |
47 | class Node(val properties: Map[String, String]) extends Serializable
48 | case class Airport(override val properties: Map[String,String]) extends Node(properties)
49 | case class Country(override val properties: Map[String,String]) extends Node(properties)
50 | case class Continent(override val properties: Map[String,String]) extends Node(properties)
51 | case class Route(override val properties: Map[String, String]) extends Node(properties)
52 |
53 | object HelloGraphx {
54 |
55 | //main method invoked when running as a standalone Spark Application
56 | def main(args: Array[String]) {
57 | lazy val client = PooledHttp1Client()
58 | val conf = new SparkConf().setAppName("Hello Graphx")
59 | val sc = new SparkContext(conf)
60 |
61 | println("Hello Graphx Demo. Load/Save a graph to/from Graphx RDDs")
62 |
63 | val sqlContext = new SQLContext(sc);
64 |
65 | //Load airports
66 | val airportsDF = sqlContext.read.format("com.databricks.spark.xml")
67 | .option("rowTag","node")
68 | .option("rootTag","graphml/graph")
69 | .load("/Users/dtaieb/Downloads/air-routes-graph/air-routes.graphml")
70 | airportsDF.printSchema()
71 | println(airportsDF.count())
72 |
73 | val airportsRdd: RDD[(VertexId, Node with Product)] =
74 | airportsDF.map { x => {
75 | val propertiesMap:Map[String,String] = x.getAs[Seq[Row]]("data")
76 | .map { row => row.getAs[String]("@key")->row.getAs[String]("#VALUE") }.toMap
77 | val id = x.getAs[Long]("@id")
78 | val nodeType:String = propertiesMap.get("type").getOrElse("")
79 | nodeType match {
80 | case "airport" => (id, Airport(propertiesMap))
81 | case "country" => (id, Country(propertiesMap))
82 | case "continent" => (id, Continent(propertiesMap))
83 | case _ => println("Skip node with type " + nodeType); (id, null)
84 | }
85 | }}.filter( f => f._2 !=null )
86 | println(airportsRdd.take(5).deep.mkString("\n"))
87 |
88 | //Load routes
89 | val routesDF = sqlContext.read.format("com.databricks.spark.xml")
90 | .option("rowTag","edge")
91 | .option("rootTag","graphml/graph")
92 | .load("/Users/dtaieb/Downloads/air-routes-graph/air-routes.graphml")
93 | routesDF.printSchema()
94 | println(routesDF.count())
95 |
96 | val routesRdd: RDD[(Edge[Route])] =
97 | routesDF.map { x => {
98 | val propertiesMap:Map[String,String] = x.getAs[Seq[Row]]("data")
99 | .map { row => row.getAs[String]("@key")->row.getAs[String]("#VALUE") }.toMap +
100 | ("id" -> x.getAs[Long]("@id").toString)
101 | Edge(x.getAs[Long]("@source"), x.getAs[Long]("@target"),Route(propertiesMap))
102 | }}
103 | println(routesRdd.take(5).deep.mkString("\n"))
104 |
105 | val graph = Graph( airportsRdd, routesRdd )
106 |
107 | //Iterate over the graph and send the vertices/edges to Gremlin Server
108 | graph.triplets.foreach( f => {
109 | addTriplet(client, f );
110 | })
111 |
112 | //Traverse all nodes and all vertices, send them to the graphdb service via gremlin
113 | sc.stop()
114 | }
115 |
116 | def escape(s:String):String={
117 | s.replace("'", "\\'")
118 | }
119 |
120 | def addTriplet(client: Client, f: EdgeTriplet[Node with Product, Route] ){
121 | val sb = new StringBuilder()
122 |
123 | //Add the source vertex if necessary
124 | sb.append( "v1=graph.traversal().V(" + f.srcId + ").tryNext().orElse(null);")
125 | sb.append(" if(!v1) v1=graph.addVertex(id, " + f.srcId)
126 | f.srcAttr.properties.foreach { case(k,v) => sb.append(",'" + escape(k) + "','" + escape(v) + "'" ) }
127 | sb.append(");")
128 |
129 | //Add the target vertex if necessary
130 | sb.append( "v2=graph.traversal().V(" + f.dstId + ").tryNext().orElse(null);")
131 | sb.append(" if(!v2) v2=graph.addVertex(id, " + f.dstId)
132 | f.dstAttr.properties.foreach { case(k,v) => sb.append(",'" + escape(k) + "','" + escape(v) + "'") }
133 | sb.append(");")
134 |
135 | //Add the edge
136 | sb.append("v1.addEdge('edge', v2")
137 | f.attr.properties.foreach { f => sb.append(",'" + escape(f._1) + "','" + escape(f._2) + "'") }
138 | sb.append(");")
139 |
140 | runScript(client, sb.toString )
141 | }
142 |
143 | def addVertex(client: Client, id: Long, keyValues: Seq[(String,String)]){
144 | val sb = new StringBuilder();
145 | sb.append( "if(!graph.traversal().V(" + id + ")) graph.addVertex(id, " + id);
146 | keyValues.foreach { case(k,v) => sb.append("," + k + "," + v) }
147 | sb.append(")")
148 | runScript(client, sb.toString() )
149 | }
150 |
151 | def runScript(client: Client, script: String){
152 | //println("{\"gremlin\":" + JSONObject.quote( script ) + "}")
153 | val results = EntityEncoder[String].toEntity("{\"gremlin\":" + JSONObject.quote( script ) + "}" ).flatMap {
154 | entity =>
155 | val gremlinUri = Uri.fromString( "http://localhost:8182" ).getOrElse( null )
156 | client(
157 | Request(
158 | method = Method.POST,
159 | uri = gremlinUri,
160 | headers = Headers(
161 | Header("Accept", "application/json"),
162 | Header("Content-Type", "application/json")
163 | ),
164 | body = entity.body
165 | )
166 | ).flatMap { response =>
167 | val res = response.as[String]
168 | if (response.status.code == 200 ) {
169 | res
170 | } else {
171 | println( "Error received from Gremlin. Code : " + response.status.code + " reason: " + response.status.reason )
172 | res
173 | }
174 | }
175 | }.attemptRun match {
176 | case -\/(e) => //Ignore
177 | case \/-(a) => println(a)
178 | }
179 | }
180 | }
181 |
--------------------------------------------------------------------------------
/helloSpark/.settings/org.scala-ide.sdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | scala.compiler.additionalParams=\ -Xsource\:2.10 -Ymacro-expand\:none
3 | scala.compiler.installation=78943290
4 | scala.compiler.sourceLevel=2.10
5 | scala.compiler.useProjectSettings=true
6 |
--------------------------------------------------------------------------------
/helloSpark/build.sbt:
--------------------------------------------------------------------------------
1 | name := "helloSpark"
2 |
3 | version := "2.1"
4 |
5 | scalaVersion := "2.10.4"
6 |
7 | libraryDependencies ++= {
8 | val sparkVersion = "1.6.0"
9 | Seq(
10 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
11 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
12 | "org.apache.spark" %% "spark-repl" % sparkVersion % "provided"
13 | )
14 | }
15 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
16 |
--------------------------------------------------------------------------------
/helloSpark/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 |
--------------------------------------------------------------------------------
/helloSpark/python/helloSpark.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pyspark import SparkContext
3 |
4 | def computeStatsForCollection(sc,countPerPartitions=100000,partitions=5):
5 | totalNumber = min( countPerPartitions * partitions, sys.maxsize)
6 | rdd = sc.parallelize( range(totalNumber),partitions)
7 | return (rdd.mean(), rdd.variance())
8 |
9 | if __name__ == "__main__":
10 | sc = SparkContext(appName="Hello Spark")
11 | print("Hello Spark Demo. Compute the mean and variance of a collection")
12 | stats = computeStatsForCollection(sc);
13 | print(">>> Results: ")
14 | print(">>>>>>>Mean: " + str(stats[0]));
15 | print(">>>>>>>Variance: " + str(stats[1]));
16 | sc.stop()
--------------------------------------------------------------------------------
/helloSpark/python/helloSpark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/helloSpark/python/helloSpark/__init__.py
--------------------------------------------------------------------------------
/helloSpark/python/setup.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/helloSpark/python/setup.py
--------------------------------------------------------------------------------
/helloSpark/readme.md:
--------------------------------------------------------------------------------
1 | # Start Developing with Spark
2 |
3 | ####Build a custom library for Apache® Spark™ and deploy it to a Jupyter Notebook.
4 |
5 | If you're new to developing Spark applications you've come to the right place. Our [**Start Developing with Spark** tutorial](https://developer.ibm.com/clouddataservices/start-developing-with-spark-and-notebooks/) provides detailed end-to-end steps that show you how to build a simple custom library for Spark (written in scala) and how to deploy it on IBM Analytics for Apache Spark for Bluemix.
6 |
7 | These steps are the foundation for building real-life production applications. You'll also learn how to manage your project with the import, test, and debug features of Scala IDE for Eclipse.
8 |
9 | [Get started](https://developer.ibm.com/clouddataservices/start-developing-with-spark-and-notebooks/)
10 |
--------------------------------------------------------------------------------
/helloSpark/src/main/scala/com/ibm/cds/spark/samples/HelloSpark.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.ibm.cds.spark.samples
19 |
20 | import org.apache.spark._
21 |
22 | object HelloSpark {
23 |
24 | //main method invoked when running as a standalone Spark Application
25 | def main(args: Array[String]) {
26 | val conf = new SparkConf().setAppName("Hello Spark")
27 | val spark = new SparkContext(conf)
28 |
29 | println("Hello Spark Demo. Compute the mean and variance of a collection")
30 | val stats = computeStatsForCollection(spark);
31 | println(">>> Results: ")
32 | println(">>>>>>>Mean: " + stats._1 );
33 | println(">>>>>>>Variance: " + stats._2);
34 | spark.stop()
35 | }
36 |
37 | //Library method that can be invoked from Jupyter Notebook
38 | def computeStatsForCollection( spark: SparkContext, countPerPartitions: Int = 100000, partitions: Int=5): (Double, Double) = {
39 | val totalNumber = math.min( countPerPartitions * partitions, Long.MaxValue).toInt;
40 | val rdd = spark.parallelize( 1 until totalNumber,partitions);
41 | (rdd.mean(), rdd.variance())
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/helloSpark/src/main/scala/com/ibm/cds/spark/samples/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | /**
18 | * Spark Sample Applications
19 | *
20 | */
21 | package com.ibm.cds.spark.samples;
--------------------------------------------------------------------------------
/notebook/Get Service Credentials for Twitter Sentiment with Watson TA and PI.md:
--------------------------------------------------------------------------------
1 | # Set Up Services and Get Credentials
2 |
3 | These instructions accompany the [Twitter Sentiment analysis with Watson Tone Analyzer and Watson Personality Insights Notebook](https://github.com/ibm-watson-data-lab/spark.samples/tree/master/notebook). This sample notebook requires a connection to the following online services:
4 |
5 | - Twitter
6 | - Watson Tone Analyzer
7 | - Watson Personality Insights
8 |
9 | Follow these steps to set up, retrieve, and enter credentials for all 3 services:
10 |
11 | ## Get OAuth Credentials for Twitter
12 |
13 |
14 | Create a new app on your Twitter account and configure the OAuth credentials.
15 |
16 |
17 | - Go to https://apps.twitter.com/. Sign in and click the Create New App button
18 | - Complete the required fields:
19 |
20 |
21 | - Name and Description can be anything you want.
22 | - Website. It doesn't matter what URL you enter here, as long as it's valid. For example, I used my Bluemix account URL: https://davidtaiebspark.mybluemix.net .
23 |
24 | - Below the developer agreement, turn on the Yes, I agree check box and click Create your Twitter application.
25 | - Click the Keys and Access Tokens tab.
26 | - Scroll to the bottom of the page and click the Create My Access Tokens button.
27 | - Copy the Consumer Key, Consumer Secret, Access Token, and Access Token Secret. You will need them in a few minutes.
28 |
29 | 
30 |
31 |
32 | ## Get Watson Personality Insights Credentials
33 |
34 | Provision the service and grab your credentials:
35 |
36 | 1. Still in Bluemix, go to the top menu, and click Catalog.
37 | 2. In the search box, type Personality Insights.
38 | 3. Click the Personality Insights service tile, then click Create.
39 | 4. On left side of the screen, click Service Credentials and open or create credentials.
40 |
41 | 
42 |
43 | 5. Copy the `username` and `password` values.
44 |
45 |
46 | ## Get Watson Tone Analyzer Credentials
47 |
48 | Provision the service and grab your credentials:
49 |
50 | 1. In a new browser tab or window, open Bluemix, go to the top menu, and click Catalog.
51 | 2. In the search box, type Tone Analyzer.
52 | 3. Click the Tone Analyzer tile, then click Create.
53 | 4. On left side of the screen, click Service Credentials and open or create credentials.
54 | 5. Copy the `username` and `password` values.
55 |
56 |
57 |
58 | ## Paste Credentials into the Notebook
59 |
60 | 1. Return to your version of the [Twitter Sentiment analysis with Watson Tone Analyzer and Watson Personality Insights Notebook](https://github.com/ibm-watson-data-lab/spark.samples/tree/master/notebook)
61 |
62 | 2. Paste all the credentials you just collected into the notebook, replacing the XXXXs for each item:
63 |
64 | ```
65 | sqlContext=SQLContext(sc)
66 |
67 | #Set up the twitter credentials, they will be used both in scala and python cells below
68 | consumerKey = "XXXX"
69 | consumerSecret = "XXXX"
70 | accessToken = "XXXX"
71 | accessTokenSecret = "XXXX"
72 |
73 | #Set up the Watson Personality insight credentials
74 | piUserName = "XXXX"
75 | piPassword = "XXXX"
76 |
77 | #Set up the Watson Tone Analyzer credentials
78 | taUserName = "XXXX"
79 | taPassword = "XXXX"
80 | ```
81 |
82 |
83 |
--------------------------------------------------------------------------------
/notebook/PYCON 2016 spark tutorial quick links.txt:
--------------------------------------------------------------------------------
1 | Bluemix:
2 | https://console.ng.bluemix.net
3 |
4 | FlightStats:
5 | https://developer.flightstats.com/signup
6 | https://developer.flightstats.com/admin/applications
7 |
8 | Simple Data Pipe:
9 | https://github.com/ibm-watson-data-lab/simple-data-pipe
10 |
11 | Flight Predict Notebook, Slides 36 & 37:
12 | https://github.com/ibm-watson-data-lab/simple-data-pipe-connector-flightstats/raw/master/notebook/Flight%20Predict%20PyCon%202016.ipynb
13 |
14 | Car Notebook, Slide 21:
15 | https://github.com/ibm-watson-data-lab/spark.samples/raw/master/notebook/DashDB%20Twitter%20Car%202015%20Python%20Notebook.ipynb
16 |
17 |
18 | SIMPLE DATA PIPE package.json:
19 | "simple-data-pipe-connector-flightstats":"git://github.com/ibm-watson-data-lab/simple-data-pipe-connector-flightstats.git"
20 |
--------------------------------------------------------------------------------
/notebook/README.md:
--------------------------------------------------------------------------------
1 | # Sample Notebooks
2 |
3 | This repository contains sample notebooks that show you how get the most out of IBM Analytics for Apache Spark. You may run these notebooks in a locally set up notebook environment (i.e., [Jupyter Notebook](https://jupyter.readthedocs.io/en/latest/install.html)) or through the [IBM Data Science Experience (DSX)](http://datascience.ibm.com/).
4 |
5 | ## Service Credentials
6 |
7 | Some of the notebooks require credentials to various services (e.g., Twitter API, Watson Tone Analyzer, etc.). Instructions for provisioning these services and getting credentials are outlined here: [Set Up Services and Get Credentials](https://github.com/ibm-watson-data-lab/spark.samples/blob/master/notebook/Get%20Service%20Credentials%20for%20Twitter%20Sentiment%20with%20Watson%20TA%20and%20PI.md)
8 |
9 |
10 | ## Running a notebook in DSX
11 |
12 | More info and detailed instruction for DSX can be found its [documentation](http://datascience.ibm.com/docs/content/getting-started/get-started.html).
13 |
14 | 1. Log into DSX
15 | 2. Go to __My Projects__
16 | 3. Select an existing project or create a new project
17 |
18 | ##### To set up a new project
19 | 1. Click __create project__
20 | 2. Enter a __Name__
21 | 3. Select an existing or create a new __Spark Service__ to associate with the project
22 | 4. Select and existing or create a new __Target Object Storage Instance__ to associate with the project
23 | 5. Click __Create__
24 |
25 | 4. Create a new notebook
26 |
27 | ##### To set up a new notebook
28 | 1. Click __add notebooks__
29 | 2. Click __From URL__
30 | 3. Enter a __Name__
31 | 4. Enter the __Notebook URL__
32 | 5. Select an existing __Spark Service__ to associate with the notebook
33 | 6. Click __Create Notebook__
34 |
35 | 5. Once in the notebook, follow it's instructions for running the notebook
36 |
--------------------------------------------------------------------------------
/notebook/Twitter Sentiment with Watson TA and PI.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Twitter Sentiment analysis with Watson Tone Analyzer and Watson Personality Insights\n",
8 | "\n",
9 | "
\n",
10 | "\n",
11 | "In this notebook, we perform the following steps: \n",
12 | "1. Install python-twitter and watson-developer-cloud modules\n",
13 | "2. Install the streaming Twitter jar using PixieDust packageManager\n",
14 | "3. Invoke the streaming Twitter app using the PixieDust Scala Bridge to get a DataFrame containing all the tweets enriched with Watson Tone Analyzer scores\n",
15 | "4. Create a new RDD that groups the tweets by author and concatenates all the associated tweets into one blob\n",
16 | "5. For each author and aggregated text, invoke the Watson Personality Insights to get the scores\n",
17 | "6. Visualize results using PixieDust display \n",
18 | "\n",
19 | "## Learn more \n",
20 | "* [Watson Tone Analyzer](http://www.ibm.com/watson/developercloud/tone-analyzer.html) \n",
21 | "* [Watson Personality Insights](http://www.ibm.com/watson/developercloud/personality-insights.html) \n",
22 | "* [python-twitter](https://github.com/bear/python-twitter) \n",
23 | "* [watson-developer-cloud](https://github.com/watson-developer-cloud) \n",
24 | "* [PixieDust](https://github.com/ibm-watson-data-lab/pixiedust)\n",
25 | "* [Realtime Sentiment Analysis of Twitter Hashtags with Spark](https://developer.ibm.com/clouddataservices/2016/01/15/real-time-sentiment-analysis-of-twitter-hashtags-with-spark)"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "# Install python-twitter and watson-developer-cloud\n",
33 | "If you haven't already installed the following modules, run these 2 cells:"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {
40 | "collapsed": false
41 | },
42 | "outputs": [],
43 | "source": [
44 | "!pip install --user python-twitter"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "collapsed": false
52 | },
53 | "outputs": [],
54 | "source": [
55 | "!pip install --user watson-developer-cloud"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "# Install latest pixiedust\n",
63 | "Make sure you are running the latest pixiedust version. After upgrading restart the kernel before continuing to the next cells."
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {
70 | "collapsed": true
71 | },
72 | "outputs": [],
73 | "source": [
74 | "!pip install --upgrade --user pixiedust"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "## Install the streaming Twitter jar in the notebook from the Github repo\n",
82 | "This jar file contains the Spark Streaming application (written in Scala) that connects to Twitter to fetch the tweets and send them to Watson Tone Analyzer for analysis. The resulting scores are then added to the tweets dataframe as separate columns."
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [],
92 | "source": [
93 | "import pixiedust\n",
94 | "jarPath = \"https://github.com/ibm-watson-data-lab/spark.samples/raw/master/dist/streaming-twitter-assembly-1.6.jar\"\n",
95 | "pixiedust.installPackage(jarPath)\n",
96 | "print(\"done\")"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "If PixieDust or the streaming Twitter jar were just installed or upgraded, restart the kernel before continuing.
"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "## Use Scala Bridge to run the command line version of the app\n",
111 | "Insert your credentials for Twitter, Watson Tone Analyzer, and Watson Personality Insights. Then run the following cell. \n",
112 | "[Read how to provision these services and get credentials](https://github.com/ibm-watson-data-lab/spark.samples/blob/master/notebook/Get%20Service%20Credentials%20for%20Twitter%20Sentiment%20with%20Watson%20TA%20and%20PI.md). "
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "collapsed": true
120 | },
121 | "outputs": [],
122 | "source": [
123 | "import pixiedust\n",
124 | "\n",
125 | "sqlContext=SQLContext(sc)\n",
126 | "\n",
127 | "#Set up the twitter credentials, they will be used both in scala and python cells below\n",
128 | "consumerKey = \"XXXX\"\n",
129 | "consumerSecret = \"XXXX\"\n",
130 | "accessToken = \"XXXX\"\n",
131 | "accessTokenSecret = \"XXXX\"\n",
132 | "\n",
133 | "#Set up the Watson Personality insight credentials\n",
134 | "piUserName = \"XXXX\"\n",
135 | "piPassword = \"XXXX\"\n",
136 | "\n",
137 | "#Set up the Watson Tone Analyzer credentials\n",
138 | "taUserName = \"XXXX\"\n",
139 | "taPassword = \"XXXX\""
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {
146 | "collapsed": false,
147 | "scrolled": false
148 | },
149 | "outputs": [],
150 | "source": [
151 | "%%scala\n",
152 | "val demo = com.ibm.cds.spark.samples.StreamingTwitter\n",
153 | "demo.setConfig(\"twitter4j.oauth.consumerKey\",consumerKey)\n",
154 | "demo.setConfig(\"twitter4j.oauth.consumerSecret\",consumerSecret)\n",
155 | "demo.setConfig(\"twitter4j.oauth.accessToken\",accessToken)\n",
156 | "demo.setConfig(\"twitter4j.oauth.accessTokenSecret\",accessTokenSecret)\n",
157 | "demo.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer/api\")\n",
158 | "demo.setConfig(\"watson.tone.password\",taPassword)\n",
159 | "demo.setConfig(\"watson.tone.username\",taUserName)\n",
160 | "\n",
161 | "import org.apache.spark.streaming._\n",
162 | "demo.startTwitterStreaming(sc, Seconds(30)) //Run the application for a limited time"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "# Create a tweets dataframe from the data fetched above and transfer it to Python\n",
170 | "Notice the __ prefix for each variable which is used to signal PixieDust that the variable needs to be transfered back to Python"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {
177 | "collapsed": false
178 | },
179 | "outputs": [],
180 | "source": [
181 | "%%scala\n",
182 | "val demo = com.ibm.cds.spark.samples.StreamingTwitter\n",
183 | "val (__sqlContext, __df) = demo.createTwitterDataFrames(sc)"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "## Group the tweets by author and userid\n",
191 | "This will be used later to fetch the last 200 tweets for each author"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "metadata": {
198 | "collapsed": false
199 | },
200 | "outputs": [],
201 | "source": [
202 | "import pyspark.sql.functions as F\n",
203 | "usersDF = __df.groupby(\"author\", \"userid\").agg(F.avg(\"Anger\").alias(\"Anger\"), F.avg(\"Disgust\").alias(\"Disgust\"))\n",
204 | "usersDF.show()"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "# Set up the Twitter API from python-twitter module"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {
218 | "collapsed": false
219 | },
220 | "outputs": [],
221 | "source": [
222 | "import twitter\n",
223 | "api = twitter.Api(consumer_key=consumerKey,\n",
224 | " consumer_secret=consumerSecret,\n",
225 | " access_token_key=accessToken,\n",
226 | " access_token_secret=accessTokenSecret)\n",
227 | "\n",
228 | "#print(api.VerifyCredentials())"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "# For each author, fetch the last 200 tweets\n",
236 | "use flatMap to return a new RDD that contains a list of tuples composed of userid and tweets text: (userid, tweetText)"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {
243 | "collapsed": false
244 | },
245 | "outputs": [],
246 | "source": [
247 | "def getTweets(screenName):\n",
248 | " statuses = api.GetUserTimeline(screen_name=screenName,\n",
249 | " since_id=None,\n",
250 | " max_id=None,\n",
251 | " count=200,\n",
252 | " include_rts=False,\n",
253 | " trim_user=False,\n",
254 | " exclude_replies=True)\n",
255 | " return statuses\n",
256 | "\n",
257 | "usersWithTweetsRDD = usersDF.flatMap(lambda s: [(s.user.screen_name, s.text.encode('ascii', 'ignore')) for s in getTweets(s['userid'])])\n",
258 | "print(usersWithTweetsRDD.count())"
259 | ]
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {},
264 | "source": [
265 | "# Concatenate all the tweets for each user so we have enough words to send to Watson Personality Insights\n",
266 | "* Use map to create an RDD of key, value pair composed of userId and tweets \n",
267 | "* Use reduceByKey to group all record with same author and concatenate the tweets"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {
274 | "collapsed": false,
275 | "scrolled": true
276 | },
277 | "outputs": [],
278 | "source": [
279 | "import re\n",
280 | "usersWithTweetsRDD2 = usersWithTweetsRDD.map(lambda s: (s[0], s[1])).reduceByKey(lambda s,t: s + '\\n' + t)\\\n",
281 | " .filter(lambda s: len(re.findall(r'\\w+', s[1])) > 100 )\n",
282 | "print(usersWithTweetsRDD2.count())\n",
283 | "#usersWithTweetsRDD2.take(2)"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {},
289 | "source": [
290 | "# Call Watson Personality Insights on the text for each author\n",
291 | "Watson Personality Insights requires at least 100 words from its lexicon to be available, which may not exist for each user. This is why the getPersonlityInsight helper function guards against exceptions from calling Watson PI. If an exception occurs, then an empty array is returned. Each record with empty array is filtered out of the resulting RDD.\n",
292 | "\n",
293 | "Note also that we use broadcast variables to propagate the userName and password to the cluster"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {
300 | "collapsed": false,
301 | "scrolled": true
302 | },
303 | "outputs": [],
304 | "source": [
305 | "from pyspark.sql.types import *\n",
306 | "from watson_developer_cloud import PersonalityInsightsV3\n",
307 | "broadCastPIUsername = sc.broadcast(piUserName)\n",
308 | "broadCastPIPassword = sc.broadcast(piPassword)\n",
309 | "def getPersonalityInsight(text, schema=False):\n",
310 | " personality_insights = PersonalityInsightsV3(\n",
311 | " version='2016-10-20',\n",
312 | " username=broadCastPIUsername.value,\n",
313 | " password=broadCastPIPassword.value)\n",
314 | " try:\n",
315 | " p = personality_insights.profile(\n",
316 | " text, content_type='text/plain',\n",
317 | " raw_scores=True, consumption_preferences=True)\n",
318 | "\n",
319 | " if schema:\n",
320 | " return \\\n",
321 | " [StructField(t['name'], FloatType()) for t in p[\"needs\"]] + \\\n",
322 | " [StructField(t['name'], FloatType()) for t in p[\"values\"]] + \\\n",
323 | " [StructField(t['name'], FloatType()) for t in p['personality' ]]\n",
324 | " else:\n",
325 | " return \\\n",
326 | " [t['raw_score'] for t in p[\"needs\"]] + \\\n",
327 | " [t['raw_score'] for t in p[\"values\"]] + \\\n",
328 | " [t['raw_score'] for t in p['personality']] \n",
329 | " except:\n",
330 | " return []\n",
331 | "\n",
332 | "usersWithPIRDD = usersWithTweetsRDD2.map(lambda s: [s[0]] + getPersonalityInsight(s[1])).filter(lambda s: len(s)>1)\n",
333 | "print(usersWithPIRDD.count())\n",
334 | "#usersWithPIRDD.take(2)"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "# Convert the RDD back to a DataFrame and call PixieDust display to visualize the results\n",
342 | "The schema is automatically created from introspecting a sample payload result from Watson Personality Insights"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": null,
348 | "metadata": {
349 | "collapsed": false,
350 | "pixiedust": {
351 | "displayParams": {
352 | "aggregation": "SUM",
353 | "handlerId": "barChart",
354 | "keyFields": "userid",
355 | "showLegend": "true",
356 | "stacked": "false",
357 | "staticFigure": "false",
358 | "title": "Personality Insights",
359 | "valueFields": "Challenge,Closeness,Curiosity,Excitement"
360 | }
361 | },
362 | "scrolled": false
363 | },
364 | "outputs": [],
365 | "source": [
366 | "#convert to dataframe\n",
367 | "schema = StructType(\n",
368 | " [StructField('userid',StringType())] + getPersonalityInsight(usersWithTweetsRDD2.take(1)[0][1], schema=True)\n",
369 | ")\n",
370 | "\n",
371 | "usersWithPIDF = sqlContext.createDataFrame(\n",
372 | " usersWithPIRDD, schema\n",
373 | ")\n",
374 | "\n",
375 | "usersWithPIDF.cache()\n",
376 | "display(usersWithPIDF)"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "# Compare Twitter users Personality Insights scores with this year presidential candidates\n",
384 | "\n",
385 | "For a quick look on the difference in Personality Insights scores Spark provides a describe() function that computes stddev and mean values off the dataframe. Compare differences in the scores of twitter users and presidential candidates."
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "metadata": {
392 | "collapsed": true
393 | },
394 | "outputs": [],
395 | "source": [
396 | "candidates = \"realDonaldTrump HillaryClinton\".split(\" \")\n",
397 | "candidatesRDD = sc.parallelize(candidates)\\\n",
398 | " .flatMap(lambda s: [(t.user.screen_name, t.text.encode('ascii', 'ignore')) for t in getTweets(s)])\\\n",
399 | " .map(lambda s: (s[0], s[1]))\\\n",
400 | " .reduceByKey(lambda s,t: s + '\\n' + t)\\\n",
401 | " .filter(lambda s: len(re.findall(r'\\w+', s[1])) > 100 )\\\n",
402 | " .map(lambda s: [s[0]] + getPersonalityInsight(s[1]))\n",
403 | "\n",
404 | "candidatesPIDF = sqlContext.createDataFrame(\n",
405 | " candidatesRDD, schema\n",
406 | ")"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": null,
412 | "metadata": {
413 | "collapsed": true
414 | },
415 | "outputs": [],
416 | "source": [
417 | "c = candidatesPIDF.collect()\n",
418 | "broadCastTrumpPI = sc.broadcast(c[0][1:])\n",
419 | "broadCastHillaryPI = sc.broadcast(c[1][1:])"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "metadata": {
426 | "collapsed": false,
427 | "pixiedust": {
428 | "displayParams": {
429 | "handlerId": "dataframe"
430 | }
431 | }
432 | },
433 | "outputs": [],
434 | "source": [
435 | "display(candidatesPIDF)"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": null,
441 | "metadata": {
442 | "collapsed": false
443 | },
444 | "outputs": [],
445 | "source": [
446 | "candidatesPIDF.select('userid','Emotional range','Agreeableness', 'Extraversion','Conscientiousness', 'Openness').show()\n",
447 | "\n",
448 | "usersWithPIDF.describe(['Emotional range']).show()\n",
449 | "usersWithPIDF.describe(['Agreeableness']).show()\n",
450 | "usersWithPIDF.describe(['Extraversion']).show()\n",
451 | "usersWithPIDF.describe(['Conscientiousness']).show()\n",
452 | "usersWithPIDF.describe(['Openness']).show()"
453 | ]
454 | },
455 | {
456 | "cell_type": "markdown",
457 | "metadata": {},
458 | "source": [
459 | "# Calculate Euclidean distance (norm) between each Twitter user and the presidential candidates using the Personality Insights scores\n",
460 | "\n",
461 | "Add the distances into 2 extra columns and display the results"
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": null,
467 | "metadata": {
468 | "collapsed": false,
469 | "pixiedust": {
470 | "displayParams": {
471 | "aggregation": "COUNT",
472 | "handlerId": "barChart",
473 | "keyFields": "closerHillary",
474 | "showLegend": "true",
475 | "stacked": "true",
476 | "staticFigure": "false",
477 | "valueFields": "closerHillary"
478 | }
479 | }
480 | },
481 | "outputs": [],
482 | "source": [
483 | "import numpy as np\n",
484 | "from pyspark.sql.types import Row\n",
485 | "def addEuclideanDistance(s):\n",
486 | " dict = s.asDict()\n",
487 | " def getEuclideanDistance(a,b):\n",
488 | " return np.linalg.norm(np.array(a) - np.array(b)).item()\n",
489 | " dict[\"distDonaldTrump\"]=getEuclideanDistance(s[1:], broadCastTrumpPI.value)\n",
490 | " dict[\"distHillary\"]=getEuclideanDistance(s[1:], broadCastHillaryPI.value)\n",
491 | " dict[\"closerHillary\"] = \"Yes\" if dict[\"distHillary\"] < dict[\"distDonaldTrump\"] else \"No\"\n",
492 | " return Row(**dict)\n",
493 | "\n",
494 | "#add euclidean distances to Trump and Hillary\n",
495 | "euclideanDF = sqlContext.createDataFrame(usersWithPIDF.map(lambda s: addEuclideanDistance(s)))\n",
496 | "\n",
497 | "#Reorder columns to have userid and distances first\n",
498 | "cols = euclideanDF.columns\n",
499 | "reorderCols = [\"userid\",\"distHillary\",\"distDonaldTrump\", \"closerHillary\"]\n",
500 | "euclideanDF = euclideanDF.select(reorderCols + [x for x in cols if x not in reorderCols])\n",
501 | "\n",
502 | "#PixieDust display. \n",
503 | "#To visualize the distribution, select the bar chart display, use closerHillary as key and value and aggregation=count\n",
504 | "display(euclideanDF)"
505 | ]
506 | },
507 | {
508 | "cell_type": "markdown",
509 | "metadata": {},
510 | "source": [
511 | "# Optional: do some extra data science on the tweets"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": null,
517 | "metadata": {
518 | "collapsed": false,
519 | "pixiedust": {
520 | "displayParams": {
521 | "aggregation": "COUNT",
522 | "handlerId": "barChart",
523 | "keyFields": "Anger",
524 | "showLegend": "true",
525 | "stacked": "true",
526 | "staticFigure": "false",
527 | "valueFields": "Openness"
528 | }
529 | }
530 | },
531 | "outputs": [],
532 | "source": [
533 | "tweets=__df\n",
534 | "tweets.count()\n",
535 | "display(tweets)"
536 | ]
537 | },
538 | {
539 | "cell_type": "markdown",
540 | "metadata": {},
541 | "source": [
542 | "# Compute the sentiment distributions for tweets with scores greater than 60% and create matplotlib chart visualization"
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": null,
548 | "metadata": {
549 | "collapsed": false
550 | },
551 | "outputs": [],
552 | "source": [
553 | "#create an array that will hold the count for each sentiment\n",
554 | "sentimentDistribution=[0] * 13\n",
555 | "#For each sentiment, run a sql query that counts the number of tweets for which the sentiment score is greater than 60%\n",
556 | "#Store the data in the array\n",
557 | "for i, sentiment in enumerate(tweets.columns[-13:]):\n",
558 | " sentimentDistribution[i]=__sqlContext.sql(\"SELECT count(*) as sentCount FROM tweets where \" + sentiment + \" > 60\")\\\n",
559 | " .collect()[0].sentCount"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": null,
565 | "metadata": {
566 | "collapsed": false
567 | },
568 | "outputs": [],
569 | "source": [
570 | "%matplotlib inline\n",
571 | "import matplotlib\n",
572 | "import numpy as np\n",
573 | "import matplotlib.pyplot as plt\n",
574 | "\n",
575 | "ind=np.arange(13)\n",
576 | "width = 0.35\n",
577 | "bar = plt.bar(ind, sentimentDistribution, width, color='g', label = \"distributions\")\n",
578 | "\n",
579 | "params = plt.gcf()\n",
580 | "plSize = params.get_size_inches()\n",
581 | "params.set_size_inches( (plSize[0]*2.5, plSize[1]*2) )\n",
582 | "plt.ylabel('Tweet count')\n",
583 | "plt.xlabel('Tone')\n",
584 | "plt.title('Distribution of tweets by sentiments > 60%')\n",
585 | "plt.xticks(ind+width, tweets.columns[-13:])\n",
586 | "plt.legend()\n",
587 | "\n",
588 | "plt.show()"
589 | ]
590 | },
591 | {
592 | "cell_type": "markdown",
593 | "metadata": {},
594 | "source": [
595 | "# Compute the top hashtags used in each tweet"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": null,
601 | "metadata": {
602 | "collapsed": true
603 | },
604 | "outputs": [],
605 | "source": [
606 | "from operator import add\n",
607 | "import re\n",
608 | "tagsRDD = tweets.flatMap( lambda t: re.split(\"\\s\", t.text))\\\n",
609 | " .filter( lambda word: word.startswith(\"#\") )\\\n",
610 | " .map( lambda word : (word, 1 ))\\\n",
611 | " .reduceByKey(add, 10).map(lambda (a,b): (b,a)).sortByKey(False).map(lambda (a,b):(b,a))\n",
612 | "top10tags = tagsRDD.take(10)"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": null,
618 | "metadata": {
619 | "collapsed": false
620 | },
621 | "outputs": [],
622 | "source": [
623 | "%matplotlib inline\n",
624 | "import matplotlib\n",
625 | "import matplotlib.pyplot as plt\n",
626 | "\n",
627 | "params = plt.gcf()\n",
628 | "plSize = params.get_size_inches()\n",
629 | "params.set_size_inches( (plSize[0]*2, plSize[1]*2) )\n",
630 | "\n",
631 | "labels = [i[0] for i in top10tags]\n",
632 | "sizes = [int(i[1]) for i in top10tags]\n",
633 | "colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', \"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\"]\n",
634 | "\n",
635 | "plt.pie(sizes, labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90)\n",
636 | "\n",
637 | "plt.axis('equal')\n",
638 | "plt.show()"
639 | ]
640 | },
641 | {
642 | "cell_type": "markdown",
643 | "metadata": {},
644 | "source": [
645 | "# Compute the aggregate sentiment distribution for all the tweets that contain the top hashtags"
646 | ]
647 | },
648 | {
649 | "cell_type": "code",
650 | "execution_count": null,
651 | "metadata": {
652 | "collapsed": true
653 | },
654 | "outputs": [],
655 | "source": [
656 | "cols = tweets.columns[-13:]\n",
657 | "def expand( t ):\n",
658 | " ret = []\n",
659 | " for s in [i[0] for i in top10tags]:\n",
660 | " if ( s in t.text ):\n",
661 | " for tone in cols:\n",
662 | " ret += [s.replace(':','').replace('-','') + u\"-\" + unicode(tone) + \":\" + unicode(getattr(t, tone))]\n",
663 | " return ret \n",
664 | "def makeList(l):\n",
665 | " return l if isinstance(l, list) else [l]\n",
666 | "\n",
667 | "#Create RDD from tweets dataframe\n",
668 | "tagsRDD = tweets.map(lambda t: t )\n",
669 | "\n",
670 | "#Filter to only keep the entries that are in top10tags\n",
671 | "tagsRDD = tagsRDD.filter( lambda t: any(s in t.text for s in [i[0] for i in top10tags] ) )\n",
672 | "\n",
673 | "#Create a flatMap using the expand function defined above, this will be used to collect all the scores \n",
674 | "#for a particular tag with the following format: Tag-Tone-ToneScore\n",
675 | "tagsRDD = tagsRDD.flatMap( expand )\n",
676 | "\n",
677 | "#Create a map indexed by Tag-Tone keys \n",
678 | "tagsRDD = tagsRDD.map( lambda fullTag : (fullTag.split(\":\")[0], float( fullTag.split(\":\")[1]) ))\n",
679 | "\n",
680 | "#Call combineByKey to format the data as follow\n",
681 | "#Key=Tag-Tone\n",
682 | "#Value=(count, sum_of_all_score_for_this_tone)\n",
683 | "tagsRDD = tagsRDD.combineByKey((lambda x: (x,1)),\n",
684 | " (lambda x, y: (x[0] + y, x[1] + 1)),\n",
685 | " (lambda x, y: (x[0] + y[0], x[1] + y[1])))\n",
686 | "\n",
687 | "#ReIndex the map to have the key be the Tag and value be (Tone, Average_score) tuple\n",
688 | "#Key=Tag\n",
689 | "#Value=(Tone, average_score)\n",
690 | "tagsRDD = tagsRDD.map(lambda (key, ab): (key.split(\"-\")[0], (key.split(\"-\")[1], round(ab[0]/ab[1], 2))))\n",
691 | "\n",
692 | "#Reduce the map on the Tag key, value becomes a list of (Tone,average_score) tuples\n",
693 | "tagsRDD = tagsRDD.reduceByKey( lambda x, y : makeList(x) + makeList(y) )\n",
694 | "\n",
695 | "#Sort the (Tone,average_score) tuples alphabetically by Tone\n",
696 | "tagsRDD = tagsRDD.mapValues( lambda x : sorted(x) )\n",
697 | "\n",
698 | "#Format the data as expected by the plotting code in the next cell. \n",
699 | "#map the Values to a tuple as follow: ([list of tone], [list of average score])\n",
700 | "#e.g. #someTag:([u'Agreeableness', u'Analytical', u'Anger', u'Cheerfulness', u'Confident', u'Conscientiousness', u'Negative', u'Openness', u'Tentative'], [1.0, 0.0, 0.0, 1.0, 0.0, 0.48, 0.0, 0.02, 0.0])\n",
701 | "tagsRDD = tagsRDD.mapValues( lambda x : ([elt[0] for elt in x],[elt[1] for elt in x]) )\n",
702 | "\n",
703 | "#Use custom sort function to sort the entries by order of appearance in top10tags\n",
704 | "def customCompare( key ):\n",
705 | " for (k,v) in top10tags:\n",
706 | " if k == key:\n",
707 | " return v\n",
708 | " return 0\n",
709 | "tagsRDD = tagsRDD.sortByKey(ascending=False, numPartitions=None, keyfunc = customCompare)\n",
710 | "\n",
711 | "#Take the mean tone scores for the top 10 tags\n",
712 | "top10tagsMeanScores = tagsRDD.take(10)"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": null,
718 | "metadata": {
719 | "collapsed": false
720 | },
721 | "outputs": [],
722 | "source": [
723 | "%matplotlib inline\n",
724 | "import matplotlib\n",
725 | "import numpy as np\n",
726 | "import matplotlib.pyplot as plt\n",
727 | "\n",
728 | "params = plt.gcf()\n",
729 | "plSize = params.get_size_inches()\n",
730 | "params.set_size_inches( (plSize[0]*3, plSize[1]*2) )\n",
731 | "\n",
732 | "top5tagsMeanScores = top10tagsMeanScores[:5]\n",
733 | "width = 0\n",
734 | "ind=np.arange(13)\n",
735 | "(a,b) = top5tagsMeanScores[0]\n",
736 | "labels=b[0]\n",
737 | "colors = [\"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\", \"lightgreen\", \"gainsboro\", \"aquamarine\",\"c\"]\n",
738 | "idx=0\n",
739 | "for key, value in top5tagsMeanScores:\n",
740 | " plt.bar(ind + width, value[1], 0.15, color=colors[idx], label=key)\n",
741 | " width += 0.15\n",
742 | " idx += 1\n",
743 | "plt.xticks(ind+0.3, labels)\n",
744 | "plt.ylabel('AVERAGE SCORE')\n",
745 | "plt.xlabel('TONES')\n",
746 | "plt.title('Breakdown of top hashtags by sentiment tones')\n",
747 | "\n",
748 | "plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='center',ncol=5, mode=\"expand\", borderaxespad=0.)\n",
749 | "\n",
750 | "plt.show()"
751 | ]
752 | },
753 | {
754 | "cell_type": "markdown",
755 | "metadata": {},
756 | "source": [
757 | "# Optional: Use Twitter demo embedded app to run the same app with a UI"
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": null,
763 | "metadata": {
764 | "collapsed": false
765 | },
766 | "outputs": [],
767 | "source": [
768 | "%%scala\n",
769 | "val demo = com.ibm.cds.spark.samples.PixiedustStreamingTwitter\n",
770 | "demo.setConfig(\"twitter4j.oauth.consumerKey\",consumerKey)\n",
771 | "demo.setConfig(\"twitter4j.oauth.consumerSecret\",consumerSecret)\n",
772 | "demo.setConfig(\"twitter4j.oauth.accessToken\",accessToken)\n",
773 | "demo.setConfig(\"twitter4j.oauth.accessTokenSecret\",accessTokenSecret)\n",
774 | "demo.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer/api\")\n",
775 | "demo.setConfig(\"watson.tone.password\",taPassword)\n",
776 | "demo.setConfig(\"watson.tone.username\",taUserName)\n",
777 | "demo.setConfig(\"checkpointDir\", System.getProperty(\"user.home\") + \"/pixiedust/ssc\")"
778 | ]
779 | },
780 | {
781 | "cell_type": "code",
782 | "execution_count": null,
783 | "metadata": {
784 | "collapsed": true
785 | },
786 | "outputs": [],
787 | "source": [
788 | "!pip install --upgrade --user pixiedust-twitterdemo"
789 | ]
790 | },
791 | {
792 | "cell_type": "code",
793 | "execution_count": null,
794 | "metadata": {
795 | "collapsed": false,
796 | "pixiedust": {
797 | "displayParams": {
798 | "handlerId": "twitterdemo"
799 | }
800 | }
801 | },
802 | "outputs": [],
803 | "source": [
804 | "from pixiedust_twitterdemo import *\n",
805 | "twitterDemo()"
806 | ]
807 | },
808 | {
809 | "cell_type": "markdown",
810 | "metadata": {},
811 | "source": [
812 | "## The embedded app has generated a DataFrame called __tweets. Let's use it to do some data science"
813 | ]
814 | },
815 | {
816 | "cell_type": "code",
817 | "execution_count": null,
818 | "metadata": {
819 | "collapsed": false,
820 | "pixiedust": {
821 | "displayParams": {
822 | "handlerId": "dataframe"
823 | }
824 | }
825 | },
826 | "outputs": [],
827 | "source": [
828 | "display(__tweets)"
829 | ]
830 | },
831 | {
832 | "cell_type": "code",
833 | "execution_count": null,
834 | "metadata": {
835 | "collapsed": false,
836 | "pixiedust": {
837 | "displayParams": {
838 | "aggregation": "COUNT",
839 | "handlerId": "barChart",
840 | "keyFields": "emotion",
841 | "showLegend": "true",
842 | "stacked": "true",
843 | "valueFields": "score"
844 | }
845 | }
846 | },
847 | "outputs": [],
848 | "source": [
849 | "from pyspark.sql import Row\n",
850 | "from pyspark.sql.types import *\n",
851 | "emotions=__tweets.columns[-13:]\n",
852 | "distrib = __tweets.flatMap(lambda t: [(x,t[x]) for x in emotions]).filter(lambda t: t[1]>60)\\\n",
853 | " .toDF(StructType([StructField('emotion',StringType()),StructField('score',DoubleType())]))\n",
854 | "display(distrib)"
855 | ]
856 | },
857 | {
858 | "cell_type": "code",
859 | "execution_count": null,
860 | "metadata": {
861 | "collapsed": false
862 | },
863 | "outputs": [],
864 | "source": [
865 | "__tweets.registerTempTable(\"pixiedust_tweets\")\n",
866 | "#create an array that will hold the count for each sentiment\n",
867 | "sentimentDistribution=[0] * 13\n",
868 | "#For each sentiment, run a sql query that counts the number of tweets for which the sentiment score is greater than 60%\n",
869 | "#Store the data in the array\n",
870 | "for i, sentiment in enumerate(__tweets.columns[-13:]):\n",
871 | " sentimentDistribution[i]=sqlContext.sql(\"SELECT count(*) as sentCount FROM pixiedust_tweets where \" + sentiment + \" > 60\")\\\n",
872 | " .collect()[0].sentCount"
873 | ]
874 | },
875 | {
876 | "cell_type": "code",
877 | "execution_count": null,
878 | "metadata": {
879 | "collapsed": false
880 | },
881 | "outputs": [],
882 | "source": [
883 | "%matplotlib inline\n",
884 | "import matplotlib\n",
885 | "import numpy as np\n",
886 | "import matplotlib.pyplot as plt\n",
887 | "\n",
888 | "ind=np.arange(13)\n",
889 | "width = 0.35\n",
890 | "bar = plt.bar(ind, sentimentDistribution, width, color='g', label = \"distributions\")\n",
891 | "\n",
892 | "params = plt.gcf()\n",
893 | "plSize = params.get_size_inches()\n",
894 | "params.set_size_inches( (plSize[0]*2.5, plSize[1]*2) )\n",
895 | "plt.ylabel('Tweet count')\n",
896 | "plt.xlabel('Tone')\n",
897 | "plt.title('Distribution of tweets by sentiments > 60%')\n",
898 | "plt.xticks(ind+width, __tweets.columns[-13:])\n",
899 | "plt.legend()\n",
900 | "\n",
901 | "plt.show()"
902 | ]
903 | },
904 | {
905 | "cell_type": "code",
906 | "execution_count": null,
907 | "metadata": {
908 | "collapsed": true
909 | },
910 | "outputs": [],
911 | "source": [
912 | "from operator import add\n",
913 | "import re\n",
914 | "tagsRDD = __tweets.flatMap( lambda t: re.split(\"\\s\", t.text))\\\n",
915 | " .filter( lambda word: word.startswith(\"#\") )\\\n",
916 | " .map( lambda word : (word, 1 ))\\\n",
917 | " .reduceByKey(add, 10).map(lambda (a,b): (b,a)).sortByKey(False).map(lambda (a,b):(b,a))\n",
918 | "top10tags = tagsRDD.take(10)"
919 | ]
920 | },
921 | {
922 | "cell_type": "code",
923 | "execution_count": null,
924 | "metadata": {
925 | "collapsed": false
926 | },
927 | "outputs": [],
928 | "source": [
929 | "%matplotlib inline\n",
930 | "import matplotlib\n",
931 | "import matplotlib.pyplot as plt\n",
932 | "\n",
933 | "params = plt.gcf()\n",
934 | "plSize = params.get_size_inches()\n",
935 | "params.set_size_inches( (plSize[0]*2, plSize[1]*2) )\n",
936 | "\n",
937 | "labels = [i[0] for i in top10tags]\n",
938 | "sizes = [int(i[1]) for i in top10tags]\n",
939 | "colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', \"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\"]\n",
940 | "\n",
941 | "plt.pie(sizes, labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90)\n",
942 | "\n",
943 | "plt.axis('equal')\n",
944 | "plt.show()"
945 | ]
946 | },
947 | {
948 | "cell_type": "code",
949 | "execution_count": null,
950 | "metadata": {
951 | "collapsed": true
952 | },
953 | "outputs": [],
954 | "source": [
955 | "cols = __tweets.columns[-13:]\n",
956 | "def expand( t ):\n",
957 | " ret = []\n",
958 | " for s in [i[0] for i in top10tags]:\n",
959 | " if ( s in t.text ):\n",
960 | " for tone in cols:\n",
961 | " ret += [s.replace(':','').replace('-','') + u\"-\" + unicode(tone) + \":\" + unicode(getattr(t, tone))]\n",
962 | " return ret \n",
963 | "def makeList(l):\n",
964 | " return l if isinstance(l, list) else [l]\n",
965 | "\n",
966 | "#Create RDD from tweets dataframe\n",
967 | "tagsRDD = __tweets.map(lambda t: t )\n",
968 | "\n",
969 | "#Filter to only keep the entries that are in top10tags\n",
970 | "tagsRDD = tagsRDD.filter( lambda t: any(s in t.text for s in [i[0] for i in top10tags] ) )\n",
971 | "\n",
972 | "#Create a flatMap using the expand function defined above, this will be used to collect all the scores \n",
973 | "#for a particular tag with the following format: Tag-Tone-ToneScore\n",
974 | "tagsRDD = tagsRDD.flatMap( expand )\n",
975 | "\n",
976 | "#Create a map indexed by Tag-Tone keys \n",
977 | "tagsRDD = tagsRDD.map( lambda fullTag : (fullTag.split(\":\")[0], float( fullTag.split(\":\")[1]) ))\n",
978 | "\n",
979 | "#Call combineByKey to format the data as follow\n",
980 | "#Key=Tag-Tone\n",
981 | "#Value=(count, sum_of_all_score_for_this_tone)\n",
982 | "tagsRDD = tagsRDD.combineByKey((lambda x: (x,1)),\n",
983 | " (lambda x, y: (x[0] + y, x[1] + 1)),\n",
984 | " (lambda x, y: (x[0] + y[0], x[1] + y[1])))\n",
985 | "\n",
986 | "#ReIndex the map to have the key be the Tag and value be (Tone, Average_score) tuple\n",
987 | "#Key=Tag\n",
988 | "#Value=(Tone, average_score)\n",
989 | "tagsRDD = tagsRDD.map(lambda (key, ab): (key.split(\"-\")[0], (key.split(\"-\")[1], round(ab[0]/ab[1], 2))))\n",
990 | "\n",
991 | "#Reduce the map on the Tag key, value becomes a list of (Tone,average_score) tuples\n",
992 | "tagsRDD = tagsRDD.reduceByKey( lambda x, y : makeList(x) + makeList(y) )\n",
993 | "\n",
994 | "#Sort the (Tone,average_score) tuples alphabetically by Tone\n",
995 | "tagsRDD = tagsRDD.mapValues( lambda x : sorted(x) )\n",
996 | "\n",
997 | "#Format the data as expected by the plotting code in the next cell. \n",
998 | "#map the Values to a tuple as follow: ([list of tone], [list of average score])\n",
999 | "#e.g. #someTag:([u'Agreeableness', u'Analytical', u'Anger', u'Cheerfulness', u'Confident', u'Conscientiousness', u'Negative', u'Openness', u'Tentative'], [1.0, 0.0, 0.0, 1.0, 0.0, 0.48, 0.0, 0.02, 0.0])\n",
1000 | "tagsRDD = tagsRDD.mapValues( lambda x : ([elt[0] for elt in x],[elt[1] for elt in x]) )\n",
1001 | "\n",
1002 | "#Use custom sort function to sort the entries by order of appearance in top10tags\n",
1003 | "def customCompare( key ):\n",
1004 | " for (k,v) in top10tags:\n",
1005 | " if k == key:\n",
1006 | " return v\n",
1007 | " return 0\n",
1008 | "tagsRDD = tagsRDD.sortByKey(ascending=False, numPartitions=None, keyfunc = customCompare)\n",
1009 | "\n",
1010 | "#Take the mean tone scores for the top 10 tags\n",
1011 | "top10tagsMeanScores = tagsRDD.take(10)"
1012 | ]
1013 | },
1014 | {
1015 | "cell_type": "code",
1016 | "execution_count": null,
1017 | "metadata": {
1018 | "collapsed": false
1019 | },
1020 | "outputs": [],
1021 | "source": [
1022 | "%matplotlib inline\n",
1023 | "import matplotlib\n",
1024 | "import numpy as np\n",
1025 | "import matplotlib.pyplot as plt\n",
1026 | "\n",
1027 | "params = plt.gcf()\n",
1028 | "plSize = params.get_size_inches()\n",
1029 | "params.set_size_inches( (plSize[0]*3, plSize[1]*2) )\n",
1030 | "\n",
1031 | "top5tagsMeanScores = top10tagsMeanScores[:5]\n",
1032 | "width = 0\n",
1033 | "ind=np.arange(13)\n",
1034 | "(a,b) = top5tagsMeanScores[0]\n",
1035 | "labels=b[0]\n",
1036 | "colors = [\"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\", \"lightgreen\", \"gainsboro\", \"aquamarine\",\"c\"]\n",
1037 | "idx=0\n",
1038 | "for key, value in top5tagsMeanScores:\n",
1039 | " plt.bar(ind + width, value[1], 0.15, color=colors[idx], label=key)\n",
1040 | " width += 0.15\n",
1041 | " idx += 1\n",
1042 | "plt.xticks(ind+0.3, labels)\n",
1043 | "plt.ylabel('AVERAGE SCORE')\n",
1044 | "plt.xlabel('TONES')\n",
1045 | "plt.title('Breakdown of top hashtags by sentiment tones')\n",
1046 | "\n",
1047 | "plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='center',ncol=5, mode=\"expand\", borderaxespad=0.)\n",
1048 | "\n",
1049 | "plt.show()"
1050 | ]
1051 | }
1052 | ],
1053 | "metadata": {
1054 | "anaconda-cloud": {},
1055 | "kernelspec": {
1056 | "display_name": "pySpark (Spark 1.6.0) Python 2",
1057 | "language": "python",
1058 | "name": "pyspark1.6python2"
1059 | },
1060 | "language_info": {
1061 | "codemirror_mode": {
1062 | "name": "ipython",
1063 | "version": 2
1064 | },
1065 | "file_extension": ".py",
1066 | "mimetype": "text/x-python",
1067 | "name": "python",
1068 | "nbconvert_exporter": "python",
1069 | "pygments_lexer": "ipython2",
1070 | "version": "2.7.11"
1071 | }
1072 | },
1073 | "nbformat": 4,
1074 | "nbformat_minor": 0
1075 | }
1076 |
--------------------------------------------------------------------------------
/streaming-twitter/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
--------------------------------------------------------------------------------
/streaming-twitter/.gitignore:
--------------------------------------------------------------------------------
1 | /bin/
2 | /config
3 |
--------------------------------------------------------------------------------
/streaming-twitter/.project:
--------------------------------------------------------------------------------
1 |
2 | streaming-twitter
3 |
4 |
5 | org.scala-ide.sdt.core.scalabuilder
6 |
7 |
8 |
9 | org.scala-ide.sdt.core.scalanature
10 | org.eclipse.jdt.core.javanature
11 |
12 |
13 |
--------------------------------------------------------------------------------
/streaming-twitter/build.sbt:
--------------------------------------------------------------------------------
1 | name := "streaming-twitter"
2 |
3 | version := "1.6"
4 |
5 | scalaVersion := "2.10.4"
6 |
7 | libraryDependencies ++= {
8 | val sparkVersion = "1.6.0"
9 | Seq(
10 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
11 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
12 | "org.apache.spark" %% "spark-streaming" % sparkVersion % "provided",
13 | "org.apache.spark" %% "spark-streaming-twitter" % sparkVersion,
14 | "org.apache.spark" %% "spark-repl" % sparkVersion % "provided",
15 | "com.ibm" %% "couchdb-scala" % "0.5.3",
16 | "org.apache.kafka" % "kafka-log4j-appender" % "0.9.0.0",
17 | "org.apache.kafka" % "kafka-clients" % "0.9.0.0",
18 | "org.apache.kafka" %% "kafka" % "0.9.0.0",
19 | "com.google.guava" % "guava" % "14.0.1"
20 | )
21 | }
22 |
23 | assemblyMergeStrategy in assembly := {
24 | case PathList("org", "apache", "spark", xs @ _*) => MergeStrategy.first
25 | case PathList("scala", xs @ _*) => MergeStrategy.discard
26 | case PathList("com", "ibm", "pixiedust", xs @ _*) => MergeStrategy.discard
27 | case PathList("META-INF", "maven", "org.slf4j", xs @ _* ) => MergeStrategy.first
28 | case x =>
29 | val oldStrategy = (assemblyMergeStrategy in assembly).value
30 | oldStrategy(x)
31 | }
32 |
33 | unmanagedBase <<= baseDirectory { base => base / "lib" }
34 |
35 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
36 |
37 | resolvers += "scalaz-bintray" at "https://dl.bintray.com/scalaz/releases"
38 | resolvers += "Local couchdb-scala repo" at (baseDirectory.value / "lib/couchdb-scala").toURI.toString
39 |
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar.md5:
--------------------------------------------------------------------------------
1 | e5ee6d0be04b3b9fc6f2f9c7dabc2497
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar.sha1:
--------------------------------------------------------------------------------
1 | ba8a2e725a4aae35185cbc0862f93fb86dc50138
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar.md5:
--------------------------------------------------------------------------------
1 | be140baa91495e6a161eb95b3415b48d
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar.sha1:
--------------------------------------------------------------------------------
1 | eda716f52436863b442564400ebcecc09662d8f7
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar.md5:
--------------------------------------------------------------------------------
1 | 554911d3e139c8ba42957989e4f76428
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar.sha1:
--------------------------------------------------------------------------------
1 | 6c25040548743c9ae0bb2cf4636ec9da9d55068c
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.pom:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.ibm
5 | couchdb-scala_2.10
6 | jar
7 | A purely functional Scala client for CouchDB
8 | https://github.com/beloglazov/couchdb-scala
9 | 0.5.3
10 |
11 |
12 | The Apache Software License, Version 2.0
13 | http://www.apache.org/licenses/LICENSE-2.0.txt
14 | repo
15 |
16 |
17 | couchdb-scala
18 |
19 | com.ibm
20 | https://github.com/beloglazov/couchdb-scala
21 |
22 |
23 | scm:git:git@github.com:beloglazov/couchdb-scala.git
24 | scm:git:git@github.com:beloglazov/couchdb-scala.git
25 | https://github.com/beloglazov/couchdb-scala
26 |
27 |
28 |
29 | beloglazov
30 | Anton Beloglazov
31 | anton.beloglazov@gmail.com
32 | http://beloglazov.info
33 |
34 |
35 |
36 |
37 | org.scala-lang
38 | scala-library
39 | 2.10.4
40 |
41 |
42 | org.scalaz
43 | scalaz-core_2.10
44 | 7.1.0
45 |
46 |
47 | org.scalaz
48 | scalaz-effect_2.10
49 | 7.1.0
50 |
51 |
52 | org.http4s
53 | http4s-core_2.10
54 | 0.8.2
55 |
56 |
57 | org.http4s
58 | http4s-client_2.10
59 | 0.8.2
60 |
61 |
62 | org.http4s
63 | http4s-blazeclient_2.10
64 | 0.8.2
65 |
66 |
67 | com.lihaoyi
68 | upickle_2.10
69 | 0.2.6
70 |
71 |
72 | com.github.julien-truffaut
73 | monocle-core_2.10
74 | 1.0.1
75 |
76 |
77 | com.github.julien-truffaut
78 | monocle-macro_2.10
79 | 1.0.1
80 |
81 |
82 | org.log4s
83 | log4s_2.10
84 | 1.1.3
85 |
86 |
87 | org.specs2
88 | specs2_2.10
89 | 2.4.16
90 | test
91 |
92 |
93 | org.typelevel
94 | scalaz-specs2_2.10
95 | 0.3.0
96 | test
97 |
98 |
99 | org.scalacheck
100 | scalacheck_2.10
101 | 1.12.1
102 | test
103 |
104 |
105 | org.scalaz
106 | scalaz-scalacheck-binding_2.10
107 | 7.1.0
108 | test
109 |
110 |
111 | ch.qos.logback
112 | logback-classic
113 | 1.1.2
114 | test
115 |
116 |
117 |
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.pom.md5:
--------------------------------------------------------------------------------
1 | c19ebb91556b46c2e2a7ff027b351e15
--------------------------------------------------------------------------------
/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.pom.sha1:
--------------------------------------------------------------------------------
1 | 342d29d046750084aabf94c85081f54e19bbcaa6
--------------------------------------------------------------------------------
/streaming-twitter/lib/messagehub.login-1.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/messagehub.login-1.0.0.jar
--------------------------------------------------------------------------------
/streaming-twitter/lib/pixiedust.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/pixiedust.jar
--------------------------------------------------------------------------------
/streaming-twitter/notebook/Spark Streaming Twitter-Watson-MessageHub.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "#Spark Streaming sample application using Twitter, Watson Tone Analyzer, Event Hub and Message Hub\n",
8 | "In this Notebook, we show how to run a Spark Streaming application using a Notebook. There are multiple limitations to be aware of: \n",
9 | "1. The application will stop when the page is refreshed or closed\n",
10 | "2. As events are being processed, the application generates lots of console output which may cause memory to build up in the browser. Therefore it is not recommended to run the application for too long \n",
11 | "\n",
12 | "The code can be found here: https://github.com/ibm-watson-data-lab/spark.samples/tree/master/streaming-twitter \n",
13 | "The following code is using a pre-built jar that has been posted on the Github project, but you can replace with your own url if needed."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {
20 | "collapsed": false
21 | },
22 | "outputs": [],
23 | "source": [
24 | "%AddJar https://github.com/DTAIEB/demos/raw/master/streaming-twitter-assembly-1.6.jar -f"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "##Set up the credentials for the different services\n",
32 | "Please refer to the tutorial for details on how to find the credentials for all the services, then add the value in the placeholders specified in the code below"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "collapsed": false
40 | },
41 | "outputs": [],
42 | "source": [
43 | "val demo = com.ibm.cds.spark.samples.MessageHubStreamingTwitter\n",
44 | "val config = demo.getConfig()\n",
45 | "\n",
46 | "//Watson Tone Analyzer service\n",
47 | "config.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer-beta/api\")\n",
48 | "config.setConfig(\"watson.tone.password\",\"XXXX\")\n",
49 | "config.setConfig(\"watson.tone.username\",\"XXXX\")\n",
50 | "\n",
51 | "//Message Hub/Kafka service\n",
52 | "config.setConfig(\"bootstrap.servers\",\"kafka01-prod01.messagehub.services.us-south.bluemix.net:9093,kafka02-prod01.messagehub.services.us-south.bluemix.net:9093,kafka03-prod01.messagehub.services.us-south.bluemix.net:9093,kafka04-prod01.messagehub.services.us-south.bluemix.net:9093,kafka05-prod01.messagehub.services.us-south.bluemix.net:9093\")\n",
53 | "config.setConfig(\"api_key\",\"XXXX\")\n",
54 | "config.setConfig(\"kafka.topic.tweet\",\"twitter-spark\")\n",
55 | "config.setConfig(\"kafka.user.name\",\"XXXX\")\n",
56 | "config.setConfig(\"kafka.user.password\",\"XXXX\")\n",
57 | "config.setConfig(\"kafka_rest_url\",\"https://kafka-rest-prod01.messagehub.services.us-south.bluemix.net:443\")\n",
58 | "\n",
59 | "//Spark Streaming checkpointing configuration with Object Storage Swift container\n",
60 | "config.setConfig(\"name\",\"spark\");\n",
61 | "config.setConfig(\"auth_url\",\"https://identity.open.softlayer.com\");\n",
62 | "config.setConfig(\"project_id\",\"XXXX\");\n",
63 | "config.setConfig(\"region\",\"dallas\");\n",
64 | "config.setConfig(\"user_id\",\"XXXX\");\n",
65 | "config.setConfig(\"password\",\"XXXX\");\n",
66 | "config.setConfig(\"checkpointDir\", \"swift://notebooks.spark/ssc\")"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "##Producing tweets directly from Twitter\n",
74 | "Optional: The following cell is to be used only if your MessageConnect service doesn't work. \n",
75 | "In the next cell, you configure your Twitter credentials and call the code that will connect to Twitter, fetch the tweets and send them to MessageHub for consumption (Please refer to the tutorial for more information)"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "collapsed": false
83 | },
84 | "outputs": [],
85 | "source": [
86 | "config.setConfig(\"twitter4j.oauth.consumerKey\",\"XXXX\")\n",
87 | "config.setConfig(\"twitter4j.oauth.consumerSecret\",\"XXXX\")\n",
88 | "config.setConfig(\"twitter4j.oauth.accessToken\",\"XXXX\")\n",
89 | "config.setConfig(\"twitter4j.oauth.accessTokenSecret\",\"XXXX\")\n",
90 | "val twitterStream = com.ibm.cds.spark.samples.KafkaProducerTest.createTwitterStream(config)"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "##Start the Spark Stream to collect tweets from Message Hub\n",
98 | "Start a new Twitter Stream that collects the live tweets and enrich them with Sentiment Analysis scores. The stream is run for a duration specified in the second argument of the **startTwitterStreaming** method.\n",
99 | "Note: if no duration is specified then the stream will run until the **stopTwitterStreaming** method is called."
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {
106 | "collapsed": false,
107 | "scrolled": false
108 | },
109 | "outputs": [],
110 | "source": [
111 | "demo.startTwitterStreaming(sc)"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "##Close the Tweet producer\n",
119 | "Optional: To be used only if you have started it"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {
126 | "collapsed": false
127 | },
128 | "outputs": [],
129 | "source": [
130 | "com.ibm.cds.spark.samples.KafkaProducerTest.closeTwitterStream"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "##Close the Spark Streaming"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {
144 | "collapsed": false
145 | },
146 | "outputs": [],
147 | "source": [
148 | "demo.stopTwitterStreaming"
149 | ]
150 | }
151 | ],
152 | "metadata": {
153 | "kernelspec": {
154 | "display_name": "Scala 2.10",
155 | "language": "scala",
156 | "name": "spark"
157 | },
158 | "language_info": {
159 | "name": "scala"
160 | },
161 | "name": "Twitter + Watson Tone Analyzer Part 1.ipynb"
162 | },
163 | "nbformat": 4,
164 | "nbformat_minor": 0
165 | }
--------------------------------------------------------------------------------
/streaming-twitter/notebook/Twitter + Watson Tone Analyzer Part 1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "#Twitter + Watson Tone Analyzer sample Notebook Part 1: Loading the data\n",
8 | "In this Notebook, we show how to load the custom library generate as part of the Twitter + Watson Tone Analyzer streaming application. Code can be found here: https://github.com/ibm-watson-data-lab/spark.samples/tree/master/streaming-twitter.\n",
9 | "The following code is using a pre-built jar has been posted on the Github project, but you can replace with your own url if needed."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {
16 | "collapsed": false
17 | },
18 | "outputs": [
19 | {
20 | "name": "stdout",
21 | "output_type": "stream",
22 | "text": [
23 | "Starting download from https://github.com/ibm-watson-data-lab/spark.samples/raw/master/dist/streaming-twitter-assembly-1.6.jar\n",
24 | "Finished download of streaming-twitter-assembly-1.6.jar\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "%AddJar https://github.com/ibm-watson-data-lab/spark.samples/raw/master/dist/streaming-twitter-assembly-1.6.jar -f"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "##Set up the Twitter and Watson credentials\n",
37 | "Please refer to the tutorial for details on how to find the Twitter and Watson credentials, then add the value in the placeholders specified in the code below"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 2,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [],
47 | "source": [
48 | "val demo = com.ibm.cds.spark.samples.StreamingTwitter\n",
49 | "demo.setConfig(\"twitter4j.oauth.consumerKey\",\"XXXX\")\n",
50 | "demo.setConfig(\"twitter4j.oauth.consumerSecret\",\"XXXX\")\n",
51 | "demo.setConfig(\"twitter4j.oauth.accessToken\",\"XXXX\")\n",
52 | "demo.setConfig(\"twitter4j.oauth.accessTokenSecret\",\"XXXX\")\n",
53 | "demo.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer-beta/api\")\n",
54 | "demo.setConfig(\"watson.tone.password\",\"XXXX\")\n",
55 | "demo.setConfig(\"watson.tone.username\",\"XXXX\")"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "##Start the Spark Stream to collect live tweets\n",
63 | "Start a new Twitter Stream that collects the live tweets and enrich them with Sentiment Analysis scores. The stream is run for a duration specified in the second argument of the **startTwitterStreaming** method.\n",
64 | "Note: if no duration is specified then the stream will run until the **stopTwitterStreaming** method is called."
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "metadata": {
71 | "collapsed": false
72 | },
73 | "outputs": [
74 | {
75 | "name": "stdout",
76 | "output_type": "stream",
77 | "text": [
78 | "Twitter stream started\n",
79 | "Tweets are collected real-time and analyzed\n",
80 | "To stop the streaming and start interacting with the data use: StreamingTwitter.stopTwitterStreaming\n",
81 | "Receiver Started: TwitterReceiver-0\n",
82 | "Batch started with 139 records\n",
83 | "Batch completed with 139 records\n",
84 | "Batch started with 270 records\n",
85 | "Stopping Twitter stream. Please wait this may take a while\n",
86 | "Receiver Stopped: TwitterReceiver-0\n",
87 | "Reason: : Stopped by driver\n",
88 | "Batch completed with 270 records\n",
89 | "Twitter stream stopped\n",
90 | "You can now create a sqlContext and DataFrame with 38 Tweets created. Sample usage: \n",
91 | "val (sqlContext, df) = com.ibm.cds.spark.samples.StreamingTwitter.createTwitterDataFrames(sc)\n",
92 | "df.printSchema\n",
93 | "sqlContext.sql(\"select author, text from tweets\").show\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "import org.apache.spark.streaming._\n",
99 | "demo.startTwitterStreaming(sc, Seconds(40))"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "##Create a SQLContext and a dataframe with all the tweets\n",
107 | "Note: this method will register a SparkSQL table called tweets"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 4,
113 | "metadata": {
114 | "collapsed": false
115 | },
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "A new table named tweets with 38 records has been correctly created and can be accessed through the SQLContext variable\n",
122 | "Here's the schema for tweets\n",
123 | "root\n",
124 | " |-- author: string (nullable = true)\n",
125 | " |-- date: string (nullable = true)\n",
126 | " |-- lang: string (nullable = true)\n",
127 | " |-- text: string (nullable = true)\n",
128 | " |-- lat: double (nullable = true)\n",
129 | " |-- long: double (nullable = true)\n",
130 | " |-- Anger: double (nullable = true)\n",
131 | " |-- Disgust: double (nullable = true)\n",
132 | " |-- Fear: double (nullable = true)\n",
133 | " |-- Joy: double (nullable = true)\n",
134 | " |-- Sadness: double (nullable = true)\n",
135 | " |-- Analytical: double (nullable = true)\n",
136 | " |-- Confident: double (nullable = true)\n",
137 | " |-- Tentative: double (nullable = true)\n",
138 | " |-- Openness: double (nullable = true)\n",
139 | " |-- Conscientiousness: double (nullable = true)\n",
140 | " |-- Extraversion: double (nullable = true)\n",
141 | " |-- Agreeableness: double (nullable = true)\n",
142 | " |-- EmotionalRange: double (nullable = true)\n",
143 | "\n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "val (sqlContext, df) = demo.createTwitterDataFrames(sc)"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "##Execute a SparkSQL query that contains all the data"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 5,
161 | "metadata": {
162 | "collapsed": false
163 | },
164 | "outputs": [
165 | {
166 | "name": "stdout",
167 | "output_type": "stream",
168 | "text": [
169 | "+--------------------+--------------------+-----+--------------------+---+----+------------------+------------------+------------------+-----------------+------------------+----------+---------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+\n",
170 | "| author| date| lang| text|lat|long| Anger| Disgust| Fear| Joy| Sadness|Analytical|Confident| Tentative| Openness| Conscientiousness| Extraversion| Agreeableness| EmotionalRange|\n",
171 | "+--------------------+--------------------+-----+--------------------+---+----+------------------+------------------+------------------+-----------------+------------------+----------+---------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+\n",
172 | "|Three Words o Wisdom|Sun Mar 06 13:00:...|en-gb|wildebeest rebuff...|0.0| 0.0| 11.0| 20.0| 19.0| 44.0| 22.0| 0.0| 0.0| 0.0| 80.0| 56.00000000000001| 15.0| 1.0| 39.0|\n",
173 | "| Jonny P|Sun Mar 06 13:00:...| en|Getting a pizza i...|0.0| 0.0| 8.0| 5.0| 13.0|56.00000000000001| 5.0| 0.0| 0.0|56.99999999999999| 24.0| 23.0| 83.0|56.99999999999999| 82.0|\n",
174 | "| Kayla|Sun Mar 06 13:00:...| en|RT @ebhoniogarro:...|0.0| 0.0| 2.0| 0.0| 1.0| 99.0| 2.0| 0.0| 0.0| 0.0| 30.0| 56.00000000000001| 85.0| 66.0| 39.0|\n",
175 | "| Adamlbr|Sun Mar 06 13:00:...| en|New Event now on....|0.0| 0.0| 24.0| 10.0| 11.0| 46.0| 4.0| 0.0| 0.0| 0.0| 11.0| 98.0| 46.0| 49.0| 6.0|\n",
176 | "|Lexa deserved better|Sun Mar 06 13:00:...| en|RT @canoodleclexa...|0.0| 0.0| 8.0| 7.000000000000001| 9.0| 80.0| 7.000000000000001| 84.0| 0.0| 0.0| 12.0|28.000000000000004| 73.0| 59.0| 51.0|\n",
177 | "| LoveBakesGoodCakes|Sun Mar 06 13:00:...| en|Yum, yum! Honey B...|0.0| 0.0| 41.0| 2.0| 6.0| 62.0| 7.000000000000001| 0.0| 0.0| 0.0| 60.0| 69.0| 64.0| 18.0| 11.0|\n",
178 | "| High Tech Planet|Sun Mar 06 13:00:...| en|Google is testing...|0.0| 0.0| 11.0| 5.0| 32.0| 37.0| 5.0| 78.0| 0.0| 0.0|56.99999999999999| 30.0| 6.0| 13.0|57.99999999999999|\n",
179 | "| Kael|Sun Mar 06 13:00:...| en|RT @mgiseelle: Ha...|0.0| 0.0| 16.0| 4.0|14.000000000000002| 23.0| 13.0| 0.0| 0.0| 0.0| 68.0| 85.0|57.99999999999999| 35.0| 6.0|\n",
180 | "| Ryan|Sun Mar 06 13:00:...| en|ALL THAT EFFORT T...|0.0| 0.0| 19.0|14.000000000000002| 24.0| 12.0| 24.0| 61.0| 79.0| 0.0| 78.0| 3.0| 49.0| 1.0| 91.0|\n",
181 | "| princesss|Sun Mar 06 13:00:...| en|RT @SexualGif: Be...|0.0| 0.0| 13.0| 7.000000000000001| 13.0| 34.0| 15.0| 0.0| 0.0| 0.0|56.00000000000001| 93.0| 62.0| 38.0| 39.0|\n",
182 | "| Fadi Nasser|Sun Mar 06 13:00:...| en|#USA missiles cha...|0.0| 0.0| 7.000000000000001| 10.0| 8.0| 30.0| 13.0| 0.0| 0.0| 0.0| 94.0| 75.0| 27.0| 23.0| 20.0|\n",
183 | "| Briyon?e|Sun Mar 06 13:00:...| en|RT @tonestradamus...|0.0| 0.0| 52.0| 19.0| 5.0| 1.0|14.000000000000002| 23.0| 0.0| 75.0| 21.0| 6.0| 84.0| 44.0| 59.0|\n",
184 | "| BarnBurnerBBQ|Sun Mar 06 13:00:...| en|Presenting sponso...|0.0| 0.0| 10.0| 18.0| 10.0| 26.0| 8.0| 67.0| 0.0| 0.0| 36.0| 91.0| 71.0| 91.0| 2.0|\n",
185 | "| Majid Navabi|Sun Mar 06 13:00:...| en| Download|0.0| 0.0| 12.0| 9.0| 18.0|56.99999999999999|14.000000000000002| 0.0| 0.0| 0.0| 52.0| 56.00000000000001| 15.0| 100.0| 0.0|\n",
186 | "| ?????? ?????|Sun Mar 06 13:00:...| en|RT @Adel__Almalki...|0.0| 0.0| 43.0| 6.0| 20.0| 3.0| 2.0| 0.0| 0.0| 0.0| 90.0| 56.00000000000001| 15.0| 1.0| 39.0|\n",
187 | "| liv|Sun Mar 06 13:00:...| en|RT @iamjojo: You ...|0.0| 0.0| 5.0| 2.0| 9.0| 89.0| 9.0| 0.0| 0.0| 0.0| 2.0| 2.0| 100.0| 85.0| 2.0|\n",
188 | "| LADY GAGA|Sun Mar 06 13:00:...| en|Miek_tweet #TilIt...|0.0| 0.0| 16.0| 16.0| 8.0| 23.0| 21.0| 0.0| 0.0| 0.0| 80.0| 56.00000000000001| 15.0| 1.0| 39.0|\n",
189 | "| donatello ;)|Sun Mar 06 13:00:...| en|RT @__trillgawdd:...|0.0| 0.0|14.000000000000002| 3.0| 13.0| 66.0| 9.0| 0.0| 0.0| 0.0| 30.0| 56.00000000000001| 53.0| 69.0| 20.0|\n",
190 | "| Liz|Sun Mar 06 13:00:...| en|RT @Samantha_Evel...|0.0| 0.0| 12.0| 8.0| 24.0| 10.0| 33.0| 43.0| 72.0| 91.0| 5.0| 12.0| 34.0| 61.0| 97.0|\n",
191 | "| Chrystal Johnson|Sun Mar 06 13:00:...| en|Take Aromatherapy...|0.0| 0.0| 16.0| 12.0| 44.0| 8.0| 8.0| 0.0| 0.0| 0.0| 71.0| 96.0| 40.0| 60.0| 2.0|\n",
192 | "+--------------------+--------------------+-----+--------------------+---+----+------------------+------------------+------------------+-----------------+------------------+----------+---------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+\n",
193 | "only showing top 20 rows\n",
194 | "\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "val fullSet = sqlContext.sql(\"select * from tweets\") //Select all columns\n",
200 | "fullSet.show"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "##SparkSQL query example on the data.\n",
208 | "Select all the tweets that have Anger score greated than 70%"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 6,
214 | "metadata": {
215 | "collapsed": false
216 | },
217 | "outputs": [
218 | {
219 | "name": "stdout",
220 | "output_type": "stream",
221 | "text": [
222 | "0\n",
223 | "+----+\n",
224 | "|text|\n",
225 | "+----+\n",
226 | "+----+\n",
227 | "\n"
228 | ]
229 | }
230 | ],
231 | "source": [
232 | "val set = sqlContext.sql(\"select text from tweets where Anger > 60\")\n",
233 | "println(set.count)\n",
234 | "set.show"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "##Persist the dataset into a parquet file on Object Storage service\n",
242 | "The parquet file will be reloaded in IPython Part 2 Notebook\n",
243 | "Note: you can disregard the warning messages related to SLF4J"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 7,
249 | "metadata": {
250 | "collapsed": false
251 | },
252 | "outputs": [
253 | {
254 | "name": "stdout",
255 | "output_type": "stream",
256 | "text": [
257 | "SLF4J: Failed to load class \"org.slf4j.impl.StaticLoggerBinder\".\n",
258 | "SLF4J: Defaulting to no-operation (NOP) logger implementation\n",
259 | "SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.\n"
260 | ]
261 | }
262 | ],
263 | "source": [
264 | "fullSet.repartition(1).saveAsParquetFile(\"swift://notebooks.spark/tweetsFull.parquet\")"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {
271 | "collapsed": true
272 | },
273 | "outputs": [],
274 | "source": []
275 | }
276 | ],
277 | "metadata": {
278 | "kernelspec": {
279 | "display_name": "Scala 2.10",
280 | "language": "scala",
281 | "name": "spark"
282 | },
283 | "language_info": {
284 | "name": "scala"
285 | },
286 | "name": "Twitter + Watson Tone Analyzer Part 1.ipynb"
287 | },
288 | "nbformat": 4,
289 | "nbformat_minor": 0
290 | }
--------------------------------------------------------------------------------
/streaming-twitter/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 |
--------------------------------------------------------------------------------
/streaming-twitter/readme.md:
--------------------------------------------------------------------------------
1 | #Sentiment Analysis of Twitter Hashtags
2 |
3 | ####Use Spark Streaming in combination with IBM Watson to perform sentiment analysis showing how a conversation is trending on Twitter.
4 |
5 | Track how consumers feel about you based on their tweets. To get real-time sentiment analysis, deploy our sample **Spark Streaming with Twitter and Watson** app on Bluemix and use its Notebook to analyze public opinion.
6 |
7 |
8 | This sample app uses Spark Streaming to create a feed that captures live tweets from Twitter. You can filter the tweets that contain the hashtag(s) of your choice. The tweet data is enriched in real time with various sentiment scores provided by the Watson Tone Analyzer service (available on Bluemix). This service provides insight into sentiment, or how the author feels. Then use Spark SQL to load the data into a DataFrame for further analysis. Here's the basic architecture of this app:
9 | 
10 |
11 | Follow the full tutorial to understand how it works and create your own stream.
12 |
13 | [Get started](https://developer.ibm.com/clouddataservices/sentiment-analysis-of-twitter-hashtags/)
14 |
--------------------------------------------------------------------------------
/streaming-twitter/sampleConfig/sampleconf.properties:
--------------------------------------------------------------------------------
1 | #Twitter credentials
2 | twitter4j.oauth.consumerKey=XXXX
3 | twitter4j.oauth.consumerSecret=XXXX
4 | twitter4j.oauth.accessToken=XXXX
5 | twitter4j.oauth.accessTokenSecret=XXXX
6 |
7 | #MessageHub
8 | kafka.topic.tweet=twitter-spark
9 | kafka.user.name=XXXX
10 | kafka.user.password=XXXX
11 | bootstrap.servers=kafka01-prod01.messagehub.services.us-south.bluemix.net:9093,\
12 | kafka02-prod01.messagehub.services.us-south.bluemix.net:9093,\
13 | kafka03-prod01.messagehub.services.us-south.bluemix.net:9093,\
14 | kafka04-prod01.messagehub.services.us-south.bluemix.net:9093,\
15 | kafka05-prod01.messagehub.services.us-south.bluemix.net:9093
16 | api_key=XXXX
17 | kafka_rest_url=https://kafka-rest-prod01.messagehub.services.us-south.bluemix.net:443
18 |
19 | #Watson Tone Analyzer
20 | watson.tone.url=https://gateway.watsonplatform.net/tone-analyzer-experimental/api
21 | watson.tone.password=XXXX
22 | watson.tone.username=XXXX
23 |
24 | #Checkpoint directory
25 | checkpointDir=XXXX
26 |
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/KafkaProducerTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package com.ibm.cds.spark.samples
18 |
19 | import java.io.ByteArrayInputStream
20 | import java.io.ByteArrayOutputStream
21 | import java.io.ObjectInputStream
22 | import java.io.ObjectOutputStream
23 | import java.util.concurrent.TimeUnit
24 | import scala.collection.JavaConversions.mapAsJavaMap
25 | import scala.collection.JavaConversions.seqAsJavaList
26 | import org.apache.kafka.clients.consumer.KafkaConsumer
27 | import org.apache.kafka.clients.producer.ProducerRecord
28 | import org.apache.kafka.common.serialization.Deserializer
29 | import org.apache.kafka.common.serialization.Serializer
30 | import org.apache.kafka.common.serialization.StringDeserializer
31 | import org.apache.log4j.Level
32 | import org.apache.log4j.Logger
33 | import com.ibm.cds.spark.samples.config.MessageHubConfig
34 | import twitter4j.StallWarning
35 | import twitter4j.Status
36 | import twitter4j.StatusDeletionNotice
37 | import twitter4j.StatusListener
38 | import twitter4j.TwitterStreamFactory
39 | import scala.util.parsing.json.JSON
40 | import java.io.InputStream
41 | import twitter4j.TwitterStream
42 | import com.ibm.cds.spark.samples.config.DemoConfig
43 | import org.apache.spark.Logging
44 |
45 |
46 | /**
47 | * @author dtaieb
48 | */
49 | object KafkaProducerTest extends Logging{
50 | //Very verbose, enable only if necessary
51 | //Logger.getLogger("org.apache.kafka").setLevel(Level.ALL)
52 | //Logger.getLogger("kafka").setLevel(Level.ALL)
53 |
54 | var twitterStream : TwitterStream = _;
55 |
56 | def main(args: Array[String]): Unit = {
57 | createTwitterStream();
58 | }
59 |
60 | def createTwitterStream(props: DemoConfig=null):TwitterStream = {
61 | if( twitterStream != null){
62 | println("Twitter Stream already running. Please call closeTwitterStream first");
63 | return twitterStream;
64 | }
65 | var kafkaProps:MessageHubConfig = null;
66 | if ( props == null ){
67 | kafkaProps = new MessageHubConfig
68 | }else{
69 | kafkaProps = props.cloneConfig
70 | }
71 | kafkaProps.setValueSerializer[StatusSerializer]
72 | kafkaProps.validateConfiguration("watson.tone.")
73 | kafkaProps.createTopicsIfNecessary( kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS ) )
74 | val kafkaProducer = new org.apache.kafka.clients.producer.KafkaProducer[java.lang.String, Status]( kafkaProps.toImmutableMap() );
75 |
76 | twitterStream = new TwitterStreamFactory().getInstance();
77 | twitterStream.addListener( new StatusListener(){
78 | var lastSent:Long = 0;
79 | def onStatus(status: Status){
80 | if ( lastSent == 0 || System.currentTimeMillis() - lastSent > 200L){
81 | lastSent = System.currentTimeMillis()
82 | logInfo("Got a status " + status.getText )
83 | val producerRecord = new ProducerRecord(kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS ), "tweet", status )
84 | try{
85 | val metadata = kafkaProducer.send( producerRecord ).get(2000, TimeUnit.SECONDS);
86 | logInfo("Successfully sent record: Topic: " + metadata.topic + " Offset: " + metadata.offset )
87 | }catch{
88 | case e:Throwable => e.printStackTrace
89 | }
90 | }
91 | }
92 | def onDeletionNotice( notice: StatusDeletionNotice){
93 |
94 | }
95 | def onTrackLimitationNotice( numLimitation : Int){
96 | println("Received track limitation notice from Twitter: " + numLimitation)
97 | }
98 |
99 | def onException( e: Exception){
100 | println("Unexpected error from twitterStream: " + e.getMessage);
101 | logError(e.getMessage, e)
102 | }
103 |
104 | def onScrubGeo(lat: Long, long: Long ){
105 |
106 | }
107 |
108 | def onStallWarning(warning: StallWarning ){
109 |
110 | }
111 | })
112 |
113 | //Start twitter stream sampling
114 | twitterStream.sample();
115 |
116 | println("Twitter stream started. Tweets will flow to MessageHub instance. Please call closeTwitterStream to stop the stream")
117 | twitterStream
118 | }
119 |
120 | def closeTwitterStream(){
121 | if ( twitterStream==null){
122 | println("Nothing to close. Twitter stream has not been started")
123 | }else{
124 | println("Stopping twitter stream");
125 | twitterStream.shutdown()
126 | twitterStream=null
127 | println("Twitter Stream stopped")
128 | }
129 | }
130 | }
131 |
132 | object KafkaConsumerTest {
133 | def main(args: Array[String]): Unit = {
134 | val kafkaProps = new MessageHubConfig
135 | kafkaProps.validateConfiguration("watson.tone.")
136 | val kafkaConsumer = new KafkaConsumer[java.lang.String, StatusAdapter](kafkaProps.toImmutableMap, new StringDeserializer(), new StatusDeserializer())
137 |
138 | kafkaConsumer.subscribe( List(kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS )) )
139 | new Thread( new Runnable {
140 | def run(){
141 | while( true ){
142 | Thread.sleep( 1000L )
143 | val it = kafkaConsumer.poll(1000L).iterator
144 | while( it.hasNext() ){
145 | val record = it.next();
146 | println( record.value );
147 | }
148 | }
149 | }
150 | }).start
151 | }
152 | }
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/MessageHubStreamingTwitter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.ibm.cds.spark.samples
19 |
20 | import scala.BigDecimal
21 | import scala.collection.JavaConversions.mapAsJavaMap
22 | import scala.collection.immutable.Seq.canBuildFrom
23 | import scala.collection.mutable.ListBuffer
24 | import scala.collection.mutable.Map
25 | import scala.reflect.ClassTag
26 | import org.apache.kafka.clients.producer.ProducerRecord
27 | import org.apache.kafka.common.serialization.StringDeserializer
28 | import org.apache.kafka.common.serialization.StringSerializer
29 | import org.apache.spark.HashPartitioner
30 | import org.apache.spark.SparkConf
31 | import org.apache.spark.SparkContext
32 | import org.apache.spark.rdd.RDD
33 | import org.apache.spark.sql.Row
34 | import org.apache.spark.streaming.Duration
35 | import org.apache.spark.streaming.Seconds
36 | import org.apache.spark.streaming.StreamingContext
37 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
38 | import org.http4s.client.blaze.PooledHttp1Client
39 | import com.google.common.base.CharMatcher
40 | import com.ibm.cds.spark.samples.config.MessageHubConfig
41 | import com.ibm.cds.spark.samples.dstream.KafkaStreaming.KafkaStreamingContextAdapter
42 | import twitter4j.Status
43 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted
44 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchCompleted
45 | import com.ibm.cds.spark.samples.config.DemoConfig
46 | import org.apache.log4j.Level
47 | import org.apache.log4j.Logger
48 | import org.apache.spark.streaming.dstream.DStream
49 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStopped
50 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverError
51 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
52 | import org.apache.spark.broadcast.Broadcast
53 | import org.apache.spark.Logging
54 | import java.util.Arrays
55 |
56 | /**
57 | * @author dtaieb
58 | * Twitter+Watson sample app with MessageHub/Kafka
59 | */
60 | object MessageHubStreamingTwitter extends Logging{
61 |
62 | var ssc: StreamingContext = null
63 | val reuseCheckpoint = false;
64 |
65 | val queue = new scala.collection.mutable.Queue[(String, String)]
66 |
67 | final val KAFKA_TOPIC_TOP_HASHTAGS = "topHashTags"
68 | final val KAFKA_TOPIC_TONE_SCORES = "topHashTags.toneScores"
69 | final val KAFKA_TOPIC_TOTAL_TWEETS_PROCESSED = "total_tweets"
70 |
71 | //Logger.getLogger("org.apache.kafka").setLevel(Level.ALL)
72 | //Logger.getLogger("kafka").setLevel(Level.ALL)
73 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
74 |
75 | def main(args: Array[String]): Unit = {
76 | println("Printing arguments: ");
77 | args.foreach { println }
78 |
79 | if(args.length>0 && System.getProperty("DEMO_CONFIG_PATH") == null ){
80 | //On Spark Service, input files are passed as parameters, if available, we assume first parameter is config file
81 | System.setProperty("DEMO_CONFIG_PATH", args(0))
82 | }
83 | val conf = new SparkConf().setAppName("Spark Streaming Twitter + Watson with MessageHub/Kafka Demo")
84 | val sc = new SparkContext(conf)
85 | startTwitterStreaming(sc);
86 |
87 | if(ssc!=null){
88 | //When running as stand alone app, we call awaitTermination to make sure the JVM doesn't exit prematurely due to the fact
89 | //that all non-daemon threads have terminated. Note: Don't call awaitTermination directly from startTwitterStreaming as it can be run
90 | //From Notebook
91 | ssc.awaitTermination()
92 | }
93 | }
94 |
95 | //Hold configuration key/value pairs
96 | lazy val kafkaProps = new MessageHubConfig
97 |
98 | //Wrapper api for Notebook access
99 | def getConfig():DemoConfig={
100 | kafkaProps
101 | }
102 |
103 | def startTwitterStreaming( sc: SparkContext, stopAfter: Duration = Seconds(0) ){
104 | if ( ssc != null ){
105 | println("Twitter Stream already running");
106 | return;
107 | }
108 |
109 | kafkaProps.setValueSerializer[StringSerializer];
110 |
111 | if ( !kafkaProps.validateConfiguration("twitter4j.oauth") ){
112 | return;
113 | }
114 |
115 | //Set the hadoop configuration if needed
116 | val checkpointDir = kafkaProps.getConfig( MessageHubConfig.CHECKPOINT_DIR_KEY );
117 | if ( checkpointDir.startsWith("swift") ){
118 | println("Setting hadoop configuration for swift container")
119 | kafkaProps.set_hadoop_config(sc)
120 | }
121 |
122 | //Make sure the topics are already created
123 | kafkaProps.createTopicsIfNecessary( KAFKA_TOPIC_TONE_SCORES, KAFKA_TOPIC_TOP_HASHTAGS, KAFKA_TOPIC_TOTAL_TWEETS_PROCESSED )
124 |
125 | val kafkaProducer = new org.apache.kafka.clients.producer.KafkaProducer[String, String]( kafkaProps.toImmutableMap );
126 |
127 | if ( !reuseCheckpoint ){
128 | createStreamingContextAndRunAnalytics(sc);
129 | }else{
130 | ssc = StreamingContext.getOrCreate(
131 | kafkaProps.getConfig( MessageHubConfig.CHECKPOINT_DIR_KEY ),
132 | () => {
133 | createStreamingContextAndRunAnalytics(sc);
134 | },
135 | sc.hadoopConfiguration,
136 | true
137 | );
138 | }
139 |
140 | ssc.addStreamingListener( new StreamingListener() )
141 |
142 | new Thread( new Runnable() {
143 | def run(){
144 | while(ssc!=null){
145 | while(!queue.isEmpty ){
146 | try{
147 | var task:(String,String) = null;
148 | queue.synchronized{
149 | task = queue.dequeue();
150 | }
151 | if ( task != null ){
152 | val producerRecord = new ProducerRecord[String,String](task._1, "tweet", task._2 )
153 | val metadata = kafkaProducer.send( producerRecord ).get;
154 | logInfo("Sent record " + metadata.offset() + " Topic " + task._1)
155 | }
156 | }catch{
157 | case e:Throwable => logError(e.getMessage, e)
158 | }
159 | }
160 | queue.synchronized{
161 | queue.wait();
162 | }
163 | }
164 | }
165 | },"Message Hub producer").start
166 |
167 | ssc.start
168 |
169 | println("Twitter stream started");
170 | println("Tweets are collected real-time and analyzed")
171 | println("To stop the streaming and start interacting with the data use: StreamingTwitter.stopTwitterStreaming")
172 |
173 | if ( !stopAfter.isZero ){
174 | //Automatically stop it after 10s
175 | new Thread( new Runnable {
176 | def run(){
177 | Thread.sleep( stopAfter.milliseconds )
178 | stopTwitterStreaming
179 | }
180 | }).start
181 | }
182 | }
183 |
184 | def createStreamingContextAndRunAnalytics(sc:SparkContext):StreamingContext={
185 | //Broadcast the config to each worker node
186 | val broadcastVar = sc.broadcast( kafkaProps.toImmutableMap )
187 | ssc = new StreamingContext( sc, Seconds(5) )
188 | ssc.checkpoint(kafkaProps.getConfig( MessageHubConfig.CHECKPOINT_DIR_KEY ));
189 | val stream = ssc.createKafkaStream[String, StatusAdapter,StringDeserializer, StatusDeserializer](
190 | kafkaProps,
191 | List(kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS ))
192 | );
193 | runAnalytics(sc, broadcastVar, stream)
194 | ssc;
195 | }
196 |
197 | def runAnalytics(sc:SparkContext, broadcastVar: Broadcast[scala.collection.immutable.Map[String,String]], stream:DStream[(String,StatusAdapter)]){
198 | val keys = broadcastVar.value.get("tweets.key").get.split(",");
199 | val tweets = stream.map( t => t._2)
200 | .filter { status =>
201 | status.userLang.startsWith("en") && CharMatcher.ASCII.matchesAllOf(status.text) && ( keys.isEmpty || keys.exists{status.text.contains(_)})
202 | }
203 |
204 | val rowTweets = tweets.map(status=> {
205 | lazy val client = PooledHttp1Client()
206 | val sentiment = ToneAnalyzer.computeSentiment( client, status, broadcastVar )
207 | var scoreMap : Map[String, Double] = Map()
208 | if ( sentiment != null ){
209 | for( toneCategory <- Option(sentiment.tone_categories).getOrElse( Seq() )){
210 | for ( tone <- Option( toneCategory.tones ).getOrElse( Seq() ) ){
211 | scoreMap.put( tone.tone_id, (BigDecimal(tone.score).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) * 100.0 )
212 | }
213 | }
214 | }
215 |
216 | EnrichedTweet(
217 | status.userName,
218 | status.userId,
219 | status.createdAt,
220 | status.userLang,
221 | status.text,
222 | status.long,
223 | status.lat,
224 | scoreMap
225 | )
226 | })
227 |
228 | val delimTagTone = "-%!"
229 | val delimToneScore = ":%@"
230 | val statsStream = rowTweets.map { eTweet => ("total_tweets", 1L) }
231 | .reduceByKey( _+_ )
232 | .updateStateByKey( (a:Seq[Long], b:Option[Long] ) => {
233 | var runningCount=b.getOrElse(0L)
234 | a.foreach { v => runningCount=runningCount+v }
235 | Some(runningCount)
236 | })
237 | statsStream.foreachRDD( rdd =>{
238 | queue.synchronized{
239 | queue+=((KAFKA_TOPIC_TOTAL_TWEETS_PROCESSED, TweetsMetricJsonSerializer.serialize(rdd.collect())))
240 | try{
241 | queue.notify
242 | }catch{
243 | case e:Throwable=>logError(e.getMessage, e)
244 | }
245 | }
246 | })
247 |
248 | val metricsStream = rowTweets.flatMap { eTweet => {
249 | val retList = ListBuffer[String]()
250 | for ( tag <- eTweet.text.split("\\s+") ){
251 | if ( tag.startsWith( "#") && tag.length > 1 ){
252 | for ( tone <- Option( eTweet.sentimentScores.keys ).getOrElse( Seq() ) ){
253 | retList += (tag + delimTagTone + tone + delimToneScore + eTweet.sentimentScores.getOrElse( tone, 0.0))
254 | }
255 | }
256 | }
257 | retList.toList
258 | }}
259 | .map { fullTag => {
260 | val split = fullTag.split(delimToneScore);
261 | (split(0), split(1).toFloat)
262 | }}
263 | .combineByKey(
264 | (x:Float) => (x,1),
265 | (x:(Float,Int), y:Float) => (x._1 + y, x._2+1),
266 | (x:(Float,Int),y:(Float,Int)) => (x._1 + y._1, x._2 + y._2),
267 | new HashPartitioner(sc.defaultParallelism)
268 | )
269 | .map[(String,(Long/*count*/, List[(String, Double)]))]{ t => {
270 | val key = t._1;
271 | val ab = t._2;
272 | val split = key.split(delimTagTone)
273 | (split(0), (ab._2, List((split(1), BigDecimal(ab._1/ab._2).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble ))))
274 | }}
275 | .reduceByKey( (t,u) => (t._1+u._1, (t._2 ::: u._2).sortWith( (l,r) => l._1.compareTo( r._1 ) < 0 )))
276 | .mapValues( (item:(Long, List[(String,Double)])) => {
277 | val unzip = item._2.unzip
278 | (item._1/(item._2.size), unzip._1, unzip._2)
279 | })
280 | .updateStateByKey( (a:scala.collection.Seq[(Long, List[String], List[Double])], b: Option[(Long, List[String], List[Double])]) => {
281 | val safeB = b.getOrElse( (0L, List(), List() ) )
282 | var listTones = safeB._2
283 | var listScores = safeB._3
284 | var count = safeB._1
285 | for( item <- a ){
286 | count += item._1
287 | listScores = listScores.zipAll( item._3, 0.0, 0.0).map{ case(a,b)=>(a+b)/2 }.toList
288 | listTones = item._2
289 | }
290 |
291 | Some( (count, listTones, listScores) )
292 | })
293 |
294 | metricsStream.print
295 |
296 | metricsStream.foreachRDD( rdd =>{
297 | val topHashTags = rdd.sortBy( f => f._2._1, false ).take(5)
298 | if ( !topHashTags.isEmpty){
299 | queue.synchronized{
300 | queue += ((KAFKA_TOPIC_TOP_HASHTAGS, TweetsMetricJsonSerializer.serialize(topHashTags.map( f => (f._1, f._2._1 )))))
301 | queue += ((KAFKA_TOPIC_TONE_SCORES, ToneScoreJsonSerializer.serialize(topHashTags)))
302 | try{
303 | queue.notify
304 | }catch{
305 | case e:Throwable=>logError(e.getMessage, e)
306 | }
307 | }
308 | }
309 | })
310 | }
311 |
312 | def stopTwitterStreaming(){
313 | if ( ssc == null){
314 | println("No Twitter stream to stop");
315 | return;
316 | }
317 |
318 | println("Stopping Twitter stream. Please wait this may take a while")
319 | ssc.stop(stopSparkContext = false, stopGracefully = true)
320 | ssc = null
321 | println("Twitter stream stopped");
322 | }
323 | }
324 |
325 | object TweetsMetricJsonSerializer extends Logging{
326 | def serialize(value: Seq[(String,Long)] ): String = {
327 | val sb = new StringBuilder("[")
328 | var comma = ""
329 | value.foreach( item => {
330 | sb.append( comma + "[\"" + item._1.replaceAll("\"", "") + "\"," + item._2 + "]")
331 | comma=","
332 | })
333 | sb.append("]")
334 | logInfo("Serialized json: " + sb)
335 | sb.toString()
336 | }
337 | }
338 |
339 | object ToneScoreJsonSerializer extends Logging{
340 | def serializeList[U:ClassTag]( label: String, value: List[U] ):String = {
341 | val sb = new StringBuilder("[\"" + label.replaceAll("\"", "") + "\"")
342 | value.foreach { item => {
343 | if ( item.isInstanceOf[String] ) {
344 | val s = ",\"" + item.toString().replaceAll("\"", "") + "\"";
345 | sb.append( s.replaceAll("\"\"", "\"") )
346 | }else if ( item.isInstanceOf[Double] ){
347 | sb.append("," + item )
348 | }
349 | }}
350 | sb.append("]")
351 | sb.toString
352 | }
353 | def serialize(value:Seq[(String, (Long, List[String], List[Double]))]):String={
354 | val sb = new StringBuilder("[")
355 | var comma = ""
356 | var appendToneData = true;
357 | value.foreach( item => {
358 | if ( appendToneData ){
359 | sb.append( comma + serializeList( "x", item._2._2 ) )
360 | appendToneData = false
361 | comma = ","
362 | }
363 | sb.append( comma + serializeList( item._1, item._2._3 ) )
364 | comma=","
365 | })
366 | sb.append("]")
367 | logInfo("Serialized size: " + value.size + ". Tone json: " + sb)
368 | sb.toString()
369 | }
370 | }
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/PixiedustStreamingTwitter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.ibm.cds.spark.samples
19 |
20 | import scala.collection.mutable._
21 | import com.ibm.pixiedust.ChannelReceiver
22 | import org.apache.spark.Logging
23 | import org.apache.log4j.Logger
24 | import org.apache.log4j.Level
25 | import org.apache.spark.SparkContext
26 | import org.apache.spark.streaming.StreamingContext
27 | import org.apache.spark.rdd.RDD
28 | import org.apache.spark.sql.types.StructType
29 | import org.apache.spark.sql.Row
30 | import com.ibm.cds.spark.samples.config.DemoConfig
31 | import org.apache.spark.streaming.Seconds
32 | import org.apache.spark.sql.types.IntegerType
33 | import org.apache.spark.sql.types.DoubleType
34 | import org.http4s.client.blaze.PooledHttp1Client
35 | import org.apache.spark.sql.types.StructField
36 | import org.apache.spark.sql.types.StringType
37 | import com.google.common.base.CharMatcher
38 | import com.ibm.couchdb.CouchDb
39 | import com.ibm.couchdb.TypeMapping
40 | import com.ibm.couchdb.CouchDbApi
41 | import org.apache.spark.sql.SQLContext
42 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverError
43 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStopped
44 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
45 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchCompleted
46 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted
47 | import org.apache.spark.SparkConf
48 | import org.apache.spark.streaming.dstream.DStream
49 | import org.apache.spark.broadcast.Broadcast
50 | import org.apache.spark.HashPartitioner
51 | import twitter4j.Status
52 | import org.codehaus.jettison.json.JSONObject
53 | import org.apache.spark.AccumulableParam
54 | import org.apache.spark.streaming.StreamingContextState
55 | import org.apache.spark.sql.DataFrame
56 |
57 | /* @author dtaieb
58 | * Twitter+Watson sentiment analysis app powered by Pixiedust
59 | */
60 | object PixiedustStreamingTwitter extends ChannelReceiver() with Logging{
61 | var ssc: StreamingContext = null
62 | var workingRDD: RDD[Row] = null
63 | //Hold configuration key/value pairs
64 | lazy val config = new DemoConfig
65 | lazy val logger: Logger = Logger.getLogger( "com.ibm.cds.spark.samples.PixiedustStreamingTwitter" )
66 |
67 | val BEGINSTREAM = "@BEGINSTREAM@"
68 | val ENDSTREAM = "@ENDSTREAM@"
69 |
70 | def sendLog(s:String){
71 | send("log", s)
72 | }
73 |
74 | //Wrapper api for Notebook access
75 | def setConfig(key:String, value:String){
76 | config.setConfig(key, value)
77 | }
78 |
79 | //main method invoked when running as a standalone Spark Application
80 | def main(args: Array[String]) {
81 | val conf = new SparkConf().setAppName("Pixiedust Spark Streaming Twitter Demo")
82 | val sc = new SparkContext(conf)
83 | startStreaming();
84 | }
85 |
86 | def createTwitterDataFrames(sqlContext: SQLContext) : DataFrame = {
87 | if ( workingRDD == null || workingRDD.count <= 0 ){
88 | println("No data receive. Please start the Twitter stream again to collect data")
89 | return null
90 | }
91 |
92 | sqlContext.createDataFrame( workingRDD, schemaTweets )
93 | }
94 |
95 | class PixiedustStreamingListener extends org.apache.spark.streaming.scheduler.StreamingListener {
96 | override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) {
97 | sendLog("Receiver Started: " + receiverStarted.receiverInfo.name )
98 | //Signal the frontend that we started streaming
99 | sendLog(BEGINSTREAM)
100 | }
101 |
102 | override def onReceiverError(receiverError: StreamingListenerReceiverError) {
103 | sendLog("Receiver Error: " + receiverError.receiverInfo.lastError)
104 | }
105 |
106 | override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped) {
107 | sendLog("Receiver Stopped: " + receiverStopped.receiverInfo.name)
108 | sendLog("Reason: " + receiverStopped.receiverInfo.lastError + " : " + receiverStopped.receiverInfo.lastErrorMessage)
109 | //signal the front end that we're done streaming
110 | sendLog(ENDSTREAM)
111 | }
112 |
113 | override def onBatchStarted(batchStarted: StreamingListenerBatchStarted){
114 | sendLog("Batch started with " + batchStarted.batchInfo.numRecords + " records")
115 | }
116 |
117 | override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted){
118 | sendLog("Batch completed with " + batchCompleted.batchInfo.numRecords + " records");
119 | }
120 | }
121 |
122 | val reuseCheckpoint = false;
123 |
124 | def startStreaming(){
125 | val sc = SparkContext.getOrCreate
126 | sendLog("Starting twitter stream");
127 | if ( ssc != null ){
128 | sendLog("Twitter Stream already running");
129 | sendLog("Please use stopTwitterStreaming() first and try again");
130 | return;
131 | }
132 |
133 | if ( !config.validateConfiguration() ){
134 | sendLog("Unable to validate config")
135 | sendLog(ENDSTREAM)
136 | return;
137 | }
138 |
139 | Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
140 |
141 | //Set the hadoop configuration if needed
142 | val checkpointDir = config.getConfig( DemoConfig.CHECKPOINT_DIR_KEY );
143 | if ( checkpointDir.startsWith("swift") ){
144 | println("Setting hadoop configuration for swift container")
145 | config.set_hadoop_config(sc)
146 | }
147 |
148 | workingRDD = sc.emptyRDD
149 |
150 | if ( !reuseCheckpoint ){
151 | ssc = createStreamingContextAndRunAnalytics(sc);
152 | }else{
153 | ssc = StreamingContext.getOrCreate(
154 | config.getConfig( DemoConfig.CHECKPOINT_DIR_KEY ),
155 | () => {
156 | createStreamingContextAndRunAnalytics(sc);
157 | },
158 | sc.hadoopConfiguration,
159 | true
160 | );
161 | }
162 |
163 | ssc.addStreamingListener( new PixiedustStreamingListener )
164 |
165 | ssc.start()
166 |
167 | sendLog("Twitter stream started");
168 | }
169 |
170 | def stopStreaming(){
171 | if ( ssc == null){
172 | sendLog("No Twitter stream to stop");
173 | return;
174 | }
175 |
176 | sendLog("Stopping Twitter stream. Please wait this may take a while")
177 | ssc.stop(stopSparkContext = false, stopGracefully = false)
178 | ssc = null
179 | sendLog("Twitter stream stopped");
180 | }
181 |
182 | def createStreamingContextAndRunAnalytics(sc:SparkContext):StreamingContext={
183 | //Broadcast the config to each worker node
184 | val broadcastVar = sc.broadcast( config.toImmutableMap )
185 | ssc = new StreamingContext( sc, Seconds(5) )
186 | ssc.checkpoint(config.getConfig( DemoConfig.CHECKPOINT_DIR_KEY ));
187 | val stream = org.apache.spark.streaming.twitter.TwitterUtils.createStream( ssc, None );
188 | runAnalytics(sc, broadcastVar, stream)
189 | ssc;
190 | }
191 |
192 | def runAnalytics(sc:SparkContext, broadcastVar: Broadcast[scala.collection.immutable.Map[String,String]], stream:DStream[Status]){
193 | val keys = broadcastVar.value.get("tweets.key").get.split(",");
194 | val tweets = stream.filter { status =>
195 | Option(status.getUser).flatMap[String] {
196 | u => Option(u.getLang)
197 | }.getOrElse("").startsWith("en") && CharMatcher.ASCII.matchesAllOf(status.getText) && ( keys.isEmpty || keys.exists{key => status.getText.toLowerCase.contains(key.toLowerCase)})
198 | }
199 |
200 | val tweetAccumulator = sc.accumulable(Array[(String,String)]())(TweetsAccumulatorParam)
201 |
202 | new Thread( new Runnable() {
203 | def run(){
204 | try{
205 | while(ssc!=null && ssc.getState() != StreamingContextState.STOPPED ){
206 | val accuValue = tweetAccumulator.value
207 | if ( accuValue.size > 0 ){
208 | tweetAccumulator.setValue(Array[(String,String)]() )
209 | accuValue.foreach( v => send(v._1, v._2) )
210 | }
211 | Thread.sleep( 1000L )
212 | }
213 | System.out.println("Stopping the accumulator thread")
214 | }catch{
215 | case e:Throwable => e.printStackTrace()
216 | }
217 | }
218 | },"Accumulator").start
219 |
220 | val rowTweets = tweets.map(status=> {
221 | lazy val client = PooledHttp1Client()
222 | val sentiment = ToneAnalyzer.computeSentiment( client, status, broadcastVar )
223 | var scoreMap : Map[String, Double] = Map()
224 | if ( sentiment != null ){
225 | for( toneCategory <- Option(sentiment.tone_categories).getOrElse( Seq() )){
226 | for ( tone <- Option( toneCategory.tones ).getOrElse( Seq() ) ){
227 | scoreMap.put( tone.tone_id, (BigDecimal(tone.score).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) * 100.0 )
228 | }
229 | }
230 | }
231 |
232 | var jsonSentiment="{";
233 | scoreMap.foreach( t => jsonSentiment = jsonSentiment + (if (jsonSentiment.length() == 1) "" else ",") + "\"" + t._1 + "\":" + t._2)
234 | jsonSentiment += "}";
235 | val sendValue:String = "{\"author\": \"" +
236 | status.getUser.getName +
237 | "\", \"userid\":\"" + status.getUser.getScreenName +
238 | "\", \"pic\":\"" + status.getUser.getOriginalProfileImageURLHttps +
239 | "\",\"text\":" + JSONObject.quote( status.getText ) + ", \"sentiment\": " + jsonSentiment + "}"
240 |
241 | tweetAccumulator+=("tweets",sendValue)
242 |
243 | EnrichedTweet(
244 | status.getUser.getName,
245 | status.getUser.getScreenName,
246 | status.getCreatedAt.toString,
247 | status.getUser.getLang,
248 | status.getText,
249 | Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0),
250 | Option(status.getGeoLocation).map{ _.getLongitude}.getOrElse(0.0),
251 | scoreMap
252 | )
253 | })
254 |
255 | rowTweets.foreachRDD( rdd => {
256 | if( rdd.count > 0 ){
257 | workingRDD = SparkContext.getOrCreate().parallelize( rdd.map( t => t.toRow() ).collect()).union( workingRDD )
258 | }
259 | })
260 |
261 | val delimTagTone = "-%!"
262 | val delimToneScore = ":%@"
263 | val statsStream = rowTweets.map { eTweet => ("total_tweets", 1L) }
264 | .reduceByKey( _+_ )
265 | .updateStateByKey( (a:scala.collection.Seq[Long], b:Option[Long] ) => {
266 | var runningCount=b.getOrElse(0L)
267 | a.foreach { v => runningCount=runningCount+v }
268 | Some(runningCount)
269 | })
270 | statsStream.foreachRDD( rdd =>{
271 | send("TweetProcessed", TweetsMetricJsonSerializer.serialize(rdd.collect()))
272 | })
273 |
274 | val metricsStream = rowTweets.flatMap { eTweet => {
275 | val retList = ListBuffer[String]()
276 | for ( tag <- eTweet.text.split("\\s+") ){
277 | if ( tag.startsWith( "#") && tag.length > 1 ){
278 | for ( tone <- Option( eTweet.sentimentScores.keys ).getOrElse( Seq() ) ){
279 | retList += (tag + delimTagTone + tone + delimToneScore + eTweet.sentimentScores.getOrElse( tone, 0.0))
280 | }
281 | }
282 | }
283 | retList.toList
284 | }}
285 | .map { fullTag => {
286 | val split = fullTag.split(delimToneScore);
287 | (split(0), split(1).toFloat)
288 | }}
289 | .combineByKey(
290 | (x:Float) => (x,1),
291 | (x:(Float,Int), y:Float) => (x._1 + y, x._2+1),
292 | (x:(Float,Int),y:(Float,Int)) => (x._1 + y._1, x._2 + y._2),
293 | new HashPartitioner(sc.defaultParallelism)
294 | )
295 | .map[(String,(Long/*count*/, List[(String, Double)]))]{ t => {
296 | val key = t._1;
297 | val ab = t._2;
298 | val split = key.split(delimTagTone)
299 | (split(0), (ab._2, List((split(1), BigDecimal(ab._1/ab._2).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble ))))
300 | }}
301 | .reduceByKey( (t,u) => (t._1+u._1, (t._2 ::: u._2).sortWith( (l,r) => l._1.compareTo( r._1 ) < 0 )))
302 | .mapValues( (item:(Long, List[(String,Double)])) => {
303 | val unzip = item._2.unzip
304 | (item._1/(item._2.size), unzip._1, unzip._2)
305 | })
306 | .updateStateByKey( (a:scala.collection.Seq[(Long, List[String], List[Double])], b: Option[(Long, List[String], List[Double])]) => {
307 | val safeB = b.getOrElse( (0L, List(), List() ) )
308 | var listTones = safeB._2
309 | var listScores = safeB._3
310 | var count = safeB._1
311 | for( item <- a ){
312 | count += item._1
313 | listScores = listScores.zipAll( item._3, 0.0, 0.0).map{ case(a,b)=>(a+b)/2 }.toList
314 | listTones = item._2
315 | }
316 |
317 | Some( (count, listTones, listScores) )
318 | })
319 |
320 | metricsStream.print
321 |
322 | metricsStream.foreachRDD( rdd =>{
323 | val topHashTags = rdd.sortBy( f => f._2._1, false ).take(5)
324 | if ( !topHashTags.isEmpty){
325 | tweetAccumulator+=("topHashtags", TweetsMetricJsonSerializer.serialize(topHashTags.map( f => (f._1, f._2._1 ))))
326 | tweetAccumulator+=("toneScores", ToneScoreJsonSerializer.serialize(topHashTags))
327 | }
328 | })
329 |
330 | }
331 | }
332 |
333 | object TweetsAccumulatorParam extends AccumulableParam[Array[(String,String)], (String,String)]{
334 | def zero(initialValue:Array[(String,String)]):Array[(String,String)] = {
335 | Array()
336 | }
337 |
338 | def addInPlace(s1:Array[(String,String)], s2:Array[(String,String)]):Array[(String,String)] = {
339 | s1 ++ s2
340 | }
341 |
342 | def addAccumulator(current:Array[(String,String)], s:(String,String)):Array[(String,String)] = {
343 | current :+ s
344 | }
345 | }
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/StatusSerializer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.ibm.cds.spark.samples
19 |
20 | import java.io.ObjectOutputStream
21 | import java.io.ByteArrayOutputStream
22 | import org.apache.kafka.common.serialization.Serializer
23 | import twitter4j.Status
24 |
25 | /**
26 | * @author dtaieb
27 | */
28 | class StatusSerializer extends Serializer[Status]{
29 | def configure( props: java.util.Map[String, _], isKey: Boolean) = {
30 |
31 | }
32 |
33 | def close(){
34 |
35 | }
36 |
37 | def serialize(topic: String, value: Status ): Array[Byte] = {
38 | val baos = new ByteArrayOutputStream(1024)
39 | val oos = new ObjectOutputStream(baos)
40 | oos.writeObject( value )
41 | oos.close
42 | baos.toByteArray()
43 | }
44 | }
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/StreamingListener.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.ibm.cds.spark.samples
19 |
20 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverError
21 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStopped
22 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
23 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchCompleted
24 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted
25 |
26 | /**
27 | * @author dtaieb
28 | */
29 | class StreamingListener extends org.apache.spark.streaming.scheduler.StreamingListener {
30 | override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) {
31 | println("Receiver Started: " + receiverStarted.receiverInfo.name )
32 | }
33 |
34 | override def onReceiverError(receiverError: StreamingListenerReceiverError) {
35 | println("Receiver Error: " + receiverError.receiverInfo.lastError)
36 | }
37 |
38 | override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped) {
39 | println("Receiver Stopped: " + receiverStopped.receiverInfo.name)
40 | println("Reason: " + receiverStopped.receiverInfo.lastError + " : " + receiverStopped.receiverInfo.lastErrorMessage)
41 | }
42 |
43 | override def onBatchStarted(batchStarted: StreamingListenerBatchStarted){
44 | println("Batch started with " + batchStarted.batchInfo.numRecords + " records")
45 | }
46 |
47 | override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted){
48 | println("Batch completed with " + batchCompleted.batchInfo.numRecords + " records");
49 | }
50 | }
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/StreamingTwitter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.ibm.cds.spark.samples
19 |
20 | import scala.collection.mutable._
21 | import org.apache.commons.lang3.StringEscapeUtils
22 | import org.apache.log4j.Level
23 | import org.apache.log4j.Logger
24 | import org.apache.spark.Accumulator
25 | import org.apache.spark.SparkConf
26 | import org.apache.spark.SparkContext
27 | import org.apache.spark.streaming._
28 | import org.apache.spark.streaming.dstream._
29 | import org.http4s._
30 | import org.http4s.Http4s._
31 | import org.http4s.Status._
32 | import org.http4s.client.Client
33 | import org.http4s.client.blaze.PooledHttp1Client
34 | import org.http4s.headers.Authorization
35 | import com.ibm.couchdb._
36 | import scalaz._
37 | import scalaz.concurrent.Task
38 | import twitter4j.Status
39 | import org.apache.spark.sql.SQLContext
40 | import org.apache.spark.sql.Row
41 | import org.apache.spark.sql.types._
42 | import org.apache.spark.sql.DataFrame
43 | import org.apache.spark.rdd.RDD
44 | import org.apache.spark.rdd.EmptyRDD
45 | import com.google.common.base.CharMatcher
46 | import scala.math.BigDecimal
47 | import com.ibm.cds.spark.samples.config.DemoConfig
48 | import com.ibm.cds.spark.samples.ToneAnalyzer.ToneCategory
49 | import org.apache.spark.Logging
50 |
51 |
52 |
53 |
54 | /**
55 | * @author dtaieb
56 | */
57 | object StreamingTwitter extends Logging{
58 | var ssc: StreamingContext = null
59 | var sqlContext: SQLContext = null
60 | var workingRDD: RDD[Row] = null
61 | var schemaTweets : StructType = null
62 | val logger: Logger = Logger.getLogger( "com.ibm.cds.spark.samples.StreamingTwitter" )
63 |
64 | //main method invoked when running as a standalone Spark Application
65 | def main(args: Array[String]) {
66 |
67 | val conf = new SparkConf().setAppName("Spark Streaming Twitter Demo")
68 | val sc = new SparkContext(conf)
69 | startTwitterStreaming(sc, Seconds(10));
70 | }
71 |
72 | //Hold configuration key/value pairs
73 | val config = new DemoConfig
74 |
75 | //Wrapper api for Notebook access
76 | def setConfig(key:String, value:String){
77 | config.setConfig(key, value)
78 | }
79 |
80 | def startTwitterStreaming( sc: SparkContext, stopAfter: Duration = Seconds(0) ){
81 | println("Starting twitter stream");
82 | if ( ssc != null ){
83 | println("Twitter Stream already running");
84 | println("Please use stopTwitterStreaming() first and try again");
85 | return;
86 | }
87 |
88 | if ( !config.validateConfiguration(DemoConfig.CHECKPOINT_DIR_KEY) ){
89 | println("Unable to validate config")
90 | return;
91 | }
92 |
93 | Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
94 |
95 | workingRDD = sc.emptyRDD
96 | //Broadcast the config to each worker node
97 | val broadcastVar = sc.broadcast(config.toImmutableMap)
98 |
99 | var canStopTwitterStream = true
100 | var batchesProcessed=0
101 |
102 | ssc = new StreamingContext( sc, Seconds(5) )
103 |
104 | ssc.addStreamingListener( new StreamingListener )
105 |
106 | try{
107 | sqlContext = new SQLContext(sc)
108 | val keys = config.getConfig("tweets.key").split(",");
109 | val stream = org.apache.spark.streaming.twitter.TwitterUtils.createStream( ssc, None );
110 |
111 | if ( schemaTweets == null ){
112 | val schemaString = "author userid date lang text lat:double long:double"
113 | schemaTweets =
114 | StructType(
115 | schemaString.split(" ").map(
116 | fieldName => {
117 | val ar = fieldName.split(":")
118 | StructField(
119 | ar.lift(0).get,
120 | ar.lift(1).getOrElse("string") match{
121 | case "int" => IntegerType
122 | case "double" => DoubleType
123 | case _ => StringType
124 | },
125 | true)
126 | }
127 | ).union(
128 | ToneAnalyzer.sentimentFactors.map( f => StructField( f._1, DoubleType )).toArray[StructField]
129 | )
130 | )
131 | }
132 | val tweets = stream.filter { status =>
133 | Option(status.getUser).flatMap[String] {
134 | u => Option(u.getLang)
135 | }.getOrElse("").startsWith("en") && CharMatcher.ASCII.matchesAllOf(status.getText) && ( keys.isEmpty || keys.exists{status.getText.contains(_)})
136 | }
137 |
138 | lazy val client = PooledHttp1Client()
139 | val rowTweets = tweets.map(status=> {
140 | val sentiment = ToneAnalyzer.computeSentiment( client, status, broadcastVar )
141 |
142 | var colValues = Array[Any](
143 | status.getUser.getName, //author
144 | status.getUser.getScreenName, //Userid
145 | status.getCreatedAt.toString, //date
146 | status.getUser.getLang, //Lang
147 | status.getText, //text
148 | Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0), //lat
149 | Option(status.getGeoLocation).map{_.getLongitude}.getOrElse(0.0) //long
150 | //exception
151 | )
152 |
153 | var scoreMap : Map[String, Double] = Map()
154 | if ( sentiment != null ){
155 | for( toneCategory <- Option(sentiment.tone_categories).getOrElse( Seq() )){
156 | for ( tone <- Option( toneCategory.tones ).getOrElse( Seq() ) ){
157 | scoreMap.put( tone.tone_id, tone.score )
158 | }
159 | }
160 | }
161 |
162 | colValues = colValues ++ ToneAnalyzer.sentimentFactors.map { f => (BigDecimal(scoreMap.get(f._2).getOrElse(0.0)).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) * 100.0 }
163 | //Return [Row, (sentiment, status)]
164 | (Row(colValues.toArray:_*),(sentiment, status))
165 | })
166 |
167 | rowTweets.foreachRDD( rdd => {
168 | if(batchesProcessed==0){
169 | canStopTwitterStream=false
170 | }
171 | try{
172 | if( rdd.count > 0 ){
173 | batchesProcessed += 1
174 | workingRDD = sc.parallelize( rdd.map( t => t._1 ).collect()).union( workingRDD )
175 |
176 | val saveToCloudant = broadcastVar.value.get("cloudant.save").get.toBoolean
177 | if ( saveToCloudant ){
178 | rdd.foreachPartition { iterator =>
179 | var db: CouchDbApi = null;
180 | val couch = CouchDb( broadcastVar.value.get("cloudant.hostName").get,
181 | broadcastVar.value.get("cloudant.port").get.toInt,
182 | broadcastVar.value.get("cloudant.https").get.toBoolean,
183 | broadcastVar.value.get("cloudant.username").get,
184 | broadcastVar.value.get("cloudant.password").get
185 | );
186 | val dbName = "spark-streaming-twitter"
187 | couch.dbs.get(dbName).attemptRun match{
188 | case -\/(e) => logger.trace("Couch Database does not exist, creating it now"); couch.dbs.create(dbName).run
189 | case \/-(a) => println("Connected to cloudant db " + dbName )
190 | }
191 | val typeMapping = TypeMapping(classOf[ToneAnalyzer.Tweet] -> "Tweet")
192 | db = couch.db(dbName, typeMapping)
193 | iterator.foreach( t => {
194 | saveTweetToCloudant( client, db, t._2._2, t._2._1 )
195 | }
196 | )
197 | }
198 | }
199 | }
200 | }catch{
201 | case e: InterruptedException=>//Ignore
202 | case e: Exception => logError(e.getMessage, e )
203 | }finally{
204 | canStopTwitterStream = true
205 | }
206 | })
207 |
208 | }catch{
209 | case e : Exception => logError(e.getMessage, e )
210 | return
211 | }
212 | ssc.start()
213 |
214 | println("Twitter stream started");
215 | println("Tweets are collected real-time and analyzed")
216 | println("To stop the streaming and start interacting with the data use: StreamingTwitter.stopTwitterStreaming")
217 |
218 | if ( !stopAfter.isZero ){
219 | //Automatically stop it after 10s
220 | new Thread( new Runnable {
221 | var displayMessage = true;
222 | def run(){
223 | Thread.sleep( stopAfter.milliseconds )
224 | var loop = true
225 | while(loop){
226 | if (canStopTwitterStream){
227 | stopTwitterStreaming
228 | loop = false
229 | }else{
230 | if ( displayMessage ){
231 | displayMessage = false
232 | println("Received directive to stop twitter Stream: Waiting for already received tweets to be processed...")
233 | }
234 | Thread.sleep(5000L)
235 | }
236 | }
237 | }
238 | }).start
239 | }
240 | }
241 |
242 | def saveTweetToCloudant(client: Client, db: CouchDbApi, status:Status, sentiment: ToneAnalyzer.Sentiment) : Status = {
243 | if ( db != null){
244 | logger.trace("Creating new Tweet in Couch Database " + status.getText())
245 | val task:Task[Res.DocOk] = db.docs.create(
246 | ToneAnalyzer.Tweet(
247 | status.getUser().getName,
248 | status.getCreatedAt().toString(),
249 | status.getUser().getLang(),
250 | status.getText(),
251 | ToneAnalyzer.Geo(
252 | Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0),
253 | Option(status.getGeoLocation).map{_.getLongitude}.getOrElse(0.0)
254 | ),
255 | sentiment
256 | )
257 | )
258 |
259 | // Execute the actions and process the result
260 | task.attemptRun match {
261 | case -\/(e) => logError(e.getMessage, e );
262 | case \/-(a) => logger.trace("Successfully create new Tweet in Couch Database " + status.getText() )
263 | }
264 | }
265 |
266 | status
267 | }
268 |
269 | def createTwitterDataFrames(sc: SparkContext) : (SQLContext, DataFrame) = {
270 | if ( workingRDD.count <= 0 ){
271 | println("No data receive. Please start the Twitter stream again to collect data")
272 | return null
273 | }
274 |
275 | try{
276 | val df = sqlContext.createDataFrame( workingRDD, schemaTweets )
277 | df.registerTempTable("tweets")
278 |
279 | println("A new table named tweets with " + df.count() + " records has been correctly created and can be accessed through the SQLContext variable")
280 | println("Here's the schema for tweets")
281 | df.printSchema()
282 |
283 | (sqlContext, df)
284 | }catch{
285 | case e: Exception => {logError(e.getMessage, e ); return null}
286 | }
287 | }
288 |
289 | def stopTwitterStreaming(){
290 | if ( ssc == null){
291 | println("No Twitter stream to stop");
292 | return;
293 | }
294 |
295 | println("Stopping Twitter stream. Please wait this may take a while")
296 | ssc.stop(stopSparkContext = false, stopGracefully = false)
297 | ssc = null
298 | println("Twitter stream stopped");
299 |
300 | println( "You can now create a sqlContext and DataFrame with " + workingRDD.count + " Tweets created. Sample usage: ")
301 | println("val (sqlContext, df) = com.ibm.cds.spark.samples.StreamingTwitter.createTwitterDataFrames(sc)")
302 | println("df.printSchema")
303 | println("sqlContext.sql(\"select author, text from tweets\").show")
304 | }
305 | }
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/ToneAnalyzer.scala:
--------------------------------------------------------------------------------
1 | package com.ibm.cds.spark.samples
2 |
3 | import org.http4s.EntityEncoder
4 | import org.http4s.Uri
5 | import org.http4s.client.Client
6 | import org.http4s.Request
7 | import org.http4s.BasicCredentials
8 | import org.http4s.Header
9 | import org.http4s.Headers
10 | import org.http4s.Method
11 | import org.http4s.headers.Authorization
12 | import org.apache.log4j.Logger
13 | import org.apache.spark.broadcast.Broadcast
14 | import org.apache.spark.Logging
15 | import scala.util.parsing.json.JSON
16 | import org.codehaus.jettison.json.JSONObject
17 |
18 | /**
19 | * @author dtaieb
20 | */
21 |
22 | object ToneAnalyzer extends Logging{
23 |
24 | val sentimentFactors = Array(
25 | ("Anger","anger"),
26 | ("Disgust","disgust"),
27 | ("Fear","fear"),
28 | ("Joy","joy"),
29 | ("Sadness","sadness"),
30 | ("Analytical","analytical"),
31 | ("Confident","confident"),
32 | ("Tentative","tentative"),
33 | ("Openness","openness_big5"),
34 | ("Conscientiousness","conscientiousness_big5"),
35 | ("Extraversion","extraversion_big5"),
36 | ("Agreeableness","agreeableness_big5"),
37 | ("EmotionalRange","neuroticism_big5")
38 | )
39 |
40 | //Class models for Sentiment JSON
41 | case class DocumentTone( document_tone: Sentiment )
42 | case class Sentiment(tone_categories: Seq[ToneCategory]);
43 | case class ToneCategory(category_id: String, category_name: String, tones: Seq[Tone]);
44 | case class Tone(score: Double, tone_id: String, tone_name: String)
45 | // case class Sentiment( scorecard: String, children: Seq[Tone] )
46 | // case class Tone( name: String, id: String, children: Seq[ToneResult])
47 | // case class ToneResult(name: String, id: String, word_count: Double, normalized_score: Double, raw_score: Double, linguistic_evidence: Seq[LinguisticEvidence] )
48 | // case class LinguisticEvidence( evidence_score: Double, word_count: Double, correlation: String, words : Seq[String])
49 |
50 | case class Geo( lat: Double, long: Double )
51 | case class Tweet(author: String, date: String, language: String, text: String, geo : Geo, sentiment : Sentiment )
52 |
53 | def computeSentiment( client: Client, status:StatusAdapter, broadcastVar: Broadcast[Map[String,String]] ) : Sentiment = {
54 | logTrace("Calling sentiment from Watson Tone Analyzer: " + status.text)
55 | try{
56 | //Get Sentiment on the tweet
57 | val sentimentResults: String =
58 | EntityEncoder[String].toEntity("{\"text\": " + JSONObject.quote( status.text ) + "}" ).flatMap {
59 | entity =>
60 | val s = broadcastVar.value.get("watson.tone.url").get + "/v3/tone?version=" + broadcastVar.value.get("watson.api.version").get
61 | val toneuri: Uri = Uri.fromString( s ).getOrElse( null )
62 | client(
63 | Request(
64 | method = Method.POST,
65 | uri = toneuri,
66 | headers = Headers(
67 | Authorization(
68 | BasicCredentials(broadcastVar.value.get("watson.tone.username").get, broadcastVar.value.get("watson.tone.password").get)
69 | ),
70 | Header("Accept", "application/json"),
71 | Header("Content-Type", "application/json; charset=utf-8")
72 | ),
73 | body = entity.body
74 | )
75 | ).flatMap { response =>
76 | if (response.status.code == 200 ) {
77 | response.as[String]
78 | } else {
79 | println( "Error received from Watson Tone Analyzer. Code : " + response.status.code + " reason: " + response.status.reason )
80 | null
81 | }
82 | }
83 | }.run
84 |
85 | upickle.read[DocumentTone](sentimentResults).document_tone
86 | }catch{
87 | case e:Throwable => {
88 | e.printStackTrace()
89 | null
90 | }
91 | }
92 | }
93 | }
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/TwitterAdapter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.ibm.cds.spark.samples
19 |
20 | import java.io.ObjectInputStream
21 | import java.io.ByteArrayInputStream
22 | import scala.util.parsing.json.JSON
23 | import org.apache.kafka.common.serialization.Deserializer
24 | import twitter4j.Status
25 |
26 | /**
27 | * @author dtaieb
28 | * Deserialization adapters for Twitter4J Status
29 | */
30 |
31 | case class StatusAdapter(userName:String, userId: String, userLang: String,createdAt:String,text:String, long:Double, lat:Double);
32 |
33 | object StatusAdapter{
34 | implicit def statusAdapterWrapper(status: Status) =
35 | StatusAdapter(
36 | status.getUser.getName,
37 | status.getUser.getScreenName,
38 | status.getUser.getLang,
39 | status.getCreatedAt.toString,
40 | status.getText,
41 | Option(status.getGeoLocation).map{ _.getLongitude}.getOrElse(0.0),
42 | Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0)
43 | )
44 | }
45 |
46 | class StatusDeserializer extends Deserializer[StatusAdapter]{
47 | def configure( props: java.util.Map[String, _], isKey: Boolean) = {
48 |
49 | }
50 |
51 | def close(){
52 |
53 | }
54 |
55 | def deserialize(topic: String, data: Array[Byte] ): StatusAdapter = {
56 | try{
57 | val bais = new ByteArrayInputStream( data )
58 | var ois:ObjectInputStream = null
59 | try{
60 | ois = new ObjectInputStream( bais )
61 | ois.readObject().asInstanceOf[Status]
62 | }finally{
63 | if (bais != null ){
64 | bais.close
65 | }
66 | if ( ois != null ){
67 | ois.close
68 | }
69 | }
70 | }catch{
71 | case e:Throwable=>{
72 | val jsonObject = JSON.parseFull( new String(data) ).getOrElse(Map.empty).asInstanceOf[Map[String, Any]]
73 | val user=jsonObject.get("user").getOrElse( Map.empty ).asInstanceOf[Map[String,Any]]
74 | val geo = Option(jsonObject.get("geo").orNull).getOrElse(Map.empty).asInstanceOf[Map[String,Any]]
75 | StatusAdapter(
76 | user.get("name").getOrElse("").asInstanceOf[String],
77 | user.get("userid").getOrElse("").asInstanceOf[String],
78 | user.get("lang").getOrElse("").asInstanceOf[String],
79 | jsonObject.get("created_at").getOrElse("").asInstanceOf[String],
80 | jsonObject.get("text").getOrElse("").asInstanceOf[String],
81 | geo.get("long").getOrElse(0.0).asInstanceOf[Double],
82 | geo.get("lat").getOrElse(0.0).asInstanceOf[Double]
83 | )
84 | }
85 | }
86 | }
87 | }
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/config/DemoConfig.scala:
--------------------------------------------------------------------------------
1 | package com.ibm.cds.spark.samples.config
2 |
3 | import org.apache.kafka.clients.CommonClientConfigs
4 | import java.io.FileInputStream
5 | import java.io.InputStream
6 | import scala.collection.JavaConversions._
7 | import org.apache.spark.SparkContext
8 |
9 |
10 | /**
11 | * @author dtaieb
12 | */
13 |
14 | class DemoConfig() extends Serializable{
15 |
16 | //Hold configuration key/value pairs
17 | var config = scala.collection.mutable.Map[String, String](
18 | registerConfigKey("twitter4j.oauth.consumerKey" ),
19 | registerConfigKey("twitter4j.oauth.consumerSecret" ),
20 | registerConfigKey("twitter4j.oauth.accessToken" ),
21 | registerConfigKey("twitter4j.oauth.accessTokenSecret"),
22 | registerConfigKey("tweets.key",""),
23 | registerConfigKey("cloudant.hostName" ),
24 | registerConfigKey("cloudant.https", "true"),
25 | registerConfigKey("cloudant.port" ),
26 | registerConfigKey("cloudant.username" ),
27 | registerConfigKey("cloudant.password" ),
28 | registerConfigKey("watson.tone.url" ),
29 | registerConfigKey("watson.tone.username" ),
30 | registerConfigKey("watson.tone.password" ),
31 | registerConfigKey("watson.api.version", "2016-05-19"),
32 | registerConfigKey("cloudant.save", "false" ),
33 | registerConfigKey(DemoConfig.CHECKPOINT_DIR_KEY)
34 | )
35 |
36 | private def getKeyOrFail(key:String):String={
37 | config.get(key).getOrElse( {
38 | throw new IllegalStateException("Missing key: " + key)
39 | })
40 | }
41 |
42 | def cloneConfig():MessageHubConfig={
43 | val props = new MessageHubConfig
44 | config.foreach{ entry => props.setConfig(entry._1, entry._2)}
45 | props
46 | }
47 |
48 | def set_hadoop_config(sc:SparkContext){
49 | val prefix = "fs.swift.service." + getKeyOrFail("name")
50 | val hconf = sc.hadoopConfiguration
51 | hconf.set(prefix + ".auth.url", getKeyOrFail("auth_url")+"/v3/auth/tokens")
52 | hconf.set(prefix + ".auth.endpoint.prefix", "endpoints")
53 | hconf.set(prefix + ".tenant", getKeyOrFail("project_id"))
54 | hconf.set(prefix + ".username", getKeyOrFail("user_id"))
55 | hconf.set(prefix + ".password", getKeyOrFail("password"))
56 | hconf.setInt(prefix + ".http.port", 8080)
57 | hconf.set(prefix + ".region", getKeyOrFail("region"))
58 | hconf.setBoolean(prefix + ".public", true)
59 | }
60 |
61 | def initConfigKeys(){
62 | //Overridable by subclasses
63 | }
64 |
65 | //Give a chance to subclasses to init the keys
66 | initConfigKeys;
67 |
68 | {
69 | //Load config from property file if specified
70 | val configPath = Option(System.getProperty("DEMO_CONFIG_PATH") ).orElse( Option(System.getenv("DEMO_CONFIG_PATH")))
71 | .orElse( Option(System.getProperty("spark.service.user.DEMO_CONFIG_PATH") )).orElse(Option(System.getenv("spark.service.user.DEMO_CONFIG_PATH") ))
72 | .getOrElse(null)
73 | if (configPath != null ){
74 | println("ConfigPath is: " + configPath )
75 | }
76 | if ( configPath != null ){
77 | println("Loading config from DEMO_CONFIG_PATH env variable: " + configPath)
78 | val props = new java.util.Properties
79 | var fis:InputStream = null
80 | try{
81 | fis = new FileInputStream(configPath)
82 | props.load(fis)
83 | for( key <- props.keysIterator ){
84 | setConfig( key, props.getProperty(key))
85 | }
86 | }catch{
87 | case e:Throwable => e.printStackTrace
88 | }finally{
89 | if ( fis != null ){
90 | fis.close
91 | }
92 | }
93 | }
94 | }
95 |
96 | private[config] def registerConfigKey( key: String, default: String = null ) : (String,String) = {
97 | if ( default == null ){
98 | (key, Option(System.getProperty(key)).orNull )
99 | }
100 | (key, Option(System.getProperty(key)) getOrElse default )
101 | }
102 |
103 | def setConfig(key:String, value:String){
104 | config.put( key, value )
105 | }
106 |
107 | def getConfig(key:String):String={
108 | config.get(key).getOrElse("")
109 | }
110 |
111 | implicit def toImmutableMap(): Map[String,String]= {
112 | Map( config.toList: _* )
113 | }
114 |
115 | //Validate configuration settings
116 | def validateConfiguration(ignorePrefix:String*) : Boolean = {
117 | def ignoreKey( key: String ): Boolean = {
118 | var o = ignorePrefix.find { p => p.startsWith( key ) };
119 | o.isDefined
120 | }
121 | var ret: Boolean = true;
122 | val saveToCloudant = config.get("cloudant.save").get.toBoolean
123 | config.foreach( (t:(String, Any)) =>
124 | if ( t._2 == null ){
125 | if ( saveToCloudant || !t._1.startsWith("cloudant") ){
126 | if ( !ignoreKey( t._1) ){
127 | println(t._1 + " configuration not set. Use setConfig(\"" + t._1 + "\",)");
128 | ret = false;
129 | }
130 | }
131 | }
132 | )
133 |
134 | if ( ret ){
135 | config.foreach( (t:(String,Any)) =>
136 | try{
137 | if ( t._1.startsWith( "twitter4j") && t._2 != null && !ignoreKey(t._1) ) {
138 | System.setProperty( t._1, t._2.asInstanceOf[String] )
139 | }
140 | }catch{
141 | case e:Throwable => println("error" + t)
142 | }
143 | )
144 | }
145 | ret
146 | }
147 | }
148 |
149 | object DemoConfig extends DemoConfig{
150 | final val CHECKPOINT_DIR_KEY = "checkpointDir"
151 | }
152 |
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/config/MessageHubConfig.scala:
--------------------------------------------------------------------------------
1 | package com.ibm.cds.spark.samples.config
2 |
3 | import scala.collection.mutable.ListBuffer
4 | import scala.reflect.ClassTag
5 | import org.apache.kafka.clients.CommonClientConfigs
6 | import org.apache.kafka.common.config.SslConfigs
7 | import org.apache.kafka.common.security.JaasUtils
8 | import scala.io.Source
9 | import java.io.InputStream
10 | import java.io.FileWriter
11 | import java.io.File
12 | import org.http4s.EntityEncoder
13 | import org.http4s.Uri
14 | import org.http4s.client.blaze.PooledHttp1Client
15 | import org.http4s.Request
16 | import org.http4s.Method
17 | import org.http4s.Headers
18 | import org.http4s.headers.Authorization
19 | import org.http4s.BasicCredentials
20 | import org.http4s.Header
21 | import javax.net.ssl.SSLContext
22 | import org.codehaus.jettison.json.JSONObject
23 |
24 |
25 | /**
26 | * @author dtaieb
27 | */
28 | class MessageHubConfig extends DemoConfig{
29 | lazy val kafkaOptionKeys = ListBuffer[String]()
30 | override def initConfigKeys(){
31 | config = config ++ Map[String,String](
32 | registerConfigKey(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG),
33 | registerConfigKey(CommonClientConfigs.CLIENT_ID_CONFIG, "demo.watson.twitter.messagehub"),
34 | registerConfigKey("auto.offset.reset", "latest"),
35 | registerConfigKey("acks", "-1"),
36 | registerConfigKey("retries", "0"),
37 | registerConfigKey("batch.size", "16384"),
38 | registerConfigKey("linger.ms", "1"),
39 | registerConfigKey("buffer.memory", "33554432"),
40 | registerConfigKey("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"),
41 | registerConfigKey("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"),
42 | registerConfigKey(SslConfigs.SSL_PROTOCOL_CONFIG, "TLSv1.2"),
43 | registerConfigKey(SslConfigs.SSL_ENABLED_PROTOCOLS_CONFIG, "TLSv1.2"),
44 | registerConfigKey(SslConfigs.SSL_TRUSTSTORE_TYPE_CONFIG, "JKS"),
45 | registerConfigKey(SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG, getDefaultSSLTrustStoreLocation),
46 | registerConfigKey(SslConfigs.SSL_TRUSTSTORE_PASSWORD_CONFIG, "changeit"),
47 | registerConfigKey(SslConfigs.SSL_ENDPOINT_IDENTIFICATION_ALGORITHM_CONFIG, "HTTPS"),
48 | registerConfigKey(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, "SASL_SSL" ),
49 |
50 | registerConfigKey(MessageHubConfig.CHECKPOINT_DIR_KEY),
51 | registerConfigKey(MessageHubConfig.KAFKA_TOPIC_TWEETS, "demo.tweets.watson.topic"),
52 | registerConfigKey(MessageHubConfig.KAFKA_USER_NAME),
53 | registerConfigKey(MessageHubConfig.KAFKA_USER_PASSWORD),
54 | registerConfigKey(MessageHubConfig.MESSAGEHUB_API_KEY),
55 | registerConfigKey(MessageHubConfig.MESSAGEHUB_REST_URL)
56 | )
57 | }
58 |
59 | private def getDefaultSSLTrustStoreLocation():String={
60 | val javaHome = System.getProperty("java.home") + File.separator + "lib" + File.separator + "security" + File.separator + "cacerts"
61 | println("default location of ssl Trust store is: " + javaHome)
62 | javaHome
63 | }
64 |
65 | override private[config] def registerConfigKey( key: String, default: String = null ) : (String,String) = {
66 | kafkaOptionKeys += key
67 | super.registerConfigKey(key,default)
68 | }
69 |
70 | override def validateConfiguration(ignorePrefix:String*) : Boolean = {
71 | val ret = super.validateConfiguration(ignorePrefix:_*)
72 | if ( ret ){
73 | //Create the jaas configuration
74 | MessageHubConfig.createJaasConfiguration(getConfig(MessageHubConfig.KAFKA_USER_NAME ), getConfig(MessageHubConfig.KAFKA_USER_PASSWORD) )
75 | }
76 | ret
77 | }
78 |
79 | def copyKafkaOptionKeys(other:MessageHubConfig){
80 | kafkaOptionKeys.foreach { key => other.setConfig(key, getConfig(key) ) }
81 | }
82 |
83 | def setValueSerializer[U]()(implicit c: ClassTag[U]){
84 | setConfig("value.serializer", c.runtimeClass.getName);
85 | }
86 |
87 | def setValueDeserializer[U]()(implicit c: ClassTag[U]){
88 | setConfig("value.deserializer", c.runtimeClass.getName);
89 | }
90 |
91 | def createTopicsIfNecessary( topics: String* ){
92 | val sslContext = SSLContext.getInstance("TLSv1.2")
93 | sslContext.init(null, null, null)
94 | lazy val client = PooledHttp1Client(sslContext=Option(sslContext))
95 | for( topic <- topics ){
96 | EntityEncoder[String].toEntity("{\"name\":" + JSONObject.quote( topic ) + "}" ).flatMap {
97 | entity =>
98 | val topicUri: Uri = Uri.fromString( getConfig(MessageHubConfig.MESSAGEHUB_REST_URL) + "/admin/topics" ).getOrElse( null )
99 | println(topicUri)
100 | client(
101 | Request(
102 | method = Method.POST,
103 | uri = topicUri,
104 | headers = Headers(
105 | Header("Content-Type", "application/json"),
106 | Header("X-Auth-Token", getConfig(MessageHubConfig.MESSAGEHUB_API_KEY))
107 | ),
108 | body = entity.body
109 | )
110 | ).flatMap { response =>
111 | response.status.code match {
112 | case 200 | 202 => println("Successfully created topic: " + topic)
113 | case 422 | 403 => println("Topic already exists in the server: " + topic)
114 | case _ => throw new IllegalStateException("Error when trying to create topic: " + response.status.code + " Reason: " + response.status.reason)
115 | }
116 | response.as[String]
117 | }
118 | }.run
119 | }
120 | }
121 | }
122 |
123 | object MessageHubConfig{
124 | final val CHECKPOINT_DIR_KEY = DemoConfig.CHECKPOINT_DIR_KEY
125 | final val KAFKA_TOPIC_TWEETS = "kafka.topic.tweet" //Key for name of the kafka topic holding used for publishing the tweets
126 | final val KAFKA_USER_NAME = "kafka.user.name"
127 | final val KAFKA_USER_PASSWORD = "kafka.user.password"
128 |
129 | final val MESSAGEHUB_API_KEY = "api_key"
130 | final val MESSAGEHUB_REST_URL = "kafka_rest_url"
131 |
132 | private def fixPath(path: String):String = {
133 | path.replaceAll("\\ / : * ? \" < > |,", "_")
134 | }
135 |
136 | def createJaasConfiguration( userName: String, password: String){
137 | //Create the jaas configuration
138 | var is:InputStream = null
139 | try{
140 | val packageName = MessageHubConfig.getClass.getPackage.getName.replace('.', File.separatorChar)
141 | is = MessageHubConfig.getClass.getClassLoader.getResourceAsStream(packageName + "/jaas.conf");
142 | val confString = Source.fromInputStream( is ).mkString
143 | .replace( "$USERNAME", userName)
144 | .replace( "$PASSWORD", password )
145 |
146 | val confDir= new File( System.getProperty("java.io.tmpdir") + File.separator +
147 | fixPath( userName ) )
148 | confDir.mkdirs
149 | val confFile = new File( confDir, "jaas.conf");
150 | val fw = new FileWriter( confFile );
151 | fw.write( confString )
152 | fw.close
153 |
154 | //Set the jaas login config property
155 | println("Registering JaasConfiguration: " + confFile.getAbsolutePath)
156 | System.setProperty(JaasUtils.JAVA_LOGIN_CONFIG_PARAM, confFile.getAbsolutePath )
157 | }catch{
158 | case e:Throwable => {
159 | e.printStackTrace
160 | throw e
161 | }
162 | }finally{
163 | if ( is != null ) is.close
164 | }
165 | }
166 | }
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/config/jaas.conf:
--------------------------------------------------------------------------------
1 | KafkaClient {
2 | com.ibm.messagehub.login.MessageHubLoginModule required
3 | serviceName="kafka"
4 | username="$USERNAME"
5 | password="$PASSWORD";
6 | };
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/dstream/KafkaInputDStream.scala:
--------------------------------------------------------------------------------
1 | package com.ibm.cds.spark.samples.dstream
2 |
3 | import scala.collection.JavaConversions._
4 | import scala.collection.Map
5 | import scala.reflect.ClassTag
6 | import scala.reflect.classTag
7 | import org.apache.kafka.clients.consumer.ConsumerRecord
8 | import org.apache.kafka.clients.consumer.KafkaConsumer
9 | import org.apache.kafka.common.serialization.Deserializer
10 | import org.apache.spark.Logging
11 | import org.apache.spark.storage.StorageLevel
12 | import org.apache.spark.streaming.StreamingContext
13 | import org.apache.spark.streaming.dstream._
14 | import org.apache.spark.streaming.receiver.Receiver
15 | import org.apache.log4j.Level
16 | import org.apache.log4j.Logger
17 | import java.util.Properties
18 | import com.ibm.cds.spark.samples.config.MessageHubConfig
19 | import org.apache.kafka.common.security.JaasUtils
20 |
21 | class KafkaInputDStream[
22 | K: ClassTag,
23 | V: ClassTag,
24 | U <: Deserializer[_]: ClassTag,
25 | T <: Deserializer[_]: ClassTag](
26 | ssc : StreamingContext,
27 | kafkaParams: Map[String, String],
28 | topics: List[String],
29 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK
30 | ) extends ReceiverInputDStream[(K, V)](ssc) with Logging {
31 |
32 | def getReceiver(): Receiver[(K, V)] = {
33 | new KafkaReceiver[K, V, U, T](kafkaParams, topics, storageLevel)
34 | }
35 | }
36 |
37 | object KafkaStreaming{
38 | implicit class KafkaStreamingContextAdapter( val ssc : StreamingContext ){
39 | def createKafkaStream[K: ClassTag, V: ClassTag, U <: Deserializer[_]: ClassTag, T <: Deserializer[_]: ClassTag](
40 | bootStrapKafkaConfig: MessageHubConfig,
41 | topics: List[String]
42 | ): ReceiverInputDStream[(K, V)] = {
43 | val kafkaProps = new MessageHubConfig;
44 | bootStrapKafkaConfig.copyKafkaOptionKeys( kafkaProps)
45 | kafkaProps.setValueDeserializer[T];
46 | new KafkaInputDStream[K, V, U, T](ssc, kafkaProps.toImmutableMap, topics)
47 | }
48 | }
49 | }
50 |
51 | class KafkaReceiver[
52 | K: ClassTag,
53 | V: ClassTag,
54 | U <: Deserializer[_]: ClassTag,
55 | T <: Deserializer[_]: ClassTag](
56 | kafkaParams: Map[String,String],
57 | topics: List[String],
58 | storageLevel: StorageLevel
59 | ) extends Receiver[(K, V)](storageLevel) with Logging {
60 |
61 | // Connection to Kafka
62 | var kafkaConsumer: KafkaConsumer[K,V] = null
63 |
64 | def onStop() {
65 | if (kafkaConsumer != null) {
66 | kafkaConsumer.synchronized {
67 | print("Stopping kafkaConsumer")
68 | kafkaConsumer.close()
69 | kafkaConsumer = null
70 | }
71 | }
72 | }
73 |
74 | def onStart() {
75 | logInfo("Starting Kafka Consumer Stream")
76 |
77 | //Make sure the Jaas Login config param is set
78 | val jaasLoginParam = System.getProperty(JaasUtils.JAVA_LOGIN_CONFIG_PARAM);
79 | if ( jaasLoginParam == null ){
80 | MessageHubConfig.createJaasConfiguration( kafkaParams.get(MessageHubConfig.KAFKA_USER_NAME).get, kafkaParams.get(MessageHubConfig.KAFKA_USER_PASSWORD).get)
81 | }
82 |
83 |
84 | val keyDeserializer = classTag[U].runtimeClass.getConstructor().newInstance().asInstanceOf[Deserializer[K]]
85 | val valueDeserializer = classTag[T].runtimeClass.getConstructor().newInstance().asInstanceOf[Deserializer[V]]
86 |
87 | //Create a new kafka consumer and subscribe to the relevant topics
88 | kafkaConsumer = new KafkaConsumer[K, V](kafkaParams)
89 | kafkaConsumer.subscribe( topics )
90 |
91 | new Thread( new Runnable {
92 | def run(){
93 | try{
94 | while( kafkaConsumer != null ){
95 | var it:Iterator[ConsumerRecord[K, V]] = null;
96 |
97 | if ( kafkaConsumer != null ){
98 | kafkaConsumer.synchronized{
99 | //Poll for new events
100 | it = kafkaConsumer.poll(1000L).iterator
101 | while( it != null && it.hasNext() ){
102 | //Get the record and store it
103 | val record = it.next();
104 | store( (record.key, record.value) )
105 | }
106 | kafkaConsumer.commitSync
107 | }
108 | }
109 |
110 | Thread.sleep( 1000L )
111 | }
112 | println("Exiting Thread")
113 | }catch{
114 | case e:Throwable => {
115 | reportError( "Error in KafkaConsumer thread", e);
116 | e.printStackTrace()
117 | }
118 | }
119 | }
120 | }).start
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/package-info.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | /**
18 | * Spark Streaming sample application
19 | *
20 | */
21 | package com.ibm.cds.spark.samples;
--------------------------------------------------------------------------------
/streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/package.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.ibm.cds.spark
19 |
20 | /**
21 | * @author dtaieb
22 | */
23 | import scala.collection.mutable._
24 | import org.apache.spark.sql.types.IntegerType
25 | import org.apache.spark.sql.types.DoubleType
26 | import org.apache.spark.sql.types.StructField
27 | import org.apache.spark.sql.types.StringType
28 | import org.apache.spark.sql.types.StructField
29 | import org.apache.spark.sql.types.StructType
30 | import org.apache.spark.sql.Row
31 |
32 | package object samples {
33 |
34 | case class EnrichedTweet( author:String="", userid: String="", date: String, lang: String, text: String, lat: Double, long: Double, sentimentScores: Map[String, Double]){
35 | def toRow():Row={
36 | var colValues = Array[Any](author,userid,date,lang,text,lat,long)
37 | val scores = for {
38 | (_,emotion)<-ToneAnalyzer.sentimentFactors
39 | score=sentimentScores.getOrElse(emotion, 0.0)
40 | }yield score
41 | colValues = colValues ++ scores
42 | Row(colValues.toArray:_*)
43 | }
44 | }
45 |
46 | val schemaString = "author userid date lang text lat:double long:double"
47 | val schemaTweets =
48 | StructType(
49 | schemaString.split(" ").map(
50 | fieldName => {
51 | val ar = fieldName.split(":");
52 | StructField(
53 | ar.lift(0).get,
54 | ar.lift(1).getOrElse("string") match{
55 | case "int" => IntegerType
56 | case "double" => DoubleType
57 | case _ => StringType
58 | },
59 | true
60 | )
61 | }
62 | ).union(
63 | ToneAnalyzer.sentimentFactors.map( f => StructField( f._1, DoubleType )).toArray[StructField]
64 | )
65 | )
66 | }
--------------------------------------------------------------------------------