├── .gitignore ├── LICENSE ├── README.md ├── dist ├── helloSpark-assembly-2.1.jar └── streaming-twitter-assembly-1.6.jar ├── docs ├── Twitter Sentiment with Watson TA and PI architecture diagram.orig.png └── Twitter Sentiment with Watson TA and PI architecture diagram.png ├── helloGraphx ├── build.sbt ├── project │ └── assembly.sbt ├── readme.md └── src │ └── main │ └── scala │ └── com │ └── ibm │ └── cds │ └── spark │ └── samples │ └── HelloGraphx.scala ├── helloSpark ├── .settings │ └── org.scala-ide.sdt.core.prefs ├── build.sbt ├── project │ └── assembly.sbt ├── python │ ├── helloSpark.py │ ├── helloSpark │ │ └── __init__.py │ └── setup.py ├── readme.md └── src │ └── main │ └── scala │ └── com │ └── ibm │ └── cds │ └── spark │ └── samples │ ├── HelloSpark.scala │ └── package-info.java ├── notebook ├── DashDB Twitter Car 2015 Python Notebook.ipynb ├── Get Service Credentials for Twitter Sentiment with Watson TA and PI.md ├── PYCON 2016 spark tutorial quick links.txt ├── README.md └── Twitter Sentiment with Watson TA and PI.ipynb └── streaming-twitter ├── .classpath ├── .gitignore ├── .project ├── build.sbt ├── lib ├── couchdb-scala │ └── com │ │ └── ibm │ │ └── couchdb-scala_2.10 │ │ └── 0.5.3 │ │ ├── couchdb-scala_2.10-0.5.3-javadoc.jar │ │ ├── couchdb-scala_2.10-0.5.3-javadoc.jar.md5 │ │ ├── couchdb-scala_2.10-0.5.3-javadoc.jar.sha1 │ │ ├── couchdb-scala_2.10-0.5.3-sources.jar │ │ ├── couchdb-scala_2.10-0.5.3-sources.jar.md5 │ │ ├── couchdb-scala_2.10-0.5.3-sources.jar.sha1 │ │ ├── couchdb-scala_2.10-0.5.3.jar │ │ ├── couchdb-scala_2.10-0.5.3.jar.md5 │ │ ├── couchdb-scala_2.10-0.5.3.jar.sha1 │ │ ├── couchdb-scala_2.10-0.5.3.pom │ │ ├── couchdb-scala_2.10-0.5.3.pom.md5 │ │ └── couchdb-scala_2.10-0.5.3.pom.sha1 ├── messagehub.login-1.0.0.jar └── pixiedust.jar ├── notebook ├── Spark Streaming Twitter-Watson-MessageHub.ipynb ├── Twitter + Watson Tone Analyzer Part 1.ipynb ├── Twitter + Watson Tone Analyzer Part 2.ipynb └── Twitter Sentiment with Pixiedust.ipynb ├── project └── assembly.sbt ├── readme.md ├── sampleConfig └── sampleconf.properties └── src └── main └── scala └── com └── ibm └── cds └── spark └── samples ├── KafkaProducerTest.scala ├── MessageHubStreamingTwitter.scala ├── PixiedustStreamingTwitter.scala ├── StatusSerializer.scala ├── StreamingListener.scala ├── StreamingTwitter.scala ├── ToneAnalyzer.scala ├── TwitterAdapter.scala ├── config ├── DemoConfig.scala ├── MessageHubConfig.scala └── jaas.conf ├── dstream └── KafkaInputDStream.scala ├── package-info.java └── package.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | target/ 9 | lib_managed/ 10 | src_managed/ 11 | project/boot/ 12 | project/plugins/project/ 13 | 14 | # Scala-IDE specific 15 | .scala_dependencies 16 | .worksheet 17 | 18 | helloSpark/.cache-main 19 | 20 | helloSpark/.classpath 21 | 22 | helloSpark/.project 23 | 24 | streaming-twitter/.cache-main 25 | 26 | streaming-twitter/.settings/org.scala-ide.sdt.core.prefs 27 | 28 | streaming-twitter/config/MessageHubYP.properties 29 | 30 | *.pyc 31 | 32 | pixiedust/pixiedust.egg-info 33 | 34 | pixiedust/dist 35 | 36 | .DS_Store 37 | 38 | streaming-twitter/conf/log4j.properties 39 | 40 | streaming-twitter/conf/log4j.properties.template 41 | 42 | streaming-twitter/src/main/scala/resources/log4j.properties 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Spark Tutorials 2 | 3 | This repository contains tutorials and samples that show you how get the most out of IBM Analytics for Apache Spark. 4 | 5 | Watch this repo for new content. Meanwhile, try these tutorials: 6 | 7 | - [Start Developing with Spark](https://developer.ibm.com/clouddataservices/start-developing-with-spark-and-notebooks/) 8 | 9 | - [Sentiment Analysis of Twitter Hashtags](https://developer.ibm.com/clouddataservices/sentiment-analysis-of-twitter-hashtags/) 10 | 11 | - [Real-time Sentiment Analysis of Twitter Hashtags with Spark](https://developer.ibm.com/clouddataservices/2016/01/15/real-time-sentiment-analysis-of-twitter-hashtags-with-spark/) 12 | 13 | - [Getting started with GraphFrames in Apache Spark](https://developer.ibm.com/clouddataservices/2016/07/15/intro-to-apache-spark-graphframes/) 14 | 15 | - [Predict Flight Delays with Apache Spark MLLib, FlightStats, and Weather Data](https://developer.ibm.com/clouddataservices/2016/08/04/predict-flight-delays-with-apache-spark-mllib-flightstats-and-weather-data/) 16 | 17 | - [Analyze Market Trends in Twitter Using Apache Spark, Python, and dashDB](https://developer.ibm.com/clouddataservices/2016/06/13/analyze-market-trends-in-twitter-using-apache-spark-python-and-dashdb/) 18 | 19 | - [PixieDust: Magic for Your Python Notebook](https://developer.ibm.com/clouddataservices/2016/10/11/pixiedust-magic-for-python-notebook/) 20 | 21 | 22 | -------------------------------------------------------------------------------- /dist/helloSpark-assembly-2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/dist/helloSpark-assembly-2.1.jar -------------------------------------------------------------------------------- /dist/streaming-twitter-assembly-1.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/dist/streaming-twitter-assembly-1.6.jar -------------------------------------------------------------------------------- /docs/Twitter Sentiment with Watson TA and PI architecture diagram.orig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/docs/Twitter Sentiment with Watson TA and PI architecture diagram.orig.png -------------------------------------------------------------------------------- /docs/Twitter Sentiment with Watson TA and PI architecture diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/docs/Twitter Sentiment with Watson TA and PI architecture diagram.png -------------------------------------------------------------------------------- /helloGraphx/build.sbt: -------------------------------------------------------------------------------- 1 | name := "helloGraphx" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies ++= { 8 | val sparkVersion = "1.6.0" 9 | Seq( 10 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided", 11 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", 12 | "org.apache.spark" %% "spark-graphx" % sparkVersion % "provided", 13 | "org.apache.spark" %% "spark-repl" % sparkVersion % "provided", 14 | "org.http4s" %% "http4s-core" % "0.8.2", 15 | "org.http4s" %% "http4s-client" % "0.8.2", 16 | "org.http4s" %% "http4s-blazeclient" % "0.8.2" 17 | ) 18 | } 19 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 20 | -------------------------------------------------------------------------------- /helloGraphx/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 2 | -------------------------------------------------------------------------------- /helloGraphx/readme.md: -------------------------------------------------------------------------------- 1 | # Start Developing with GraphX 2 | 3 | -------------------------------------------------------------------------------- /helloGraphx/src/main/scala/com/ibm/cds/spark/samples/HelloGraphx.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.ibm.cds.spark.samples 19 | 20 | import org.apache.spark._ 21 | import scalaz._ 22 | import java.net.URL 23 | import java.util.Calendar 24 | import java.net.URLEncoder 25 | import java.text.SimpleDateFormat 26 | import org.apache.spark.sql.SQLContext 27 | import scala.collection.immutable.Map 28 | import org.apache.spark.rdd.RDD 29 | import org.apache.spark.graphx.VertexId 30 | import org.apache.spark.sql.Row 31 | import org.apache.spark.graphx.Edge 32 | import org.apache.spark.graphx.Graph 33 | import org.http4s.EntityEncoder 34 | import org.codehaus.jettison.json.JSONObject 35 | import org.http4s.Uri 36 | import org.http4s.Request 37 | import org.http4s.BasicCredentials 38 | import org.http4s.headers.Authorization 39 | import org.http4s.Header 40 | import org.http4s.Headers 41 | import org.http4s.Method 42 | import org.http4s.client.blaze.PooledHttp1Client 43 | import org.http4s.client.Client 44 | import org.http4s.EntityDecoder 45 | import org.apache.spark.graphx.EdgeTriplet 46 | 47 | class Node(val properties: Map[String, String]) extends Serializable 48 | case class Airport(override val properties: Map[String,String]) extends Node(properties) 49 | case class Country(override val properties: Map[String,String]) extends Node(properties) 50 | case class Continent(override val properties: Map[String,String]) extends Node(properties) 51 | case class Route(override val properties: Map[String, String]) extends Node(properties) 52 | 53 | object HelloGraphx { 54 | 55 | //main method invoked when running as a standalone Spark Application 56 | def main(args: Array[String]) { 57 | lazy val client = PooledHttp1Client() 58 | val conf = new SparkConf().setAppName("Hello Graphx") 59 | val sc = new SparkContext(conf) 60 | 61 | println("Hello Graphx Demo. Load/Save a graph to/from Graphx RDDs") 62 | 63 | val sqlContext = new SQLContext(sc); 64 | 65 | //Load airports 66 | val airportsDF = sqlContext.read.format("com.databricks.spark.xml") 67 | .option("rowTag","node") 68 | .option("rootTag","graphml/graph") 69 | .load("/Users/dtaieb/Downloads/air-routes-graph/air-routes.graphml") 70 | airportsDF.printSchema() 71 | println(airportsDF.count()) 72 | 73 | val airportsRdd: RDD[(VertexId, Node with Product)] = 74 | airportsDF.map { x => { 75 | val propertiesMap:Map[String,String] = x.getAs[Seq[Row]]("data") 76 | .map { row => row.getAs[String]("@key")->row.getAs[String]("#VALUE") }.toMap 77 | val id = x.getAs[Long]("@id") 78 | val nodeType:String = propertiesMap.get("type").getOrElse("") 79 | nodeType match { 80 | case "airport" => (id, Airport(propertiesMap)) 81 | case "country" => (id, Country(propertiesMap)) 82 | case "continent" => (id, Continent(propertiesMap)) 83 | case _ => println("Skip node with type " + nodeType); (id, null) 84 | } 85 | }}.filter( f => f._2 !=null ) 86 | println(airportsRdd.take(5).deep.mkString("\n")) 87 | 88 | //Load routes 89 | val routesDF = sqlContext.read.format("com.databricks.spark.xml") 90 | .option("rowTag","edge") 91 | .option("rootTag","graphml/graph") 92 | .load("/Users/dtaieb/Downloads/air-routes-graph/air-routes.graphml") 93 | routesDF.printSchema() 94 | println(routesDF.count()) 95 | 96 | val routesRdd: RDD[(Edge[Route])] = 97 | routesDF.map { x => { 98 | val propertiesMap:Map[String,String] = x.getAs[Seq[Row]]("data") 99 | .map { row => row.getAs[String]("@key")->row.getAs[String]("#VALUE") }.toMap + 100 | ("id" -> x.getAs[Long]("@id").toString) 101 | Edge(x.getAs[Long]("@source"), x.getAs[Long]("@target"),Route(propertiesMap)) 102 | }} 103 | println(routesRdd.take(5).deep.mkString("\n")) 104 | 105 | val graph = Graph( airportsRdd, routesRdd ) 106 | 107 | //Iterate over the graph and send the vertices/edges to Gremlin Server 108 | graph.triplets.foreach( f => { 109 | addTriplet(client, f ); 110 | }) 111 | 112 | //Traverse all nodes and all vertices, send them to the graphdb service via gremlin 113 | sc.stop() 114 | } 115 | 116 | def escape(s:String):String={ 117 | s.replace("'", "\\'") 118 | } 119 | 120 | def addTriplet(client: Client, f: EdgeTriplet[Node with Product, Route] ){ 121 | val sb = new StringBuilder() 122 | 123 | //Add the source vertex if necessary 124 | sb.append( "v1=graph.traversal().V(" + f.srcId + ").tryNext().orElse(null);") 125 | sb.append(" if(!v1) v1=graph.addVertex(id, " + f.srcId) 126 | f.srcAttr.properties.foreach { case(k,v) => sb.append(",'" + escape(k) + "','" + escape(v) + "'" ) } 127 | sb.append(");") 128 | 129 | //Add the target vertex if necessary 130 | sb.append( "v2=graph.traversal().V(" + f.dstId + ").tryNext().orElse(null);") 131 | sb.append(" if(!v2) v2=graph.addVertex(id, " + f.dstId) 132 | f.dstAttr.properties.foreach { case(k,v) => sb.append(",'" + escape(k) + "','" + escape(v) + "'") } 133 | sb.append(");") 134 | 135 | //Add the edge 136 | sb.append("v1.addEdge('edge', v2") 137 | f.attr.properties.foreach { f => sb.append(",'" + escape(f._1) + "','" + escape(f._2) + "'") } 138 | sb.append(");") 139 | 140 | runScript(client, sb.toString ) 141 | } 142 | 143 | def addVertex(client: Client, id: Long, keyValues: Seq[(String,String)]){ 144 | val sb = new StringBuilder(); 145 | sb.append( "if(!graph.traversal().V(" + id + ")) graph.addVertex(id, " + id); 146 | keyValues.foreach { case(k,v) => sb.append("," + k + "," + v) } 147 | sb.append(")") 148 | runScript(client, sb.toString() ) 149 | } 150 | 151 | def runScript(client: Client, script: String){ 152 | //println("{\"gremlin\":" + JSONObject.quote( script ) + "}") 153 | val results = EntityEncoder[String].toEntity("{\"gremlin\":" + JSONObject.quote( script ) + "}" ).flatMap { 154 | entity => 155 | val gremlinUri = Uri.fromString( "http://localhost:8182" ).getOrElse( null ) 156 | client( 157 | Request( 158 | method = Method.POST, 159 | uri = gremlinUri, 160 | headers = Headers( 161 | Header("Accept", "application/json"), 162 | Header("Content-Type", "application/json") 163 | ), 164 | body = entity.body 165 | ) 166 | ).flatMap { response => 167 | val res = response.as[String] 168 | if (response.status.code == 200 ) { 169 | res 170 | } else { 171 | println( "Error received from Gremlin. Code : " + response.status.code + " reason: " + response.status.reason ) 172 | res 173 | } 174 | } 175 | }.attemptRun match { 176 | case -\/(e) => //Ignore 177 | case \/-(a) => println(a) 178 | } 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /helloSpark/.settings/org.scala-ide.sdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | scala.compiler.additionalParams=\ -Xsource\:2.10 -Ymacro-expand\:none 3 | scala.compiler.installation=78943290 4 | scala.compiler.sourceLevel=2.10 5 | scala.compiler.useProjectSettings=true 6 | -------------------------------------------------------------------------------- /helloSpark/build.sbt: -------------------------------------------------------------------------------- 1 | name := "helloSpark" 2 | 3 | version := "2.1" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies ++= { 8 | val sparkVersion = "1.6.0" 9 | Seq( 10 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided", 11 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", 12 | "org.apache.spark" %% "spark-repl" % sparkVersion % "provided" 13 | ) 14 | } 15 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 16 | -------------------------------------------------------------------------------- /helloSpark/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 2 | -------------------------------------------------------------------------------- /helloSpark/python/helloSpark.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark import SparkContext 3 | 4 | def computeStatsForCollection(sc,countPerPartitions=100000,partitions=5): 5 | totalNumber = min( countPerPartitions * partitions, sys.maxsize) 6 | rdd = sc.parallelize( range(totalNumber),partitions) 7 | return (rdd.mean(), rdd.variance()) 8 | 9 | if __name__ == "__main__": 10 | sc = SparkContext(appName="Hello Spark") 11 | print("Hello Spark Demo. Compute the mean and variance of a collection") 12 | stats = computeStatsForCollection(sc); 13 | print(">>> Results: ") 14 | print(">>>>>>>Mean: " + str(stats[0])); 15 | print(">>>>>>>Variance: " + str(stats[1])); 16 | sc.stop() -------------------------------------------------------------------------------- /helloSpark/python/helloSpark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/helloSpark/python/helloSpark/__init__.py -------------------------------------------------------------------------------- /helloSpark/python/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/helloSpark/python/setup.py -------------------------------------------------------------------------------- /helloSpark/readme.md: -------------------------------------------------------------------------------- 1 | # Start Developing with Spark 2 | 3 | ####Build a custom library for Apache® Spark™ and deploy it to a Jupyter Notebook. 4 | 5 | If you're new to developing Spark applications you've come to the right place. Our [**Start Developing with Spark** tutorial](https://developer.ibm.com/clouddataservices/start-developing-with-spark-and-notebooks/) provides detailed end-to-end steps that show you how to build a simple custom library for Spark (written in scala) and how to deploy it on IBM Analytics for Apache Spark for Bluemix. 6 | 7 | These steps are the foundation for building real-life production applications. You'll also learn how to manage your project with the import, test, and debug features of Scala IDE for Eclipse. 8 | 9 | [Get started](https://developer.ibm.com/clouddataservices/start-developing-with-spark-and-notebooks/) 10 | -------------------------------------------------------------------------------- /helloSpark/src/main/scala/com/ibm/cds/spark/samples/HelloSpark.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.ibm.cds.spark.samples 19 | 20 | import org.apache.spark._ 21 | 22 | object HelloSpark { 23 | 24 | //main method invoked when running as a standalone Spark Application 25 | def main(args: Array[String]) { 26 | val conf = new SparkConf().setAppName("Hello Spark") 27 | val spark = new SparkContext(conf) 28 | 29 | println("Hello Spark Demo. Compute the mean and variance of a collection") 30 | val stats = computeStatsForCollection(spark); 31 | println(">>> Results: ") 32 | println(">>>>>>>Mean: " + stats._1 ); 33 | println(">>>>>>>Variance: " + stats._2); 34 | spark.stop() 35 | } 36 | 37 | //Library method that can be invoked from Jupyter Notebook 38 | def computeStatsForCollection( spark: SparkContext, countPerPartitions: Int = 100000, partitions: Int=5): (Double, Double) = { 39 | val totalNumber = math.min( countPerPartitions * partitions, Long.MaxValue).toInt; 40 | val rdd = spark.parallelize( 1 until totalNumber,partitions); 41 | (rdd.mean(), rdd.variance()) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /helloSpark/src/main/scala/com/ibm/cds/spark/samples/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | /** 18 | * Spark Sample Applications 19 | * 20 | */ 21 | package com.ibm.cds.spark.samples; -------------------------------------------------------------------------------- /notebook/Get Service Credentials for Twitter Sentiment with Watson TA and PI.md: -------------------------------------------------------------------------------- 1 | # Set Up Services and Get Credentials 2 | 3 | These instructions accompany the [Twitter Sentiment analysis with Watson Tone Analyzer and Watson Personality Insights Notebook](https://github.com/ibm-watson-data-lab/spark.samples/tree/master/notebook). This sample notebook requires a connection to the following online services: 4 | 5 | - Twitter 6 | - Watson Tone Analyzer 7 | - Watson Personality Insights 8 | 9 | Follow these steps to set up, retrieve, and enter credentials for all 3 services: 10 | 11 | ## Get OAuth Credentials for Twitter 12 | 13 | 14 | Create a new app on your Twitter account and configure the OAuth credentials. 15 | 16 |
    17 |
  1. Go to https://apps.twitter.com/. Sign in and click the Create New App button
  2. 18 |
  3. Complete the required fields: 19 | 20 |
  4. 24 |
  5. Below the developer agreement, turn on the Yes, I agree check box and click Create your Twitter application.
  6. 25 |
  7. Click the Keys and Access Tokens tab.
  8. 26 |
  9. Scroll to the bottom of the page and click the Create My Access Tokens button.
  10. 27 |
  11. Copy the Consumer Key, Consumer Secret, Access Token, and Access Token Secret. You will need them in a few minutes. 28 |

    29 | twitter_keys

  12. 30 |
31 | 32 | ## Get Watson Personality Insights Credentials 33 | 34 | Provision the service and grab your credentials: 35 | 36 | 1. Still in Bluemix, go to the top menu, and click Catalog. 37 | 2. In the search box, type Personality Insights. 38 | 3. Click the Personality Insights service tile, then click Create. 39 | 4. On left side of the screen, click Service Credentials and open or create credentials. 40 | 41 | ![creds](http://developer.ibm.com/clouddataservices/wp-content/uploads/sites/85/2016/10/pi_creds.png) 42 | 43 | 5. Copy the `username` and `password` values. 44 | 45 | 46 | ## Get Watson Tone Analyzer Credentials 47 | 48 | Provision the service and grab your credentials: 49 | 50 | 1. In a new browser tab or window, open Bluemix, go to the top menu, and click Catalog. 51 | 2. In the search box, type Tone Analyzer. 52 | 3. Click the Tone Analyzer tile, then click Create. 53 | 4. On left side of the screen, click Service Credentials and open or create credentials. 54 | 5. Copy the `username` and `password` values. 55 | 56 | 57 | 58 | ## Paste Credentials into the Notebook 59 | 60 | 1. Return to your version of the [Twitter Sentiment analysis with Watson Tone Analyzer and Watson Personality Insights Notebook](https://github.com/ibm-watson-data-lab/spark.samples/tree/master/notebook) 61 | 62 | 2. Paste all the credentials you just collected into the notebook, replacing the XXXXs for each item: 63 | 64 | ``` 65 | sqlContext=SQLContext(sc) 66 | 67 | #Set up the twitter credentials, they will be used both in scala and python cells below 68 | consumerKey = "XXXX" 69 | consumerSecret = "XXXX" 70 | accessToken = "XXXX" 71 | accessTokenSecret = "XXXX" 72 | 73 | #Set up the Watson Personality insight credentials 74 | piUserName = "XXXX" 75 | piPassword = "XXXX" 76 | 77 | #Set up the Watson Tone Analyzer credentials 78 | taUserName = "XXXX" 79 | taPassword = "XXXX" 80 | ``` 81 | 82 | 83 | -------------------------------------------------------------------------------- /notebook/PYCON 2016 spark tutorial quick links.txt: -------------------------------------------------------------------------------- 1 | Bluemix: 2 | https://console.ng.bluemix.net 3 | 4 | FlightStats: 5 | https://developer.flightstats.com/signup 6 | https://developer.flightstats.com/admin/applications 7 | 8 | Simple Data Pipe: 9 | https://github.com/ibm-watson-data-lab/simple-data-pipe 10 | 11 | Flight Predict Notebook, Slides 36 & 37: 12 | https://github.com/ibm-watson-data-lab/simple-data-pipe-connector-flightstats/raw/master/notebook/Flight%20Predict%20PyCon%202016.ipynb 13 | 14 | Car Notebook, Slide 21: 15 | https://github.com/ibm-watson-data-lab/spark.samples/raw/master/notebook/DashDB%20Twitter%20Car%202015%20Python%20Notebook.ipynb 16 | 17 | 18 | SIMPLE DATA PIPE package.json: 19 | "simple-data-pipe-connector-flightstats":"git://github.com/ibm-watson-data-lab/simple-data-pipe-connector-flightstats.git" 20 | -------------------------------------------------------------------------------- /notebook/README.md: -------------------------------------------------------------------------------- 1 | # Sample Notebooks 2 | 3 | This repository contains sample notebooks that show you how get the most out of IBM Analytics for Apache Spark. You may run these notebooks in a locally set up notebook environment (i.e., [Jupyter Notebook](https://jupyter.readthedocs.io/en/latest/install.html)) or through the [IBM Data Science Experience (DSX)](http://datascience.ibm.com/). 4 | 5 | ## Service Credentials 6 | 7 | Some of the notebooks require credentials to various services (e.g., Twitter API, Watson Tone Analyzer, etc.). Instructions for provisioning these services and getting credentials are outlined here: [Set Up Services and Get Credentials](https://github.com/ibm-watson-data-lab/spark.samples/blob/master/notebook/Get%20Service%20Credentials%20for%20Twitter%20Sentiment%20with%20Watson%20TA%20and%20PI.md) 8 | 9 | 10 | ## Running a notebook in DSX 11 | 12 | More info and detailed instruction for DSX can be found its [documentation](http://datascience.ibm.com/docs/content/getting-started/get-started.html). 13 | 14 | 1. Log into DSX 15 | 2. Go to __My Projects__ 16 | 3. Select an existing project or create a new project 17 | 18 | ##### To set up a new project 19 | 1. Click __create project__ 20 | 2. Enter a __Name__ 21 | 3. Select an existing or create a new __Spark Service__ to associate with the project 22 | 4. Select and existing or create a new __Target Object Storage Instance__ to associate with the project 23 | 5. Click __Create__ 24 | 25 | 4. Create a new notebook 26 | 27 | ##### To set up a new notebook 28 | 1. Click __add notebooks__ 29 | 2. Click __From URL__ 30 | 3. Enter a __Name__ 31 | 4. Enter the __Notebook URL__ 32 | 5. Select an existing __Spark Service__ to associate with the notebook 33 | 6. Click __Create Notebook__ 34 | 35 | 5. Once in the notebook, follow it's instructions for running the notebook 36 | -------------------------------------------------------------------------------- /notebook/Twitter Sentiment with Watson TA and PI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Twitter Sentiment analysis with Watson Tone Analyzer and Watson Personality Insights\n", 8 | "\n", 9 | "\n", 10 | "\n", 11 | "In this notebook, we perform the following steps: \n", 12 | "1. Install python-twitter and watson-developer-cloud modules\n", 13 | "2. Install the streaming Twitter jar using PixieDust packageManager\n", 14 | "3. Invoke the streaming Twitter app using the PixieDust Scala Bridge to get a DataFrame containing all the tweets enriched with Watson Tone Analyzer scores\n", 15 | "4. Create a new RDD that groups the tweets by author and concatenates all the associated tweets into one blob\n", 16 | "5. For each author and aggregated text, invoke the Watson Personality Insights to get the scores\n", 17 | "6. Visualize results using PixieDust display \n", 18 | "\n", 19 | "## Learn more \n", 20 | "* [Watson Tone Analyzer](http://www.ibm.com/watson/developercloud/tone-analyzer.html) \n", 21 | "* [Watson Personality Insights](http://www.ibm.com/watson/developercloud/personality-insights.html) \n", 22 | "* [python-twitter](https://github.com/bear/python-twitter) \n", 23 | "* [watson-developer-cloud](https://github.com/watson-developer-cloud) \n", 24 | "* [PixieDust](https://github.com/ibm-watson-data-lab/pixiedust)\n", 25 | "* [Realtime Sentiment Analysis of Twitter Hashtags with Spark](https://developer.ibm.com/clouddataservices/2016/01/15/real-time-sentiment-analysis-of-twitter-hashtags-with-spark)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Install python-twitter and watson-developer-cloud\n", 33 | "If you haven't already installed the following modules, run these 2 cells:" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "!pip install --user python-twitter" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "!pip install --user watson-developer-cloud" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Install latest pixiedust\n", 63 | "Make sure you are running the latest pixiedust version. After upgrading restart the kernel before continuing to the next cells." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "!pip install --upgrade --user pixiedust" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Install the streaming Twitter jar in the notebook from the Github repo\n", 82 | "This jar file contains the Spark Streaming application (written in Scala) that connects to Twitter to fetch the tweets and send them to Watson Tone Analyzer for analysis. The resulting scores are then added to the tweets dataframe as separate columns." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "import pixiedust\n", 94 | "jarPath = \"https://github.com/ibm-watson-data-lab/spark.samples/raw/master/dist/streaming-twitter-assembly-1.6.jar\"\n", 95 | "pixiedust.installPackage(jarPath)\n", 96 | "print(\"done\")" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "

If PixieDust or the streaming Twitter jar were just installed or upgraded, restart the kernel before continuing.

" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## Use Scala Bridge to run the command line version of the app\n", 111 | "Insert your credentials for Twitter, Watson Tone Analyzer, and Watson Personality Insights. Then run the following cell. \n", 112 | "[Read how to provision these services and get credentials](https://github.com/ibm-watson-data-lab/spark.samples/blob/master/notebook/Get%20Service%20Credentials%20for%20Twitter%20Sentiment%20with%20Watson%20TA%20and%20PI.md). " 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "import pixiedust\n", 124 | "\n", 125 | "sqlContext=SQLContext(sc)\n", 126 | "\n", 127 | "#Set up the twitter credentials, they will be used both in scala and python cells below\n", 128 | "consumerKey = \"XXXX\"\n", 129 | "consumerSecret = \"XXXX\"\n", 130 | "accessToken = \"XXXX\"\n", 131 | "accessTokenSecret = \"XXXX\"\n", 132 | "\n", 133 | "#Set up the Watson Personality insight credentials\n", 134 | "piUserName = \"XXXX\"\n", 135 | "piPassword = \"XXXX\"\n", 136 | "\n", 137 | "#Set up the Watson Tone Analyzer credentials\n", 138 | "taUserName = \"XXXX\"\n", 139 | "taPassword = \"XXXX\"" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": false, 147 | "scrolled": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "%%scala\n", 152 | "val demo = com.ibm.cds.spark.samples.StreamingTwitter\n", 153 | "demo.setConfig(\"twitter4j.oauth.consumerKey\",consumerKey)\n", 154 | "demo.setConfig(\"twitter4j.oauth.consumerSecret\",consumerSecret)\n", 155 | "demo.setConfig(\"twitter4j.oauth.accessToken\",accessToken)\n", 156 | "demo.setConfig(\"twitter4j.oauth.accessTokenSecret\",accessTokenSecret)\n", 157 | "demo.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer/api\")\n", 158 | "demo.setConfig(\"watson.tone.password\",taPassword)\n", 159 | "demo.setConfig(\"watson.tone.username\",taUserName)\n", 160 | "\n", 161 | "import org.apache.spark.streaming._\n", 162 | "demo.startTwitterStreaming(sc, Seconds(30)) //Run the application for a limited time" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "# Create a tweets dataframe from the data fetched above and transfer it to Python\n", 170 | "Notice the __ prefix for each variable which is used to signal PixieDust that the variable needs to be transfered back to Python" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "%%scala\n", 182 | "val demo = com.ibm.cds.spark.samples.StreamingTwitter\n", 183 | "val (__sqlContext, __df) = demo.createTwitterDataFrames(sc)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Group the tweets by author and userid\n", 191 | "This will be used later to fetch the last 200 tweets for each author" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "import pyspark.sql.functions as F\n", 203 | "usersDF = __df.groupby(\"author\", \"userid\").agg(F.avg(\"Anger\").alias(\"Anger\"), F.avg(\"Disgust\").alias(\"Disgust\"))\n", 204 | "usersDF.show()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "# Set up the Twitter API from python-twitter module" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "collapsed": false 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "import twitter\n", 223 | "api = twitter.Api(consumer_key=consumerKey,\n", 224 | " consumer_secret=consumerSecret,\n", 225 | " access_token_key=accessToken,\n", 226 | " access_token_secret=accessTokenSecret)\n", 227 | "\n", 228 | "#print(api.VerifyCredentials())" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "# For each author, fetch the last 200 tweets\n", 236 | "use flatMap to return a new RDD that contains a list of tuples composed of userid and tweets text: (userid, tweetText)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": false 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "def getTweets(screenName):\n", 248 | " statuses = api.GetUserTimeline(screen_name=screenName,\n", 249 | " since_id=None,\n", 250 | " max_id=None,\n", 251 | " count=200,\n", 252 | " include_rts=False,\n", 253 | " trim_user=False,\n", 254 | " exclude_replies=True)\n", 255 | " return statuses\n", 256 | "\n", 257 | "usersWithTweetsRDD = usersDF.flatMap(lambda s: [(s.user.screen_name, s.text.encode('ascii', 'ignore')) for s in getTweets(s['userid'])])\n", 258 | "print(usersWithTweetsRDD.count())" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "# Concatenate all the tweets for each user so we have enough words to send to Watson Personality Insights\n", 266 | "* Use map to create an RDD of key, value pair composed of userId and tweets \n", 267 | "* Use reduceByKey to group all record with same author and concatenate the tweets" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": { 274 | "collapsed": false, 275 | "scrolled": true 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "import re\n", 280 | "usersWithTweetsRDD2 = usersWithTweetsRDD.map(lambda s: (s[0], s[1])).reduceByKey(lambda s,t: s + '\\n' + t)\\\n", 281 | " .filter(lambda s: len(re.findall(r'\\w+', s[1])) > 100 )\n", 282 | "print(usersWithTweetsRDD2.count())\n", 283 | "#usersWithTweetsRDD2.take(2)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "# Call Watson Personality Insights on the text for each author\n", 291 | "Watson Personality Insights requires at least 100 words from its lexicon to be available, which may not exist for each user. This is why the getPersonlityInsight helper function guards against exceptions from calling Watson PI. If an exception occurs, then an empty array is returned. Each record with empty array is filtered out of the resulting RDD.\n", 292 | "\n", 293 | "Note also that we use broadcast variables to propagate the userName and password to the cluster" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": false, 301 | "scrolled": true 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "from pyspark.sql.types import *\n", 306 | "from watson_developer_cloud import PersonalityInsightsV3\n", 307 | "broadCastPIUsername = sc.broadcast(piUserName)\n", 308 | "broadCastPIPassword = sc.broadcast(piPassword)\n", 309 | "def getPersonalityInsight(text, schema=False):\n", 310 | " personality_insights = PersonalityInsightsV3(\n", 311 | " version='2016-10-20',\n", 312 | " username=broadCastPIUsername.value,\n", 313 | " password=broadCastPIPassword.value)\n", 314 | " try:\n", 315 | " p = personality_insights.profile(\n", 316 | " text, content_type='text/plain',\n", 317 | " raw_scores=True, consumption_preferences=True)\n", 318 | "\n", 319 | " if schema:\n", 320 | " return \\\n", 321 | " [StructField(t['name'], FloatType()) for t in p[\"needs\"]] + \\\n", 322 | " [StructField(t['name'], FloatType()) for t in p[\"values\"]] + \\\n", 323 | " [StructField(t['name'], FloatType()) for t in p['personality' ]]\n", 324 | " else:\n", 325 | " return \\\n", 326 | " [t['raw_score'] for t in p[\"needs\"]] + \\\n", 327 | " [t['raw_score'] for t in p[\"values\"]] + \\\n", 328 | " [t['raw_score'] for t in p['personality']] \n", 329 | " except:\n", 330 | " return []\n", 331 | "\n", 332 | "usersWithPIRDD = usersWithTweetsRDD2.map(lambda s: [s[0]] + getPersonalityInsight(s[1])).filter(lambda s: len(s)>1)\n", 333 | "print(usersWithPIRDD.count())\n", 334 | "#usersWithPIRDD.take(2)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "# Convert the RDD back to a DataFrame and call PixieDust display to visualize the results\n", 342 | "The schema is automatically created from introspecting a sample payload result from Watson Personality Insights" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": false, 350 | "pixiedust": { 351 | "displayParams": { 352 | "aggregation": "SUM", 353 | "handlerId": "barChart", 354 | "keyFields": "userid", 355 | "showLegend": "true", 356 | "stacked": "false", 357 | "staticFigure": "false", 358 | "title": "Personality Insights", 359 | "valueFields": "Challenge,Closeness,Curiosity,Excitement" 360 | } 361 | }, 362 | "scrolled": false 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "#convert to dataframe\n", 367 | "schema = StructType(\n", 368 | " [StructField('userid',StringType())] + getPersonalityInsight(usersWithTweetsRDD2.take(1)[0][1], schema=True)\n", 369 | ")\n", 370 | "\n", 371 | "usersWithPIDF = sqlContext.createDataFrame(\n", 372 | " usersWithPIRDD, schema\n", 373 | ")\n", 374 | "\n", 375 | "usersWithPIDF.cache()\n", 376 | "display(usersWithPIDF)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "# Compare Twitter users Personality Insights scores with this year presidential candidates\n", 384 | "\n", 385 | "For a quick look on the difference in Personality Insights scores Spark provides a describe() function that computes stddev and mean values off the dataframe. Compare differences in the scores of twitter users and presidential candidates." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": { 392 | "collapsed": true 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "candidates = \"realDonaldTrump HillaryClinton\".split(\" \")\n", 397 | "candidatesRDD = sc.parallelize(candidates)\\\n", 398 | " .flatMap(lambda s: [(t.user.screen_name, t.text.encode('ascii', 'ignore')) for t in getTweets(s)])\\\n", 399 | " .map(lambda s: (s[0], s[1]))\\\n", 400 | " .reduceByKey(lambda s,t: s + '\\n' + t)\\\n", 401 | " .filter(lambda s: len(re.findall(r'\\w+', s[1])) > 100 )\\\n", 402 | " .map(lambda s: [s[0]] + getPersonalityInsight(s[1]))\n", 403 | "\n", 404 | "candidatesPIDF = sqlContext.createDataFrame(\n", 405 | " candidatesRDD, schema\n", 406 | ")" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "collapsed": true 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "c = candidatesPIDF.collect()\n", 418 | "broadCastTrumpPI = sc.broadcast(c[0][1:])\n", 419 | "broadCastHillaryPI = sc.broadcast(c[1][1:])" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": { 426 | "collapsed": false, 427 | "pixiedust": { 428 | "displayParams": { 429 | "handlerId": "dataframe" 430 | } 431 | } 432 | }, 433 | "outputs": [], 434 | "source": [ 435 | "display(candidatesPIDF)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": { 442 | "collapsed": false 443 | }, 444 | "outputs": [], 445 | "source": [ 446 | "candidatesPIDF.select('userid','Emotional range','Agreeableness', 'Extraversion','Conscientiousness', 'Openness').show()\n", 447 | "\n", 448 | "usersWithPIDF.describe(['Emotional range']).show()\n", 449 | "usersWithPIDF.describe(['Agreeableness']).show()\n", 450 | "usersWithPIDF.describe(['Extraversion']).show()\n", 451 | "usersWithPIDF.describe(['Conscientiousness']).show()\n", 452 | "usersWithPIDF.describe(['Openness']).show()" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "# Calculate Euclidean distance (norm) between each Twitter user and the presidential candidates using the Personality Insights scores\n", 460 | "\n", 461 | "Add the distances into 2 extra columns and display the results" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": { 468 | "collapsed": false, 469 | "pixiedust": { 470 | "displayParams": { 471 | "aggregation": "COUNT", 472 | "handlerId": "barChart", 473 | "keyFields": "closerHillary", 474 | "showLegend": "true", 475 | "stacked": "true", 476 | "staticFigure": "false", 477 | "valueFields": "closerHillary" 478 | } 479 | } 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "import numpy as np\n", 484 | "from pyspark.sql.types import Row\n", 485 | "def addEuclideanDistance(s):\n", 486 | " dict = s.asDict()\n", 487 | " def getEuclideanDistance(a,b):\n", 488 | " return np.linalg.norm(np.array(a) - np.array(b)).item()\n", 489 | " dict[\"distDonaldTrump\"]=getEuclideanDistance(s[1:], broadCastTrumpPI.value)\n", 490 | " dict[\"distHillary\"]=getEuclideanDistance(s[1:], broadCastHillaryPI.value)\n", 491 | " dict[\"closerHillary\"] = \"Yes\" if dict[\"distHillary\"] < dict[\"distDonaldTrump\"] else \"No\"\n", 492 | " return Row(**dict)\n", 493 | "\n", 494 | "#add euclidean distances to Trump and Hillary\n", 495 | "euclideanDF = sqlContext.createDataFrame(usersWithPIDF.map(lambda s: addEuclideanDistance(s)))\n", 496 | "\n", 497 | "#Reorder columns to have userid and distances first\n", 498 | "cols = euclideanDF.columns\n", 499 | "reorderCols = [\"userid\",\"distHillary\",\"distDonaldTrump\", \"closerHillary\"]\n", 500 | "euclideanDF = euclideanDF.select(reorderCols + [x for x in cols if x not in reorderCols])\n", 501 | "\n", 502 | "#PixieDust display. \n", 503 | "#To visualize the distribution, select the bar chart display, use closerHillary as key and value and aggregation=count\n", 504 | "display(euclideanDF)" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "# Optional: do some extra data science on the tweets" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": { 518 | "collapsed": false, 519 | "pixiedust": { 520 | "displayParams": { 521 | "aggregation": "COUNT", 522 | "handlerId": "barChart", 523 | "keyFields": "Anger", 524 | "showLegend": "true", 525 | "stacked": "true", 526 | "staticFigure": "false", 527 | "valueFields": "Openness" 528 | } 529 | } 530 | }, 531 | "outputs": [], 532 | "source": [ 533 | "tweets=__df\n", 534 | "tweets.count()\n", 535 | "display(tweets)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "# Compute the sentiment distributions for tweets with scores greater than 60% and create matplotlib chart visualization" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": { 549 | "collapsed": false 550 | }, 551 | "outputs": [], 552 | "source": [ 553 | "#create an array that will hold the count for each sentiment\n", 554 | "sentimentDistribution=[0] * 13\n", 555 | "#For each sentiment, run a sql query that counts the number of tweets for which the sentiment score is greater than 60%\n", 556 | "#Store the data in the array\n", 557 | "for i, sentiment in enumerate(tweets.columns[-13:]):\n", 558 | " sentimentDistribution[i]=__sqlContext.sql(\"SELECT count(*) as sentCount FROM tweets where \" + sentiment + \" > 60\")\\\n", 559 | " .collect()[0].sentCount" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": { 566 | "collapsed": false 567 | }, 568 | "outputs": [], 569 | "source": [ 570 | "%matplotlib inline\n", 571 | "import matplotlib\n", 572 | "import numpy as np\n", 573 | "import matplotlib.pyplot as plt\n", 574 | "\n", 575 | "ind=np.arange(13)\n", 576 | "width = 0.35\n", 577 | "bar = plt.bar(ind, sentimentDistribution, width, color='g', label = \"distributions\")\n", 578 | "\n", 579 | "params = plt.gcf()\n", 580 | "plSize = params.get_size_inches()\n", 581 | "params.set_size_inches( (plSize[0]*2.5, plSize[1]*2) )\n", 582 | "plt.ylabel('Tweet count')\n", 583 | "plt.xlabel('Tone')\n", 584 | "plt.title('Distribution of tweets by sentiments > 60%')\n", 585 | "plt.xticks(ind+width, tweets.columns[-13:])\n", 586 | "plt.legend()\n", 587 | "\n", 588 | "plt.show()" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "metadata": {}, 594 | "source": [ 595 | "# Compute the top hashtags used in each tweet" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": { 602 | "collapsed": true 603 | }, 604 | "outputs": [], 605 | "source": [ 606 | "from operator import add\n", 607 | "import re\n", 608 | "tagsRDD = tweets.flatMap( lambda t: re.split(\"\\s\", t.text))\\\n", 609 | " .filter( lambda word: word.startswith(\"#\") )\\\n", 610 | " .map( lambda word : (word, 1 ))\\\n", 611 | " .reduceByKey(add, 10).map(lambda (a,b): (b,a)).sortByKey(False).map(lambda (a,b):(b,a))\n", 612 | "top10tags = tagsRDD.take(10)" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": { 619 | "collapsed": false 620 | }, 621 | "outputs": [], 622 | "source": [ 623 | "%matplotlib inline\n", 624 | "import matplotlib\n", 625 | "import matplotlib.pyplot as plt\n", 626 | "\n", 627 | "params = plt.gcf()\n", 628 | "plSize = params.get_size_inches()\n", 629 | "params.set_size_inches( (plSize[0]*2, plSize[1]*2) )\n", 630 | "\n", 631 | "labels = [i[0] for i in top10tags]\n", 632 | "sizes = [int(i[1]) for i in top10tags]\n", 633 | "colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', \"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\"]\n", 634 | "\n", 635 | "plt.pie(sizes, labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90)\n", 636 | "\n", 637 | "plt.axis('equal')\n", 638 | "plt.show()" 639 | ] 640 | }, 641 | { 642 | "cell_type": "markdown", 643 | "metadata": {}, 644 | "source": [ 645 | "# Compute the aggregate sentiment distribution for all the tweets that contain the top hashtags" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": { 652 | "collapsed": true 653 | }, 654 | "outputs": [], 655 | "source": [ 656 | "cols = tweets.columns[-13:]\n", 657 | "def expand( t ):\n", 658 | " ret = []\n", 659 | " for s in [i[0] for i in top10tags]:\n", 660 | " if ( s in t.text ):\n", 661 | " for tone in cols:\n", 662 | " ret += [s.replace(':','').replace('-','') + u\"-\" + unicode(tone) + \":\" + unicode(getattr(t, tone))]\n", 663 | " return ret \n", 664 | "def makeList(l):\n", 665 | " return l if isinstance(l, list) else [l]\n", 666 | "\n", 667 | "#Create RDD from tweets dataframe\n", 668 | "tagsRDD = tweets.map(lambda t: t )\n", 669 | "\n", 670 | "#Filter to only keep the entries that are in top10tags\n", 671 | "tagsRDD = tagsRDD.filter( lambda t: any(s in t.text for s in [i[0] for i in top10tags] ) )\n", 672 | "\n", 673 | "#Create a flatMap using the expand function defined above, this will be used to collect all the scores \n", 674 | "#for a particular tag with the following format: Tag-Tone-ToneScore\n", 675 | "tagsRDD = tagsRDD.flatMap( expand )\n", 676 | "\n", 677 | "#Create a map indexed by Tag-Tone keys \n", 678 | "tagsRDD = tagsRDD.map( lambda fullTag : (fullTag.split(\":\")[0], float( fullTag.split(\":\")[1]) ))\n", 679 | "\n", 680 | "#Call combineByKey to format the data as follow\n", 681 | "#Key=Tag-Tone\n", 682 | "#Value=(count, sum_of_all_score_for_this_tone)\n", 683 | "tagsRDD = tagsRDD.combineByKey((lambda x: (x,1)),\n", 684 | " (lambda x, y: (x[0] + y, x[1] + 1)),\n", 685 | " (lambda x, y: (x[0] + y[0], x[1] + y[1])))\n", 686 | "\n", 687 | "#ReIndex the map to have the key be the Tag and value be (Tone, Average_score) tuple\n", 688 | "#Key=Tag\n", 689 | "#Value=(Tone, average_score)\n", 690 | "tagsRDD = tagsRDD.map(lambda (key, ab): (key.split(\"-\")[0], (key.split(\"-\")[1], round(ab[0]/ab[1], 2))))\n", 691 | "\n", 692 | "#Reduce the map on the Tag key, value becomes a list of (Tone,average_score) tuples\n", 693 | "tagsRDD = tagsRDD.reduceByKey( lambda x, y : makeList(x) + makeList(y) )\n", 694 | "\n", 695 | "#Sort the (Tone,average_score) tuples alphabetically by Tone\n", 696 | "tagsRDD = tagsRDD.mapValues( lambda x : sorted(x) )\n", 697 | "\n", 698 | "#Format the data as expected by the plotting code in the next cell. \n", 699 | "#map the Values to a tuple as follow: ([list of tone], [list of average score])\n", 700 | "#e.g. #someTag:([u'Agreeableness', u'Analytical', u'Anger', u'Cheerfulness', u'Confident', u'Conscientiousness', u'Negative', u'Openness', u'Tentative'], [1.0, 0.0, 0.0, 1.0, 0.0, 0.48, 0.0, 0.02, 0.0])\n", 701 | "tagsRDD = tagsRDD.mapValues( lambda x : ([elt[0] for elt in x],[elt[1] for elt in x]) )\n", 702 | "\n", 703 | "#Use custom sort function to sort the entries by order of appearance in top10tags\n", 704 | "def customCompare( key ):\n", 705 | " for (k,v) in top10tags:\n", 706 | " if k == key:\n", 707 | " return v\n", 708 | " return 0\n", 709 | "tagsRDD = tagsRDD.sortByKey(ascending=False, numPartitions=None, keyfunc = customCompare)\n", 710 | "\n", 711 | "#Take the mean tone scores for the top 10 tags\n", 712 | "top10tagsMeanScores = tagsRDD.take(10)" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": null, 718 | "metadata": { 719 | "collapsed": false 720 | }, 721 | "outputs": [], 722 | "source": [ 723 | "%matplotlib inline\n", 724 | "import matplotlib\n", 725 | "import numpy as np\n", 726 | "import matplotlib.pyplot as plt\n", 727 | "\n", 728 | "params = plt.gcf()\n", 729 | "plSize = params.get_size_inches()\n", 730 | "params.set_size_inches( (plSize[0]*3, plSize[1]*2) )\n", 731 | "\n", 732 | "top5tagsMeanScores = top10tagsMeanScores[:5]\n", 733 | "width = 0\n", 734 | "ind=np.arange(13)\n", 735 | "(a,b) = top5tagsMeanScores[0]\n", 736 | "labels=b[0]\n", 737 | "colors = [\"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\", \"lightgreen\", \"gainsboro\", \"aquamarine\",\"c\"]\n", 738 | "idx=0\n", 739 | "for key, value in top5tagsMeanScores:\n", 740 | " plt.bar(ind + width, value[1], 0.15, color=colors[idx], label=key)\n", 741 | " width += 0.15\n", 742 | " idx += 1\n", 743 | "plt.xticks(ind+0.3, labels)\n", 744 | "plt.ylabel('AVERAGE SCORE')\n", 745 | "plt.xlabel('TONES')\n", 746 | "plt.title('Breakdown of top hashtags by sentiment tones')\n", 747 | "\n", 748 | "plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='center',ncol=5, mode=\"expand\", borderaxespad=0.)\n", 749 | "\n", 750 | "plt.show()" 751 | ] 752 | }, 753 | { 754 | "cell_type": "markdown", 755 | "metadata": {}, 756 | "source": [ 757 | "# Optional: Use Twitter demo embedded app to run the same app with a UI" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "metadata": { 764 | "collapsed": false 765 | }, 766 | "outputs": [], 767 | "source": [ 768 | "%%scala\n", 769 | "val demo = com.ibm.cds.spark.samples.PixiedustStreamingTwitter\n", 770 | "demo.setConfig(\"twitter4j.oauth.consumerKey\",consumerKey)\n", 771 | "demo.setConfig(\"twitter4j.oauth.consumerSecret\",consumerSecret)\n", 772 | "demo.setConfig(\"twitter4j.oauth.accessToken\",accessToken)\n", 773 | "demo.setConfig(\"twitter4j.oauth.accessTokenSecret\",accessTokenSecret)\n", 774 | "demo.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer/api\")\n", 775 | "demo.setConfig(\"watson.tone.password\",taPassword)\n", 776 | "demo.setConfig(\"watson.tone.username\",taUserName)\n", 777 | "demo.setConfig(\"checkpointDir\", System.getProperty(\"user.home\") + \"/pixiedust/ssc\")" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": null, 783 | "metadata": { 784 | "collapsed": true 785 | }, 786 | "outputs": [], 787 | "source": [ 788 | "!pip install --upgrade --user pixiedust-twitterdemo" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": null, 794 | "metadata": { 795 | "collapsed": false, 796 | "pixiedust": { 797 | "displayParams": { 798 | "handlerId": "twitterdemo" 799 | } 800 | } 801 | }, 802 | "outputs": [], 803 | "source": [ 804 | "from pixiedust_twitterdemo import *\n", 805 | "twitterDemo()" 806 | ] 807 | }, 808 | { 809 | "cell_type": "markdown", 810 | "metadata": {}, 811 | "source": [ 812 | "## The embedded app has generated a DataFrame called __tweets. Let's use it to do some data science" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": { 819 | "collapsed": false, 820 | "pixiedust": { 821 | "displayParams": { 822 | "handlerId": "dataframe" 823 | } 824 | } 825 | }, 826 | "outputs": [], 827 | "source": [ 828 | "display(__tweets)" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "metadata": { 835 | "collapsed": false, 836 | "pixiedust": { 837 | "displayParams": { 838 | "aggregation": "COUNT", 839 | "handlerId": "barChart", 840 | "keyFields": "emotion", 841 | "showLegend": "true", 842 | "stacked": "true", 843 | "valueFields": "score" 844 | } 845 | } 846 | }, 847 | "outputs": [], 848 | "source": [ 849 | "from pyspark.sql import Row\n", 850 | "from pyspark.sql.types import *\n", 851 | "emotions=__tweets.columns[-13:]\n", 852 | "distrib = __tweets.flatMap(lambda t: [(x,t[x]) for x in emotions]).filter(lambda t: t[1]>60)\\\n", 853 | " .toDF(StructType([StructField('emotion',StringType()),StructField('score',DoubleType())]))\n", 854 | "display(distrib)" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "metadata": { 861 | "collapsed": false 862 | }, 863 | "outputs": [], 864 | "source": [ 865 | "__tweets.registerTempTable(\"pixiedust_tweets\")\n", 866 | "#create an array that will hold the count for each sentiment\n", 867 | "sentimentDistribution=[0] * 13\n", 868 | "#For each sentiment, run a sql query that counts the number of tweets for which the sentiment score is greater than 60%\n", 869 | "#Store the data in the array\n", 870 | "for i, sentiment in enumerate(__tweets.columns[-13:]):\n", 871 | " sentimentDistribution[i]=sqlContext.sql(\"SELECT count(*) as sentCount FROM pixiedust_tweets where \" + sentiment + \" > 60\")\\\n", 872 | " .collect()[0].sentCount" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": null, 878 | "metadata": { 879 | "collapsed": false 880 | }, 881 | "outputs": [], 882 | "source": [ 883 | "%matplotlib inline\n", 884 | "import matplotlib\n", 885 | "import numpy as np\n", 886 | "import matplotlib.pyplot as plt\n", 887 | "\n", 888 | "ind=np.arange(13)\n", 889 | "width = 0.35\n", 890 | "bar = plt.bar(ind, sentimentDistribution, width, color='g', label = \"distributions\")\n", 891 | "\n", 892 | "params = plt.gcf()\n", 893 | "plSize = params.get_size_inches()\n", 894 | "params.set_size_inches( (plSize[0]*2.5, plSize[1]*2) )\n", 895 | "plt.ylabel('Tweet count')\n", 896 | "plt.xlabel('Tone')\n", 897 | "plt.title('Distribution of tweets by sentiments > 60%')\n", 898 | "plt.xticks(ind+width, __tweets.columns[-13:])\n", 899 | "plt.legend()\n", 900 | "\n", 901 | "plt.show()" 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": null, 907 | "metadata": { 908 | "collapsed": true 909 | }, 910 | "outputs": [], 911 | "source": [ 912 | "from operator import add\n", 913 | "import re\n", 914 | "tagsRDD = __tweets.flatMap( lambda t: re.split(\"\\s\", t.text))\\\n", 915 | " .filter( lambda word: word.startswith(\"#\") )\\\n", 916 | " .map( lambda word : (word, 1 ))\\\n", 917 | " .reduceByKey(add, 10).map(lambda (a,b): (b,a)).sortByKey(False).map(lambda (a,b):(b,a))\n", 918 | "top10tags = tagsRDD.take(10)" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": null, 924 | "metadata": { 925 | "collapsed": false 926 | }, 927 | "outputs": [], 928 | "source": [ 929 | "%matplotlib inline\n", 930 | "import matplotlib\n", 931 | "import matplotlib.pyplot as plt\n", 932 | "\n", 933 | "params = plt.gcf()\n", 934 | "plSize = params.get_size_inches()\n", 935 | "params.set_size_inches( (plSize[0]*2, plSize[1]*2) )\n", 936 | "\n", 937 | "labels = [i[0] for i in top10tags]\n", 938 | "sizes = [int(i[1]) for i in top10tags]\n", 939 | "colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', \"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\"]\n", 940 | "\n", 941 | "plt.pie(sizes, labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90)\n", 942 | "\n", 943 | "plt.axis('equal')\n", 944 | "plt.show()" 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "execution_count": null, 950 | "metadata": { 951 | "collapsed": true 952 | }, 953 | "outputs": [], 954 | "source": [ 955 | "cols = __tweets.columns[-13:]\n", 956 | "def expand( t ):\n", 957 | " ret = []\n", 958 | " for s in [i[0] for i in top10tags]:\n", 959 | " if ( s in t.text ):\n", 960 | " for tone in cols:\n", 961 | " ret += [s.replace(':','').replace('-','') + u\"-\" + unicode(tone) + \":\" + unicode(getattr(t, tone))]\n", 962 | " return ret \n", 963 | "def makeList(l):\n", 964 | " return l if isinstance(l, list) else [l]\n", 965 | "\n", 966 | "#Create RDD from tweets dataframe\n", 967 | "tagsRDD = __tweets.map(lambda t: t )\n", 968 | "\n", 969 | "#Filter to only keep the entries that are in top10tags\n", 970 | "tagsRDD = tagsRDD.filter( lambda t: any(s in t.text for s in [i[0] for i in top10tags] ) )\n", 971 | "\n", 972 | "#Create a flatMap using the expand function defined above, this will be used to collect all the scores \n", 973 | "#for a particular tag with the following format: Tag-Tone-ToneScore\n", 974 | "tagsRDD = tagsRDD.flatMap( expand )\n", 975 | "\n", 976 | "#Create a map indexed by Tag-Tone keys \n", 977 | "tagsRDD = tagsRDD.map( lambda fullTag : (fullTag.split(\":\")[0], float( fullTag.split(\":\")[1]) ))\n", 978 | "\n", 979 | "#Call combineByKey to format the data as follow\n", 980 | "#Key=Tag-Tone\n", 981 | "#Value=(count, sum_of_all_score_for_this_tone)\n", 982 | "tagsRDD = tagsRDD.combineByKey((lambda x: (x,1)),\n", 983 | " (lambda x, y: (x[0] + y, x[1] + 1)),\n", 984 | " (lambda x, y: (x[0] + y[0], x[1] + y[1])))\n", 985 | "\n", 986 | "#ReIndex the map to have the key be the Tag and value be (Tone, Average_score) tuple\n", 987 | "#Key=Tag\n", 988 | "#Value=(Tone, average_score)\n", 989 | "tagsRDD = tagsRDD.map(lambda (key, ab): (key.split(\"-\")[0], (key.split(\"-\")[1], round(ab[0]/ab[1], 2))))\n", 990 | "\n", 991 | "#Reduce the map on the Tag key, value becomes a list of (Tone,average_score) tuples\n", 992 | "tagsRDD = tagsRDD.reduceByKey( lambda x, y : makeList(x) + makeList(y) )\n", 993 | "\n", 994 | "#Sort the (Tone,average_score) tuples alphabetically by Tone\n", 995 | "tagsRDD = tagsRDD.mapValues( lambda x : sorted(x) )\n", 996 | "\n", 997 | "#Format the data as expected by the plotting code in the next cell. \n", 998 | "#map the Values to a tuple as follow: ([list of tone], [list of average score])\n", 999 | "#e.g. #someTag:([u'Agreeableness', u'Analytical', u'Anger', u'Cheerfulness', u'Confident', u'Conscientiousness', u'Negative', u'Openness', u'Tentative'], [1.0, 0.0, 0.0, 1.0, 0.0, 0.48, 0.0, 0.02, 0.0])\n", 1000 | "tagsRDD = tagsRDD.mapValues( lambda x : ([elt[0] for elt in x],[elt[1] for elt in x]) )\n", 1001 | "\n", 1002 | "#Use custom sort function to sort the entries by order of appearance in top10tags\n", 1003 | "def customCompare( key ):\n", 1004 | " for (k,v) in top10tags:\n", 1005 | " if k == key:\n", 1006 | " return v\n", 1007 | " return 0\n", 1008 | "tagsRDD = tagsRDD.sortByKey(ascending=False, numPartitions=None, keyfunc = customCompare)\n", 1009 | "\n", 1010 | "#Take the mean tone scores for the top 10 tags\n", 1011 | "top10tagsMeanScores = tagsRDD.take(10)" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": null, 1017 | "metadata": { 1018 | "collapsed": false 1019 | }, 1020 | "outputs": [], 1021 | "source": [ 1022 | "%matplotlib inline\n", 1023 | "import matplotlib\n", 1024 | "import numpy as np\n", 1025 | "import matplotlib.pyplot as plt\n", 1026 | "\n", 1027 | "params = plt.gcf()\n", 1028 | "plSize = params.get_size_inches()\n", 1029 | "params.set_size_inches( (plSize[0]*3, plSize[1]*2) )\n", 1030 | "\n", 1031 | "top5tagsMeanScores = top10tagsMeanScores[:5]\n", 1032 | "width = 0\n", 1033 | "ind=np.arange(13)\n", 1034 | "(a,b) = top5tagsMeanScores[0]\n", 1035 | "labels=b[0]\n", 1036 | "colors = [\"beige\", \"paleturquoise\", \"pink\", \"lightyellow\", \"coral\", \"lightgreen\", \"gainsboro\", \"aquamarine\",\"c\"]\n", 1037 | "idx=0\n", 1038 | "for key, value in top5tagsMeanScores:\n", 1039 | " plt.bar(ind + width, value[1], 0.15, color=colors[idx], label=key)\n", 1040 | " width += 0.15\n", 1041 | " idx += 1\n", 1042 | "plt.xticks(ind+0.3, labels)\n", 1043 | "plt.ylabel('AVERAGE SCORE')\n", 1044 | "plt.xlabel('TONES')\n", 1045 | "plt.title('Breakdown of top hashtags by sentiment tones')\n", 1046 | "\n", 1047 | "plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='center',ncol=5, mode=\"expand\", borderaxespad=0.)\n", 1048 | "\n", 1049 | "plt.show()" 1050 | ] 1051 | } 1052 | ], 1053 | "metadata": { 1054 | "anaconda-cloud": {}, 1055 | "kernelspec": { 1056 | "display_name": "pySpark (Spark 1.6.0) Python 2", 1057 | "language": "python", 1058 | "name": "pyspark1.6python2" 1059 | }, 1060 | "language_info": { 1061 | "codemirror_mode": { 1062 | "name": "ipython", 1063 | "version": 2 1064 | }, 1065 | "file_extension": ".py", 1066 | "mimetype": "text/x-python", 1067 | "name": "python", 1068 | "nbconvert_exporter": "python", 1069 | "pygments_lexer": "ipython2", 1070 | "version": "2.7.11" 1071 | } 1072 | }, 1073 | "nbformat": 4, 1074 | "nbformat_minor": 0 1075 | } 1076 | -------------------------------------------------------------------------------- /streaming-twitter/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /streaming-twitter/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /config 3 | -------------------------------------------------------------------------------- /streaming-twitter/.project: -------------------------------------------------------------------------------- 1 | 2 | streaming-twitter 3 | 4 | 5 | org.scala-ide.sdt.core.scalabuilder 6 | 7 | 8 | 9 | org.scala-ide.sdt.core.scalanature 10 | org.eclipse.jdt.core.javanature 11 | 12 | 13 | -------------------------------------------------------------------------------- /streaming-twitter/build.sbt: -------------------------------------------------------------------------------- 1 | name := "streaming-twitter" 2 | 3 | version := "1.6" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies ++= { 8 | val sparkVersion = "1.6.0" 9 | Seq( 10 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided", 11 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", 12 | "org.apache.spark" %% "spark-streaming" % sparkVersion % "provided", 13 | "org.apache.spark" %% "spark-streaming-twitter" % sparkVersion, 14 | "org.apache.spark" %% "spark-repl" % sparkVersion % "provided", 15 | "com.ibm" %% "couchdb-scala" % "0.5.3", 16 | "org.apache.kafka" % "kafka-log4j-appender" % "0.9.0.0", 17 | "org.apache.kafka" % "kafka-clients" % "0.9.0.0", 18 | "org.apache.kafka" %% "kafka" % "0.9.0.0", 19 | "com.google.guava" % "guava" % "14.0.1" 20 | ) 21 | } 22 | 23 | assemblyMergeStrategy in assembly := { 24 | case PathList("org", "apache", "spark", xs @ _*) => MergeStrategy.first 25 | case PathList("scala", xs @ _*) => MergeStrategy.discard 26 | case PathList("com", "ibm", "pixiedust", xs @ _*) => MergeStrategy.discard 27 | case PathList("META-INF", "maven", "org.slf4j", xs @ _* ) => MergeStrategy.first 28 | case x => 29 | val oldStrategy = (assemblyMergeStrategy in assembly).value 30 | oldStrategy(x) 31 | } 32 | 33 | unmanagedBase <<= baseDirectory { base => base / "lib" } 34 | 35 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 36 | 37 | resolvers += "scalaz-bintray" at "https://dl.bintray.com/scalaz/releases" 38 | resolvers += "Local couchdb-scala repo" at (baseDirectory.value / "lib/couchdb-scala").toURI.toString 39 | -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar.md5: -------------------------------------------------------------------------------- 1 | e5ee6d0be04b3b9fc6f2f9c7dabc2497 -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-javadoc.jar.sha1: -------------------------------------------------------------------------------- 1 | ba8a2e725a4aae35185cbc0862f93fb86dc50138 -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar.md5: -------------------------------------------------------------------------------- 1 | be140baa91495e6a161eb95b3415b48d -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3-sources.jar.sha1: -------------------------------------------------------------------------------- 1 | eda716f52436863b442564400ebcecc09662d8f7 -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar.md5: -------------------------------------------------------------------------------- 1 | 554911d3e139c8ba42957989e4f76428 -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.jar.sha1: -------------------------------------------------------------------------------- 1 | 6c25040548743c9ae0bb2cf4636ec9da9d55068c -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.pom: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.ibm 5 | couchdb-scala_2.10 6 | jar 7 | A purely functional Scala client for CouchDB 8 | https://github.com/beloglazov/couchdb-scala 9 | 0.5.3 10 | 11 | 12 | The Apache Software License, Version 2.0 13 | http://www.apache.org/licenses/LICENSE-2.0.txt 14 | repo 15 | 16 | 17 | couchdb-scala 18 | 19 | com.ibm 20 | https://github.com/beloglazov/couchdb-scala 21 | 22 | 23 | scm:git:git@github.com:beloglazov/couchdb-scala.git 24 | scm:git:git@github.com:beloglazov/couchdb-scala.git 25 | https://github.com/beloglazov/couchdb-scala 26 | 27 | 28 | 29 | beloglazov 30 | Anton Beloglazov 31 | anton.beloglazov@gmail.com 32 | http://beloglazov.info 33 | 34 | 35 | 36 | 37 | org.scala-lang 38 | scala-library 39 | 2.10.4 40 | 41 | 42 | org.scalaz 43 | scalaz-core_2.10 44 | 7.1.0 45 | 46 | 47 | org.scalaz 48 | scalaz-effect_2.10 49 | 7.1.0 50 | 51 | 52 | org.http4s 53 | http4s-core_2.10 54 | 0.8.2 55 | 56 | 57 | org.http4s 58 | http4s-client_2.10 59 | 0.8.2 60 | 61 | 62 | org.http4s 63 | http4s-blazeclient_2.10 64 | 0.8.2 65 | 66 | 67 | com.lihaoyi 68 | upickle_2.10 69 | 0.2.6 70 | 71 | 72 | com.github.julien-truffaut 73 | monocle-core_2.10 74 | 1.0.1 75 | 76 | 77 | com.github.julien-truffaut 78 | monocle-macro_2.10 79 | 1.0.1 80 | 81 | 82 | org.log4s 83 | log4s_2.10 84 | 1.1.3 85 | 86 | 87 | org.specs2 88 | specs2_2.10 89 | 2.4.16 90 | test 91 | 92 | 93 | org.typelevel 94 | scalaz-specs2_2.10 95 | 0.3.0 96 | test 97 | 98 | 99 | org.scalacheck 100 | scalacheck_2.10 101 | 1.12.1 102 | test 103 | 104 | 105 | org.scalaz 106 | scalaz-scalacheck-binding_2.10 107 | 7.1.0 108 | test 109 | 110 | 111 | ch.qos.logback 112 | logback-classic 113 | 1.1.2 114 | test 115 | 116 | 117 | -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.pom.md5: -------------------------------------------------------------------------------- 1 | c19ebb91556b46c2e2a7ff027b351e15 -------------------------------------------------------------------------------- /streaming-twitter/lib/couchdb-scala/com/ibm/couchdb-scala_2.10/0.5.3/couchdb-scala_2.10-0.5.3.pom.sha1: -------------------------------------------------------------------------------- 1 | 342d29d046750084aabf94c85081f54e19bbcaa6 -------------------------------------------------------------------------------- /streaming-twitter/lib/messagehub.login-1.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/messagehub.login-1.0.0.jar -------------------------------------------------------------------------------- /streaming-twitter/lib/pixiedust.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibm-watson-data-lab/spark.samples/60e55f7f07e49d43dd8c5b38185bbdf971fbcd60/streaming-twitter/lib/pixiedust.jar -------------------------------------------------------------------------------- /streaming-twitter/notebook/Spark Streaming Twitter-Watson-MessageHub.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#Spark Streaming sample application using Twitter, Watson Tone Analyzer, Event Hub and Message Hub\n", 8 | "In this Notebook, we show how to run a Spark Streaming application using a Notebook. There are multiple limitations to be aware of: \n", 9 | "1. The application will stop when the page is refreshed or closed\n", 10 | "2. As events are being processed, the application generates lots of console output which may cause memory to build up in the browser. Therefore it is not recommended to run the application for too long \n", 11 | "\n", 12 | "The code can be found here: https://github.com/ibm-watson-data-lab/spark.samples/tree/master/streaming-twitter \n", 13 | "The following code is using a pre-built jar that has been posted on the Github project, but you can replace with your own url if needed." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "%AddJar https://github.com/DTAIEB/demos/raw/master/streaming-twitter-assembly-1.6.jar -f" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "##Set up the credentials for the different services\n", 32 | "Please refer to the tutorial for details on how to find the credentials for all the services, then add the value in the placeholders specified in the code below" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "val demo = com.ibm.cds.spark.samples.MessageHubStreamingTwitter\n", 44 | "val config = demo.getConfig()\n", 45 | "\n", 46 | "//Watson Tone Analyzer service\n", 47 | "config.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer-beta/api\")\n", 48 | "config.setConfig(\"watson.tone.password\",\"XXXX\")\n", 49 | "config.setConfig(\"watson.tone.username\",\"XXXX\")\n", 50 | "\n", 51 | "//Message Hub/Kafka service\n", 52 | "config.setConfig(\"bootstrap.servers\",\"kafka01-prod01.messagehub.services.us-south.bluemix.net:9093,kafka02-prod01.messagehub.services.us-south.bluemix.net:9093,kafka03-prod01.messagehub.services.us-south.bluemix.net:9093,kafka04-prod01.messagehub.services.us-south.bluemix.net:9093,kafka05-prod01.messagehub.services.us-south.bluemix.net:9093\")\n", 53 | "config.setConfig(\"api_key\",\"XXXX\")\n", 54 | "config.setConfig(\"kafka.topic.tweet\",\"twitter-spark\")\n", 55 | "config.setConfig(\"kafka.user.name\",\"XXXX\")\n", 56 | "config.setConfig(\"kafka.user.password\",\"XXXX\")\n", 57 | "config.setConfig(\"kafka_rest_url\",\"https://kafka-rest-prod01.messagehub.services.us-south.bluemix.net:443\")\n", 58 | "\n", 59 | "//Spark Streaming checkpointing configuration with Object Storage Swift container\n", 60 | "config.setConfig(\"name\",\"spark\");\n", 61 | "config.setConfig(\"auth_url\",\"https://identity.open.softlayer.com\");\n", 62 | "config.setConfig(\"project_id\",\"XXXX\");\n", 63 | "config.setConfig(\"region\",\"dallas\");\n", 64 | "config.setConfig(\"user_id\",\"XXXX\");\n", 65 | "config.setConfig(\"password\",\"XXXX\");\n", 66 | "config.setConfig(\"checkpointDir\", \"swift://notebooks.spark/ssc\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "##Producing tweets directly from Twitter\n", 74 | "Optional: The following cell is to be used only if your MessageConnect service doesn't work. \n", 75 | "In the next cell, you configure your Twitter credentials and call the code that will connect to Twitter, fetch the tweets and send them to MessageHub for consumption (Please refer to the tutorial for more information)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "config.setConfig(\"twitter4j.oauth.consumerKey\",\"XXXX\")\n", 87 | "config.setConfig(\"twitter4j.oauth.consumerSecret\",\"XXXX\")\n", 88 | "config.setConfig(\"twitter4j.oauth.accessToken\",\"XXXX\")\n", 89 | "config.setConfig(\"twitter4j.oauth.accessTokenSecret\",\"XXXX\")\n", 90 | "val twitterStream = com.ibm.cds.spark.samples.KafkaProducerTest.createTwitterStream(config)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "##Start the Spark Stream to collect tweets from Message Hub\n", 98 | "Start a new Twitter Stream that collects the live tweets and enrich them with Sentiment Analysis scores. The stream is run for a duration specified in the second argument of the **startTwitterStreaming** method.\n", 99 | "Note: if no duration is specified then the stream will run until the **stopTwitterStreaming** method is called." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": false, 107 | "scrolled": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "demo.startTwitterStreaming(sc)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "##Close the Tweet producer\n", 119 | "Optional: To be used only if you have started it" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "com.ibm.cds.spark.samples.KafkaProducerTest.closeTwitterStream" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "##Close the Spark Streaming" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "demo.stopTwitterStreaming" 149 | ] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Scala 2.10", 155 | "language": "scala", 156 | "name": "spark" 157 | }, 158 | "language_info": { 159 | "name": "scala" 160 | }, 161 | "name": "Twitter + Watson Tone Analyzer Part 1.ipynb" 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 0 165 | } -------------------------------------------------------------------------------- /streaming-twitter/notebook/Twitter + Watson Tone Analyzer Part 1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#Twitter + Watson Tone Analyzer sample Notebook Part 1: Loading the data\n", 8 | "In this Notebook, we show how to load the custom library generate as part of the Twitter + Watson Tone Analyzer streaming application. Code can be found here: https://github.com/ibm-watson-data-lab/spark.samples/tree/master/streaming-twitter.\n", 9 | "The following code is using a pre-built jar has been posted on the Github project, but you can replace with your own url if needed." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "Starting download from https://github.com/ibm-watson-data-lab/spark.samples/raw/master/dist/streaming-twitter-assembly-1.6.jar\n", 24 | "Finished download of streaming-twitter-assembly-1.6.jar\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "%AddJar https://github.com/ibm-watson-data-lab/spark.samples/raw/master/dist/streaming-twitter-assembly-1.6.jar -f" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "##Set up the Twitter and Watson credentials\n", 37 | "Please refer to the tutorial for details on how to find the Twitter and Watson credentials, then add the value in the placeholders specified in the code below" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "val demo = com.ibm.cds.spark.samples.StreamingTwitter\n", 49 | "demo.setConfig(\"twitter4j.oauth.consumerKey\",\"XXXX\")\n", 50 | "demo.setConfig(\"twitter4j.oauth.consumerSecret\",\"XXXX\")\n", 51 | "demo.setConfig(\"twitter4j.oauth.accessToken\",\"XXXX\")\n", 52 | "demo.setConfig(\"twitter4j.oauth.accessTokenSecret\",\"XXXX\")\n", 53 | "demo.setConfig(\"watson.tone.url\",\"https://gateway.watsonplatform.net/tone-analyzer-beta/api\")\n", 54 | "demo.setConfig(\"watson.tone.password\",\"XXXX\")\n", 55 | "demo.setConfig(\"watson.tone.username\",\"XXXX\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "##Start the Spark Stream to collect live tweets\n", 63 | "Start a new Twitter Stream that collects the live tweets and enrich them with Sentiment Analysis scores. The stream is run for a duration specified in the second argument of the **startTwitterStreaming** method.\n", 64 | "Note: if no duration is specified then the stream will run until the **stopTwitterStreaming** method is called." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "Twitter stream started\n", 79 | "Tweets are collected real-time and analyzed\n", 80 | "To stop the streaming and start interacting with the data use: StreamingTwitter.stopTwitterStreaming\n", 81 | "Receiver Started: TwitterReceiver-0\n", 82 | "Batch started with 139 records\n", 83 | "Batch completed with 139 records\n", 84 | "Batch started with 270 records\n", 85 | "Stopping Twitter stream. Please wait this may take a while\n", 86 | "Receiver Stopped: TwitterReceiver-0\n", 87 | "Reason: : Stopped by driver\n", 88 | "Batch completed with 270 records\n", 89 | "Twitter stream stopped\n", 90 | "You can now create a sqlContext and DataFrame with 38 Tweets created. Sample usage: \n", 91 | "val (sqlContext, df) = com.ibm.cds.spark.samples.StreamingTwitter.createTwitterDataFrames(sc)\n", 92 | "df.printSchema\n", 93 | "sqlContext.sql(\"select author, text from tweets\").show\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "import org.apache.spark.streaming._\n", 99 | "demo.startTwitterStreaming(sc, Seconds(40))" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "##Create a SQLContext and a dataframe with all the tweets\n", 107 | "Note: this method will register a SparkSQL table called tweets" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "A new table named tweets with 38 records has been correctly created and can be accessed through the SQLContext variable\n", 122 | "Here's the schema for tweets\n", 123 | "root\n", 124 | " |-- author: string (nullable = true)\n", 125 | " |-- date: string (nullable = true)\n", 126 | " |-- lang: string (nullable = true)\n", 127 | " |-- text: string (nullable = true)\n", 128 | " |-- lat: double (nullable = true)\n", 129 | " |-- long: double (nullable = true)\n", 130 | " |-- Anger: double (nullable = true)\n", 131 | " |-- Disgust: double (nullable = true)\n", 132 | " |-- Fear: double (nullable = true)\n", 133 | " |-- Joy: double (nullable = true)\n", 134 | " |-- Sadness: double (nullable = true)\n", 135 | " |-- Analytical: double (nullable = true)\n", 136 | " |-- Confident: double (nullable = true)\n", 137 | " |-- Tentative: double (nullable = true)\n", 138 | " |-- Openness: double (nullable = true)\n", 139 | " |-- Conscientiousness: double (nullable = true)\n", 140 | " |-- Extraversion: double (nullable = true)\n", 141 | " |-- Agreeableness: double (nullable = true)\n", 142 | " |-- EmotionalRange: double (nullable = true)\n", 143 | "\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "val (sqlContext, df) = demo.createTwitterDataFrames(sc)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "##Execute a SparkSQL query that contains all the data" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "metadata": { 162 | "collapsed": false 163 | }, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "+--------------------+--------------------+-----+--------------------+---+----+------------------+------------------+------------------+-----------------+------------------+----------+---------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+\n", 170 | "| author| date| lang| text|lat|long| Anger| Disgust| Fear| Joy| Sadness|Analytical|Confident| Tentative| Openness| Conscientiousness| Extraversion| Agreeableness| EmotionalRange|\n", 171 | "+--------------------+--------------------+-----+--------------------+---+----+------------------+------------------+------------------+-----------------+------------------+----------+---------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+\n", 172 | "|Three Words o Wisdom|Sun Mar 06 13:00:...|en-gb|wildebeest rebuff...|0.0| 0.0| 11.0| 20.0| 19.0| 44.0| 22.0| 0.0| 0.0| 0.0| 80.0| 56.00000000000001| 15.0| 1.0| 39.0|\n", 173 | "| Jonny P|Sun Mar 06 13:00:...| en|Getting a pizza i...|0.0| 0.0| 8.0| 5.0| 13.0|56.00000000000001| 5.0| 0.0| 0.0|56.99999999999999| 24.0| 23.0| 83.0|56.99999999999999| 82.0|\n", 174 | "| Kayla|Sun Mar 06 13:00:...| en|RT @ebhoniogarro:...|0.0| 0.0| 2.0| 0.0| 1.0| 99.0| 2.0| 0.0| 0.0| 0.0| 30.0| 56.00000000000001| 85.0| 66.0| 39.0|\n", 175 | "| Adamlbr|Sun Mar 06 13:00:...| en|New Event now on....|0.0| 0.0| 24.0| 10.0| 11.0| 46.0| 4.0| 0.0| 0.0| 0.0| 11.0| 98.0| 46.0| 49.0| 6.0|\n", 176 | "|Lexa deserved better|Sun Mar 06 13:00:...| en|RT @canoodleclexa...|0.0| 0.0| 8.0| 7.000000000000001| 9.0| 80.0| 7.000000000000001| 84.0| 0.0| 0.0| 12.0|28.000000000000004| 73.0| 59.0| 51.0|\n", 177 | "| LoveBakesGoodCakes|Sun Mar 06 13:00:...| en|Yum, yum! Honey B...|0.0| 0.0| 41.0| 2.0| 6.0| 62.0| 7.000000000000001| 0.0| 0.0| 0.0| 60.0| 69.0| 64.0| 18.0| 11.0|\n", 178 | "| High Tech Planet|Sun Mar 06 13:00:...| en|Google is testing...|0.0| 0.0| 11.0| 5.0| 32.0| 37.0| 5.0| 78.0| 0.0| 0.0|56.99999999999999| 30.0| 6.0| 13.0|57.99999999999999|\n", 179 | "| Kael|Sun Mar 06 13:00:...| en|RT @mgiseelle: Ha...|0.0| 0.0| 16.0| 4.0|14.000000000000002| 23.0| 13.0| 0.0| 0.0| 0.0| 68.0| 85.0|57.99999999999999| 35.0| 6.0|\n", 180 | "| Ryan|Sun Mar 06 13:00:...| en|ALL THAT EFFORT T...|0.0| 0.0| 19.0|14.000000000000002| 24.0| 12.0| 24.0| 61.0| 79.0| 0.0| 78.0| 3.0| 49.0| 1.0| 91.0|\n", 181 | "| princesss|Sun Mar 06 13:00:...| en|RT @SexualGif: Be...|0.0| 0.0| 13.0| 7.000000000000001| 13.0| 34.0| 15.0| 0.0| 0.0| 0.0|56.00000000000001| 93.0| 62.0| 38.0| 39.0|\n", 182 | "| Fadi Nasser|Sun Mar 06 13:00:...| en|#USA missiles cha...|0.0| 0.0| 7.000000000000001| 10.0| 8.0| 30.0| 13.0| 0.0| 0.0| 0.0| 94.0| 75.0| 27.0| 23.0| 20.0|\n", 183 | "| Briyon?e|Sun Mar 06 13:00:...| en|RT @tonestradamus...|0.0| 0.0| 52.0| 19.0| 5.0| 1.0|14.000000000000002| 23.0| 0.0| 75.0| 21.0| 6.0| 84.0| 44.0| 59.0|\n", 184 | "| BarnBurnerBBQ|Sun Mar 06 13:00:...| en|Presenting sponso...|0.0| 0.0| 10.0| 18.0| 10.0| 26.0| 8.0| 67.0| 0.0| 0.0| 36.0| 91.0| 71.0| 91.0| 2.0|\n", 185 | "| Majid Navabi|Sun Mar 06 13:00:...| en| Download|0.0| 0.0| 12.0| 9.0| 18.0|56.99999999999999|14.000000000000002| 0.0| 0.0| 0.0| 52.0| 56.00000000000001| 15.0| 100.0| 0.0|\n", 186 | "| ?????? ?????|Sun Mar 06 13:00:...| en|RT @Adel__Almalki...|0.0| 0.0| 43.0| 6.0| 20.0| 3.0| 2.0| 0.0| 0.0| 0.0| 90.0| 56.00000000000001| 15.0| 1.0| 39.0|\n", 187 | "| liv|Sun Mar 06 13:00:...| en|RT @iamjojo: You ...|0.0| 0.0| 5.0| 2.0| 9.0| 89.0| 9.0| 0.0| 0.0| 0.0| 2.0| 2.0| 100.0| 85.0| 2.0|\n", 188 | "| LADY GAGA|Sun Mar 06 13:00:...| en|Miek_tweet #TilIt...|0.0| 0.0| 16.0| 16.0| 8.0| 23.0| 21.0| 0.0| 0.0| 0.0| 80.0| 56.00000000000001| 15.0| 1.0| 39.0|\n", 189 | "| donatello ;)|Sun Mar 06 13:00:...| en|RT @__trillgawdd:...|0.0| 0.0|14.000000000000002| 3.0| 13.0| 66.0| 9.0| 0.0| 0.0| 0.0| 30.0| 56.00000000000001| 53.0| 69.0| 20.0|\n", 190 | "| Liz|Sun Mar 06 13:00:...| en|RT @Samantha_Evel...|0.0| 0.0| 12.0| 8.0| 24.0| 10.0| 33.0| 43.0| 72.0| 91.0| 5.0| 12.0| 34.0| 61.0| 97.0|\n", 191 | "| Chrystal Johnson|Sun Mar 06 13:00:...| en|Take Aromatherapy...|0.0| 0.0| 16.0| 12.0| 44.0| 8.0| 8.0| 0.0| 0.0| 0.0| 71.0| 96.0| 40.0| 60.0| 2.0|\n", 192 | "+--------------------+--------------------+-----+--------------------+---+----+------------------+------------------+------------------+-----------------+------------------+----------+---------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+\n", 193 | "only showing top 20 rows\n", 194 | "\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "val fullSet = sqlContext.sql(\"select * from tweets\") //Select all columns\n", 200 | "fullSet.show" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "##SparkSQL query example on the data.\n", 208 | "Select all the tweets that have Anger score greated than 70%" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 6, 214 | "metadata": { 215 | "collapsed": false 216 | }, 217 | "outputs": [ 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "0\n", 223 | "+----+\n", 224 | "|text|\n", 225 | "+----+\n", 226 | "+----+\n", 227 | "\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "val set = sqlContext.sql(\"select text from tweets where Anger > 60\")\n", 233 | "println(set.count)\n", 234 | "set.show" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "##Persist the dataset into a parquet file on Object Storage service\n", 242 | "The parquet file will be reloaded in IPython Part 2 Notebook\n", 243 | "Note: you can disregard the warning messages related to SLF4J" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 7, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [ 253 | { 254 | "name": "stdout", 255 | "output_type": "stream", 256 | "text": [ 257 | "SLF4J: Failed to load class \"org.slf4j.impl.StaticLoggerBinder\".\n", 258 | "SLF4J: Defaulting to no-operation (NOP) logger implementation\n", 259 | "SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "fullSet.repartition(1).saveAsParquetFile(\"swift://notebooks.spark/tweetsFull.parquet\")" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [] 275 | } 276 | ], 277 | "metadata": { 278 | "kernelspec": { 279 | "display_name": "Scala 2.10", 280 | "language": "scala", 281 | "name": "spark" 282 | }, 283 | "language_info": { 284 | "name": "scala" 285 | }, 286 | "name": "Twitter + Watson Tone Analyzer Part 1.ipynb" 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 0 290 | } -------------------------------------------------------------------------------- /streaming-twitter/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 2 | -------------------------------------------------------------------------------- /streaming-twitter/readme.md: -------------------------------------------------------------------------------- 1 | #Sentiment Analysis of Twitter Hashtags 2 | 3 | ####Use Spark Streaming in combination with IBM Watson to perform sentiment analysis showing how a conversation is trending on Twitter. 4 | 5 | Track how consumers feel about you based on their tweets. To get real-time sentiment analysis, deploy our sample **Spark Streaming with Twitter and Watson** app on Bluemix and use its Notebook to analyze public opinion. 6 | 7 | 8 | This sample app uses Spark Streaming to create a feed that captures live tweets from Twitter. You can filter the tweets that contain the hashtag(s) of your choice. The tweet data is enriched in real time with various sentiment scores provided by the Watson Tone Analyzer service (available on Bluemix). This service provides insight into sentiment, or how the author feels. Then use Spark SQL to load the data into a DataFrame for further analysis. Here's the basic architecture of this app: 9 | ![Twitter + Watson high level architecture](https://i2.wp.com/developer.ibm.com/clouddataservices/wp-content/uploads/sites/47/2015/10/Spark-Streaming-Twitter-architecture.png) 10 | 11 | Follow the full tutorial to understand how it works and create your own stream. 12 | 13 | [Get started](https://developer.ibm.com/clouddataservices/sentiment-analysis-of-twitter-hashtags/) 14 | -------------------------------------------------------------------------------- /streaming-twitter/sampleConfig/sampleconf.properties: -------------------------------------------------------------------------------- 1 | #Twitter credentials 2 | twitter4j.oauth.consumerKey=XXXX 3 | twitter4j.oauth.consumerSecret=XXXX 4 | twitter4j.oauth.accessToken=XXXX 5 | twitter4j.oauth.accessTokenSecret=XXXX 6 | 7 | #MessageHub 8 | kafka.topic.tweet=twitter-spark 9 | kafka.user.name=XXXX 10 | kafka.user.password=XXXX 11 | bootstrap.servers=kafka01-prod01.messagehub.services.us-south.bluemix.net:9093,\ 12 | kafka02-prod01.messagehub.services.us-south.bluemix.net:9093,\ 13 | kafka03-prod01.messagehub.services.us-south.bluemix.net:9093,\ 14 | kafka04-prod01.messagehub.services.us-south.bluemix.net:9093,\ 15 | kafka05-prod01.messagehub.services.us-south.bluemix.net:9093 16 | api_key=XXXX 17 | kafka_rest_url=https://kafka-rest-prod01.messagehub.services.us-south.bluemix.net:443 18 | 19 | #Watson Tone Analyzer 20 | watson.tone.url=https://gateway.watsonplatform.net/tone-analyzer-experimental/api 21 | watson.tone.password=XXXX 22 | watson.tone.username=XXXX 23 | 24 | #Checkpoint directory 25 | checkpointDir=XXXX 26 | -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/KafkaProducerTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.ibm.cds.spark.samples 18 | 19 | import java.io.ByteArrayInputStream 20 | import java.io.ByteArrayOutputStream 21 | import java.io.ObjectInputStream 22 | import java.io.ObjectOutputStream 23 | import java.util.concurrent.TimeUnit 24 | import scala.collection.JavaConversions.mapAsJavaMap 25 | import scala.collection.JavaConversions.seqAsJavaList 26 | import org.apache.kafka.clients.consumer.KafkaConsumer 27 | import org.apache.kafka.clients.producer.ProducerRecord 28 | import org.apache.kafka.common.serialization.Deserializer 29 | import org.apache.kafka.common.serialization.Serializer 30 | import org.apache.kafka.common.serialization.StringDeserializer 31 | import org.apache.log4j.Level 32 | import org.apache.log4j.Logger 33 | import com.ibm.cds.spark.samples.config.MessageHubConfig 34 | import twitter4j.StallWarning 35 | import twitter4j.Status 36 | import twitter4j.StatusDeletionNotice 37 | import twitter4j.StatusListener 38 | import twitter4j.TwitterStreamFactory 39 | import scala.util.parsing.json.JSON 40 | import java.io.InputStream 41 | import twitter4j.TwitterStream 42 | import com.ibm.cds.spark.samples.config.DemoConfig 43 | import org.apache.spark.Logging 44 | 45 | 46 | /** 47 | * @author dtaieb 48 | */ 49 | object KafkaProducerTest extends Logging{ 50 | //Very verbose, enable only if necessary 51 | //Logger.getLogger("org.apache.kafka").setLevel(Level.ALL) 52 | //Logger.getLogger("kafka").setLevel(Level.ALL) 53 | 54 | var twitterStream : TwitterStream = _; 55 | 56 | def main(args: Array[String]): Unit = { 57 | createTwitterStream(); 58 | } 59 | 60 | def createTwitterStream(props: DemoConfig=null):TwitterStream = { 61 | if( twitterStream != null){ 62 | println("Twitter Stream already running. Please call closeTwitterStream first"); 63 | return twitterStream; 64 | } 65 | var kafkaProps:MessageHubConfig = null; 66 | if ( props == null ){ 67 | kafkaProps = new MessageHubConfig 68 | }else{ 69 | kafkaProps = props.cloneConfig 70 | } 71 | kafkaProps.setValueSerializer[StatusSerializer] 72 | kafkaProps.validateConfiguration("watson.tone.") 73 | kafkaProps.createTopicsIfNecessary( kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS ) ) 74 | val kafkaProducer = new org.apache.kafka.clients.producer.KafkaProducer[java.lang.String, Status]( kafkaProps.toImmutableMap() ); 75 | 76 | twitterStream = new TwitterStreamFactory().getInstance(); 77 | twitterStream.addListener( new StatusListener(){ 78 | var lastSent:Long = 0; 79 | def onStatus(status: Status){ 80 | if ( lastSent == 0 || System.currentTimeMillis() - lastSent > 200L){ 81 | lastSent = System.currentTimeMillis() 82 | logInfo("Got a status " + status.getText ) 83 | val producerRecord = new ProducerRecord(kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS ), "tweet", status ) 84 | try{ 85 | val metadata = kafkaProducer.send( producerRecord ).get(2000, TimeUnit.SECONDS); 86 | logInfo("Successfully sent record: Topic: " + metadata.topic + " Offset: " + metadata.offset ) 87 | }catch{ 88 | case e:Throwable => e.printStackTrace 89 | } 90 | } 91 | } 92 | def onDeletionNotice( notice: StatusDeletionNotice){ 93 | 94 | } 95 | def onTrackLimitationNotice( numLimitation : Int){ 96 | println("Received track limitation notice from Twitter: " + numLimitation) 97 | } 98 | 99 | def onException( e: Exception){ 100 | println("Unexpected error from twitterStream: " + e.getMessage); 101 | logError(e.getMessage, e) 102 | } 103 | 104 | def onScrubGeo(lat: Long, long: Long ){ 105 | 106 | } 107 | 108 | def onStallWarning(warning: StallWarning ){ 109 | 110 | } 111 | }) 112 | 113 | //Start twitter stream sampling 114 | twitterStream.sample(); 115 | 116 | println("Twitter stream started. Tweets will flow to MessageHub instance. Please call closeTwitterStream to stop the stream") 117 | twitterStream 118 | } 119 | 120 | def closeTwitterStream(){ 121 | if ( twitterStream==null){ 122 | println("Nothing to close. Twitter stream has not been started") 123 | }else{ 124 | println("Stopping twitter stream"); 125 | twitterStream.shutdown() 126 | twitterStream=null 127 | println("Twitter Stream stopped") 128 | } 129 | } 130 | } 131 | 132 | object KafkaConsumerTest { 133 | def main(args: Array[String]): Unit = { 134 | val kafkaProps = new MessageHubConfig 135 | kafkaProps.validateConfiguration("watson.tone.") 136 | val kafkaConsumer = new KafkaConsumer[java.lang.String, StatusAdapter](kafkaProps.toImmutableMap, new StringDeserializer(), new StatusDeserializer()) 137 | 138 | kafkaConsumer.subscribe( List(kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS )) ) 139 | new Thread( new Runnable { 140 | def run(){ 141 | while( true ){ 142 | Thread.sleep( 1000L ) 143 | val it = kafkaConsumer.poll(1000L).iterator 144 | while( it.hasNext() ){ 145 | val record = it.next(); 146 | println( record.value ); 147 | } 148 | } 149 | } 150 | }).start 151 | } 152 | } -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/MessageHubStreamingTwitter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.ibm.cds.spark.samples 19 | 20 | import scala.BigDecimal 21 | import scala.collection.JavaConversions.mapAsJavaMap 22 | import scala.collection.immutable.Seq.canBuildFrom 23 | import scala.collection.mutable.ListBuffer 24 | import scala.collection.mutable.Map 25 | import scala.reflect.ClassTag 26 | import org.apache.kafka.clients.producer.ProducerRecord 27 | import org.apache.kafka.common.serialization.StringDeserializer 28 | import org.apache.kafka.common.serialization.StringSerializer 29 | import org.apache.spark.HashPartitioner 30 | import org.apache.spark.SparkConf 31 | import org.apache.spark.SparkContext 32 | import org.apache.spark.rdd.RDD 33 | import org.apache.spark.sql.Row 34 | import org.apache.spark.streaming.Duration 35 | import org.apache.spark.streaming.Seconds 36 | import org.apache.spark.streaming.StreamingContext 37 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 38 | import org.http4s.client.blaze.PooledHttp1Client 39 | import com.google.common.base.CharMatcher 40 | import com.ibm.cds.spark.samples.config.MessageHubConfig 41 | import com.ibm.cds.spark.samples.dstream.KafkaStreaming.KafkaStreamingContextAdapter 42 | import twitter4j.Status 43 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted 44 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchCompleted 45 | import com.ibm.cds.spark.samples.config.DemoConfig 46 | import org.apache.log4j.Level 47 | import org.apache.log4j.Logger 48 | import org.apache.spark.streaming.dstream.DStream 49 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStopped 50 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverError 51 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted 52 | import org.apache.spark.broadcast.Broadcast 53 | import org.apache.spark.Logging 54 | import java.util.Arrays 55 | 56 | /** 57 | * @author dtaieb 58 | * Twitter+Watson sample app with MessageHub/Kafka 59 | */ 60 | object MessageHubStreamingTwitter extends Logging{ 61 | 62 | var ssc: StreamingContext = null 63 | val reuseCheckpoint = false; 64 | 65 | val queue = new scala.collection.mutable.Queue[(String, String)] 66 | 67 | final val KAFKA_TOPIC_TOP_HASHTAGS = "topHashTags" 68 | final val KAFKA_TOPIC_TONE_SCORES = "topHashTags.toneScores" 69 | final val KAFKA_TOPIC_TOTAL_TWEETS_PROCESSED = "total_tweets" 70 | 71 | //Logger.getLogger("org.apache.kafka").setLevel(Level.ALL) 72 | //Logger.getLogger("kafka").setLevel(Level.ALL) 73 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 74 | 75 | def main(args: Array[String]): Unit = { 76 | println("Printing arguments: "); 77 | args.foreach { println } 78 | 79 | if(args.length>0 && System.getProperty("DEMO_CONFIG_PATH") == null ){ 80 | //On Spark Service, input files are passed as parameters, if available, we assume first parameter is config file 81 | System.setProperty("DEMO_CONFIG_PATH", args(0)) 82 | } 83 | val conf = new SparkConf().setAppName("Spark Streaming Twitter + Watson with MessageHub/Kafka Demo") 84 | val sc = new SparkContext(conf) 85 | startTwitterStreaming(sc); 86 | 87 | if(ssc!=null){ 88 | //When running as stand alone app, we call awaitTermination to make sure the JVM doesn't exit prematurely due to the fact 89 | //that all non-daemon threads have terminated. Note: Don't call awaitTermination directly from startTwitterStreaming as it can be run 90 | //From Notebook 91 | ssc.awaitTermination() 92 | } 93 | } 94 | 95 | //Hold configuration key/value pairs 96 | lazy val kafkaProps = new MessageHubConfig 97 | 98 | //Wrapper api for Notebook access 99 | def getConfig():DemoConfig={ 100 | kafkaProps 101 | } 102 | 103 | def startTwitterStreaming( sc: SparkContext, stopAfter: Duration = Seconds(0) ){ 104 | if ( ssc != null ){ 105 | println("Twitter Stream already running"); 106 | return; 107 | } 108 | 109 | kafkaProps.setValueSerializer[StringSerializer]; 110 | 111 | if ( !kafkaProps.validateConfiguration("twitter4j.oauth") ){ 112 | return; 113 | } 114 | 115 | //Set the hadoop configuration if needed 116 | val checkpointDir = kafkaProps.getConfig( MessageHubConfig.CHECKPOINT_DIR_KEY ); 117 | if ( checkpointDir.startsWith("swift") ){ 118 | println("Setting hadoop configuration for swift container") 119 | kafkaProps.set_hadoop_config(sc) 120 | } 121 | 122 | //Make sure the topics are already created 123 | kafkaProps.createTopicsIfNecessary( KAFKA_TOPIC_TONE_SCORES, KAFKA_TOPIC_TOP_HASHTAGS, KAFKA_TOPIC_TOTAL_TWEETS_PROCESSED ) 124 | 125 | val kafkaProducer = new org.apache.kafka.clients.producer.KafkaProducer[String, String]( kafkaProps.toImmutableMap ); 126 | 127 | if ( !reuseCheckpoint ){ 128 | createStreamingContextAndRunAnalytics(sc); 129 | }else{ 130 | ssc = StreamingContext.getOrCreate( 131 | kafkaProps.getConfig( MessageHubConfig.CHECKPOINT_DIR_KEY ), 132 | () => { 133 | createStreamingContextAndRunAnalytics(sc); 134 | }, 135 | sc.hadoopConfiguration, 136 | true 137 | ); 138 | } 139 | 140 | ssc.addStreamingListener( new StreamingListener() ) 141 | 142 | new Thread( new Runnable() { 143 | def run(){ 144 | while(ssc!=null){ 145 | while(!queue.isEmpty ){ 146 | try{ 147 | var task:(String,String) = null; 148 | queue.synchronized{ 149 | task = queue.dequeue(); 150 | } 151 | if ( task != null ){ 152 | val producerRecord = new ProducerRecord[String,String](task._1, "tweet", task._2 ) 153 | val metadata = kafkaProducer.send( producerRecord ).get; 154 | logInfo("Sent record " + metadata.offset() + " Topic " + task._1) 155 | } 156 | }catch{ 157 | case e:Throwable => logError(e.getMessage, e) 158 | } 159 | } 160 | queue.synchronized{ 161 | queue.wait(); 162 | } 163 | } 164 | } 165 | },"Message Hub producer").start 166 | 167 | ssc.start 168 | 169 | println("Twitter stream started"); 170 | println("Tweets are collected real-time and analyzed") 171 | println("To stop the streaming and start interacting with the data use: StreamingTwitter.stopTwitterStreaming") 172 | 173 | if ( !stopAfter.isZero ){ 174 | //Automatically stop it after 10s 175 | new Thread( new Runnable { 176 | def run(){ 177 | Thread.sleep( stopAfter.milliseconds ) 178 | stopTwitterStreaming 179 | } 180 | }).start 181 | } 182 | } 183 | 184 | def createStreamingContextAndRunAnalytics(sc:SparkContext):StreamingContext={ 185 | //Broadcast the config to each worker node 186 | val broadcastVar = sc.broadcast( kafkaProps.toImmutableMap ) 187 | ssc = new StreamingContext( sc, Seconds(5) ) 188 | ssc.checkpoint(kafkaProps.getConfig( MessageHubConfig.CHECKPOINT_DIR_KEY )); 189 | val stream = ssc.createKafkaStream[String, StatusAdapter,StringDeserializer, StatusDeserializer]( 190 | kafkaProps, 191 | List(kafkaProps.getConfig(MessageHubConfig.KAFKA_TOPIC_TWEETS )) 192 | ); 193 | runAnalytics(sc, broadcastVar, stream) 194 | ssc; 195 | } 196 | 197 | def runAnalytics(sc:SparkContext, broadcastVar: Broadcast[scala.collection.immutable.Map[String,String]], stream:DStream[(String,StatusAdapter)]){ 198 | val keys = broadcastVar.value.get("tweets.key").get.split(","); 199 | val tweets = stream.map( t => t._2) 200 | .filter { status => 201 | status.userLang.startsWith("en") && CharMatcher.ASCII.matchesAllOf(status.text) && ( keys.isEmpty || keys.exists{status.text.contains(_)}) 202 | } 203 | 204 | val rowTweets = tweets.map(status=> { 205 | lazy val client = PooledHttp1Client() 206 | val sentiment = ToneAnalyzer.computeSentiment( client, status, broadcastVar ) 207 | var scoreMap : Map[String, Double] = Map() 208 | if ( sentiment != null ){ 209 | for( toneCategory <- Option(sentiment.tone_categories).getOrElse( Seq() )){ 210 | for ( tone <- Option( toneCategory.tones ).getOrElse( Seq() ) ){ 211 | scoreMap.put( tone.tone_id, (BigDecimal(tone.score).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) * 100.0 ) 212 | } 213 | } 214 | } 215 | 216 | EnrichedTweet( 217 | status.userName, 218 | status.userId, 219 | status.createdAt, 220 | status.userLang, 221 | status.text, 222 | status.long, 223 | status.lat, 224 | scoreMap 225 | ) 226 | }) 227 | 228 | val delimTagTone = "-%!" 229 | val delimToneScore = ":%@" 230 | val statsStream = rowTweets.map { eTweet => ("total_tweets", 1L) } 231 | .reduceByKey( _+_ ) 232 | .updateStateByKey( (a:Seq[Long], b:Option[Long] ) => { 233 | var runningCount=b.getOrElse(0L) 234 | a.foreach { v => runningCount=runningCount+v } 235 | Some(runningCount) 236 | }) 237 | statsStream.foreachRDD( rdd =>{ 238 | queue.synchronized{ 239 | queue+=((KAFKA_TOPIC_TOTAL_TWEETS_PROCESSED, TweetsMetricJsonSerializer.serialize(rdd.collect()))) 240 | try{ 241 | queue.notify 242 | }catch{ 243 | case e:Throwable=>logError(e.getMessage, e) 244 | } 245 | } 246 | }) 247 | 248 | val metricsStream = rowTweets.flatMap { eTweet => { 249 | val retList = ListBuffer[String]() 250 | for ( tag <- eTweet.text.split("\\s+") ){ 251 | if ( tag.startsWith( "#") && tag.length > 1 ){ 252 | for ( tone <- Option( eTweet.sentimentScores.keys ).getOrElse( Seq() ) ){ 253 | retList += (tag + delimTagTone + tone + delimToneScore + eTweet.sentimentScores.getOrElse( tone, 0.0)) 254 | } 255 | } 256 | } 257 | retList.toList 258 | }} 259 | .map { fullTag => { 260 | val split = fullTag.split(delimToneScore); 261 | (split(0), split(1).toFloat) 262 | }} 263 | .combineByKey( 264 | (x:Float) => (x,1), 265 | (x:(Float,Int), y:Float) => (x._1 + y, x._2+1), 266 | (x:(Float,Int),y:(Float,Int)) => (x._1 + y._1, x._2 + y._2), 267 | new HashPartitioner(sc.defaultParallelism) 268 | ) 269 | .map[(String,(Long/*count*/, List[(String, Double)]))]{ t => { 270 | val key = t._1; 271 | val ab = t._2; 272 | val split = key.split(delimTagTone) 273 | (split(0), (ab._2, List((split(1), BigDecimal(ab._1/ab._2).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble )))) 274 | }} 275 | .reduceByKey( (t,u) => (t._1+u._1, (t._2 ::: u._2).sortWith( (l,r) => l._1.compareTo( r._1 ) < 0 ))) 276 | .mapValues( (item:(Long, List[(String,Double)])) => { 277 | val unzip = item._2.unzip 278 | (item._1/(item._2.size), unzip._1, unzip._2) 279 | }) 280 | .updateStateByKey( (a:scala.collection.Seq[(Long, List[String], List[Double])], b: Option[(Long, List[String], List[Double])]) => { 281 | val safeB = b.getOrElse( (0L, List(), List() ) ) 282 | var listTones = safeB._2 283 | var listScores = safeB._3 284 | var count = safeB._1 285 | for( item <- a ){ 286 | count += item._1 287 | listScores = listScores.zipAll( item._3, 0.0, 0.0).map{ case(a,b)=>(a+b)/2 }.toList 288 | listTones = item._2 289 | } 290 | 291 | Some( (count, listTones, listScores) ) 292 | }) 293 | 294 | metricsStream.print 295 | 296 | metricsStream.foreachRDD( rdd =>{ 297 | val topHashTags = rdd.sortBy( f => f._2._1, false ).take(5) 298 | if ( !topHashTags.isEmpty){ 299 | queue.synchronized{ 300 | queue += ((KAFKA_TOPIC_TOP_HASHTAGS, TweetsMetricJsonSerializer.serialize(topHashTags.map( f => (f._1, f._2._1 ))))) 301 | queue += ((KAFKA_TOPIC_TONE_SCORES, ToneScoreJsonSerializer.serialize(topHashTags))) 302 | try{ 303 | queue.notify 304 | }catch{ 305 | case e:Throwable=>logError(e.getMessage, e) 306 | } 307 | } 308 | } 309 | }) 310 | } 311 | 312 | def stopTwitterStreaming(){ 313 | if ( ssc == null){ 314 | println("No Twitter stream to stop"); 315 | return; 316 | } 317 | 318 | println("Stopping Twitter stream. Please wait this may take a while") 319 | ssc.stop(stopSparkContext = false, stopGracefully = true) 320 | ssc = null 321 | println("Twitter stream stopped"); 322 | } 323 | } 324 | 325 | object TweetsMetricJsonSerializer extends Logging{ 326 | def serialize(value: Seq[(String,Long)] ): String = { 327 | val sb = new StringBuilder("[") 328 | var comma = "" 329 | value.foreach( item => { 330 | sb.append( comma + "[\"" + item._1.replaceAll("\"", "") + "\"," + item._2 + "]") 331 | comma="," 332 | }) 333 | sb.append("]") 334 | logInfo("Serialized json: " + sb) 335 | sb.toString() 336 | } 337 | } 338 | 339 | object ToneScoreJsonSerializer extends Logging{ 340 | def serializeList[U:ClassTag]( label: String, value: List[U] ):String = { 341 | val sb = new StringBuilder("[\"" + label.replaceAll("\"", "") + "\"") 342 | value.foreach { item => { 343 | if ( item.isInstanceOf[String] ) { 344 | val s = ",\"" + item.toString().replaceAll("\"", "") + "\""; 345 | sb.append( s.replaceAll("\"\"", "\"") ) 346 | }else if ( item.isInstanceOf[Double] ){ 347 | sb.append("," + item ) 348 | } 349 | }} 350 | sb.append("]") 351 | sb.toString 352 | } 353 | def serialize(value:Seq[(String, (Long, List[String], List[Double]))]):String={ 354 | val sb = new StringBuilder("[") 355 | var comma = "" 356 | var appendToneData = true; 357 | value.foreach( item => { 358 | if ( appendToneData ){ 359 | sb.append( comma + serializeList( "x", item._2._2 ) ) 360 | appendToneData = false 361 | comma = "," 362 | } 363 | sb.append( comma + serializeList( item._1, item._2._3 ) ) 364 | comma="," 365 | }) 366 | sb.append("]") 367 | logInfo("Serialized size: " + value.size + ". Tone json: " + sb) 368 | sb.toString() 369 | } 370 | } -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/PixiedustStreamingTwitter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.ibm.cds.spark.samples 19 | 20 | import scala.collection.mutable._ 21 | import com.ibm.pixiedust.ChannelReceiver 22 | import org.apache.spark.Logging 23 | import org.apache.log4j.Logger 24 | import org.apache.log4j.Level 25 | import org.apache.spark.SparkContext 26 | import org.apache.spark.streaming.StreamingContext 27 | import org.apache.spark.rdd.RDD 28 | import org.apache.spark.sql.types.StructType 29 | import org.apache.spark.sql.Row 30 | import com.ibm.cds.spark.samples.config.DemoConfig 31 | import org.apache.spark.streaming.Seconds 32 | import org.apache.spark.sql.types.IntegerType 33 | import org.apache.spark.sql.types.DoubleType 34 | import org.http4s.client.blaze.PooledHttp1Client 35 | import org.apache.spark.sql.types.StructField 36 | import org.apache.spark.sql.types.StringType 37 | import com.google.common.base.CharMatcher 38 | import com.ibm.couchdb.CouchDb 39 | import com.ibm.couchdb.TypeMapping 40 | import com.ibm.couchdb.CouchDbApi 41 | import org.apache.spark.sql.SQLContext 42 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverError 43 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStopped 44 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted 45 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchCompleted 46 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted 47 | import org.apache.spark.SparkConf 48 | import org.apache.spark.streaming.dstream.DStream 49 | import org.apache.spark.broadcast.Broadcast 50 | import org.apache.spark.HashPartitioner 51 | import twitter4j.Status 52 | import org.codehaus.jettison.json.JSONObject 53 | import org.apache.spark.AccumulableParam 54 | import org.apache.spark.streaming.StreamingContextState 55 | import org.apache.spark.sql.DataFrame 56 | 57 | /* @author dtaieb 58 | * Twitter+Watson sentiment analysis app powered by Pixiedust 59 | */ 60 | object PixiedustStreamingTwitter extends ChannelReceiver() with Logging{ 61 | var ssc: StreamingContext = null 62 | var workingRDD: RDD[Row] = null 63 | //Hold configuration key/value pairs 64 | lazy val config = new DemoConfig 65 | lazy val logger: Logger = Logger.getLogger( "com.ibm.cds.spark.samples.PixiedustStreamingTwitter" ) 66 | 67 | val BEGINSTREAM = "@BEGINSTREAM@" 68 | val ENDSTREAM = "@ENDSTREAM@" 69 | 70 | def sendLog(s:String){ 71 | send("log", s) 72 | } 73 | 74 | //Wrapper api for Notebook access 75 | def setConfig(key:String, value:String){ 76 | config.setConfig(key, value) 77 | } 78 | 79 | //main method invoked when running as a standalone Spark Application 80 | def main(args: Array[String]) { 81 | val conf = new SparkConf().setAppName("Pixiedust Spark Streaming Twitter Demo") 82 | val sc = new SparkContext(conf) 83 | startStreaming(); 84 | } 85 | 86 | def createTwitterDataFrames(sqlContext: SQLContext) : DataFrame = { 87 | if ( workingRDD == null || workingRDD.count <= 0 ){ 88 | println("No data receive. Please start the Twitter stream again to collect data") 89 | return null 90 | } 91 | 92 | sqlContext.createDataFrame( workingRDD, schemaTweets ) 93 | } 94 | 95 | class PixiedustStreamingListener extends org.apache.spark.streaming.scheduler.StreamingListener { 96 | override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) { 97 | sendLog("Receiver Started: " + receiverStarted.receiverInfo.name ) 98 | //Signal the frontend that we started streaming 99 | sendLog(BEGINSTREAM) 100 | } 101 | 102 | override def onReceiverError(receiverError: StreamingListenerReceiverError) { 103 | sendLog("Receiver Error: " + receiverError.receiverInfo.lastError) 104 | } 105 | 106 | override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped) { 107 | sendLog("Receiver Stopped: " + receiverStopped.receiverInfo.name) 108 | sendLog("Reason: " + receiverStopped.receiverInfo.lastError + " : " + receiverStopped.receiverInfo.lastErrorMessage) 109 | //signal the front end that we're done streaming 110 | sendLog(ENDSTREAM) 111 | } 112 | 113 | override def onBatchStarted(batchStarted: StreamingListenerBatchStarted){ 114 | sendLog("Batch started with " + batchStarted.batchInfo.numRecords + " records") 115 | } 116 | 117 | override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted){ 118 | sendLog("Batch completed with " + batchCompleted.batchInfo.numRecords + " records"); 119 | } 120 | } 121 | 122 | val reuseCheckpoint = false; 123 | 124 | def startStreaming(){ 125 | val sc = SparkContext.getOrCreate 126 | sendLog("Starting twitter stream"); 127 | if ( ssc != null ){ 128 | sendLog("Twitter Stream already running"); 129 | sendLog("Please use stopTwitterStreaming() first and try again"); 130 | return; 131 | } 132 | 133 | if ( !config.validateConfiguration() ){ 134 | sendLog("Unable to validate config") 135 | sendLog(ENDSTREAM) 136 | return; 137 | } 138 | 139 | Logger.getLogger("org.apache.spark").setLevel(Level.OFF) 140 | 141 | //Set the hadoop configuration if needed 142 | val checkpointDir = config.getConfig( DemoConfig.CHECKPOINT_DIR_KEY ); 143 | if ( checkpointDir.startsWith("swift") ){ 144 | println("Setting hadoop configuration for swift container") 145 | config.set_hadoop_config(sc) 146 | } 147 | 148 | workingRDD = sc.emptyRDD 149 | 150 | if ( !reuseCheckpoint ){ 151 | ssc = createStreamingContextAndRunAnalytics(sc); 152 | }else{ 153 | ssc = StreamingContext.getOrCreate( 154 | config.getConfig( DemoConfig.CHECKPOINT_DIR_KEY ), 155 | () => { 156 | createStreamingContextAndRunAnalytics(sc); 157 | }, 158 | sc.hadoopConfiguration, 159 | true 160 | ); 161 | } 162 | 163 | ssc.addStreamingListener( new PixiedustStreamingListener ) 164 | 165 | ssc.start() 166 | 167 | sendLog("Twitter stream started"); 168 | } 169 | 170 | def stopStreaming(){ 171 | if ( ssc == null){ 172 | sendLog("No Twitter stream to stop"); 173 | return; 174 | } 175 | 176 | sendLog("Stopping Twitter stream. Please wait this may take a while") 177 | ssc.stop(stopSparkContext = false, stopGracefully = false) 178 | ssc = null 179 | sendLog("Twitter stream stopped"); 180 | } 181 | 182 | def createStreamingContextAndRunAnalytics(sc:SparkContext):StreamingContext={ 183 | //Broadcast the config to each worker node 184 | val broadcastVar = sc.broadcast( config.toImmutableMap ) 185 | ssc = new StreamingContext( sc, Seconds(5) ) 186 | ssc.checkpoint(config.getConfig( DemoConfig.CHECKPOINT_DIR_KEY )); 187 | val stream = org.apache.spark.streaming.twitter.TwitterUtils.createStream( ssc, None ); 188 | runAnalytics(sc, broadcastVar, stream) 189 | ssc; 190 | } 191 | 192 | def runAnalytics(sc:SparkContext, broadcastVar: Broadcast[scala.collection.immutable.Map[String,String]], stream:DStream[Status]){ 193 | val keys = broadcastVar.value.get("tweets.key").get.split(","); 194 | val tweets = stream.filter { status => 195 | Option(status.getUser).flatMap[String] { 196 | u => Option(u.getLang) 197 | }.getOrElse("").startsWith("en") && CharMatcher.ASCII.matchesAllOf(status.getText) && ( keys.isEmpty || keys.exists{key => status.getText.toLowerCase.contains(key.toLowerCase)}) 198 | } 199 | 200 | val tweetAccumulator = sc.accumulable(Array[(String,String)]())(TweetsAccumulatorParam) 201 | 202 | new Thread( new Runnable() { 203 | def run(){ 204 | try{ 205 | while(ssc!=null && ssc.getState() != StreamingContextState.STOPPED ){ 206 | val accuValue = tweetAccumulator.value 207 | if ( accuValue.size > 0 ){ 208 | tweetAccumulator.setValue(Array[(String,String)]() ) 209 | accuValue.foreach( v => send(v._1, v._2) ) 210 | } 211 | Thread.sleep( 1000L ) 212 | } 213 | System.out.println("Stopping the accumulator thread") 214 | }catch{ 215 | case e:Throwable => e.printStackTrace() 216 | } 217 | } 218 | },"Accumulator").start 219 | 220 | val rowTweets = tweets.map(status=> { 221 | lazy val client = PooledHttp1Client() 222 | val sentiment = ToneAnalyzer.computeSentiment( client, status, broadcastVar ) 223 | var scoreMap : Map[String, Double] = Map() 224 | if ( sentiment != null ){ 225 | for( toneCategory <- Option(sentiment.tone_categories).getOrElse( Seq() )){ 226 | for ( tone <- Option( toneCategory.tones ).getOrElse( Seq() ) ){ 227 | scoreMap.put( tone.tone_id, (BigDecimal(tone.score).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) * 100.0 ) 228 | } 229 | } 230 | } 231 | 232 | var jsonSentiment="{"; 233 | scoreMap.foreach( t => jsonSentiment = jsonSentiment + (if (jsonSentiment.length() == 1) "" else ",") + "\"" + t._1 + "\":" + t._2) 234 | jsonSentiment += "}"; 235 | val sendValue:String = "{\"author\": \"" + 236 | status.getUser.getName + 237 | "\", \"userid\":\"" + status.getUser.getScreenName + 238 | "\", \"pic\":\"" + status.getUser.getOriginalProfileImageURLHttps + 239 | "\",\"text\":" + JSONObject.quote( status.getText ) + ", \"sentiment\": " + jsonSentiment + "}" 240 | 241 | tweetAccumulator+=("tweets",sendValue) 242 | 243 | EnrichedTweet( 244 | status.getUser.getName, 245 | status.getUser.getScreenName, 246 | status.getCreatedAt.toString, 247 | status.getUser.getLang, 248 | status.getText, 249 | Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0), 250 | Option(status.getGeoLocation).map{ _.getLongitude}.getOrElse(0.0), 251 | scoreMap 252 | ) 253 | }) 254 | 255 | rowTweets.foreachRDD( rdd => { 256 | if( rdd.count > 0 ){ 257 | workingRDD = SparkContext.getOrCreate().parallelize( rdd.map( t => t.toRow() ).collect()).union( workingRDD ) 258 | } 259 | }) 260 | 261 | val delimTagTone = "-%!" 262 | val delimToneScore = ":%@" 263 | val statsStream = rowTweets.map { eTweet => ("total_tweets", 1L) } 264 | .reduceByKey( _+_ ) 265 | .updateStateByKey( (a:scala.collection.Seq[Long], b:Option[Long] ) => { 266 | var runningCount=b.getOrElse(0L) 267 | a.foreach { v => runningCount=runningCount+v } 268 | Some(runningCount) 269 | }) 270 | statsStream.foreachRDD( rdd =>{ 271 | send("TweetProcessed", TweetsMetricJsonSerializer.serialize(rdd.collect())) 272 | }) 273 | 274 | val metricsStream = rowTweets.flatMap { eTweet => { 275 | val retList = ListBuffer[String]() 276 | for ( tag <- eTweet.text.split("\\s+") ){ 277 | if ( tag.startsWith( "#") && tag.length > 1 ){ 278 | for ( tone <- Option( eTweet.sentimentScores.keys ).getOrElse( Seq() ) ){ 279 | retList += (tag + delimTagTone + tone + delimToneScore + eTweet.sentimentScores.getOrElse( tone, 0.0)) 280 | } 281 | } 282 | } 283 | retList.toList 284 | }} 285 | .map { fullTag => { 286 | val split = fullTag.split(delimToneScore); 287 | (split(0), split(1).toFloat) 288 | }} 289 | .combineByKey( 290 | (x:Float) => (x,1), 291 | (x:(Float,Int), y:Float) => (x._1 + y, x._2+1), 292 | (x:(Float,Int),y:(Float,Int)) => (x._1 + y._1, x._2 + y._2), 293 | new HashPartitioner(sc.defaultParallelism) 294 | ) 295 | .map[(String,(Long/*count*/, List[(String, Double)]))]{ t => { 296 | val key = t._1; 297 | val ab = t._2; 298 | val split = key.split(delimTagTone) 299 | (split(0), (ab._2, List((split(1), BigDecimal(ab._1/ab._2).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble )))) 300 | }} 301 | .reduceByKey( (t,u) => (t._1+u._1, (t._2 ::: u._2).sortWith( (l,r) => l._1.compareTo( r._1 ) < 0 ))) 302 | .mapValues( (item:(Long, List[(String,Double)])) => { 303 | val unzip = item._2.unzip 304 | (item._1/(item._2.size), unzip._1, unzip._2) 305 | }) 306 | .updateStateByKey( (a:scala.collection.Seq[(Long, List[String], List[Double])], b: Option[(Long, List[String], List[Double])]) => { 307 | val safeB = b.getOrElse( (0L, List(), List() ) ) 308 | var listTones = safeB._2 309 | var listScores = safeB._3 310 | var count = safeB._1 311 | for( item <- a ){ 312 | count += item._1 313 | listScores = listScores.zipAll( item._3, 0.0, 0.0).map{ case(a,b)=>(a+b)/2 }.toList 314 | listTones = item._2 315 | } 316 | 317 | Some( (count, listTones, listScores) ) 318 | }) 319 | 320 | metricsStream.print 321 | 322 | metricsStream.foreachRDD( rdd =>{ 323 | val topHashTags = rdd.sortBy( f => f._2._1, false ).take(5) 324 | if ( !topHashTags.isEmpty){ 325 | tweetAccumulator+=("topHashtags", TweetsMetricJsonSerializer.serialize(topHashTags.map( f => (f._1, f._2._1 )))) 326 | tweetAccumulator+=("toneScores", ToneScoreJsonSerializer.serialize(topHashTags)) 327 | } 328 | }) 329 | 330 | } 331 | } 332 | 333 | object TweetsAccumulatorParam extends AccumulableParam[Array[(String,String)], (String,String)]{ 334 | def zero(initialValue:Array[(String,String)]):Array[(String,String)] = { 335 | Array() 336 | } 337 | 338 | def addInPlace(s1:Array[(String,String)], s2:Array[(String,String)]):Array[(String,String)] = { 339 | s1 ++ s2 340 | } 341 | 342 | def addAccumulator(current:Array[(String,String)], s:(String,String)):Array[(String,String)] = { 343 | current :+ s 344 | } 345 | } -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/StatusSerializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.ibm.cds.spark.samples 19 | 20 | import java.io.ObjectOutputStream 21 | import java.io.ByteArrayOutputStream 22 | import org.apache.kafka.common.serialization.Serializer 23 | import twitter4j.Status 24 | 25 | /** 26 | * @author dtaieb 27 | */ 28 | class StatusSerializer extends Serializer[Status]{ 29 | def configure( props: java.util.Map[String, _], isKey: Boolean) = { 30 | 31 | } 32 | 33 | def close(){ 34 | 35 | } 36 | 37 | def serialize(topic: String, value: Status ): Array[Byte] = { 38 | val baos = new ByteArrayOutputStream(1024) 39 | val oos = new ObjectOutputStream(baos) 40 | oos.writeObject( value ) 41 | oos.close 42 | baos.toByteArray() 43 | } 44 | } -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/StreamingListener.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.ibm.cds.spark.samples 19 | 20 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverError 21 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStopped 22 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted 23 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchCompleted 24 | import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted 25 | 26 | /** 27 | * @author dtaieb 28 | */ 29 | class StreamingListener extends org.apache.spark.streaming.scheduler.StreamingListener { 30 | override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) { 31 | println("Receiver Started: " + receiverStarted.receiverInfo.name ) 32 | } 33 | 34 | override def onReceiverError(receiverError: StreamingListenerReceiverError) { 35 | println("Receiver Error: " + receiverError.receiverInfo.lastError) 36 | } 37 | 38 | override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped) { 39 | println("Receiver Stopped: " + receiverStopped.receiverInfo.name) 40 | println("Reason: " + receiverStopped.receiverInfo.lastError + " : " + receiverStopped.receiverInfo.lastErrorMessage) 41 | } 42 | 43 | override def onBatchStarted(batchStarted: StreamingListenerBatchStarted){ 44 | println("Batch started with " + batchStarted.batchInfo.numRecords + " records") 45 | } 46 | 47 | override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted){ 48 | println("Batch completed with " + batchCompleted.batchInfo.numRecords + " records"); 49 | } 50 | } -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/StreamingTwitter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.ibm.cds.spark.samples 19 | 20 | import scala.collection.mutable._ 21 | import org.apache.commons.lang3.StringEscapeUtils 22 | import org.apache.log4j.Level 23 | import org.apache.log4j.Logger 24 | import org.apache.spark.Accumulator 25 | import org.apache.spark.SparkConf 26 | import org.apache.spark.SparkContext 27 | import org.apache.spark.streaming._ 28 | import org.apache.spark.streaming.dstream._ 29 | import org.http4s._ 30 | import org.http4s.Http4s._ 31 | import org.http4s.Status._ 32 | import org.http4s.client.Client 33 | import org.http4s.client.blaze.PooledHttp1Client 34 | import org.http4s.headers.Authorization 35 | import com.ibm.couchdb._ 36 | import scalaz._ 37 | import scalaz.concurrent.Task 38 | import twitter4j.Status 39 | import org.apache.spark.sql.SQLContext 40 | import org.apache.spark.sql.Row 41 | import org.apache.spark.sql.types._ 42 | import org.apache.spark.sql.DataFrame 43 | import org.apache.spark.rdd.RDD 44 | import org.apache.spark.rdd.EmptyRDD 45 | import com.google.common.base.CharMatcher 46 | import scala.math.BigDecimal 47 | import com.ibm.cds.spark.samples.config.DemoConfig 48 | import com.ibm.cds.spark.samples.ToneAnalyzer.ToneCategory 49 | import org.apache.spark.Logging 50 | 51 | 52 | 53 | 54 | /** 55 | * @author dtaieb 56 | */ 57 | object StreamingTwitter extends Logging{ 58 | var ssc: StreamingContext = null 59 | var sqlContext: SQLContext = null 60 | var workingRDD: RDD[Row] = null 61 | var schemaTweets : StructType = null 62 | val logger: Logger = Logger.getLogger( "com.ibm.cds.spark.samples.StreamingTwitter" ) 63 | 64 | //main method invoked when running as a standalone Spark Application 65 | def main(args: Array[String]) { 66 | 67 | val conf = new SparkConf().setAppName("Spark Streaming Twitter Demo") 68 | val sc = new SparkContext(conf) 69 | startTwitterStreaming(sc, Seconds(10)); 70 | } 71 | 72 | //Hold configuration key/value pairs 73 | val config = new DemoConfig 74 | 75 | //Wrapper api for Notebook access 76 | def setConfig(key:String, value:String){ 77 | config.setConfig(key, value) 78 | } 79 | 80 | def startTwitterStreaming( sc: SparkContext, stopAfter: Duration = Seconds(0) ){ 81 | println("Starting twitter stream"); 82 | if ( ssc != null ){ 83 | println("Twitter Stream already running"); 84 | println("Please use stopTwitterStreaming() first and try again"); 85 | return; 86 | } 87 | 88 | if ( !config.validateConfiguration(DemoConfig.CHECKPOINT_DIR_KEY) ){ 89 | println("Unable to validate config") 90 | return; 91 | } 92 | 93 | Logger.getLogger("org.apache.spark").setLevel(Level.OFF) 94 | 95 | workingRDD = sc.emptyRDD 96 | //Broadcast the config to each worker node 97 | val broadcastVar = sc.broadcast(config.toImmutableMap) 98 | 99 | var canStopTwitterStream = true 100 | var batchesProcessed=0 101 | 102 | ssc = new StreamingContext( sc, Seconds(5) ) 103 | 104 | ssc.addStreamingListener( new StreamingListener ) 105 | 106 | try{ 107 | sqlContext = new SQLContext(sc) 108 | val keys = config.getConfig("tweets.key").split(","); 109 | val stream = org.apache.spark.streaming.twitter.TwitterUtils.createStream( ssc, None ); 110 | 111 | if ( schemaTweets == null ){ 112 | val schemaString = "author userid date lang text lat:double long:double" 113 | schemaTweets = 114 | StructType( 115 | schemaString.split(" ").map( 116 | fieldName => { 117 | val ar = fieldName.split(":") 118 | StructField( 119 | ar.lift(0).get, 120 | ar.lift(1).getOrElse("string") match{ 121 | case "int" => IntegerType 122 | case "double" => DoubleType 123 | case _ => StringType 124 | }, 125 | true) 126 | } 127 | ).union( 128 | ToneAnalyzer.sentimentFactors.map( f => StructField( f._1, DoubleType )).toArray[StructField] 129 | ) 130 | ) 131 | } 132 | val tweets = stream.filter { status => 133 | Option(status.getUser).flatMap[String] { 134 | u => Option(u.getLang) 135 | }.getOrElse("").startsWith("en") && CharMatcher.ASCII.matchesAllOf(status.getText) && ( keys.isEmpty || keys.exists{status.getText.contains(_)}) 136 | } 137 | 138 | lazy val client = PooledHttp1Client() 139 | val rowTweets = tweets.map(status=> { 140 | val sentiment = ToneAnalyzer.computeSentiment( client, status, broadcastVar ) 141 | 142 | var colValues = Array[Any]( 143 | status.getUser.getName, //author 144 | status.getUser.getScreenName, //Userid 145 | status.getCreatedAt.toString, //date 146 | status.getUser.getLang, //Lang 147 | status.getText, //text 148 | Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0), //lat 149 | Option(status.getGeoLocation).map{_.getLongitude}.getOrElse(0.0) //long 150 | //exception 151 | ) 152 | 153 | var scoreMap : Map[String, Double] = Map() 154 | if ( sentiment != null ){ 155 | for( toneCategory <- Option(sentiment.tone_categories).getOrElse( Seq() )){ 156 | for ( tone <- Option( toneCategory.tones ).getOrElse( Seq() ) ){ 157 | scoreMap.put( tone.tone_id, tone.score ) 158 | } 159 | } 160 | } 161 | 162 | colValues = colValues ++ ToneAnalyzer.sentimentFactors.map { f => (BigDecimal(scoreMap.get(f._2).getOrElse(0.0)).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) * 100.0 } 163 | //Return [Row, (sentiment, status)] 164 | (Row(colValues.toArray:_*),(sentiment, status)) 165 | }) 166 | 167 | rowTweets.foreachRDD( rdd => { 168 | if(batchesProcessed==0){ 169 | canStopTwitterStream=false 170 | } 171 | try{ 172 | if( rdd.count > 0 ){ 173 | batchesProcessed += 1 174 | workingRDD = sc.parallelize( rdd.map( t => t._1 ).collect()).union( workingRDD ) 175 | 176 | val saveToCloudant = broadcastVar.value.get("cloudant.save").get.toBoolean 177 | if ( saveToCloudant ){ 178 | rdd.foreachPartition { iterator => 179 | var db: CouchDbApi = null; 180 | val couch = CouchDb( broadcastVar.value.get("cloudant.hostName").get, 181 | broadcastVar.value.get("cloudant.port").get.toInt, 182 | broadcastVar.value.get("cloudant.https").get.toBoolean, 183 | broadcastVar.value.get("cloudant.username").get, 184 | broadcastVar.value.get("cloudant.password").get 185 | ); 186 | val dbName = "spark-streaming-twitter" 187 | couch.dbs.get(dbName).attemptRun match{ 188 | case -\/(e) => logger.trace("Couch Database does not exist, creating it now"); couch.dbs.create(dbName).run 189 | case \/-(a) => println("Connected to cloudant db " + dbName ) 190 | } 191 | val typeMapping = TypeMapping(classOf[ToneAnalyzer.Tweet] -> "Tweet") 192 | db = couch.db(dbName, typeMapping) 193 | iterator.foreach( t => { 194 | saveTweetToCloudant( client, db, t._2._2, t._2._1 ) 195 | } 196 | ) 197 | } 198 | } 199 | } 200 | }catch{ 201 | case e: InterruptedException=>//Ignore 202 | case e: Exception => logError(e.getMessage, e ) 203 | }finally{ 204 | canStopTwitterStream = true 205 | } 206 | }) 207 | 208 | }catch{ 209 | case e : Exception => logError(e.getMessage, e ) 210 | return 211 | } 212 | ssc.start() 213 | 214 | println("Twitter stream started"); 215 | println("Tweets are collected real-time and analyzed") 216 | println("To stop the streaming and start interacting with the data use: StreamingTwitter.stopTwitterStreaming") 217 | 218 | if ( !stopAfter.isZero ){ 219 | //Automatically stop it after 10s 220 | new Thread( new Runnable { 221 | var displayMessage = true; 222 | def run(){ 223 | Thread.sleep( stopAfter.milliseconds ) 224 | var loop = true 225 | while(loop){ 226 | if (canStopTwitterStream){ 227 | stopTwitterStreaming 228 | loop = false 229 | }else{ 230 | if ( displayMessage ){ 231 | displayMessage = false 232 | println("Received directive to stop twitter Stream: Waiting for already received tweets to be processed...") 233 | } 234 | Thread.sleep(5000L) 235 | } 236 | } 237 | } 238 | }).start 239 | } 240 | } 241 | 242 | def saveTweetToCloudant(client: Client, db: CouchDbApi, status:Status, sentiment: ToneAnalyzer.Sentiment) : Status = { 243 | if ( db != null){ 244 | logger.trace("Creating new Tweet in Couch Database " + status.getText()) 245 | val task:Task[Res.DocOk] = db.docs.create( 246 | ToneAnalyzer.Tweet( 247 | status.getUser().getName, 248 | status.getCreatedAt().toString(), 249 | status.getUser().getLang(), 250 | status.getText(), 251 | ToneAnalyzer.Geo( 252 | Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0), 253 | Option(status.getGeoLocation).map{_.getLongitude}.getOrElse(0.0) 254 | ), 255 | sentiment 256 | ) 257 | ) 258 | 259 | // Execute the actions and process the result 260 | task.attemptRun match { 261 | case -\/(e) => logError(e.getMessage, e ); 262 | case \/-(a) => logger.trace("Successfully create new Tweet in Couch Database " + status.getText() ) 263 | } 264 | } 265 | 266 | status 267 | } 268 | 269 | def createTwitterDataFrames(sc: SparkContext) : (SQLContext, DataFrame) = { 270 | if ( workingRDD.count <= 0 ){ 271 | println("No data receive. Please start the Twitter stream again to collect data") 272 | return null 273 | } 274 | 275 | try{ 276 | val df = sqlContext.createDataFrame( workingRDD, schemaTweets ) 277 | df.registerTempTable("tweets") 278 | 279 | println("A new table named tweets with " + df.count() + " records has been correctly created and can be accessed through the SQLContext variable") 280 | println("Here's the schema for tweets") 281 | df.printSchema() 282 | 283 | (sqlContext, df) 284 | }catch{ 285 | case e: Exception => {logError(e.getMessage, e ); return null} 286 | } 287 | } 288 | 289 | def stopTwitterStreaming(){ 290 | if ( ssc == null){ 291 | println("No Twitter stream to stop"); 292 | return; 293 | } 294 | 295 | println("Stopping Twitter stream. Please wait this may take a while") 296 | ssc.stop(stopSparkContext = false, stopGracefully = false) 297 | ssc = null 298 | println("Twitter stream stopped"); 299 | 300 | println( "You can now create a sqlContext and DataFrame with " + workingRDD.count + " Tweets created. Sample usage: ") 301 | println("val (sqlContext, df) = com.ibm.cds.spark.samples.StreamingTwitter.createTwitterDataFrames(sc)") 302 | println("df.printSchema") 303 | println("sqlContext.sql(\"select author, text from tweets\").show") 304 | } 305 | } -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/ToneAnalyzer.scala: -------------------------------------------------------------------------------- 1 | package com.ibm.cds.spark.samples 2 | 3 | import org.http4s.EntityEncoder 4 | import org.http4s.Uri 5 | import org.http4s.client.Client 6 | import org.http4s.Request 7 | import org.http4s.BasicCredentials 8 | import org.http4s.Header 9 | import org.http4s.Headers 10 | import org.http4s.Method 11 | import org.http4s.headers.Authorization 12 | import org.apache.log4j.Logger 13 | import org.apache.spark.broadcast.Broadcast 14 | import org.apache.spark.Logging 15 | import scala.util.parsing.json.JSON 16 | import org.codehaus.jettison.json.JSONObject 17 | 18 | /** 19 | * @author dtaieb 20 | */ 21 | 22 | object ToneAnalyzer extends Logging{ 23 | 24 | val sentimentFactors = Array( 25 | ("Anger","anger"), 26 | ("Disgust","disgust"), 27 | ("Fear","fear"), 28 | ("Joy","joy"), 29 | ("Sadness","sadness"), 30 | ("Analytical","analytical"), 31 | ("Confident","confident"), 32 | ("Tentative","tentative"), 33 | ("Openness","openness_big5"), 34 | ("Conscientiousness","conscientiousness_big5"), 35 | ("Extraversion","extraversion_big5"), 36 | ("Agreeableness","agreeableness_big5"), 37 | ("EmotionalRange","neuroticism_big5") 38 | ) 39 | 40 | //Class models for Sentiment JSON 41 | case class DocumentTone( document_tone: Sentiment ) 42 | case class Sentiment(tone_categories: Seq[ToneCategory]); 43 | case class ToneCategory(category_id: String, category_name: String, tones: Seq[Tone]); 44 | case class Tone(score: Double, tone_id: String, tone_name: String) 45 | // case class Sentiment( scorecard: String, children: Seq[Tone] ) 46 | // case class Tone( name: String, id: String, children: Seq[ToneResult]) 47 | // case class ToneResult(name: String, id: String, word_count: Double, normalized_score: Double, raw_score: Double, linguistic_evidence: Seq[LinguisticEvidence] ) 48 | // case class LinguisticEvidence( evidence_score: Double, word_count: Double, correlation: String, words : Seq[String]) 49 | 50 | case class Geo( lat: Double, long: Double ) 51 | case class Tweet(author: String, date: String, language: String, text: String, geo : Geo, sentiment : Sentiment ) 52 | 53 | def computeSentiment( client: Client, status:StatusAdapter, broadcastVar: Broadcast[Map[String,String]] ) : Sentiment = { 54 | logTrace("Calling sentiment from Watson Tone Analyzer: " + status.text) 55 | try{ 56 | //Get Sentiment on the tweet 57 | val sentimentResults: String = 58 | EntityEncoder[String].toEntity("{\"text\": " + JSONObject.quote( status.text ) + "}" ).flatMap { 59 | entity => 60 | val s = broadcastVar.value.get("watson.tone.url").get + "/v3/tone?version=" + broadcastVar.value.get("watson.api.version").get 61 | val toneuri: Uri = Uri.fromString( s ).getOrElse( null ) 62 | client( 63 | Request( 64 | method = Method.POST, 65 | uri = toneuri, 66 | headers = Headers( 67 | Authorization( 68 | BasicCredentials(broadcastVar.value.get("watson.tone.username").get, broadcastVar.value.get("watson.tone.password").get) 69 | ), 70 | Header("Accept", "application/json"), 71 | Header("Content-Type", "application/json; charset=utf-8") 72 | ), 73 | body = entity.body 74 | ) 75 | ).flatMap { response => 76 | if (response.status.code == 200 ) { 77 | response.as[String] 78 | } else { 79 | println( "Error received from Watson Tone Analyzer. Code : " + response.status.code + " reason: " + response.status.reason ) 80 | null 81 | } 82 | } 83 | }.run 84 | 85 | upickle.read[DocumentTone](sentimentResults).document_tone 86 | }catch{ 87 | case e:Throwable => { 88 | e.printStackTrace() 89 | null 90 | } 91 | } 92 | } 93 | } -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/TwitterAdapter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.ibm.cds.spark.samples 19 | 20 | import java.io.ObjectInputStream 21 | import java.io.ByteArrayInputStream 22 | import scala.util.parsing.json.JSON 23 | import org.apache.kafka.common.serialization.Deserializer 24 | import twitter4j.Status 25 | 26 | /** 27 | * @author dtaieb 28 | * Deserialization adapters for Twitter4J Status 29 | */ 30 | 31 | case class StatusAdapter(userName:String, userId: String, userLang: String,createdAt:String,text:String, long:Double, lat:Double); 32 | 33 | object StatusAdapter{ 34 | implicit def statusAdapterWrapper(status: Status) = 35 | StatusAdapter( 36 | status.getUser.getName, 37 | status.getUser.getScreenName, 38 | status.getUser.getLang, 39 | status.getCreatedAt.toString, 40 | status.getText, 41 | Option(status.getGeoLocation).map{ _.getLongitude}.getOrElse(0.0), 42 | Option(status.getGeoLocation).map{ _.getLatitude}.getOrElse(0.0) 43 | ) 44 | } 45 | 46 | class StatusDeserializer extends Deserializer[StatusAdapter]{ 47 | def configure( props: java.util.Map[String, _], isKey: Boolean) = { 48 | 49 | } 50 | 51 | def close(){ 52 | 53 | } 54 | 55 | def deserialize(topic: String, data: Array[Byte] ): StatusAdapter = { 56 | try{ 57 | val bais = new ByteArrayInputStream( data ) 58 | var ois:ObjectInputStream = null 59 | try{ 60 | ois = new ObjectInputStream( bais ) 61 | ois.readObject().asInstanceOf[Status] 62 | }finally{ 63 | if (bais != null ){ 64 | bais.close 65 | } 66 | if ( ois != null ){ 67 | ois.close 68 | } 69 | } 70 | }catch{ 71 | case e:Throwable=>{ 72 | val jsonObject = JSON.parseFull( new String(data) ).getOrElse(Map.empty).asInstanceOf[Map[String, Any]] 73 | val user=jsonObject.get("user").getOrElse( Map.empty ).asInstanceOf[Map[String,Any]] 74 | val geo = Option(jsonObject.get("geo").orNull).getOrElse(Map.empty).asInstanceOf[Map[String,Any]] 75 | StatusAdapter( 76 | user.get("name").getOrElse("").asInstanceOf[String], 77 | user.get("userid").getOrElse("").asInstanceOf[String], 78 | user.get("lang").getOrElse("").asInstanceOf[String], 79 | jsonObject.get("created_at").getOrElse("").asInstanceOf[String], 80 | jsonObject.get("text").getOrElse("").asInstanceOf[String], 81 | geo.get("long").getOrElse(0.0).asInstanceOf[Double], 82 | geo.get("lat").getOrElse(0.0).asInstanceOf[Double] 83 | ) 84 | } 85 | } 86 | } 87 | } -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/config/DemoConfig.scala: -------------------------------------------------------------------------------- 1 | package com.ibm.cds.spark.samples.config 2 | 3 | import org.apache.kafka.clients.CommonClientConfigs 4 | import java.io.FileInputStream 5 | import java.io.InputStream 6 | import scala.collection.JavaConversions._ 7 | import org.apache.spark.SparkContext 8 | 9 | 10 | /** 11 | * @author dtaieb 12 | */ 13 | 14 | class DemoConfig() extends Serializable{ 15 | 16 | //Hold configuration key/value pairs 17 | var config = scala.collection.mutable.Map[String, String]( 18 | registerConfigKey("twitter4j.oauth.consumerKey" ), 19 | registerConfigKey("twitter4j.oauth.consumerSecret" ), 20 | registerConfigKey("twitter4j.oauth.accessToken" ), 21 | registerConfigKey("twitter4j.oauth.accessTokenSecret"), 22 | registerConfigKey("tweets.key",""), 23 | registerConfigKey("cloudant.hostName" ), 24 | registerConfigKey("cloudant.https", "true"), 25 | registerConfigKey("cloudant.port" ), 26 | registerConfigKey("cloudant.username" ), 27 | registerConfigKey("cloudant.password" ), 28 | registerConfigKey("watson.tone.url" ), 29 | registerConfigKey("watson.tone.username" ), 30 | registerConfigKey("watson.tone.password" ), 31 | registerConfigKey("watson.api.version", "2016-05-19"), 32 | registerConfigKey("cloudant.save", "false" ), 33 | registerConfigKey(DemoConfig.CHECKPOINT_DIR_KEY) 34 | ) 35 | 36 | private def getKeyOrFail(key:String):String={ 37 | config.get(key).getOrElse( { 38 | throw new IllegalStateException("Missing key: " + key) 39 | }) 40 | } 41 | 42 | def cloneConfig():MessageHubConfig={ 43 | val props = new MessageHubConfig 44 | config.foreach{ entry => props.setConfig(entry._1, entry._2)} 45 | props 46 | } 47 | 48 | def set_hadoop_config(sc:SparkContext){ 49 | val prefix = "fs.swift.service." + getKeyOrFail("name") 50 | val hconf = sc.hadoopConfiguration 51 | hconf.set(prefix + ".auth.url", getKeyOrFail("auth_url")+"/v3/auth/tokens") 52 | hconf.set(prefix + ".auth.endpoint.prefix", "endpoints") 53 | hconf.set(prefix + ".tenant", getKeyOrFail("project_id")) 54 | hconf.set(prefix + ".username", getKeyOrFail("user_id")) 55 | hconf.set(prefix + ".password", getKeyOrFail("password")) 56 | hconf.setInt(prefix + ".http.port", 8080) 57 | hconf.set(prefix + ".region", getKeyOrFail("region")) 58 | hconf.setBoolean(prefix + ".public", true) 59 | } 60 | 61 | def initConfigKeys(){ 62 | //Overridable by subclasses 63 | } 64 | 65 | //Give a chance to subclasses to init the keys 66 | initConfigKeys; 67 | 68 | { 69 | //Load config from property file if specified 70 | val configPath = Option(System.getProperty("DEMO_CONFIG_PATH") ).orElse( Option(System.getenv("DEMO_CONFIG_PATH"))) 71 | .orElse( Option(System.getProperty("spark.service.user.DEMO_CONFIG_PATH") )).orElse(Option(System.getenv("spark.service.user.DEMO_CONFIG_PATH") )) 72 | .getOrElse(null) 73 | if (configPath != null ){ 74 | println("ConfigPath is: " + configPath ) 75 | } 76 | if ( configPath != null ){ 77 | println("Loading config from DEMO_CONFIG_PATH env variable: " + configPath) 78 | val props = new java.util.Properties 79 | var fis:InputStream = null 80 | try{ 81 | fis = new FileInputStream(configPath) 82 | props.load(fis) 83 | for( key <- props.keysIterator ){ 84 | setConfig( key, props.getProperty(key)) 85 | } 86 | }catch{ 87 | case e:Throwable => e.printStackTrace 88 | }finally{ 89 | if ( fis != null ){ 90 | fis.close 91 | } 92 | } 93 | } 94 | } 95 | 96 | private[config] def registerConfigKey( key: String, default: String = null ) : (String,String) = { 97 | if ( default == null ){ 98 | (key, Option(System.getProperty(key)).orNull ) 99 | } 100 | (key, Option(System.getProperty(key)) getOrElse default ) 101 | } 102 | 103 | def setConfig(key:String, value:String){ 104 | config.put( key, value ) 105 | } 106 | 107 | def getConfig(key:String):String={ 108 | config.get(key).getOrElse("") 109 | } 110 | 111 | implicit def toImmutableMap(): Map[String,String]= { 112 | Map( config.toList: _* ) 113 | } 114 | 115 | //Validate configuration settings 116 | def validateConfiguration(ignorePrefix:String*) : Boolean = { 117 | def ignoreKey( key: String ): Boolean = { 118 | var o = ignorePrefix.find { p => p.startsWith( key ) }; 119 | o.isDefined 120 | } 121 | var ret: Boolean = true; 122 | val saveToCloudant = config.get("cloudant.save").get.toBoolean 123 | config.foreach( (t:(String, Any)) => 124 | if ( t._2 == null ){ 125 | if ( saveToCloudant || !t._1.startsWith("cloudant") ){ 126 | if ( !ignoreKey( t._1) ){ 127 | println(t._1 + " configuration not set. Use setConfig(\"" + t._1 + "\",)"); 128 | ret = false; 129 | } 130 | } 131 | } 132 | ) 133 | 134 | if ( ret ){ 135 | config.foreach( (t:(String,Any)) => 136 | try{ 137 | if ( t._1.startsWith( "twitter4j") && t._2 != null && !ignoreKey(t._1) ) { 138 | System.setProperty( t._1, t._2.asInstanceOf[String] ) 139 | } 140 | }catch{ 141 | case e:Throwable => println("error" + t) 142 | } 143 | ) 144 | } 145 | ret 146 | } 147 | } 148 | 149 | object DemoConfig extends DemoConfig{ 150 | final val CHECKPOINT_DIR_KEY = "checkpointDir" 151 | } 152 | -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/config/MessageHubConfig.scala: -------------------------------------------------------------------------------- 1 | package com.ibm.cds.spark.samples.config 2 | 3 | import scala.collection.mutable.ListBuffer 4 | import scala.reflect.ClassTag 5 | import org.apache.kafka.clients.CommonClientConfigs 6 | import org.apache.kafka.common.config.SslConfigs 7 | import org.apache.kafka.common.security.JaasUtils 8 | import scala.io.Source 9 | import java.io.InputStream 10 | import java.io.FileWriter 11 | import java.io.File 12 | import org.http4s.EntityEncoder 13 | import org.http4s.Uri 14 | import org.http4s.client.blaze.PooledHttp1Client 15 | import org.http4s.Request 16 | import org.http4s.Method 17 | import org.http4s.Headers 18 | import org.http4s.headers.Authorization 19 | import org.http4s.BasicCredentials 20 | import org.http4s.Header 21 | import javax.net.ssl.SSLContext 22 | import org.codehaus.jettison.json.JSONObject 23 | 24 | 25 | /** 26 | * @author dtaieb 27 | */ 28 | class MessageHubConfig extends DemoConfig{ 29 | lazy val kafkaOptionKeys = ListBuffer[String]() 30 | override def initConfigKeys(){ 31 | config = config ++ Map[String,String]( 32 | registerConfigKey(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG), 33 | registerConfigKey(CommonClientConfigs.CLIENT_ID_CONFIG, "demo.watson.twitter.messagehub"), 34 | registerConfigKey("auto.offset.reset", "latest"), 35 | registerConfigKey("acks", "-1"), 36 | registerConfigKey("retries", "0"), 37 | registerConfigKey("batch.size", "16384"), 38 | registerConfigKey("linger.ms", "1"), 39 | registerConfigKey("buffer.memory", "33554432"), 40 | registerConfigKey("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"), 41 | registerConfigKey("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"), 42 | registerConfigKey(SslConfigs.SSL_PROTOCOL_CONFIG, "TLSv1.2"), 43 | registerConfigKey(SslConfigs.SSL_ENABLED_PROTOCOLS_CONFIG, "TLSv1.2"), 44 | registerConfigKey(SslConfigs.SSL_TRUSTSTORE_TYPE_CONFIG, "JKS"), 45 | registerConfigKey(SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG, getDefaultSSLTrustStoreLocation), 46 | registerConfigKey(SslConfigs.SSL_TRUSTSTORE_PASSWORD_CONFIG, "changeit"), 47 | registerConfigKey(SslConfigs.SSL_ENDPOINT_IDENTIFICATION_ALGORITHM_CONFIG, "HTTPS"), 48 | registerConfigKey(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, "SASL_SSL" ), 49 | 50 | registerConfigKey(MessageHubConfig.CHECKPOINT_DIR_KEY), 51 | registerConfigKey(MessageHubConfig.KAFKA_TOPIC_TWEETS, "demo.tweets.watson.topic"), 52 | registerConfigKey(MessageHubConfig.KAFKA_USER_NAME), 53 | registerConfigKey(MessageHubConfig.KAFKA_USER_PASSWORD), 54 | registerConfigKey(MessageHubConfig.MESSAGEHUB_API_KEY), 55 | registerConfigKey(MessageHubConfig.MESSAGEHUB_REST_URL) 56 | ) 57 | } 58 | 59 | private def getDefaultSSLTrustStoreLocation():String={ 60 | val javaHome = System.getProperty("java.home") + File.separator + "lib" + File.separator + "security" + File.separator + "cacerts" 61 | println("default location of ssl Trust store is: " + javaHome) 62 | javaHome 63 | } 64 | 65 | override private[config] def registerConfigKey( key: String, default: String = null ) : (String,String) = { 66 | kafkaOptionKeys += key 67 | super.registerConfigKey(key,default) 68 | } 69 | 70 | override def validateConfiguration(ignorePrefix:String*) : Boolean = { 71 | val ret = super.validateConfiguration(ignorePrefix:_*) 72 | if ( ret ){ 73 | //Create the jaas configuration 74 | MessageHubConfig.createJaasConfiguration(getConfig(MessageHubConfig.KAFKA_USER_NAME ), getConfig(MessageHubConfig.KAFKA_USER_PASSWORD) ) 75 | } 76 | ret 77 | } 78 | 79 | def copyKafkaOptionKeys(other:MessageHubConfig){ 80 | kafkaOptionKeys.foreach { key => other.setConfig(key, getConfig(key) ) } 81 | } 82 | 83 | def setValueSerializer[U]()(implicit c: ClassTag[U]){ 84 | setConfig("value.serializer", c.runtimeClass.getName); 85 | } 86 | 87 | def setValueDeserializer[U]()(implicit c: ClassTag[U]){ 88 | setConfig("value.deserializer", c.runtimeClass.getName); 89 | } 90 | 91 | def createTopicsIfNecessary( topics: String* ){ 92 | val sslContext = SSLContext.getInstance("TLSv1.2") 93 | sslContext.init(null, null, null) 94 | lazy val client = PooledHttp1Client(sslContext=Option(sslContext)) 95 | for( topic <- topics ){ 96 | EntityEncoder[String].toEntity("{\"name\":" + JSONObject.quote( topic ) + "}" ).flatMap { 97 | entity => 98 | val topicUri: Uri = Uri.fromString( getConfig(MessageHubConfig.MESSAGEHUB_REST_URL) + "/admin/topics" ).getOrElse( null ) 99 | println(topicUri) 100 | client( 101 | Request( 102 | method = Method.POST, 103 | uri = topicUri, 104 | headers = Headers( 105 | Header("Content-Type", "application/json"), 106 | Header("X-Auth-Token", getConfig(MessageHubConfig.MESSAGEHUB_API_KEY)) 107 | ), 108 | body = entity.body 109 | ) 110 | ).flatMap { response => 111 | response.status.code match { 112 | case 200 | 202 => println("Successfully created topic: " + topic) 113 | case 422 | 403 => println("Topic already exists in the server: " + topic) 114 | case _ => throw new IllegalStateException("Error when trying to create topic: " + response.status.code + " Reason: " + response.status.reason) 115 | } 116 | response.as[String] 117 | } 118 | }.run 119 | } 120 | } 121 | } 122 | 123 | object MessageHubConfig{ 124 | final val CHECKPOINT_DIR_KEY = DemoConfig.CHECKPOINT_DIR_KEY 125 | final val KAFKA_TOPIC_TWEETS = "kafka.topic.tweet" //Key for name of the kafka topic holding used for publishing the tweets 126 | final val KAFKA_USER_NAME = "kafka.user.name" 127 | final val KAFKA_USER_PASSWORD = "kafka.user.password" 128 | 129 | final val MESSAGEHUB_API_KEY = "api_key" 130 | final val MESSAGEHUB_REST_URL = "kafka_rest_url" 131 | 132 | private def fixPath(path: String):String = { 133 | path.replaceAll("\\ / : * ? \" < > |,", "_") 134 | } 135 | 136 | def createJaasConfiguration( userName: String, password: String){ 137 | //Create the jaas configuration 138 | var is:InputStream = null 139 | try{ 140 | val packageName = MessageHubConfig.getClass.getPackage.getName.replace('.', File.separatorChar) 141 | is = MessageHubConfig.getClass.getClassLoader.getResourceAsStream(packageName + "/jaas.conf"); 142 | val confString = Source.fromInputStream( is ).mkString 143 | .replace( "$USERNAME", userName) 144 | .replace( "$PASSWORD", password ) 145 | 146 | val confDir= new File( System.getProperty("java.io.tmpdir") + File.separator + 147 | fixPath( userName ) ) 148 | confDir.mkdirs 149 | val confFile = new File( confDir, "jaas.conf"); 150 | val fw = new FileWriter( confFile ); 151 | fw.write( confString ) 152 | fw.close 153 | 154 | //Set the jaas login config property 155 | println("Registering JaasConfiguration: " + confFile.getAbsolutePath) 156 | System.setProperty(JaasUtils.JAVA_LOGIN_CONFIG_PARAM, confFile.getAbsolutePath ) 157 | }catch{ 158 | case e:Throwable => { 159 | e.printStackTrace 160 | throw e 161 | } 162 | }finally{ 163 | if ( is != null ) is.close 164 | } 165 | } 166 | } -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/config/jaas.conf: -------------------------------------------------------------------------------- 1 | KafkaClient { 2 | com.ibm.messagehub.login.MessageHubLoginModule required 3 | serviceName="kafka" 4 | username="$USERNAME" 5 | password="$PASSWORD"; 6 | }; -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/dstream/KafkaInputDStream.scala: -------------------------------------------------------------------------------- 1 | package com.ibm.cds.spark.samples.dstream 2 | 3 | import scala.collection.JavaConversions._ 4 | import scala.collection.Map 5 | import scala.reflect.ClassTag 6 | import scala.reflect.classTag 7 | import org.apache.kafka.clients.consumer.ConsumerRecord 8 | import org.apache.kafka.clients.consumer.KafkaConsumer 9 | import org.apache.kafka.common.serialization.Deserializer 10 | import org.apache.spark.Logging 11 | import org.apache.spark.storage.StorageLevel 12 | import org.apache.spark.streaming.StreamingContext 13 | import org.apache.spark.streaming.dstream._ 14 | import org.apache.spark.streaming.receiver.Receiver 15 | import org.apache.log4j.Level 16 | import org.apache.log4j.Logger 17 | import java.util.Properties 18 | import com.ibm.cds.spark.samples.config.MessageHubConfig 19 | import org.apache.kafka.common.security.JaasUtils 20 | 21 | class KafkaInputDStream[ 22 | K: ClassTag, 23 | V: ClassTag, 24 | U <: Deserializer[_]: ClassTag, 25 | T <: Deserializer[_]: ClassTag]( 26 | ssc : StreamingContext, 27 | kafkaParams: Map[String, String], 28 | topics: List[String], 29 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK 30 | ) extends ReceiverInputDStream[(K, V)](ssc) with Logging { 31 | 32 | def getReceiver(): Receiver[(K, V)] = { 33 | new KafkaReceiver[K, V, U, T](kafkaParams, topics, storageLevel) 34 | } 35 | } 36 | 37 | object KafkaStreaming{ 38 | implicit class KafkaStreamingContextAdapter( val ssc : StreamingContext ){ 39 | def createKafkaStream[K: ClassTag, V: ClassTag, U <: Deserializer[_]: ClassTag, T <: Deserializer[_]: ClassTag]( 40 | bootStrapKafkaConfig: MessageHubConfig, 41 | topics: List[String] 42 | ): ReceiverInputDStream[(K, V)] = { 43 | val kafkaProps = new MessageHubConfig; 44 | bootStrapKafkaConfig.copyKafkaOptionKeys( kafkaProps) 45 | kafkaProps.setValueDeserializer[T]; 46 | new KafkaInputDStream[K, V, U, T](ssc, kafkaProps.toImmutableMap, topics) 47 | } 48 | } 49 | } 50 | 51 | class KafkaReceiver[ 52 | K: ClassTag, 53 | V: ClassTag, 54 | U <: Deserializer[_]: ClassTag, 55 | T <: Deserializer[_]: ClassTag]( 56 | kafkaParams: Map[String,String], 57 | topics: List[String], 58 | storageLevel: StorageLevel 59 | ) extends Receiver[(K, V)](storageLevel) with Logging { 60 | 61 | // Connection to Kafka 62 | var kafkaConsumer: KafkaConsumer[K,V] = null 63 | 64 | def onStop() { 65 | if (kafkaConsumer != null) { 66 | kafkaConsumer.synchronized { 67 | print("Stopping kafkaConsumer") 68 | kafkaConsumer.close() 69 | kafkaConsumer = null 70 | } 71 | } 72 | } 73 | 74 | def onStart() { 75 | logInfo("Starting Kafka Consumer Stream") 76 | 77 | //Make sure the Jaas Login config param is set 78 | val jaasLoginParam = System.getProperty(JaasUtils.JAVA_LOGIN_CONFIG_PARAM); 79 | if ( jaasLoginParam == null ){ 80 | MessageHubConfig.createJaasConfiguration( kafkaParams.get(MessageHubConfig.KAFKA_USER_NAME).get, kafkaParams.get(MessageHubConfig.KAFKA_USER_PASSWORD).get) 81 | } 82 | 83 | 84 | val keyDeserializer = classTag[U].runtimeClass.getConstructor().newInstance().asInstanceOf[Deserializer[K]] 85 | val valueDeserializer = classTag[T].runtimeClass.getConstructor().newInstance().asInstanceOf[Deserializer[V]] 86 | 87 | //Create a new kafka consumer and subscribe to the relevant topics 88 | kafkaConsumer = new KafkaConsumer[K, V](kafkaParams) 89 | kafkaConsumer.subscribe( topics ) 90 | 91 | new Thread( new Runnable { 92 | def run(){ 93 | try{ 94 | while( kafkaConsumer != null ){ 95 | var it:Iterator[ConsumerRecord[K, V]] = null; 96 | 97 | if ( kafkaConsumer != null ){ 98 | kafkaConsumer.synchronized{ 99 | //Poll for new events 100 | it = kafkaConsumer.poll(1000L).iterator 101 | while( it != null && it.hasNext() ){ 102 | //Get the record and store it 103 | val record = it.next(); 104 | store( (record.key, record.value) ) 105 | } 106 | kafkaConsumer.commitSync 107 | } 108 | } 109 | 110 | Thread.sleep( 1000L ) 111 | } 112 | println("Exiting Thread") 113 | }catch{ 114 | case e:Throwable => { 115 | reportError( "Error in KafkaConsumer thread", e); 116 | e.printStackTrace() 117 | } 118 | } 119 | } 120 | }).start 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | /** 18 | * Spark Streaming sample application 19 | * 20 | */ 21 | package com.ibm.cds.spark.samples; -------------------------------------------------------------------------------- /streaming-twitter/src/main/scala/com/ibm/cds/spark/samples/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.ibm.cds.spark 19 | 20 | /** 21 | * @author dtaieb 22 | */ 23 | import scala.collection.mutable._ 24 | import org.apache.spark.sql.types.IntegerType 25 | import org.apache.spark.sql.types.DoubleType 26 | import org.apache.spark.sql.types.StructField 27 | import org.apache.spark.sql.types.StringType 28 | import org.apache.spark.sql.types.StructField 29 | import org.apache.spark.sql.types.StructType 30 | import org.apache.spark.sql.Row 31 | 32 | package object samples { 33 | 34 | case class EnrichedTweet( author:String="", userid: String="", date: String, lang: String, text: String, lat: Double, long: Double, sentimentScores: Map[String, Double]){ 35 | def toRow():Row={ 36 | var colValues = Array[Any](author,userid,date,lang,text,lat,long) 37 | val scores = for { 38 | (_,emotion)<-ToneAnalyzer.sentimentFactors 39 | score=sentimentScores.getOrElse(emotion, 0.0) 40 | }yield score 41 | colValues = colValues ++ scores 42 | Row(colValues.toArray:_*) 43 | } 44 | } 45 | 46 | val schemaString = "author userid date lang text lat:double long:double" 47 | val schemaTweets = 48 | StructType( 49 | schemaString.split(" ").map( 50 | fieldName => { 51 | val ar = fieldName.split(":"); 52 | StructField( 53 | ar.lift(0).get, 54 | ar.lift(1).getOrElse("string") match{ 55 | case "int" => IntegerType 56 | case "double" => DoubleType 57 | case _ => StringType 58 | }, 59 | true 60 | ) 61 | } 62 | ).union( 63 | ToneAnalyzer.sentimentFactors.map( f => StructField( f._1, DoubleType )).toArray[StructField] 64 | ) 65 | ) 66 | } --------------------------------------------------------------------------------