├── .gitignore ├── README.md ├── python └── network_wordcount.py ├── scala ├── ConnectToCassandra │ ├── build.sbt │ ├── project │ │ └── assembly.sbt │ └── src │ │ └── main │ │ └── scala │ │ └── ConnectToCassandra.scala ├── FinalProject │ ├── build.sbt │ ├── project │ │ └── assembly.sbt │ └── src │ │ └── main │ │ └── scala │ │ ├── FinalProject.scala │ │ └── utils │ │ ├── CassandraUtils.scala │ │ ├── CommunityUtils.scala │ │ ├── GraphUtils.scala │ │ ├── MllibUtils.scala │ │ └── RDDUtils.scala ├── FindCommunities │ ├── build.sbt │ ├── launch.sh │ ├── project │ │ └── assembly.sbt │ └── src │ │ └── main │ │ └── scala │ │ ├── FindCommunities.scala │ │ └── utils │ │ ├── CassandraUtils.scala │ │ ├── CommunityUtils.scala │ │ ├── GraphUtils.scala │ │ ├── MllibUtils.scala │ │ └── RDDUtils.scala ├── GraphxTesting │ ├── build.sbt │ ├── project │ │ └── assembly.sbt │ └── src │ │ └── main │ │ └── scala │ │ ├── GraphxTesting.scala │ │ └── utils │ │ ├── CassandraUtils.scala │ │ ├── CommunityUtils.scala │ │ ├── GraphUtils.scala │ │ ├── MllibUtils.scala │ │ └── RDDUtils.scala ├── RDDFromCassandra │ ├── build.sbt │ ├── project │ │ └── assembly.sbt │ └── src │ │ └── main │ │ └── scala │ │ └── RDDFromCassandra.scala ├── SaveCommunicationToCassandra │ ├── build.sbt │ ├── project │ │ └── assembly.sbt │ └── src │ │ └── main │ │ └── scala │ │ └── SaveCommunicationToCassandra.scala ├── ScalaTwitterStreaming │ ├── build.sbt │ ├── project │ │ └── assembly.sbt │ └── src │ │ └── main │ │ └── scala │ │ └── ScalaTwitterStreaming.scala └── SimpleAppUsingSBT │ ├── build.sbt │ └── src │ └── main │ └── scala │ └── SimpleAppUsingSBT.scala └── visualization ├── d3.slider.css ├── d3.slider.js ├── data.php └── graph.html /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | TwitterConfig.scala 5 | /TwitterConfig.scala 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Pycharm 11 | .idea/* 12 | .idea/ 13 | .metadata 14 | .metadata/* 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Community detection and LDA 2 | 3 | For further informations -> wiki 4 | -------------------------------------------------------------------------------- /python/network_wordcount.py: -------------------------------------------------------------------------------- 1 | __author__ = 'michaelcaraccio' 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | """ 21 | Counts words in UTF8 encoded, '\n' delimited text received from the network every second. 22 | Usage: network_wordcount.py 23 | and describe the TCP server that Spark Streaming would connect to receive data. 24 | To run this on your local machine, you need to first run a Netcat server 25 | `$ nc -lk 9999` 26 | and then run the example 27 | `$ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localhost 9999` 28 | """ 29 | 30 | import sys 31 | 32 | from pyspark import SparkContext 33 | from pyspark.streaming import StreamingContext 34 | 35 | if __name__ == "__main__": 36 | if len(sys.argv) != 3: 37 | print >> sys.stderr, "Usage: network_wordcount.py " 38 | exit(-1) 39 | sc = SparkContext(appName="PythonStreamingNetworkWordCount") 40 | ssc = StreamingContext(sc, 1) 41 | 42 | lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) 43 | counts = lines.flatMap(lambda line: line.split(" "))\ 44 | .map(lambda word: (word, 1))\ 45 | .reduceByKey(lambda a, b: a+b) 46 | counts.pprint() 47 | 48 | ssc.start() 49 | ssc.awaitTermination() -------------------------------------------------------------------------------- /scala/ConnectToCassandra/build.sbt: -------------------------------------------------------------------------------- 1 | name := "ConnectToCassandra" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-core" % "1.2.0" % "provided", 9 | "org.apache.spark" %% "spark-streaming" % "1.2.0" % "provided", 10 | "org.apache.spark" %% "spark-streaming-twitter" % "1.2.1") 11 | 12 | libraryDependencies += "org.twitter4j" % "twitter4j-stream" % "3.0.6" 13 | 14 | libraryDependencies += "org.twitter4j" % "twitter4j-core" % "3.0.6" 15 | 16 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.2.0-rc3" -------------------------------------------------------------------------------- /scala/ConnectToCassandra/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") -------------------------------------------------------------------------------- /scala/ConnectToCassandra/src/main/scala/ConnectToCassandra.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.streaming.{Seconds, StreamingContext} 2 | import StreamingContext._ 3 | 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.SparkContext._ 6 | 7 | import org.apache.spark.streaming.twitter 8 | import org.apache.spark.streaming.twitter._ 9 | import org.apache.spark.streaming.twitter.TwitterUtils 10 | 11 | import org.apache.spark.SparkConf 12 | 13 | import org.apache.spark.streaming.dstream.DStream 14 | import org.apache.spark.streaming.Seconds 15 | import org.apache.spark.streaming.StreamingContext 16 | import org.apache.spark.streaming.StreamingContext._ 17 | 18 | import twitter4j.TwitterFactory 19 | import twitter4j.auth.AccessToken 20 | import twitter4j._ 21 | import collection.JavaConversions._ 22 | 23 | import org.apache.log4j.Logger 24 | import org.apache.log4j.Level 25 | 26 | import com.datastax.spark.connector._ 27 | import com.datastax.spark.connector.streaming._ 28 | 29 | import scala.util.matching.Regex 30 | 31 | 32 | // Useful links 33 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/0_quick_start.md 34 | // http://planetcassandra.org/getting-started-with-apache-spark-and-cassandra/ 35 | // https://bcomposes.wordpress.com/2013/02/09/using-twitter4j-with-scala-to-access-streaming-tweets/ 36 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/5_saving.md 37 | 38 | object ConnectToCassandra { 39 | def main(args: Array[String]) { 40 | 41 | // Display only warning messages 42 | Logger.getLogger("org").setLevel(Level.ERROR) 43 | Logger.getLogger("akka").setLevel(Level.ERROR) 44 | 45 | val filters = args 46 | 47 | // Spark configuration 48 | val sparkConf = new SparkConf(true) 49 | .setMaster("local[4]") 50 | .setAppName("ConnectToCassandra") 51 | .set("spark.cassandra.connection.host", "127.0.0.1") // Add this line to link to Cassandra 52 | 53 | // Filters by words that contains @ 54 | val words = Array("@") 55 | 56 | // Pattern used to find users 57 | val pattern = new Regex("\\@\\w+") 58 | 59 | // First twitter instance : Used for stream 60 | val twitterstream = new TwitterFactory().getInstance() 61 | twitterstream.setOAuthConsumer("MCrQfOAttGZnIIkrqZ4lQA9gr", "5NnYhhGdfyqOE4pIXXdYkploCybQMzFJiQejZssK4a3mNdkCoa") 62 | twitterstream.setOAuthAccessToken(new AccessToken("237197078-6zwzHsuB3VY3psD5873hhU3KQ1lSVQlOXyBhDqpG", "UIMZ1aD06DObpKI741zC8wHZF8jkj1bh02Lqfl5cQ76Pl")) 63 | System.setProperty("twitter4j.http.retryCount", "3"); 64 | System.setProperty("twitter4j.http.retryIntervalSecs", "10") 65 | System.setProperty("twitter4j.async.numThreads", "1"); 66 | 67 | val ssc = new StreamingContext(sparkConf, Seconds(1)) 68 | val stream = TwitterUtils.createStream(ssc, Option(twitterstream.getAuthorization()), words) 69 | 70 | // Second twitter instance : Used to query user's informations 71 | val twitter = new TwitterFactory().getInstance() 72 | twitter.setOAuthConsumer("Vb0BxXrK933CDEeQ3Myj69kkC", "q55rXOM8pQnnAyPrYhHh6LHK4IFHw0U01tfe6VDoleaxmvOL3B") 73 | twitter.setOAuthAccessToken(new AccessToken("237197078-iXi3ANEAUXNmoDbcbH3lvS93vDO6PvEQj3255ToL", "Skv8J9xcfhbKV2Lwddke2g7llTDwwh6S9QyAlNR6fanqY")) 74 | 75 | // Stream about users 76 | val usersStream = stream.map{status => (status.getUser.getId.toString, 77 | status.getUser.getName.toString, 78 | status.getUser.getLang, 79 | status.getUser.getFollowersCount.toString, 80 | status.getUser.getFriendsCount.toString, 81 | status.getUser.getScreenName, 82 | status.getUser.getStatusesCount.toString)} 83 | 84 | // Stream about tweets 85 | val tweetsStream = stream.map{status => (status.getId.toString, 86 | status.getUser.getId.toString, 87 | status.getUser.getName.toString, 88 | status.getText, 89 | 90 | if(pattern.findFirstIn(status.getText).isEmpty){ 91 | "" 92 | } 93 | else 94 | { 95 | twitterstream.showUser(pattern.findFirstIn(status.getText).getOrElse("@MichaelCaraccio").tail).getName 96 | }, 97 | 98 | if(pattern.findFirstIn(status.getText).isEmpty){ 99 | "" 100 | } 101 | else{ 102 | twitterstream.showUser(pattern.findFirstIn(status.getText).getOrElse("@MichaelCaraccio").tail).getId 103 | }, 104 | 105 | status.getRetweetCount.toString, 106 | new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(status.getCreatedAt), 107 | 108 | Option(status.getGeoLocation) match { 109 | case Some(theValue) => 110 | status.getGeoLocation.getLongitude.toString 111 | case None => 112 | "" 113 | }, 114 | 115 | Option(status.getGeoLocation) match { 116 | case Some(theValue) => 117 | status.getGeoLocation.getLatitude.toString 118 | case None => 119 | "" 120 | } 121 | )} 122 | 123 | // Save user's informations in Cassandra 124 | usersStream.foreachRDD(rdd => { 125 | //rdd.saveToCassandra("twitter", "user_filtered", SomeColumns("user_id", "user_name", "user_lang", "user_follower_count", "user_friends_count", "user_screen_name", "user_status_count")) 126 | println("user added") 127 | }) 128 | 129 | // Save tweet's informations in Cassandra 130 | tweetsStream.foreachRDD(rdd => { 131 | //rdd.saveToCassandra("twitter", "tweet_filtered", SomeColumns("tweet_id", "user_id", "tweet_text", "tweet_retweet", "tweet_create_at", "user_longitude", "user_latitude")) 132 | 133 | 134 | /* val twitter = new TwitterFactory().getInstance 135 | val userName = twitter.getScreenName 136 | 137 | val statuses = twitter.getMentionsTimeline.take(2) 138 | 139 | statuses.foreach { status => { 140 | val statusAuthor = status.getUser.getScreenName 141 | val mentionedEntities = status.getUserMentionEntities.map(_.getScreenName).toList 142 | val participants = (statusAuthor :: mentionedEntities).toSet - userName 143 | val text = participants.map(p=>"@"+p).mkString(" ") + " OK." 144 | val reply = new StatusUpdate(text).inReplyToStatusId(status.getId) 145 | println("Replying: " + text) 146 | //twitter.updateStatus(reply) 147 | println("DAT BITCH" + mentionedEntities) 148 | println("DAT BITCH2" + reply) 149 | }}*/ 150 | 151 | 152 | 153 | rdd.foreach {r => { 154 | val sender_name = r._3 155 | val sender_id = r._2 156 | val tweet_text = r._4 157 | val dest_name = r._5 158 | val dest_id = r._6 159 | 160 | println("----------------------------------------------") 161 | println("Sender ID : " + sender_id) 162 | println("Sender Name : " + sender_name) 163 | println("Tweet : " + tweet_text) 164 | println("Dest name :" + dest_name) 165 | println("Dest ID : " + dest_id) 166 | println("----------------------------------------------") 167 | 168 | }} 169 | println("tweet added") 170 | }) 171 | 172 | ssc.start() 173 | ssc.awaitTermination() 174 | } 175 | } -------------------------------------------------------------------------------- /scala/FinalProject/build.sbt: -------------------------------------------------------------------------------- 1 | name := "FinalProject" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.5" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-core" % "1.3.0" % "provided", 9 | "org.apache.spark" %% "spark-graphx" % "1.3.0" % "provided", 10 | //"org.apache.spark" %% "spark-streaming" % "1.3.0" % "provided", 11 | "org.apache.spark" %% "spark-mllib" % "1.3.0" % "provided"//, 12 | // "org.apache.commons" % "commons-lang3" % "3.3.2", 13 | /*"org.apache.spark" %% "spark-streaming-twitter" % "1.3.0"*/) 14 | 15 | //libraryDependencies += "org.apache.spark" % "spark-streaming_2.10" % "1.3.0" 16 | libraryDependencies += "org.apache.spark" % "spark-streaming-twitter_2.10" % "1.3.0" 17 | 18 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.3.0-M1" 19 | 20 | //libraryDependencies += "com.google.code.gson" % "gson" % "2.3" 21 | 22 | //libraryDependencies += "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly() 23 | 24 | // http://stackoverflow.com/questions/28459333/how-to-build-an-uber-jar-fat-jar-using-sbt-within-intellij-idea 25 | // META-INF discarding 26 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => 27 | { 28 | case PathList("META-INF", xs @ _*) => MergeStrategy.discard 29 | case x => MergeStrategy.first 30 | } 31 | } 32 | 33 | resolvers ++= Seq( 34 | // "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", 35 | // "Spray Repository" at "http://repo.spray.cc/", 36 | // "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", 37 | // "Akka Repository" at "http://repo.akka.io/releases/", 38 | // "Twitter4J Repository" at "http://twitter4j.org/maven2/", 39 | // "Apache HBase" at "https://repository.apache.org/content/repositories/releases", 40 | // "Twitter Maven Repo" at "http://maven.twttr.com/", 41 | // "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", 42 | // "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", 43 | // "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/" 44 | // "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", 45 | // Resolver.sonatypeRepo("public") 46 | ) -------------------------------------------------------------------------------- /scala/FinalProject/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") -------------------------------------------------------------------------------- /scala/FinalProject/src/main/scala/utils/CassandraUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | 5 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD: 6 | 7 | import com.datastax.spark.connector._ 8 | 9 | // To make some of the examples work we will also need RDD 10 | 11 | import org.apache.spark.SparkContext 12 | import org.apache.spark.graphx._ 13 | import org.apache.spark.rdd.RDD 14 | import org.apache.spark.sql.cassandra.CassandraSQLContext 15 | 16 | //@SerialVersionUID(100L) 17 | class CassandraUtils /*extends Serializable*/ { 18 | 19 | val RED = "\033[1;30m" 20 | val ENDC = "\033[0m" 21 | 22 | /** 23 | * @constructor getTweetContentFromID 24 | * 25 | * Return tweet content 26 | * 27 | * @param SparkContext sc - SparkContext 28 | * @param String $id - tweet id 29 | * @return Unit 30 | */ 31 | def getTweetContentFromID(sc: SparkContext, id: String): String = { 32 | 33 | println(color("\nCall getTweetContentFromID", RED)) 34 | 35 | val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", id) 36 | 37 | if (query.collect().length != 0) { 38 | query.first().getString("tweet_text") 39 | } 40 | else 41 | "Tweet not found" 42 | } 43 | 44 | /** 45 | * @constructor getTweetsIDFromUser 46 | * 47 | * Return tweet id 48 | * 49 | * @param SparkContext sc - SparkContext 50 | * @param String $id - user (sender) id 51 | * @return Unit 52 | */ 53 | def getTweetsIDFromUser(sc: SparkContext, id: String): ArrayBuffer[String] = { 54 | 55 | println(color("\nCall getTweetsIDFromUser", RED)) 56 | println("Tweets found:") 57 | 58 | val query = sc.cassandraTable("twitter", "users_communicate").select("tweet_id").where("user_send_local_id = ?", id) 59 | 60 | // Result will be stored in an array 61 | var result = ArrayBuffer[String]() 62 | 63 | if (query.collect().length != 0) { 64 | result += query.first().getString("tweet_id") 65 | } 66 | 67 | // Display result 68 | result.foreach(println(_)) 69 | 70 | // Return 71 | result 72 | } 73 | 74 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 75 | 76 | /** 77 | * @constructor getTweetsContentFromEdge 78 | * 79 | * Return an array of tweets content for a given Graph 80 | * 81 | * @param SparkContext sc - SparkContext 82 | * @param RDD[Edge[String]] $edge - graph's edge 83 | * @return Unit 84 | */ 85 | def getTweetsContentFromEdge(sc: SparkContext, edge: RDD[Edge[String]], displayResult: Boolean): RDD[String] = { 86 | 87 | println(color("\nCall getTweetsContentFromEdge", RED)) 88 | 89 | // Get the tweets ID for every communication 90 | val tweetsID = edge.flatMap({ 91 | case Edge(idSend, idExp, idTweet) => Seq(idTweet) 92 | }) 93 | 94 | // Result will be stored in an array 95 | var result = ArrayBuffer[String]() 96 | 97 | // Queries 98 | for (tweet <- tweetsID.collect()) { 99 | val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", tweet) 100 | 101 | if (query.collect().length != 0) { 102 | result += query.first().getString("tweet_text") 103 | } 104 | } 105 | 106 | // Display results 107 | if (displayResult) { 108 | result.foreach(println(_)) 109 | } 110 | 111 | 112 | // return 113 | sc.parallelize(result) 114 | } 115 | 116 | /*def getAllTweetsText(sc: SparkContext): ArrayBuffer[String] = { 117 | val rdd = sc.cassandraTable("twitter", "tweet_filtered2").select("tweet_text").cache() 118 | 119 | var dictionnary = new ArrayBuffer[String] 120 | 121 | println("Tweets by tweets -> Create documents and vocabulary") 122 | rdd.select("tweet_text").as((i: String) => i).foreach(x => { 123 | 124 | val tweet = x 125 | .toLowerCase.split("\\s") 126 | .filter(_.length > 3) 127 | .filter(_.forall(java.lang.Character.isLetter)).mkString(" ") 128 | 129 | if (tweet.length > 1) 130 | dictionnary += tweet 131 | }) 132 | }*/ 133 | 134 | // (RDD[(VertexId, (String))], RDD[Edge[String]]) 135 | def getAllCommunicationsToGraph(sc: SparkContext): Graph[String, String] = { 136 | println(color("\nCall getAllCommunications", RED)) 137 | 138 | 139 | /* val users: RDD[(VertexId, (String))] = 140 | sc.parallelize(List( 141 | (2732329846L, "Michael"), 142 | (132988448L, "David"), 143 | (473822999L, "Sarah"), 144 | (2932436311L, "Jean"), 145 | (2249679902L, "Raphael"), 146 | (601389784L, "Lucie"), 147 | (2941487254L, "Harold"), 148 | (1192483885L, "Pierre"), 149 | (465776805L, "Christophe"), 150 | (838147628L, "Zoe"), 151 | (2564641105L, "Fabien"), 152 | (1518391292L, "Nicolas") 153 | ))*/ 154 | 155 | 156 | // Collection of vertices (contains users) 157 | // val collectionVertices = ListBuffer[(Long, String)]() 158 | 159 | 160 | // val users: RDD[(VertexId, (String))] = sc.parallelize(collectionVertices) 161 | 162 | 163 | //val con = sc.cassandraTable("twitter", "user_filtered") 164 | //con.toArray.foreach(println) 165 | /*println("Test -1") 166 | 167 | var t0 = System.nanoTime() 168 | for (row <- query) { 169 | 170 | } 171 | 172 | var t1 = System.nanoTime() 173 | println("Elapsed time: " + (t1 - t0) + "ns")*/ 174 | 175 | // val query = sc.cassandraTable("twitter", "user_filtered").select("user_local_id", "user_screen_name") 176 | 177 | 178 | /*val con = query.map{ 179 | case result => (result._1, result._2) 180 | }*/ 181 | val cc = new CassandraSQLContext(sc) 182 | 183 | println("Test 0") 184 | var t0 = System.nanoTime() 185 | val rdd0 = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered") 186 | 187 | val pelo = rdd0.map(p => (p(0).toString.toLong, p(1).toString)).cache() 188 | 189 | val rdd1 = cc.sql("SELECT tweet_id, user_send_local_id, user_dest_id from twitter.users_communicate") 190 | 191 | val pelo2 = rdd1.map(p => Edge(p(1).toString.toLong, p(2).toString.toLong, p(0).toString)).cache() 192 | 193 | Graph(pelo, pelo2) 194 | 195 | /*println("okkk") 196 | 197 | graphh.vertices.foreach(println(_)) 198 | 199 | 200 | //pelo.foreach(println(_)) 201 | 202 | println("After collecting") 203 | 204 | rdd0.show() 205 | 206 | for (row <- rdd0) { 207 | //println(row(0)) 208 | 209 | collectionVertices += ((row(0).toString.toLong, row(1).toString)) 210 | //collectionVertices.append((row(0).toString.toLong, row(1).toString)) 211 | } 212 | var t1 = System.nanoTime() 213 | println("Elapsed time: " + (t1 - t0) + "ns") 214 | 215 | 216 | println("Test 1") 217 | t0 = System.nanoTime() 218 | 219 | val rdd = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered LIMIT 100").persist() 220 | for (row <- rdd) { 221 | collectionVertices += ((row(0).toString.toLong, row(1).toString)) 222 | } 223 | rdd.unpersist() 224 | t1 = System.nanoTime() 225 | println("Elapsed time: " + (t1 - t0) + "ns") 226 | 227 | 228 | 229 | println("Test 2") 230 | t0 = System.nanoTime() 231 | val rdd2 = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered limit 10000").cache() 232 | for (row <- rdd2) { 233 | collectionVertices += ((row(0).toString.toLong, row(1).toString)) 234 | } 235 | t1 = System.nanoTime() 236 | println("Elapsed time: " + (t1 - t0) + "ns") 237 | 238 | println("Test 3") 239 | t0 = System.nanoTime() 240 | 241 | for (row <- cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered limit 10000")) { 242 | collectionVertices += ((row(0).toString.toLong, row(1).toString)) 243 | } 244 | t1 = System.nanoTime() 245 | println("Elapsed time: " + (t1 - t0) + "ns") 246 | 247 | 248 | 249 | 250 | 251 | 252 | println("f") 253 | // println(rdd.take(1)) 254 | println("f2") 255 | * 256 | /* 257 | println("Query 1 ok") 258 | */ 259 | // Save result to ArrayBuffer 260 | //if (query.collect().length != 0) { 261 | //collectionVertices += ((query.first().getString("user_local_id").toLong, query.first().getString("user_local_id").toString)) 262 | println(query.first().getString("user_local_id")) 263 | // } 264 | 265 | //collectionVertices.foreach(println(_)) 266 | 267 | println("Query 1 Collect ok") 268 | 269 | 270 | 271 | // Collection of edges (contains communications between users) 272 | val collectionEdge = ArrayBuffer[Edge[String]]() 273 | 274 | 275 | //query = sc.cassandraTable("twitter", "users_communicate").select("user_send_local_id", "user_dest_id", "tweet_id").toArray() 276 | 277 | println("Query 2 ok") 278 | // Save result to ArrayBuffer 279 | /*if (query.collect().length != 0) { 280 | collectionEdge += Edge(query.first().getString("user_send_local_id").toLong, query.first().getString("user_dest_id").toLong, query.first().getString("tweet_id").toString) 281 | }*/ 282 | 283 | //collectionEdge.foreach(println(_)) 284 | 285 | println("Query 2 Collect ok") 286 | 287 | // Convert vertices to RDD 288 | val VerticesRDD = sc.parallelize(collectionVertices) 289 | 290 | // Convert it to RDD 291 | val EdgeRDD = sc.parallelize(collectionEdge) 292 | 293 | println("Total vertices: " + collectionVertices.length) 294 | println("Total edges: " + collectionEdge.length) 295 | 296 | (VerticesRDD, EdgeRDD)*/ 297 | } 298 | } -------------------------------------------------------------------------------- /scala/FinalProject/src/main/scala/utils/CommunityUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.graphx._ 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.math._ 8 | import scala.reflect.ClassTag 9 | 10 | class CommunityUtils extends Logging { 11 | 12 | val RED = "\033[1;30m" 13 | val ENDC = "\033[0m" 14 | 15 | /** 16 | * splitCommunity 17 | * 18 | * Find and split communities in graph 19 | * 20 | * @param Graph[String,String] $graph - Graph element 21 | * @param RDD[(VertexId, (String))] $users - Vertices 22 | * @param Boolean $displayResult - if true, display println 23 | * @return ArrayBuffer[Graph[String,String]] - Contains one graph per community 24 | * 25 | */ 26 | def splitCommunity(graph: Graph[String, String], users: RDD[(VertexId, (String))], NBKCORE: Int, displayResult: Boolean): Graph[String, String] = { 27 | 28 | println(color("\nCall SplitCommunity", RED)) 29 | 30 | getKCoreGraph(graph, users, NBKCORE, displayResult).cache() 31 | } 32 | 33 | /** 34 | * Compute the k-core decomposition of the graph for all k <= kmax. This 35 | * uses the iterative pruning algorithm discussed by Alvarez-Hamelin et al. 36 | * in K-Core Decomposition: a Tool For the Visualization of Large Scale Networks 37 | * (see http://arxiv.org/abs/cs/0504107). 38 | * 39 | * @tparam VD the vertex attribute type (discarded in the computation) 40 | * @tparam ED the edge attribute type (preserved in the computation) 41 | * 42 | * @param graph the graph for which to compute the connected components 43 | * @param kmax the maximum value of k to decompose the graph 44 | * 45 | * @return a graph where the vertex attribute is the minimum of 46 | * kmax or the highest value k for which that vertex was a member of 47 | * the k-core. 48 | * 49 | * @note This method has the advantage of returning not just a single kcore of the 50 | * graph but will yield all the cores for k > kmin. 51 | */ 52 | def getKCoreGraph[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], 53 | users: RDD[(VertexId, (String))], 54 | kmin: Int, 55 | displayResult: Boolean): Graph[String, ED] = { 56 | 57 | // Graph[(Int, Boolean), ED] - boolean indicates whether it is active or not 58 | var g = graph.cache().outerJoinVertices(graph.degrees)((vid, oldData, newData) => newData.getOrElse(0)).cache() 59 | 60 | println(color("\nCall KCoreDecomposition", RED)) 61 | 62 | g = computeCurrentKCore(g, kmin).cache() 63 | 64 | val v = g.vertices.filter { case (vid, vd) => vd >= kmin }.cache() 65 | 66 | // Display informations 67 | if (displayResult) { 68 | val degrees = graph.degrees 69 | val numVertices = degrees.count() 70 | val testK = kmin 71 | val vCount = g.vertices.filter { case (vid, vd) => vd >= kmin }.count() 72 | val eCount = g.triplets.map { t => t.srcAttr >= testK && t.dstAttr >= testK }.count() 73 | 74 | logWarning(s"Number of vertices: $numVertices") 75 | logWarning(s"Degree sample: ${degrees.take(10).mkString(", ")}") 76 | logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey(_ + _).collect().mkString(", ")) 77 | logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey(_ + _).take(10).mkString(", ")) 78 | logWarning(s"K=$kmin, V=$vCount, E=$eCount") 79 | } 80 | 81 | // Create new RDD users 82 | val newUser = users.join(v).map { 83 | case (id, (username, rank)) => (id, username) 84 | } 85 | 86 | // Create a new graph 87 | val gra = Graph(newUser, g.edges) 88 | 89 | // Remove missing vertices as well as the edges to connected to them 90 | gra.subgraph(vpred = (id, username) => username != null).cache() 91 | } 92 | 93 | def computeCurrentKCore[ED: ClassTag](graph: Graph[Int, ED], k: Int) = { 94 | println("Computing kcore for k=" + k) 95 | def sendMsg(et: EdgeTriplet[Int, ED]): Iterator[(VertexId, Int)] = { 96 | if (et.srcAttr < 0 || et.dstAttr < 0) { 97 | // if either vertex has already been turned off we do nothing 98 | Iterator.empty 99 | } else if (et.srcAttr < k && et.dstAttr < k) { 100 | // tell both vertices to turn off but don't need change count value 101 | Iterator((et.srcId, -1), (et.dstId, -1)) 102 | 103 | } else if (et.srcAttr < k) { 104 | // if src is being pruned, tell dst to subtract from vertex count 105 | Iterator((et.srcId, -1), (et.dstId, 1)) 106 | 107 | } else if (et.dstAttr < k) { 108 | // if dst is being pruned, tell src to subtract from vertex count 109 | Iterator((et.dstId, -1), (et.srcId, 1)) 110 | 111 | } else { 112 | Iterator.empty 113 | } 114 | } 115 | 116 | // subtracts removed neighbors from neighbor count and tells vertex whether it was turned off or not 117 | def mergeMsg(m1: Int, m2: Int): Int = { 118 | if (m1 < 0 || m2 < 0) { 119 | -1 120 | } else { 121 | m1 + m2 122 | } 123 | } 124 | 125 | def vProg(vid: VertexId, data: Int, update: Int): Int = { 126 | if (update < 0) { 127 | // if the vertex has turned off, keep it turned off 128 | -1 129 | } else { 130 | // subtract the number of neighbors that have turned off this round from 131 | // the count of active vertices 132 | // TODO(crankshaw) can we ever have the case data < update? 133 | max(data - update, 0) 134 | } 135 | } 136 | 137 | // Note that initial message should have no effect 138 | Pregel(graph, 0)(vProg, sendMsg, mergeMsg) 139 | } 140 | 141 | 142 | /** 143 | * @constructor time 144 | * 145 | * timer for profiling block 146 | * 147 | * @param R $block - Block executed 148 | * @return Unit 149 | */ 150 | def time[R](block: => R): R = { 151 | val t0 = System.nanoTime() 152 | val result = block // call-by-name 153 | val t1 = System.nanoTime() 154 | println("Elapsed time: " + (t1 - t0) / 1000000000.0 + " seconds") 155 | result 156 | } 157 | 158 | def subgraphCommunities(graph: Graph[String, String], users: RDD[(VertexId, (String))], displayResult: Boolean): (Array[Graph[String, String]], Array[Long]) = { 159 | 160 | println(color("\nCall subgraphCommunities", RED)) 161 | 162 | // Find the connected components 163 | val cc = time { 164 | graph.connectedComponents().vertices.cache() 165 | } 166 | 167 | // Join the connected components with the usernames and id 168 | // The result is an RDD not a Graph 169 | val ccByUsername = users.join(cc).map { 170 | case (id, (username, cci)) => (id, username, cci) 171 | }.cache() 172 | 173 | // Print the result 174 | val lowerIDPerCommunity = ccByUsername.map { case (id, username, cci) => cci }.distinct().cache() 175 | 176 | // Result will be stored in an array 177 | //var result = new ArrayBuffer[Graph[String, String]]() 178 | println("--------------------------") 179 | println("Total community found: " + lowerIDPerCommunity.count()) 180 | println("--------------------------") 181 | 182 | 183 | val collectIDsCommunity = lowerIDPerCommunity.collect() 184 | 185 | val result = collectIDsCommunity.map(colID => Graph(ccByUsername.filter { 186 | _._3 == colID 187 | }.map { case (id, username, cc) => (id, username) }, graph.edges).subgraph(vpred = (id, username) => username != null).cache()) 188 | 189 | // Display communities 190 | if (displayResult) { 191 | println("\nCommunities found " + result.length) 192 | for (community <- result) { 193 | println("-----------------------") 194 | community.edges.collect().foreach(println(_)) 195 | community.vertices.collect().foreach(println(_)) 196 | } 197 | } 198 | 199 | cc.unpersist() 200 | lowerIDPerCommunity.unpersist() 201 | 202 | (result, collectIDsCommunity) 203 | } 204 | 205 | /** 206 | * getTriangleCount 207 | * 208 | * Compute the number of triangles passing through each vertex. 209 | * 210 | * @param Graph[String,String] $graph - Graph element 211 | * @param RDD[(VertexId, (String))] $users - Vertices 212 | * @return Unit 213 | * 214 | * @see [[org.apache.spark.graphx.lib.TriangleCount$#run]] 215 | */ 216 | def getTriangleCount(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = { 217 | 218 | println(color("\nCall getTriangleCount", RED)) 219 | 220 | // Sort edges ID srcID < dstID 221 | val edges = graph.edges.map { e => 222 | if (e.srcId < e.dstId) { 223 | Edge(e.srcId, e.dstId, e.attr) 224 | } 225 | else { 226 | Edge(e.dstId, e.srcId, e.attr) 227 | } 228 | } 229 | 230 | // Temporary graph 231 | val newGraph = Graph(users, edges, "").cache() 232 | 233 | // Find the triangle count for each vertex 234 | // TriangleCount requires the graph to be partitioned 235 | val triCounts = newGraph.partitionBy(PartitionStrategy.RandomVertexCut).cache().triangleCount().vertices 236 | 237 | val triCountByUsername = users.join(triCounts).map { 238 | case (id, (username, rank)) => (id, username, rank) 239 | } 240 | 241 | println("Display triangle's sum for each user") 242 | triCountByUsername.foreach(println) 243 | 244 | println("\nTotal: " + triCountByUsername.map { case (id, username, rank) => rank }.distinct().count() + "\n") 245 | } 246 | 247 | /** 248 | * @constructor ConnectedComponents 249 | * 250 | * Compute the connected component membership of each vertex and return a graph with the vertex 251 | * value containing the lowest vertex id in the connected component containing that vertex. 252 | * 253 | * @param Graph[String,String] $graph - Graph element 254 | * @param RDD[(VertexId, (String))] $users - Vertices 255 | * @return Unit 256 | * 257 | * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]] 258 | */ 259 | def cc(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = { 260 | println(color("\nCall ConnectedComponents", RED)) 261 | 262 | // Find the connected components 263 | val cc = graph.connectedComponents().vertices 264 | 265 | // Join the connected components with the usernames and id 266 | val ccByUsername = users.join(cc).map { 267 | case (id, (username, cc)) => (id, username, cc) 268 | } 269 | // Print the result 270 | println(ccByUsername.collect().sortBy(_._3).mkString("\n")) 271 | 272 | println("\nTotal groups: " + ccByUsername.map { case (id, username, cc) => cc }.distinct().count() + "\n") 273 | } 274 | 275 | /** 276 | * @constructor StronglyConnectedComponents 277 | * 278 | * Compute the strongly connected component (SCC) of each vertex and return a graph with the 279 | * vertex value containing the lowest vertex id in the SCC containing that vertex. 280 | * 281 | * Display edges's membership and total groups 282 | * 283 | * @param Graph[String,String] $graph - Graph element 284 | * @param Int $iteration - Number of iteration 285 | * @return Unit 286 | */ 287 | def scc(graph: Graph[String, String], iteration: Int): Unit = { 288 | 289 | println(color("\nCall StronglyConnectedComponents : iteration : " + iteration, RED)) 290 | val sccGraph = graph.stronglyConnectedComponents(5) 291 | 292 | val connectedGraph = sccGraph.vertices.map { 293 | case (member, leaderGroup) => s"$member is in the group of $leaderGroup's edge" 294 | } 295 | 296 | val totalGroups = sccGraph.vertices.map { 297 | case (member, leaderGroup) => leaderGroup 298 | } 299 | 300 | connectedGraph.collect().foreach(println) 301 | 302 | println("\nTotal groups: " + totalGroups.distinct().count() + "\n") 303 | } 304 | 305 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 306 | } -------------------------------------------------------------------------------- /scala/FinalProject/src/main/scala/utils/GraphUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // To make some of the examples work we will also need RDD 4 | 5 | import org.apache.spark.graphx._ 6 | import org.apache.spark.rdd.RDD 7 | 8 | 9 | class GraphUtils extends serializable { 10 | 11 | val RED = "\033[1;30m" 12 | val ENDC = "\033[0m" 13 | private val defaultSeed = 0xadc83b19L 14 | 15 | /** 16 | * @constructor murmurHash64A 17 | * 18 | * @param 19 | * @param 20 | * @return Long 21 | * 22 | */ 23 | def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = { 24 | val m = 0xc6a4a7935bd1e995L 25 | val r = 47 26 | 27 | val f: Long => Long = m.* 28 | val g: Long => Long = x => x ^ (x >>> r) 29 | 30 | val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) => 31 | val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff)) 32 | val j: Long => Long = if (xs.length == 8) f compose g compose f else identity 33 | f(y ^ j(k)) 34 | } 35 | (g compose f compose g)(h) 36 | } 37 | 38 | /** 39 | * @constructor getPageRank 40 | * 41 | * Run PageRank for a fixed number of iterations returning a graph with vertex attributes 42 | * containing the PageRank and edge attributes the normalized edge weight. 43 | * 44 | * @param Graph[String,String] $graph - Graph element 45 | * @param RDD[(VertexId, (String))] $users - Vertices 46 | * @return Unit 47 | * 48 | * @see [[org.apache.spark.graphx.lib.PageRank$#run]] 49 | */ 50 | def getPageRank(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = { 51 | 52 | println(color("\nCall getPageRank", RED)) 53 | 54 | val ranks = graph.pageRank(0.00001).vertices 55 | 56 | val ranksByUsername = users.join(ranks).map { 57 | case (id, (username, rank)) => (id, username, rank) 58 | } 59 | 60 | // Print the result descending 61 | println(ranksByUsername.collect().sortBy(_._3).reverse.mkString("\n")) 62 | } 63 | 64 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 65 | 66 | /** 67 | * @constructor inAndOutDegrees 68 | * 69 | * @param Graph[String,String] $graph - Graph element 70 | * @return Unit 71 | * 72 | */ 73 | def inAndOutDegrees(graph: Graph[String, String]): Unit = { 74 | 75 | println(color("\nCall inAndOutDegrees", RED)) 76 | 77 | // Create User class 78 | case class User(name: String, // Username 79 | inDeg: Int, // Received tweets 80 | outDeg: Int) // Sent tweets 81 | 82 | // Create user Graph 83 | // def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED] 84 | val initialUserGraph: Graph[User, String] = graph.mapVertices { 85 | case (id, (name)) => User(name, 0, 0) 86 | } 87 | 88 | //initialUserGraph.edges.collect.foreach(println(_)) 89 | 90 | 91 | // Fill in the degree informations (out and in degrees) 92 | val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) { 93 | case (id, u, inDegOpt) => User(u.name, inDegOpt.getOrElse(0), u.outDeg) 94 | }.outerJoinVertices(initialUserGraph.outDegrees) { 95 | case (id, u, outDegOpt) => User(u.name, u.inDeg, outDegOpt.getOrElse(0)) 96 | } 97 | 98 | // Display the userGraph 99 | userGraph.vertices.foreach { 100 | case (id, u) => println(s"User $id is called ${u.name} and received ${u.inDeg} tweets and send ${u.outDeg}.") 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /scala/FinalProject/src/main/scala/utils/MllibUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark.mllib.clustering._ 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.collection.mutable 8 | 9 | /** 10 | * Topic models automatically infer the topics discussed in a collection of documents. These topics can be used 11 | * to summarize and organize documents, or used for featurization and dimensionality reduction in later stages 12 | * of a Machine Learning (ML) pipeline. 13 | * 14 | * LDA is not given topics, so it must infer them from raw text. LDA defines a topic as a distribution over words. 15 | */ 16 | class MllibUtils { 17 | 18 | // Terminal Color 19 | val RED = "\033[1;30m" 20 | val ENDC = "\033[0m" 21 | 22 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 23 | 24 | def createdoc(tokenizedCorpus: RDD[String]): ((Seq[(Long, Vector)], Array[String], Map[String, Int], Array[String])) = { 25 | 26 | println(color("\nCall createdoc", RED)) 27 | 28 | // Choose the vocabulary. 29 | // termCounts: Sorted list of (term, termCount) pairs 30 | val termCounts: Array[(String, Long)] = 31 | tokenizedCorpus.map(_ -> 1L).reduceByKey(_ + _).collect().sortBy(-_._2) 32 | 33 | // vocabArray: Chosen vocab (removing common terms) 34 | val numStopwords = 20 35 | val vocabArray: Array[String] = 36 | termCounts.takeRight(termCounts.length - numStopwords).map(_._1) 37 | 38 | // vocab: Map term -> term index 39 | val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap 40 | 41 | val tokenCollected = tokenizedCorpus.collect() 42 | 43 | 44 | // MAP : [ Word ID , VECTOR [vocab.size, WordFrequency]] 45 | val documents: Map[Long, Vector] = vocab.map { case (tokens, id) => 46 | 47 | val counts = new mutable.HashMap[Int, Double]() 48 | 49 | // Word ID 50 | val idx = vocab(tokens) 51 | 52 | // Count word occurancy 53 | counts(idx) = counts.getOrElse(idx, 0.0) + tokenCollected.count(_ == tokens) 54 | 55 | // Return word ID and Vector 56 | (id.toLong, Vectors.sparse(vocab.size, counts.toSeq)) 57 | } 58 | 59 | (documents.toSeq, tokenizedCorpus.collect(), vocab, tokenizedCorpus.collect()) 60 | } 61 | 62 | 63 | def cosineSimilarity(tokenizedCorpus: RDD[String], vocab: Map[String, Int], tokenizedTweet: Array[String]): (Seq[(Long, Vector)]) = { 64 | 65 | println(color("\nCall cosineSimilarity", RED)) 66 | 67 | val document: Map[Long, Vector] = vocab.map { case (tokens, id) => 68 | 69 | val counts2 = new mutable.HashMap[Int, Double]() 70 | 71 | // Word ID 72 | val idx = vocab(tokens) 73 | 74 | // Count word occurancy 75 | counts2(idx) = counts2.getOrElse(idx, 0.0) + tokenizedTweet.count(_ == tokens).toDouble 76 | 77 | // Return word ID and Vector 78 | (id.toLong, Vectors.sparse(vocab.size, counts2.toSeq)) 79 | } 80 | 81 | document.toSeq 82 | } 83 | 84 | /** 85 | * @constructor findTopics 86 | * 87 | * Set currentTweet attribut and add the new tweet to the dictionnary 88 | * 89 | * @param LDAModel $ldaModel - LDA Model (LocalModel) 90 | * @param Array[String] $vocabArray - Contains all distinct words set to LDA 91 | * @param Int $numWordsByTopics - 92 | * @param Boolean $displayResult - Display result in console 93 | * 94 | * @return LDAModel 95 | */ 96 | def findTopics(ldaModel: LDAModel, vocabArray: Array[String], T: String, SG: Int, numWordsByTopics: Int, displayResult: Boolean): Seq[(String, String, String, String)] = { 97 | 98 | println(color("\nCall findTopics", RED)) 99 | 100 | println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):") 101 | 102 | val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = numWordsByTopics) 103 | 104 | var it = 0 105 | var seqC = List[(String, String, String, String)]() 106 | 107 | // Print topics, showing top-weighted x terms for each topic. 108 | topicIndices.foreach { case (terms, termWeights) => 109 | 110 | if (displayResult) 111 | println("TOPICS:") 112 | 113 | val tabTopics = terms.zip(termWeights).map(vector => vocabArray(vector._1.toInt).toString).mkString(";") 114 | 115 | if (displayResult) { 116 | terms.zip(termWeights).foreach { case (term, weight) => 117 | println(s"${vocabArray(term.toInt)}\t\t$weight") 118 | } 119 | } 120 | 121 | seqC = seqC :+(T, SG.toString, it.toString, tabTopics) 122 | 123 | println("T: " + T + " SG: " + SG + "TopicN: " + it + " c: " + tabTopics) 124 | it += 1 125 | 126 | if (displayResult) 127 | println() 128 | 129 | } 130 | seqC.toSeq 131 | } 132 | } -------------------------------------------------------------------------------- /scala/FinalProject/src/main/scala/utils/RDDUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | 7 | // To make some of the examples work we will also need RDD 8 | 9 | import org.apache.spark.graphx._ 10 | import org.apache.spark.rdd.RDD 11 | 12 | 13 | class RDDUtils { 14 | 15 | val RED = "\033[1;30m" 16 | val ENDC = "\033[0m" 17 | 18 | /** 19 | * @constructor ArrayToVertices 20 | * 21 | * Convert ArrayBuffer to RDD containing Vertices 22 | * 23 | * @param SparkContext - $sc - SparkContext 24 | * @param ArrayBuffer[(Long, (String))] - $collection - Contains vertices 25 | * 26 | * @return RDD[Edge[String]] - RDD of vertices 27 | */ 28 | def ArrayToVertices(sc: SparkContext, collection: ArrayBuffer[(Long, (String))]): RDD[(VertexId, (String))] = { 29 | sc.parallelize(collection) 30 | } 31 | 32 | /** 33 | * @constructor ArrayToEdges 34 | * 35 | * Convert ArrayBuffer to RDD containing Edges 36 | * 37 | * @param SparkContext - $sc - SparkContext 38 | * @param ArrayBuffer[Edge[String]] - $collection - Contains edges 39 | * 40 | * @return RDD[Edge[String]] - RDD of edges 41 | */ 42 | def ArrayToEdges(sc: SparkContext, collection: ArrayBuffer[Edge[String]]): RDD[Edge[String]] = { 43 | sc.parallelize(collection) 44 | } 45 | 46 | /** 47 | * @constructor findUserByIDInGraph 48 | * 49 | * find user ID with username 50 | * 51 | * @param Graph[String,String] $graph - Graph element 52 | * @param Int $userID - User id 53 | * @return String - if success : username | failure : "user not found" 54 | */ 55 | def findUserNameByIDInGraph(graph: Graph[String, String], userID: Int): String = { 56 | println(color("\nCall : findUserNameWithID", RED)) 57 | 58 | graph.vertices.filter { case (id, name) => id.toString equals userID.toString }.collect().foreach { 59 | (e: (org.apache.spark.graphx.VertexId, String)) => return e._2 60 | } 61 | "user not found" 62 | } 63 | 64 | /** 65 | * @constructor findUserIDByNameInGraph 66 | * 67 | * find username with id 68 | * 69 | * @param Graph[String,String] $graph - Graph element 70 | * @param String $userName - Username 71 | * @return String - if success : id found | failure : "0" 72 | */ 73 | def findUserIDByNameInGraph(graph: Graph[String, String], userName: String): String = { 74 | println(color("\nCall : findUserIDWithName", RED)) 75 | 76 | graph.vertices.filter(_._2 == userName).collect().foreach { 77 | (e: (org.apache.spark.graphx.VertexId, String)) => return e._1.toString 78 | } 79 | "0" 80 | } 81 | 82 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 83 | 84 | /** 85 | * @constructor displayAllCommunications 86 | * 87 | * display all communications between users 88 | * 89 | * @param Graph[String,String] $graph - Graph element 90 | * @return Unit 91 | */ 92 | def displayAllCommunications(graph: Graph[String, String]): Unit = { 93 | 94 | println(color("\nCall : displayAllCommunications", RED)) 95 | println("Users communications: ") 96 | 97 | val facts: RDD[String] = graph.triplets.map(triplet => triplet.srcAttr + " communicate with " + 98 | triplet.dstAttr + " with tweet id " + triplet.attr) 99 | 100 | facts.collect().foreach(println(_)) 101 | } 102 | } -------------------------------------------------------------------------------- /scala/FindCommunities/build.sbt: -------------------------------------------------------------------------------- 1 | name := "FindCommunities" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.5" 6 | 7 | /*libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-core" % "1.3.0" % "provided", 9 | "org.apache.spark" %% "spark-graphx" % "1.3.0" % "provided", 10 | "org.apache.spark" %% "spark-mllib" % "1.3.0" % "provided") 11 | 12 | libraryDependencies += "org.apache.spark" % "spark-streaming-twitter_2.10" % "1.3.0" 13 | 14 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.3.0-M1"*/ 15 | 16 | libraryDependencies ++= Seq( 17 | "org.apache.spark" %% "spark-core" % "1.4.0" % "provided", 18 | "org.apache.spark" %% "spark-graphx" % "1.4.0" % "provided", 19 | "org.apache.spark" %% "spark-mllib" % "1.4.0" % "provided") 20 | 21 | libraryDependencies += "org.apache.spark" % "spark-streaming-twitter_2.10" % "1.4.0" 22 | 23 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0-M1" 24 | 25 | //libraryDependencies += "com.google.code.gson" % "gson" % "2.3" 26 | 27 | //libraryDependencies += "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly() 28 | 29 | // http://stackoverflow.com/questions/28459333/how-to-build-an-uber-jar-fat-jar-using-sbt-within-intellij-idea 30 | // META-INF discarding 31 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => 32 | { 33 | case PathList("META-INF", xs @ _*) => MergeStrategy.discard 34 | case x => MergeStrategy.first 35 | } 36 | } 37 | 38 | resolvers ++= Seq( 39 | // "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", 40 | // "Spray Repository" at "http://repo.spray.cc/", 41 | // "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", 42 | // "Akka Repository" at "http://repo.akka.io/releases/", 43 | // "Twitter4J Repository" at "http://twitter4j.org/maven2/", 44 | // "Apache HBase" at "https://repository.apache.org/content/repositories/releases", 45 | // "Twitter Maven Repo" at "http://maven.twttr.com/", 46 | // "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", 47 | // "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", 48 | // "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/" 49 | // "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", 50 | // Resolver.sonatypeRepo("public") 51 | ) -------------------------------------------------------------------------------- /scala/FindCommunities/launch.sh: -------------------------------------------------------------------------------- 1 | date >> log.log 2 | 3 | spark-submit --class FindCommunities /home/mcaraccio/TB_2015/scala/FindCommunities/target/scala-2.10/FindCommunities-assembly-1.0.jar 4 | 5 | # | tee log.log -a 6 | 7 | -------------------------------------------------------------------------------- /scala/FindCommunities/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") -------------------------------------------------------------------------------- /scala/FindCommunities/src/main/scala/FindCommunities.scala: -------------------------------------------------------------------------------- 1 | // ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 2 | // Author : Michael Caraccio 3 | // Project title : Détection et analyse de communauté Twitter 4 | // ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 5 | 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.graphx._ 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.storage.StorageLevel 10 | import org.apache.spark.streaming._ 11 | import org.apache.spark.streaming.twitter.TwitterUtils 12 | import utils._ 13 | 14 | import scala.collection.mutable 15 | import scala.collection.mutable.{ArrayBuffer, ListBuffer} 16 | import scala.math._ 17 | import scala.reflect.ClassTag 18 | 19 | //Log4J 20 | import org.apache.log4j.{Level, Logger} 21 | 22 | // Cassandra 23 | import com.datastax.spark.connector._ 24 | 25 | // Regex 26 | import scala.util.matching.Regex 27 | 28 | // MLlib 29 | import org.apache.spark.mllib.clustering.{LDA, _} 30 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 31 | 32 | 33 | object FindCommunities { 34 | 35 | // ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 36 | // CONSTANT 37 | // ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 38 | 39 | var MIN_VERTICES_PER_COMMUNITIES = 6 // Limit - Minimum vertices per communities 40 | var MIN_WORD_LENGTH = 3 // Minimum word length in tweet 41 | var NBKCORE = 6 // Number of core - K Core Decomposition algorithm 42 | var BATCH_SIZE = 900 // Batch size (in seconds) 43 | var CLEAN_GRAPH_MOD = 4 // Clean stockGraph every CLEAN_GRAPH_MOD 44 | var CLEAN_GRAPH_NBKCORE = 2 // When clean graph is called, k-core decomposition is called 45 | 46 | val defaultSeed = 0xadc83b19L // Seed for murmurhash - Do not change this value 47 | 48 | var dictionnary = new ArrayBuffer[String]() // Store tweets 49 | var ldaModel: LDAModel = null // LDA Model 50 | var lda: LDA = null // LDA object 51 | var stockGraph: Graph[String, String] = null // Store every edges and vertices received by Twitter 52 | var currentTweets: String = "" 53 | 54 | var counter = 1 // Perid 55 | 56 | val RED = "\033[1;30m" // Terminal color RED 57 | val ENDC = "\033[0m" // Terminal end character 58 | 59 | 60 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 61 | 62 | 63 | def main(args: Array[String]) { 64 | 65 | val ru = new RDDUtils // Manipulate RDD class 66 | val tc = new TwitterConfig // Login and password for Twitter 67 | 68 | // LDA parameters 69 | val topicSmoothing = 1.2 70 | val termSmoothing = 1.2 71 | val numTopics = 10 72 | val numIterations = 50 73 | val numWordsByTopics = 12 74 | 75 | // Display only error messages 76 | Logger.getLogger("org").setLevel(Level.ERROR) 77 | Logger.getLogger("akka").setLevel(Level.ERROR) 78 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) 79 | Logger.getLogger("org.apache.spark.storage.BlockManager").setLevel(Level.ERROR) 80 | 81 | // Spark configuration 82 | val sparkConf = new SparkConf(true) 83 | .setAppName("FindCommunities") 84 | .setMaster("local[4]") 85 | .set("spark.akka.frameSize", "1000") 86 | .set("spark.streaming.receiver.maxRate", "0") // no limit on the rate 87 | .set("spark.task.maxFailures", "30000") 88 | .set("spark.akka.timeout", "180") 89 | .set("spark.network.timeout", "180") 90 | .set("spark.driver.cores", "4") 91 | .set("spark.driver.memory", "16g") 92 | .set("spark.executor.memory", "16g") 93 | .set("spark.shuffle.memoryFraction", "0.7") 94 | .set("spark.driver.maxResultSize", "0") // no limit 95 | .set("spark.cassandra.connection.host", "157.26.83.16") // Link to Cassandra 96 | .set("spark.cassandra.auth.username", "cassandra") 97 | .set("spark.cassandra.auth.password", "cassandra"); 98 | 99 | // Set the system properties so that Twitter4j library used by twitter stream 100 | // can use them to generate OAuth credentials 101 | System.setProperty("twitter4j.oauth.consumerKey", tc.getconsumerKey()) 102 | System.setProperty("twitter4j.oauth.consumerSecret", tc.getconsumerSecret()) 103 | System.setProperty("twitter4j.oauth.accessToken", tc.getaccessToken()) 104 | System.setProperty("twitter4j.oauth.accessTokenSecret", tc.getaccessTokenSecret()) 105 | System.setProperty("twitter4j.http.connectionTimeout", "200000") 106 | System.setProperty("twitter4j.http.retryCount", "30") 107 | System.setProperty("twitter4j.http.retryIntervalSecs", "2") 108 | 109 | 110 | 111 | println("\n\n**************************************************************") 112 | println("****************** FindCommunities ***************") 113 | println("**************************************************************\n") 114 | 115 | val words = Array(" @") // Filters tweet stream by words 116 | 117 | // Pattern used to find users and filter tweets 118 | val pattern = new Regex("\\@\\w{3,}") 119 | val patternURL = new Regex("(http|ftp|https)://[A-Za-z0-9-_]+.[A-Za-z0-9-_:%&?/.=]+") 120 | val patternSmiley = new Regex("((?::|;|=)(?:-)?(?:\\)|D|P|3|O))") 121 | val patternCommonWords = new Regex("\\b(that|have|with|this|from|they|would|there|their|what|about|which|when|make|like|time|just|know|take|into|year|your|good|some|could|them|other|than|then|look|only|come|over|think|also|back|after|work|first|well|even|want|because|these|give|most|http|https|fpt)\\b") 122 | 123 | // Streaming context -> batch size 124 | val ssc = new StreamingContext(sparkConf, Seconds(BATCH_SIZE)) 125 | 126 | val stream = TwitterUtils.createStream(ssc, None, words) 127 | 128 | // filter for english user only 129 | stream.filter(a => a.getUser.getLang.equals("en") || a.getUser.getLang.equals("en-GB")) 130 | 131 | // Group into larger batches 132 | val streamBatch = stream.window(Seconds(BATCH_SIZE), Seconds(BATCH_SIZE)) 133 | 134 | // Init SparkContext 135 | val sc = ssc.sparkContext 136 | 137 | /** 138 | * LDA CREATED FROM CASSANDRA 139 | * Date comes from old tweets 140 | */ 141 | println("\n*******************************************") 142 | println("Create corpus from Cassandra") 143 | println("*******************************************\n") 144 | 145 | // Get every tweets 146 | val rdd = sc.cassandraTable("twitter", "tweet_filtered").cache() 147 | 148 | rdd.select("tweet_text").as((i: String) => i).collect().foreach(x => { 149 | 150 | val preText = patternCommonWords.replaceAllIn(x.toLowerCase, "") 151 | 152 | val tweet = preText 153 | .toLowerCase.split("\\s") 154 | .filter(_.length > MIN_WORD_LENGTH) 155 | .filter(_.forall(java.lang.Character.isAlphabetic(_))) 156 | 157 | if (tweet.length > 0) { 158 | for (t <- tweet) { 159 | dictionnary += t 160 | } 161 | } 162 | }) 163 | 164 | 165 | // Create RDD 166 | val dictRDDInit = sc.parallelize(dictionnary).cache() 167 | 168 | // Init LDA 169 | lda = new LDA() 170 | .setK(numTopics) 171 | .setDocConcentration(topicSmoothing) 172 | .setTopicConcentration(termSmoothing) 173 | .setMaxIterations(numIterations) 174 | .setOptimizer("online") // works with Apache Spark 1.4 only 175 | 176 | // Create documents for LDA 177 | val (res1: RDD[(Long, Vector)], vocab: Map[String, Int]) = time { 178 | createdoc(dictRDDInit) 179 | } 180 | 181 | dictRDDInit.unpersist() 182 | 183 | if (!res1.isEmpty()) { 184 | // Start LDA 185 | println("LDA Started") 186 | time { 187 | ldaModel = lda.run(res1.persist(StorageLevel.MEMORY_AND_DISK_SER)) 188 | } 189 | println("LDA Finished\n") 190 | } 191 | res1.unpersist() 192 | 193 | 194 | 195 | // ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 196 | // STREAM OBJECT 197 | // ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 198 | 199 | 200 | // Stream about users 201 | val usersStream = streamBatch.map { status => ( 202 | status.getUser.getId.toString, 203 | abs(murmurHash64A(status.getUser.getScreenName.getBytes)), 204 | status.getUser.getName, 205 | status.getUser.getLang, 206 | status.getUser.getFollowersCount.toString, 207 | status.getUser.getFriendsCount.toString, 208 | status.getUser.getScreenName, 209 | status.getUser.getStatusesCount.toString) 210 | } 211 | 212 | // Stream about communication between two users 213 | val commStream = streamBatch.map { status => ( 214 | status.getId, 215 | status.getUser.getId.toString, 216 | status.getUser.getScreenName, 217 | if (pattern.findFirstIn(status.getText).isEmpty) { 218 | "" 219 | } 220 | else { 221 | pattern.findFirstIn(status.getText).getOrElse("@MichaelCaraccio").tail 222 | }, 223 | status.getText 224 | ) 225 | } 226 | 227 | // Stream about tweets 228 | val tweetsStream = streamBatch.map { status => ( 229 | status.getId.toString, 230 | status.getUser.getId.toString, 231 | abs(murmurHash64A(status.getUser.getScreenName.getBytes)), 232 | new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(status.getCreatedAt), 233 | status.getRetweetCount.toString, 234 | status.getText 235 | ) 236 | } 237 | 238 | 239 | // ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 240 | // STREAMING PART 241 | // Following code is called every batch interval 242 | // ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 243 | 244 | println("*******************************************") 245 | println("Streaming started") 246 | println("*******************************************\n") 247 | 248 | // ************************************************************ 249 | // Save tweet's informations into Cassandra 250 | // ************************************************************ 251 | tweetsStream.foreachRDD(rdd => { 252 | 253 | rdd.persist(StorageLevel.MEMORY_AND_DISK) 254 | 255 | // For each tweets in RDD 256 | val seqtweetsStream = rdd.collect().map(a => (a._1, a._2, a._3.toString, a._4, a._5, patternSmiley.replaceAllIn(patternURL.replaceAllIn(a._6, ""), ""))).toList 257 | 258 | 259 | sc.parallelize(seqtweetsStream).saveToCassandra( 260 | "twitter", 261 | "tweet_filtered", 262 | SomeColumns("tweet_id", 263 | "user_twitter_id", 264 | "user_local_id", 265 | "tweet_create_at", 266 | "tweet_retweet", 267 | "tweet_text" 268 | )) 269 | 270 | // reset 271 | rdd.unpersist() 272 | }) 273 | 274 | // ************************************************************ 275 | // Save user's informations in Cassandra 276 | // ************************************************************ 277 | usersStream.persist(StorageLevel.MEMORY_AND_DISK).foreachRDD(rdd => { 278 | rdd.saveToCassandra("twitter", "user_filtered", SomeColumns("user_twitter_id", "user_local_id", "user_name", "user_lang", "user_follow_count", "user_friends_count", "user_screen_name", "user_status_count")) 279 | }) 280 | 281 | // ************************************************************ 282 | // Save communication's informations in Cassandra 283 | // ************************************************************ 284 | commStream.persist(StorageLevel.MEMORY_AND_DISK).foreachRDD(rdd => { 285 | 286 | // Timer 287 | val t00 = System.nanoTime() 288 | 289 | // Collection of vertices (contains users) 290 | var collectionVertices = new ArrayBuffer[(Long, String)]() 291 | 292 | // Collection of edges (contains communications between users) 293 | var collectionEdge = new ArrayBuffer[Edge[String]]() 294 | 295 | val seqcommStream = new ListBuffer[(String, String, String, String)]() 296 | 297 | rdd.persist(StorageLevel.MEMORY_AND_DISK) 298 | 299 | /** 300 | * Enregistrement des messages dans cassandra 301 | */ 302 | 303 | val textBuffer = rdd.collect().map { g => g._1 -> g._5 }.toMap 304 | 305 | // For each tweets in RDD 306 | for (item <- rdd.collect()) { 307 | 308 | // Avoid single @ in message, english only 309 | if (item._4.nonEmpty) { 310 | 311 | // Sender ID 312 | val sendID: Long = abs(murmurHash64A(item._3.getBytes)) 313 | 314 | // Sender 315 | collectionVertices += ((sendID, item._3)) 316 | 317 | // For each dest in tweet 318 | pattern.findAllIn(item._5).foreach { destName => { 319 | 320 | val user_dest_name = destName.drop(1) 321 | 322 | // Generate Hash 323 | val destID: Long = abs(murmurHash64A(user_dest_name.getBytes)) 324 | 325 | if (sendID != destID) { 326 | // Create each users and edges 327 | collectionVertices += ((destID, user_dest_name)) 328 | collectionEdge += Edge(sendID, destID, item._1.toString) 329 | 330 | seqcommStream.append((item._1.toString, item._2, sendID.toString, destID.toString)) 331 | } 332 | } 333 | } 334 | } 335 | } 336 | 337 | 338 | sc.parallelize(seqcommStream).saveToCassandra( 339 | "twitter", 340 | "users_communicate", 341 | SomeColumns( 342 | "tweet_id", 343 | "user_send_twitter_id", 344 | "user_send_local_id", 345 | "user_dest_id")) 346 | 347 | // reset 348 | seqcommStream.clear() 349 | 350 | 351 | /** 352 | * Initialisation du graph 353 | */ 354 | 355 | // Empty graph at first launch 356 | if (stockGraph == null) { 357 | 358 | // Convert vertices to RDD 359 | val VerticesRDD = ru ArrayToVertices(sc, collectionVertices) 360 | 361 | // Convert it to RDD 362 | val EdgeRDD = ru ArrayToEdges(sc, collectionEdge) 363 | 364 | stockGraph = Graph(VerticesRDD, EdgeRDD) 365 | stockGraph.unpersist() 366 | stockGraph.persist(StorageLevel.MEMORY_AND_DISK) 367 | } 368 | 369 | /** 370 | * Ajout des nouveaux Edges et Vertices dans le graph principal 371 | */ 372 | 373 | time { 374 | stockGraph = Graph(stockGraph.vertices.union(sc.parallelize(collectionVertices)), stockGraph.edges.union(sc.parallelize(collectionEdge))) 375 | } 376 | 377 | collectionVertices = new ArrayBuffer[(Long, String)]() 378 | collectionEdge = new ArrayBuffer[Edge[String]]() 379 | 380 | 381 | /** 382 | * Split main Graph in multiples communities 383 | */ 384 | 385 | if (counter % CLEAN_GRAPH_MOD == 0) { 386 | println("################################################") 387 | println("Clean stockgraph") 388 | println("Before cleaning (edges): " + stockGraph.edges.count()) 389 | 390 | stockGraph = time { 391 | splitCommunity(stockGraph, stockGraph.vertices, CLEAN_GRAPH_NBKCORE, displayResult = false) 392 | } 393 | println("After cleaning (edges): " + stockGraph.edges.count()) 394 | println("################################################") 395 | } 396 | 397 | val communityGraph = time { 398 | splitCommunity(stockGraph, stockGraph.vertices, NBKCORE, displayResult = false) 399 | } 400 | 401 | communityGraph.cache() 402 | 403 | var (subgraphs, commIDs) = time { 404 | subgraphCommunities(communityGraph, stockGraph.vertices, displayResult = false) 405 | } 406 | 407 | communityGraph.unpersist() 408 | 409 | /** 410 | * LDA 411 | */ 412 | 413 | // We only care about subgraph bigger than MIN_VERTICES_PER_COMMUNITIES 414 | subgraphs = time { 415 | subgraphs.filter(_.vertices.count() >= MIN_VERTICES_PER_COMMUNITIES) 416 | } 417 | 418 | 419 | currentTweets = "" 420 | for (i <- subgraphs.indices) { 421 | 422 | // Messages will be stored in an array 423 | val result = subgraphs(i).edges.collect().map(message => textBuffer.getOrElse(message.attr.toLong, "").replaceAll("[!?.,:;<>)(]", " ")) 424 | 425 | result.foreach(x => { 426 | 427 | val preText = patternCommonWords.replaceAllIn(x.toLowerCase, "") 428 | 429 | val tweet = preText 430 | .toLowerCase.split("\\s") 431 | .filter(_.length > MIN_WORD_LENGTH) 432 | .filter(_.forall(java.lang.Character.isAlphabetic(_))) 433 | 434 | if (tweet.nonEmpty) { 435 | for (t <- tweet) { 436 | dictionnary += t 437 | } 438 | } 439 | }) 440 | } 441 | 442 | 443 | // Create document 444 | println("Create document") 445 | val dictRDD = sc.parallelize(dictionnary).persist(StorageLevel.MEMORY_AND_DISK) 446 | 447 | val (res1: RDD[(Long, Vector)], vocab: Map[String, Int]) = time { 448 | createdoc(dictRDD) 449 | } 450 | 451 | 452 | // Start LDA 453 | println("LDA Started") 454 | ldaModel = lda.run(res1.persist(StorageLevel.MEMORY_AND_DISK_SER)) 455 | 456 | res1.unpersist() 457 | var seqC: Seq[(String, String, String, String)] = time { 458 | findTopics(ldaModel, dictionnary.toArray, counter.toString, 0, numWordsByTopics, displayResult = true) 459 | } 460 | 461 | seqC = seqC.map(a => (counter.toString, a._2, a._3, a._4)) 462 | 463 | 464 | //Save to cassandra 465 | sc.parallelize(seqC).saveToCassandra( 466 | "twitter", 467 | "lda", 468 | SomeColumns("t", 469 | "sg", 470 | "n_topic", 471 | "words" 472 | )) 473 | 474 | println("LDA Finished") 475 | 476 | 477 | var cpt = 0 478 | 479 | for (i <- subgraphs.indices) { 480 | 481 | println("\n\n:::::::::::::::::::::::::::::::::::") 482 | println("::::: Community N°" + i + " T: " + counter + " SG: " + cpt) 483 | println(":::::::::::::::::::::::::::::::::::") 484 | 485 | // Timer 486 | val t0 = System.nanoTime() 487 | 488 | // Current subgraph 489 | val sub = subgraphs(i).cache() 490 | 491 | val verticesCount = sub.vertices.count() 492 | 493 | println("Number of users in community : " + verticesCount) 494 | 495 | // Messages will be stored in an array 496 | val result = sub.edges.collect().map(message => textBuffer.getOrElse(message.attr.toLong, "").replaceAll("[!?.,:;<>)(]", " ")) 497 | 498 | /** 499 | * If there's a new tweet in a community -> LDA 500 | */ 501 | 502 | 503 | if (result.nonEmpty) { 504 | 505 | println("Words in current tweet: " + result.length) 506 | 507 | currentTweets = "" 508 | result.foreach(x => { 509 | 510 | val preText = patternCommonWords.replaceAllIn(x.toLowerCase, "") 511 | 512 | val tweet = preText 513 | .toLowerCase.split("\\s") 514 | .filter(_.length > MIN_WORD_LENGTH) 515 | .filter(_.forall(java.lang.Character.isAlphabetic(_))) 516 | 517 | currentTweets = currentTweets.concat(tweet.mkString(" ")) 518 | 519 | }) 520 | 521 | 522 | println("Call cosineSimilarity") 523 | val tabcosine: ArrayBuffer[Double] = cosineSimilarity(vocab, dictionnary.toArray.distinct, currentTweets.split(" ")) 524 | println("outside cosineSimilarity") 525 | 526 | // Pour chaques edges . On crée un Seq qui contient le futur record pour cassandra 527 | var seqcommunities = sub.edges.map(message => (counter.toString, verticesCount.toString, cpt.toString, commIDs(cpt).toString, message.srcId.toString, message.dstId.toString, message.attr, tabcosine.mkString(";"))).collect() 528 | 529 | // Petit problème avec le counter qui ne se met pas a jour dans la method au dessus 530 | seqcommunities = seqcommunities.map(a => (counter.toString, a._2, a._3, a._4, a._5, a._6, a._7, a._8)) 531 | 532 | // Save to cassandra 533 | sc.parallelize(seqcommunities.toSeq).saveToCassandra( 534 | "twitter", 535 | "communities", 536 | SomeColumns("t", 537 | "nbv", 538 | "sg", 539 | "com_id", 540 | "src_id", 541 | "dst_id", 542 | "attr", 543 | "lda" 544 | )) 545 | } else { 546 | println("LDA wont process current document because it does not contains any words") 547 | } 548 | 549 | cpt += 1 550 | 551 | 552 | val t1 = System.nanoTime() 553 | println("SubGraph N°: " + cpt + " processed in " + (t1 - t0) / 1000000000.0 + " seconds") 554 | } 555 | 556 | counter += 1 557 | 558 | val t11 = System.nanoTime() 559 | println("------------------------------------------------------------") 560 | println("BATCH FINISHED") 561 | println("Processed in " + (t11 - t00) / 1000000000.0 + " seconds") 562 | println("------------------------------------------------------------") 563 | }) 564 | 565 | ssc.start() 566 | ssc.awaitTermination() 567 | } 568 | 569 | /** 570 | * @constructor murmurHash64A 571 | * 572 | * Murmur is a family of good general purpose hashing functions, suitable for non-cryptographic usage. As stated by Austin Appleby, MurmurHash provides the following benefits: 573 | * - good distribution (passing chi-squared tests for practically all keysets & bucket sizes. 574 | * - good avalanche behavior (max bias of 0.5%). 575 | * - good collision resistance (passes Bob Jenkin's frog.c torture-test. No collisions possible for 4-byte keys, no small (1- to 7-bit) differentials). 576 | * - great performance on Intel/AMD hardware, good tradeoff between hash quality and CPU consumption. 577 | * 578 | * Source : http://stackoverflow.com/questions/11899616/murmurhash-what-is-it 579 | * 580 | * @param Seq[Byte] - $data 581 | * @param Long - $seed 582 | * @return Long - Return hash 583 | * 584 | */ 585 | def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = { 586 | val m = 0xc6a4a7935bd1e995L 587 | val r = 47 588 | 589 | val f: Long => Long = m.* 590 | val g: Long => Long = x => x ^ (x >>> r) 591 | 592 | val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) => 593 | val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff)) 594 | val j: Long => Long = if (xs.length == 8) f compose g compose f else identity 595 | f(y ^ j(k)) 596 | } 597 | (g compose f compose g)(h) 598 | } 599 | 600 | /** 601 | * @constructor time 602 | * 603 | * timer for profiling block 604 | * 605 | * @param R $block - Block executed 606 | * @return Unit 607 | */ 608 | def time[R](block: => R): R = { 609 | val t0 = System.nanoTime() 610 | val result = block // call-by-name 611 | val t1 = System.nanoTime() 612 | println("Elapsed time: " + (t1 - t0) / 1000000000.0 + " seconds") 613 | result 614 | } 615 | 616 | /** 617 | * This method takes 2 equal length arrays of integers 618 | * It returns a double representing similarity of the 2 arrays 619 | * 0.9925 would be 99.25% similar 620 | * (x dot y) / ||X|| ||Y|| 621 | * 622 | * @param x 623 | * @param y 624 | * @return cosine similarity 625 | */ 626 | def cosineSimilarity(x: ArrayBuffer[Double], y: ArrayBuffer[Double]): Double = { 627 | require(x.length == y.length) 628 | 629 | if (magnitude(x) == 0.0 || magnitude(y) == 0.0) 630 | return 0.0 631 | 632 | dotProduct(x, y) / (magnitude(x) * magnitude(y)) 633 | } 634 | 635 | /** 636 | * Return the dot product of the 2 arrays 637 | * e.g. (a[0]*b[0])+(a[1]*a[2]) 638 | * 639 | * @param x 640 | * @param y 641 | * @return 642 | */ 643 | def dotProduct(x: ArrayBuffer[Double], y: ArrayBuffer[Double]): Double = { 644 | (for ((a, b) <- x zip y) yield a * b) sum 645 | } 646 | 647 | /** 648 | * We multiply each element, sum it, then square root the result. 649 | * 650 | * @param x 651 | * @return the magnitude of an array 652 | */ 653 | def magnitude(x: ArrayBuffer[Double]): Double = { 654 | math.sqrt(x map (i => i * i) sum) 655 | } 656 | 657 | 658 | def splitCommunity(graph: Graph[String, String], users: RDD[(VertexId, (String))], NBKCORE: Int, displayResult: Boolean): Graph[String, String] = { 659 | 660 | println(color("\nCall SplitCommunity", RED)) 661 | 662 | getKCoreGraph(graph, users, NBKCORE, displayResult).cache() 663 | } 664 | 665 | /** 666 | * Compute the k-core decomposition of the graph for all k <= kmax. This 667 | * uses the iterative pruning algorithm discussed by Alvarez-Hamelin et al. 668 | * in K-Core Decomposition: a Tool For the Visualization of Large Scale Networks 669 | * (see http://arxiv.org/abs/cs/0504107). 670 | * 671 | * @tparam VD the vertex attribute type (discarded in the computation) 672 | * @tparam ED the edge attribute type (preserved in the computation) 673 | * 674 | * @param graph the graph for which to compute the connected components 675 | * @param kmax the maximum value of k to decompose the graph 676 | * 677 | * @return a graph where the vertex attribute is the minimum of 678 | * kmax or the highest value k for which that vertex was a member of 679 | * the k-core. 680 | * 681 | * @note This method has the advantage of returning not just a single kcore of the 682 | * graph but will yield all the cores for k > kmin. 683 | */ 684 | def getKCoreGraph[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], 685 | users: RDD[(VertexId, (String))], 686 | kmin: Int, 687 | displayResult: Boolean): Graph[String, ED] = { 688 | 689 | // Graph[(Int, Boolean), ED] - boolean indicates whether it is active or not 690 | var g = graph.cache().outerJoinVertices(graph.degrees)((vid, oldData, newData) => newData.getOrElse(0)).cache() 691 | 692 | println(color("\nCall KCoreDecomposition", RED)) 693 | 694 | g = computeCurrentKCore(g, kmin).cache() 695 | 696 | val v = g.vertices.filter { case (vid, vd) => vd >= kmin }.cache() 697 | 698 | // Create new RDD users 699 | val newUser = users.join(v).map { 700 | case (id, (username, rank)) => (id, username) 701 | } 702 | 703 | // Create a new graph 704 | val gra = Graph(newUser, g.edges) 705 | 706 | // Remove missing vertices as well as the edges to connected to them 707 | gra.subgraph(vpred = (id, username) => username != null).cache() 708 | } 709 | 710 | def computeCurrentKCore[ED: ClassTag](graph: Graph[Int, ED], k: Int) = { 711 | println("Computing kcore for k=" + k) 712 | def sendMsg(et: EdgeTriplet[Int, ED]): Iterator[(VertexId, Int)] = { 713 | if (et.srcAttr < 0 || et.dstAttr < 0) { 714 | // if either vertex has already been turned off we do nothing 715 | Iterator.empty 716 | } else if (et.srcAttr < k && et.dstAttr < k) { 717 | // tell both vertices to turn off but don't need change count value 718 | Iterator((et.srcId, -1), (et.dstId, -1)) 719 | 720 | } else if (et.srcAttr < k) { 721 | // if src is being pruned, tell dst to subtract from vertex count 722 | Iterator((et.srcId, -1), (et.dstId, 1)) 723 | 724 | } else if (et.dstAttr < k) { 725 | // if dst is being pruned, tell src to subtract from vertex count 726 | Iterator((et.dstId, -1), (et.srcId, 1)) 727 | 728 | } else { 729 | Iterator.empty 730 | } 731 | } 732 | 733 | // subtracts removed neighbors from neighbor count and tells vertex whether it was turned off or not 734 | def mergeMsg(m1: Int, m2: Int): Int = { 735 | if (m1 < 0 || m2 < 0) { 736 | -1 737 | } else { 738 | m1 + m2 739 | } 740 | } 741 | 742 | def vProg(vid: VertexId, data: Int, update: Int): Int = { 743 | if (update < 0) { 744 | // if the vertex has turned off, keep it turned off 745 | -1 746 | } else { 747 | // subtract the number of neighbors that have turned off this round from 748 | // the count of active vertices 749 | // TODO(crankshaw) can we ever have the case data < update? 750 | max(data - update, 0) 751 | } 752 | } 753 | 754 | // Note that initial message should have no effect 755 | Pregel(graph, 0)(vProg, sendMsg, mergeMsg) 756 | } 757 | 758 | 759 | /** 760 | * SubGraphCommunities is used to find communities in a graph 761 | * 762 | * Steps : 763 | * 1. Connected Compoenents 764 | * 2. Collect subgraphs id's 765 | * 3. Add subgraph to array 766 | * 4. Return array of communities 767 | * 768 | * @param graph the graph for which to compute the connected components 769 | * @param users RDD containing users - used to associate edges and vertices 770 | * @param boolean displayResult 771 | * 772 | * @return an Array of graph (which contains subgraph) and communities ids 773 | */ 774 | def subgraphCommunities(graph: Graph[String, String], users: RDD[(VertexId, (String))], displayResult: Boolean): (Array[Graph[String, String]], Array[Long]) = { 775 | 776 | println(color("\nCall subgraphCommunities", RED)) 777 | 778 | // Find the connected components 779 | val cc = time { 780 | graph.connectedComponents().vertices.cache() 781 | } 782 | 783 | // Join the connected components with the usernames and id 784 | // The result is an RDD not a Graph 785 | val ccByUsername = users.join(cc).map { 786 | case (id, (username, cci)) => (id, username, cci) 787 | }.cache() 788 | 789 | val lowerIDPerCommunity = ccByUsername.map { case (id, username, cci) => cci }.distinct().cache() 790 | 791 | // Result will be stored in an array 792 | println("--------------------------") 793 | println("Total community found: " + lowerIDPerCommunity.count()) 794 | println("--------------------------") 795 | 796 | 797 | val collectIDsCommunity = lowerIDPerCommunity.collect() 798 | 799 | val result = collectIDsCommunity.map(colID => Graph(ccByUsername.filter { 800 | _._3 == colID 801 | }.map { case (id, username, cc) => (id, username) }, graph.edges).subgraph(vpred = (id, username) => username != null).cache()) 802 | 803 | // Display communities 804 | if (displayResult) { 805 | println("\nCommunities found " + result.length) 806 | for (community <- result) { 807 | println("-----------------------") 808 | community.edges.collect().foreach(println(_)) 809 | community.vertices.collect().foreach(println(_)) 810 | } 811 | } 812 | 813 | cc.unpersist() 814 | lowerIDPerCommunity.unpersist() 815 | 816 | (result, collectIDsCommunity) 817 | } 818 | 819 | /** 820 | * CreateDoc generate document for LDA 821 | * 822 | * Steps : 823 | * 1. Get tweets 824 | * 2. Split into sequences 825 | * 3. Counts terms occurency 826 | * 4. Create vocab array with unique words 827 | * 5. Create documents (RDD) containing vector and word id 828 | * 829 | * @param RDD tweets 830 | * 831 | * @return documents (RDD) ready to use 832 | * array of tweets 833 | */ 834 | def createdoc(tokenizedCorpus: RDD[String]): ((RDD[(Long, Vector)], Map[String, Int])) = { 835 | 836 | println(color("\nCall createdoc", RED)) 837 | 838 | // Split each document into a sequence of terms (words) 839 | val tokenized: RDD[Seq[String]] = 840 | tokenizedCorpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3)) 841 | 842 | // Choose the vocabulary. 843 | // termCounts: Sorted list of (term, termCount) pairs 844 | val termCounts: RDD[(String, Long)] = 845 | tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).sortBy(_._2) //.collect().sortBy(-_._2) 846 | 847 | // vocabArray: Chosen vocab (removing common terms) 848 | val vocabArray: Array[String] = termCounts.map(a => a._1).collect() 849 | 850 | // vocab: Map term -> term index 851 | val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap 852 | 853 | // Convert documents into term count vectors 854 | val documents: RDD[(Long, Vector)] = 855 | tokenized.zipWithIndex.map { case (tokens, id) => 856 | val counts = new mutable.HashMap[Int, Double]() 857 | tokens.foreach { term => 858 | if (vocab.contains(term)) { 859 | val idx = vocab(term) 860 | counts(idx) = counts.getOrElse(idx, 0.0) + 1.0 861 | } 862 | } 863 | (id, Vectors.sparse(vocab.size, counts.toSeq)) 864 | } 865 | 866 | (documents, vocab) 867 | } 868 | 869 | 870 | def cosineSimilarity(vocab: Map[String, Int], vocabArray: Array[String], tokenizedTweet: Array[String]): ArrayBuffer[Double] = { 871 | 872 | println(color("\nCall cosineSimilarity", RED)) 873 | 874 | var tab1 = new ArrayBuffer[Double]() 875 | var tab2 = new ArrayBuffer[Double]() 876 | var tabcosine = new ArrayBuffer[Double]() 877 | 878 | ldaModel.describeTopics().foreach { case (terms, termWeights) => 879 | terms.zip(termWeights).foreach { case (term, weight) => 880 | 881 | tab1 += tokenizedTweet.count(_ == vocabArray(term.toInt)) 882 | tab2 += weight.toDouble 883 | } 884 | 885 | // Store every cosine similarity 886 | tabcosine += cosineSimilarity(tab1, tab2) 887 | } 888 | tabcosine 889 | } 890 | 891 | /** 892 | * @constructor findTopics 893 | * 894 | * Set currentTweet attribut and add the new tweet to the dictionnary 895 | * 896 | * @param LDAModel $ldaModel - LDA Model (LocalModel) 897 | * @param Array[String] $vocabArray - Contains all distinct words set to LDA 898 | * @param Int $numWordsByTopics - 899 | * @param Boolean $displayResult - Display result in console 900 | * 901 | * @return Seq 902 | */ 903 | def findTopics(ldaModel: LDAModel, vocabArray: Array[String], T: String, SG: Int, numWordsByTopics: Int, displayResult: Boolean): Seq[(String, String, String, String)] = { 904 | 905 | println(color("\nCall findTopics", RED)) 906 | 907 | println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):") 908 | 909 | val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = numWordsByTopics) 910 | 911 | var it = 0 912 | var seqC = List[(String, String, String, String)]() 913 | 914 | // Print topics, showing top-weighted x terms for each topic. 915 | topicIndices.foreach { case (terms, termWeights) => 916 | 917 | if (displayResult) 918 | println("TOPICS:") 919 | 920 | val tabTopics = terms.zip(termWeights).map(vector => vocabArray(vector._1.toInt).toString).mkString(";") 921 | 922 | if (displayResult) { 923 | terms.zip(termWeights).foreach { case (term, weight) => 924 | println(s"${vocabArray(term.toInt)}\t\t$weight") 925 | } 926 | } 927 | 928 | seqC = seqC :+(T, SG.toString, it.toString, tabTopics) 929 | 930 | println("T: " + T + " SG: " + SG + "TopicN: " + it + " c: " + tabTopics) 931 | it += 1 932 | 933 | if (displayResult) 934 | println() 935 | 936 | } 937 | seqC.toSeq 938 | } 939 | } -------------------------------------------------------------------------------- /scala/FindCommunities/src/main/scala/utils/CassandraUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | 5 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD: 6 | 7 | import com.datastax.spark.connector._ 8 | 9 | // To make some of the examples work we will also need RDD 10 | 11 | import org.apache.spark.SparkContext 12 | import org.apache.spark.graphx._ 13 | import org.apache.spark.rdd.RDD 14 | import org.apache.spark.sql.cassandra.CassandraSQLContext 15 | 16 | //@SerialVersionUID(100L) 17 | class CassandraUtils /*extends Serializable*/ { 18 | 19 | val RED = "\033[1;30m" 20 | val ENDC = "\033[0m" 21 | 22 | /** 23 | * @constructor getTweetContentFromID 24 | * 25 | * Return tweet content 26 | * 27 | * @param SparkContext sc - SparkContext 28 | * @param String $id - tweet id 29 | * @return Unit 30 | */ 31 | def getTweetContentFromID(sc: SparkContext, id: String): String = { 32 | 33 | println(color("\nCall getTweetContentFromID", RED)) 34 | 35 | val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", id) 36 | 37 | if (query.collect().length != 0) { 38 | query.first().getString("tweet_text") 39 | } 40 | else 41 | "Tweet not found" 42 | } 43 | 44 | /** 45 | * @constructor getTweetsIDFromUser 46 | * 47 | * Return tweet id 48 | * 49 | * @param SparkContext sc - SparkContext 50 | * @param String $id - user (sender) id 51 | * @return Unit 52 | */ 53 | def getTweetsIDFromUser(sc: SparkContext, id: String): ArrayBuffer[String] = { 54 | 55 | println(color("\nCall getTweetsIDFromUser", RED)) 56 | println("Tweets found:") 57 | 58 | val query = sc.cassandraTable("twitter", "users_communicate").select("tweet_id").where("user_send_local_id = ?", id) 59 | 60 | // Result will be stored in an array 61 | var result = ArrayBuffer[String]() 62 | 63 | if (query.collect().length != 0) { 64 | result += query.first().getString("tweet_id") 65 | } 66 | 67 | // Display result 68 | result.foreach(println(_)) 69 | 70 | // Return 71 | result 72 | } 73 | 74 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 75 | 76 | /** 77 | * @constructor getTweetsContentFromEdge 78 | * 79 | * Return an array of tweets content for a given Graph 80 | * 81 | * @param SparkContext sc - SparkContext 82 | * @param RDD[Edge[String]] $edge - graph's edge 83 | * @return Unit 84 | */ 85 | def getTweetsContentFromEdge(sc: SparkContext, edge: RDD[Edge[String]], displayResult: Boolean): RDD[String] = { 86 | 87 | println(color("\nCall getTweetsContentFromEdge", RED)) 88 | 89 | // Get the tweets ID for every communication 90 | val tweetsID = edge.flatMap({ 91 | case Edge(idSend, idExp, idTweet) => Seq(idTweet) 92 | }) 93 | 94 | // Result will be stored in an array 95 | var result = ArrayBuffer[String]() 96 | 97 | // Queries 98 | for (tweet <- tweetsID.collect()) { 99 | val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", tweet) 100 | 101 | if (query.collect().length != 0) { 102 | result += query.first().getString("tweet_text") 103 | } 104 | } 105 | 106 | // Display results 107 | if (displayResult) { 108 | result.foreach(println(_)) 109 | } 110 | 111 | 112 | // return 113 | sc.parallelize(result) 114 | } 115 | 116 | /*def getAllTweetsText(sc: SparkContext): ArrayBuffer[String] = { 117 | val rdd = sc.cassandraTable("twitter", "tweet_filtered2").select("tweet_text").cache() 118 | 119 | var dictionnary = new ArrayBuffer[String] 120 | 121 | println("Tweets by tweets -> Create documents and vocabulary") 122 | rdd.select("tweet_text").as((i: String) => i).foreach(x => { 123 | 124 | val tweet = x 125 | .toLowerCase.split("\\s") 126 | .filter(_.length > 3) 127 | .filter(_.forall(java.lang.Character.isLetter)).mkString(" ") 128 | 129 | if (tweet.length > 1) 130 | dictionnary += tweet 131 | }) 132 | }*/ 133 | 134 | // (RDD[(VertexId, (String))], RDD[Edge[String]]) 135 | def getAllCommunicationsToGraph(sc: SparkContext): Graph[String, String] = { 136 | println(color("\nCall getAllCommunications", RED)) 137 | 138 | 139 | /* val users: RDD[(VertexId, (String))] = 140 | sc.parallelize(List( 141 | (2732329846L, "Michael"), 142 | (132988448L, "David"), 143 | (473822999L, "Sarah"), 144 | (2932436311L, "Jean"), 145 | (2249679902L, "Raphael"), 146 | (601389784L, "Lucie"), 147 | (2941487254L, "Harold"), 148 | (1192483885L, "Pierre"), 149 | (465776805L, "Christophe"), 150 | (838147628L, "Zoe"), 151 | (2564641105L, "Fabien"), 152 | (1518391292L, "Nicolas") 153 | ))*/ 154 | 155 | 156 | // Collection of vertices (contains users) 157 | // val collectionVertices = ListBuffer[(Long, String)]() 158 | 159 | 160 | // val users: RDD[(VertexId, (String))] = sc.parallelize(collectionVertices) 161 | 162 | 163 | //val con = sc.cassandraTable("twitter", "user_filtered") 164 | //con.toArray.foreach(println) 165 | /*println("Test -1") 166 | 167 | var t0 = System.nanoTime() 168 | for (row <- query) { 169 | 170 | } 171 | 172 | var t1 = System.nanoTime() 173 | println("Elapsed time: " + (t1 - t0) + "ns")*/ 174 | 175 | // val query = sc.cassandraTable("twitter", "user_filtered").select("user_local_id", "user_screen_name") 176 | 177 | 178 | /*val con = query.map{ 179 | case result => (result._1, result._2) 180 | }*/ 181 | val cc = new CassandraSQLContext(sc) 182 | 183 | println("Test 0") 184 | var t0 = System.nanoTime() 185 | val rdd0 = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered") 186 | 187 | val pelo = rdd0.map(p => (p(0).toString.toLong, p(1).toString)).cache() 188 | 189 | val rdd1 = cc.sql("SELECT tweet_id, user_send_local_id, user_dest_id from twitter.users_communicate") 190 | 191 | val pelo2 = rdd1.map(p => Edge(p(1).toString.toLong, p(2).toString.toLong, p(0).toString)).cache() 192 | 193 | Graph(pelo, pelo2) 194 | 195 | /*println("okkk") 196 | 197 | graphh.vertices.foreach(println(_)) 198 | 199 | 200 | //pelo.foreach(println(_)) 201 | 202 | println("After collecting") 203 | 204 | rdd0.show() 205 | 206 | for (row <- rdd0) { 207 | //println(row(0)) 208 | 209 | collectionVertices += ((row(0).toString.toLong, row(1).toString)) 210 | //collectionVertices.append((row(0).toString.toLong, row(1).toString)) 211 | } 212 | var t1 = System.nanoTime() 213 | println("Elapsed time: " + (t1 - t0) + "ns") 214 | 215 | 216 | println("Test 1") 217 | t0 = System.nanoTime() 218 | 219 | val rdd = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered LIMIT 100").persist() 220 | for (row <- rdd) { 221 | collectionVertices += ((row(0).toString.toLong, row(1).toString)) 222 | } 223 | rdd.unpersist() 224 | t1 = System.nanoTime() 225 | println("Elapsed time: " + (t1 - t0) + "ns") 226 | 227 | 228 | 229 | println("Test 2") 230 | t0 = System.nanoTime() 231 | val rdd2 = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered limit 10000").cache() 232 | for (row <- rdd2) { 233 | collectionVertices += ((row(0).toString.toLong, row(1).toString)) 234 | } 235 | t1 = System.nanoTime() 236 | println("Elapsed time: " + (t1 - t0) + "ns") 237 | 238 | println("Test 3") 239 | t0 = System.nanoTime() 240 | 241 | for (row <- cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered limit 10000")) { 242 | collectionVertices += ((row(0).toString.toLong, row(1).toString)) 243 | } 244 | t1 = System.nanoTime() 245 | println("Elapsed time: " + (t1 - t0) + "ns") 246 | 247 | 248 | 249 | 250 | 251 | 252 | println("f") 253 | // println(rdd.take(1)) 254 | println("f2") 255 | * 256 | /* 257 | println("Query 1 ok") 258 | */ 259 | // Save result to ArrayBuffer 260 | //if (query.collect().length != 0) { 261 | //collectionVertices += ((query.first().getString("user_local_id").toLong, query.first().getString("user_local_id").toString)) 262 | println(query.first().getString("user_local_id")) 263 | // } 264 | 265 | //collectionVertices.foreach(println(_)) 266 | 267 | println("Query 1 Collect ok") 268 | 269 | 270 | 271 | // Collection of edges (contains communications between users) 272 | val collectionEdge = ArrayBuffer[Edge[String]]() 273 | 274 | 275 | //query = sc.cassandraTable("twitter", "users_communicate").select("user_send_local_id", "user_dest_id", "tweet_id").toArray() 276 | 277 | println("Query 2 ok") 278 | // Save result to ArrayBuffer 279 | /*if (query.collect().length != 0) { 280 | collectionEdge += Edge(query.first().getString("user_send_local_id").toLong, query.first().getString("user_dest_id").toLong, query.first().getString("tweet_id").toString) 281 | }*/ 282 | 283 | //collectionEdge.foreach(println(_)) 284 | 285 | println("Query 2 Collect ok") 286 | 287 | // Convert vertices to RDD 288 | val VerticesRDD = sc.parallelize(collectionVertices) 289 | 290 | // Convert it to RDD 291 | val EdgeRDD = sc.parallelize(collectionEdge) 292 | 293 | println("Total vertices: " + collectionVertices.length) 294 | println("Total edges: " + collectionEdge.length) 295 | 296 | (VerticesRDD, EdgeRDD)*/ 297 | } 298 | } -------------------------------------------------------------------------------- /scala/FindCommunities/src/main/scala/utils/CommunityUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.graphx._ 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.math._ 8 | import scala.reflect.ClassTag 9 | 10 | class CommunityUtils extends Logging { 11 | 12 | val RED = "\033[1;30m" 13 | val ENDC = "\033[0m" 14 | 15 | /** 16 | * splitCommunity 17 | * 18 | * Find and split communities in graph 19 | * 20 | * @param Graph[String,String] $graph - Graph element 21 | * @param RDD[(VertexId, (String))] $users - Vertices 22 | * @param Boolean $displayResult - if true, display println 23 | * @return ArrayBuffer[Graph[String,String]] - Contains one graph per community 24 | * 25 | */ 26 | def splitCommunity(graph: Graph[String, String], users: RDD[(VertexId, (String))], NBKCORE: Int, displayResult: Boolean): Graph[String, String] = { 27 | 28 | println(color("\nCall SplitCommunity", RED)) 29 | 30 | getKCoreGraph(graph, users, NBKCORE, displayResult).cache() 31 | } 32 | 33 | /** 34 | * Compute the k-core decomposition of the graph for all k <= kmax. This 35 | * uses the iterative pruning algorithm discussed by Alvarez-Hamelin et al. 36 | * in K-Core Decomposition: a Tool For the Visualization of Large Scale Networks 37 | * (see http://arxiv.org/abs/cs/0504107). 38 | * 39 | * @tparam VD the vertex attribute type (discarded in the computation) 40 | * @tparam ED the edge attribute type (preserved in the computation) 41 | * 42 | * @param graph the graph for which to compute the connected components 43 | * @param kmax the maximum value of k to decompose the graph 44 | * 45 | * @return a graph where the vertex attribute is the minimum of 46 | * kmax or the highest value k for which that vertex was a member of 47 | * the k-core. 48 | * 49 | * @note This method has the advantage of returning not just a single kcore of the 50 | * graph but will yield all the cores for k > kmin. 51 | */ 52 | def getKCoreGraph[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], 53 | users: RDD[(VertexId, (String))], 54 | kmin: Int, 55 | displayResult: Boolean): Graph[String, ED] = { 56 | 57 | // Graph[(Int, Boolean), ED] - boolean indicates whether it is active or not 58 | var g = graph.cache().outerJoinVertices(graph.degrees)((vid, oldData, newData) => newData.getOrElse(0)).cache() 59 | 60 | println(color("\nCall KCoreDecomposition", RED)) 61 | 62 | g = computeCurrentKCore(g, kmin).cache() 63 | 64 | val v = g.vertices.filter { case (vid, vd) => vd >= kmin }.cache() 65 | 66 | // Display informations 67 | if (displayResult) { 68 | val degrees = graph.degrees 69 | val numVertices = degrees.count() 70 | val testK = kmin 71 | val vCount = g.vertices.filter { case (vid, vd) => vd >= kmin }.count() 72 | val eCount = g.triplets.map { t => t.srcAttr >= testK && t.dstAttr >= testK }.count() 73 | 74 | logWarning(s"Number of vertices: $numVertices") 75 | logWarning(s"Degree sample: ${degrees.take(10).mkString(", ")}") 76 | logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey(_ + _).collect().mkString(", ")) 77 | logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey(_ + _).take(10).mkString(", ")) 78 | logWarning(s"K=$kmin, V=$vCount, E=$eCount") 79 | } 80 | 81 | // Create new RDD users 82 | val newUser = users.join(v).map { 83 | case (id, (username, rank)) => (id, username) 84 | } 85 | 86 | // Create a new graph 87 | val gra = Graph(newUser, g.edges) 88 | 89 | // Remove missing vertices as well as the edges to connected to them 90 | gra.subgraph(vpred = (id, username) => username != null).cache() 91 | } 92 | 93 | def computeCurrentKCore[ED: ClassTag](graph: Graph[Int, ED], k: Int) = { 94 | println("Computing kcore for k=" + k) 95 | def sendMsg(et: EdgeTriplet[Int, ED]): Iterator[(VertexId, Int)] = { 96 | if (et.srcAttr < 0 || et.dstAttr < 0) { 97 | // if either vertex has already been turned off we do nothing 98 | Iterator.empty 99 | } else if (et.srcAttr < k && et.dstAttr < k) { 100 | // tell both vertices to turn off but don't need change count value 101 | Iterator((et.srcId, -1), (et.dstId, -1)) 102 | 103 | } else if (et.srcAttr < k) { 104 | // if src is being pruned, tell dst to subtract from vertex count 105 | Iterator((et.srcId, -1), (et.dstId, 1)) 106 | 107 | } else if (et.dstAttr < k) { 108 | // if dst is being pruned, tell src to subtract from vertex count 109 | Iterator((et.dstId, -1), (et.srcId, 1)) 110 | 111 | } else { 112 | Iterator.empty 113 | } 114 | } 115 | 116 | // subtracts removed neighbors from neighbor count and tells vertex whether it was turned off or not 117 | def mergeMsg(m1: Int, m2: Int): Int = { 118 | if (m1 < 0 || m2 < 0) { 119 | -1 120 | } else { 121 | m1 + m2 122 | } 123 | } 124 | 125 | def vProg(vid: VertexId, data: Int, update: Int): Int = { 126 | if (update < 0) { 127 | // if the vertex has turned off, keep it turned off 128 | -1 129 | } else { 130 | // subtract the number of neighbors that have turned off this round from 131 | // the count of active vertices 132 | // TODO(crankshaw) can we ever have the case data < update? 133 | max(data - update, 0) 134 | } 135 | } 136 | 137 | // Note that initial message should have no effect 138 | Pregel(graph, 0)(vProg, sendMsg, mergeMsg) 139 | } 140 | 141 | 142 | /** 143 | * @constructor time 144 | * 145 | * timer for profiling block 146 | * 147 | * @param R $block - Block executed 148 | * @return Unit 149 | */ 150 | def time[R](block: => R): R = { 151 | val t0 = System.nanoTime() 152 | val result = block // call-by-name 153 | val t1 = System.nanoTime() 154 | println("Elapsed time: " + (t1 - t0) / 1000000000.0 + " seconds") 155 | result 156 | } 157 | 158 | def subgraphCommunities(graph: Graph[String, String], users: RDD[(VertexId, (String))], displayResult: Boolean): (Array[Graph[String, String]], Array[Long]) = { 159 | 160 | println(color("\nCall subgraphCommunities", RED)) 161 | 162 | // Find the connected components 163 | val cc = time { 164 | graph.connectedComponents().vertices.cache() 165 | } 166 | 167 | // Join the connected components with the usernames and id 168 | // The result is an RDD not a Graph 169 | val ccByUsername = users.join(cc).map { 170 | case (id, (username, cci)) => (id, username, cci) 171 | }.cache() 172 | 173 | // Print the result 174 | val lowerIDPerCommunity = ccByUsername.map { case (id, username, cci) => cci }.distinct().cache() 175 | 176 | // Result will be stored in an array 177 | //var result = new ArrayBuffer[Graph[String, String]]() 178 | println("--------------------------") 179 | println("Total community found: " + lowerIDPerCommunity.count()) 180 | println("--------------------------") 181 | 182 | 183 | val collectIDsCommunity = lowerIDPerCommunity.collect() 184 | 185 | val result = collectIDsCommunity.map(colID => Graph(ccByUsername.filter { 186 | _._3 == colID 187 | }.map { case (id, username, cc) => (id, username) }, graph.edges).subgraph(vpred = (id, username) => username != null).cache()) 188 | 189 | // Display communities 190 | if (displayResult) { 191 | println("\nCommunities found " + result.length) 192 | for (community <- result) { 193 | println("-----------------------") 194 | community.edges.collect().foreach(println(_)) 195 | community.vertices.collect().foreach(println(_)) 196 | } 197 | } 198 | 199 | cc.unpersist() 200 | lowerIDPerCommunity.unpersist() 201 | 202 | (result, collectIDsCommunity) 203 | } 204 | 205 | /** 206 | * getTriangleCount 207 | * 208 | * Compute the number of triangles passing through each vertex. 209 | * 210 | * @param Graph[String,String] $graph - Graph element 211 | * @param RDD[(VertexId, (String))] $users - Vertices 212 | * @return Unit 213 | * 214 | * @see [[org.apache.spark.graphx.lib.TriangleCount$#run]] 215 | */ 216 | def getTriangleCount(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = { 217 | 218 | println(color("\nCall getTriangleCount", RED)) 219 | 220 | // Sort edges ID srcID < dstID 221 | val edges = graph.edges.map { e => 222 | if (e.srcId < e.dstId) { 223 | Edge(e.srcId, e.dstId, e.attr) 224 | } 225 | else { 226 | Edge(e.dstId, e.srcId, e.attr) 227 | } 228 | } 229 | 230 | // Temporary graph 231 | val newGraph = Graph(users, edges, "").cache() 232 | 233 | // Find the triangle count for each vertex 234 | // TriangleCount requires the graph to be partitioned 235 | val triCounts = newGraph.partitionBy(PartitionStrategy.RandomVertexCut).cache().triangleCount().vertices 236 | 237 | val triCountByUsername = users.join(triCounts).map { 238 | case (id, (username, rank)) => (id, username, rank) 239 | } 240 | 241 | println("Display triangle's sum for each user") 242 | triCountByUsername.foreach(println) 243 | 244 | println("\nTotal: " + triCountByUsername.map { case (id, username, rank) => rank }.distinct().count() + "\n") 245 | } 246 | 247 | /** 248 | * @constructor ConnectedComponents 249 | * 250 | * Compute the connected component membership of each vertex and return a graph with the vertex 251 | * value containing the lowest vertex id in the connected component containing that vertex. 252 | * 253 | * @param Graph[String,String] $graph - Graph element 254 | * @param RDD[(VertexId, (String))] $users - Vertices 255 | * @return Unit 256 | * 257 | * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]] 258 | */ 259 | def cc(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = { 260 | println(color("\nCall ConnectedComponents", RED)) 261 | 262 | // Find the connected components 263 | val cc = graph.connectedComponents().vertices 264 | 265 | // Join the connected components with the usernames and id 266 | val ccByUsername = users.join(cc).map { 267 | case (id, (username, cc)) => (id, username, cc) 268 | } 269 | // Print the result 270 | println(ccByUsername.collect().sortBy(_._3).mkString("\n")) 271 | 272 | println("\nTotal groups: " + ccByUsername.map { case (id, username, cc) => cc }.distinct().count() + "\n") 273 | } 274 | 275 | /** 276 | * @constructor StronglyConnectedComponents 277 | * 278 | * Compute the strongly connected component (SCC) of each vertex and return a graph with the 279 | * vertex value containing the lowest vertex id in the SCC containing that vertex. 280 | * 281 | * Display edges's membership and total groups 282 | * 283 | * @param Graph[String,String] $graph - Graph element 284 | * @param Int $iteration - Number of iteration 285 | * @return Unit 286 | */ 287 | def scc(graph: Graph[String, String], iteration: Int): Unit = { 288 | 289 | println(color("\nCall StronglyConnectedComponents : iteration : " + iteration, RED)) 290 | val sccGraph = graph.stronglyConnectedComponents(5) 291 | 292 | val connectedGraph = sccGraph.vertices.map { 293 | case (member, leaderGroup) => s"$member is in the group of $leaderGroup's edge" 294 | } 295 | 296 | val totalGroups = sccGraph.vertices.map { 297 | case (member, leaderGroup) => leaderGroup 298 | } 299 | 300 | connectedGraph.collect().foreach(println) 301 | 302 | println("\nTotal groups: " + totalGroups.distinct().count() + "\n") 303 | } 304 | 305 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 306 | } -------------------------------------------------------------------------------- /scala/FindCommunities/src/main/scala/utils/GraphUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // To make some of the examples work we will also need RDD 4 | 5 | import org.apache.spark.graphx._ 6 | import org.apache.spark.rdd.RDD 7 | 8 | 9 | class GraphUtils extends serializable { 10 | 11 | val RED = "\033[1;30m" 12 | val ENDC = "\033[0m" 13 | private val defaultSeed = 0xadc83b19L 14 | 15 | /** 16 | * @constructor murmurHash64A 17 | * 18 | * @param 19 | * @param 20 | * @return Long 21 | * 22 | */ 23 | def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = { 24 | val m = 0xc6a4a7935bd1e995L 25 | val r = 47 26 | 27 | val f: Long => Long = m.* 28 | val g: Long => Long = x => x ^ (x >>> r) 29 | 30 | val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) => 31 | val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff)) 32 | val j: Long => Long = if (xs.length == 8) f compose g compose f else identity 33 | f(y ^ j(k)) 34 | } 35 | (g compose f compose g)(h) 36 | } 37 | 38 | /** 39 | * @constructor getPageRank 40 | * 41 | * Run PageRank for a fixed number of iterations returning a graph with vertex attributes 42 | * containing the PageRank and edge attributes the normalized edge weight. 43 | * 44 | * @param Graph[String,String] $graph - Graph element 45 | * @param RDD[(VertexId, (String))] $users - Vertices 46 | * @return Unit 47 | * 48 | * @see [[org.apache.spark.graphx.lib.PageRank$#run]] 49 | */ 50 | def getPageRank(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = { 51 | 52 | println(color("\nCall getPageRank", RED)) 53 | 54 | val ranks = graph.pageRank(0.00001).vertices 55 | 56 | val ranksByUsername = users.join(ranks).map { 57 | case (id, (username, rank)) => (id, username, rank) 58 | } 59 | 60 | // Print the result descending 61 | println(ranksByUsername.collect().sortBy(_._3).reverse.mkString("\n")) 62 | } 63 | 64 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 65 | 66 | /** 67 | * @constructor inAndOutDegrees 68 | * 69 | * @param Graph[String,String] $graph - Graph element 70 | * @return Unit 71 | * 72 | */ 73 | def inAndOutDegrees(graph: Graph[String, String]): Unit = { 74 | 75 | println(color("\nCall inAndOutDegrees", RED)) 76 | 77 | // Create User class 78 | case class User(name: String, // Username 79 | inDeg: Int, // Received tweets 80 | outDeg: Int) // Sent tweets 81 | 82 | // Create user Graph 83 | // def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED] 84 | val initialUserGraph: Graph[User, String] = graph.mapVertices { 85 | case (id, (name)) => User(name, 0, 0) 86 | } 87 | 88 | //initialUserGraph.edges.collect.foreach(println(_)) 89 | 90 | 91 | // Fill in the degree informations (out and in degrees) 92 | val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) { 93 | case (id, u, inDegOpt) => User(u.name, inDegOpt.getOrElse(0), u.outDeg) 94 | }.outerJoinVertices(initialUserGraph.outDegrees) { 95 | case (id, u, outDegOpt) => User(u.name, u.inDeg, outDegOpt.getOrElse(0)) 96 | } 97 | 98 | // Display the userGraph 99 | userGraph.vertices.foreach { 100 | case (id, u) => println(s"User $id is called ${u.name} and received ${u.inDeg} tweets and send ${u.outDeg}.") 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /scala/FindCommunities/src/main/scala/utils/MllibUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark.mllib.clustering._ 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.collection.mutable 8 | 9 | /** 10 | * Topic models automatically infer the topics discussed in a collection of documents. These topics can be used 11 | * to summarize and organize documents, or used for featurization and dimensionality reduction in later stages 12 | * of a Machine Learning (ML) pipeline. 13 | * 14 | * LDA is not given topics, so it must infer them from raw text. LDA defines a topic as a distribution over words. 15 | */ 16 | class MllibUtils { 17 | 18 | // Terminal Color 19 | val RED = "\033[1;30m" 20 | val ENDC = "\033[0m" 21 | 22 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 23 | 24 | def createdoc(tokenizedCorpus: RDD[String]): ((Seq[(Long, Vector)], Array[String], Map[String, Int], Array[String])) = { 25 | 26 | println(color("\nCall createdoc", RED)) 27 | 28 | // Choose the vocabulary. 29 | // termCounts: Sorted list of (term, termCount) pairs 30 | val termCounts: Array[(String, Long)] = 31 | tokenizedCorpus.map(_ -> 1L).reduceByKey(_ + _).collect().sortBy(-_._2) 32 | 33 | // vocabArray: Chosen vocab (removing common terms) 34 | val numStopwords = 20 35 | val vocabArray: Array[String] = 36 | termCounts.takeRight(termCounts.length - numStopwords).map(_._1) 37 | 38 | // vocab: Map term -> term index 39 | val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap 40 | 41 | val tokenCollected = tokenizedCorpus.collect() 42 | 43 | 44 | // MAP : [ Word ID , VECTOR [vocab.size, WordFrequency]] 45 | val documents: Map[Long, Vector] = vocab.map { case (tokens, id) => 46 | 47 | val counts = new mutable.HashMap[Int, Double]() 48 | 49 | // Word ID 50 | val idx = vocab(tokens) 51 | 52 | // Count word occurancy 53 | counts(idx) = counts.getOrElse(idx, 0.0) + tokenCollected.count(_ == tokens) 54 | 55 | // Return word ID and Vector 56 | (id.toLong, Vectors.sparse(vocab.size, counts.toSeq)) 57 | } 58 | 59 | (documents.toSeq, tokenizedCorpus.collect(), vocab, tokenizedCorpus.collect()) 60 | } 61 | 62 | 63 | def cosineSimilarity(tokenizedCorpus: RDD[String], vocab: Map[String, Int], tokenizedTweet: Array[String]): (Seq[(Long, Vector)]) = { 64 | 65 | println(color("\nCall cosineSimilarity", RED)) 66 | 67 | val document: Map[Long, Vector] = vocab.map { case (tokens, id) => 68 | 69 | val counts2 = new mutable.HashMap[Int, Double]() 70 | 71 | // Word ID 72 | val idx = vocab(tokens) 73 | 74 | // Count word occurancy 75 | counts2(idx) = counts2.getOrElse(idx, 0.0) + tokenizedTweet.count(_ == tokens).toDouble 76 | 77 | // Return word ID and Vector 78 | (id.toLong, Vectors.sparse(vocab.size, counts2.toSeq)) 79 | } 80 | 81 | document.toSeq 82 | } 83 | 84 | /** 85 | * @constructor findTopics 86 | * 87 | * Set currentTweet attribut and add the new tweet to the dictionnary 88 | * 89 | * @param LDAModel $ldaModel - LDA Model (LocalModel) 90 | * @param Array[String] $vocabArray - Contains all distinct words set to LDA 91 | * @param Int $numWordsByTopics - 92 | * @param Boolean $displayResult - Display result in console 93 | * 94 | * @return LDAModel 95 | */ 96 | def findTopics(ldaModel: LDAModel, vocabArray: Array[String], T: String, SG: Int, numWordsByTopics: Int, displayResult: Boolean): Seq[(String, String, String, String)] = { 97 | 98 | println(color("\nCall findTopics", RED)) 99 | 100 | println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):") 101 | 102 | val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = numWordsByTopics) 103 | 104 | var it = 0 105 | var seqC = List[(String, String, String, String)]() 106 | 107 | // Print topics, showing top-weighted x terms for each topic. 108 | topicIndices.foreach { case (terms, termWeights) => 109 | 110 | if (displayResult) 111 | println("TOPICS:") 112 | 113 | val tabTopics = terms.zip(termWeights).map(vector => vocabArray(vector._1.toInt).toString).mkString(";") 114 | 115 | if (displayResult) { 116 | terms.zip(termWeights).foreach { case (term, weight) => 117 | println(s"${vocabArray(term.toInt)}\t\t$weight") 118 | } 119 | } 120 | 121 | seqC = seqC :+(T, SG.toString, it.toString, tabTopics) 122 | 123 | println("T: " + T + " SG: " + SG + "TopicN: " + it + " c: " + tabTopics) 124 | it += 1 125 | 126 | if (displayResult) 127 | println() 128 | 129 | } 130 | seqC.toSeq 131 | } 132 | } -------------------------------------------------------------------------------- /scala/FindCommunities/src/main/scala/utils/RDDUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | 7 | // To make some of the examples work we will also need RDD 8 | 9 | import org.apache.spark.graphx._ 10 | import org.apache.spark.rdd.RDD 11 | 12 | 13 | class RDDUtils { 14 | 15 | val RED = "\033[1;30m" 16 | val ENDC = "\033[0m" 17 | 18 | /** 19 | * @constructor ArrayToVertices 20 | * 21 | * Convert ArrayBuffer to RDD containing Vertices 22 | * 23 | * @param SparkContext - $sc - SparkContext 24 | * @param ArrayBuffer[(Long, (String))] - $collection - Contains vertices 25 | * 26 | * @return RDD[Edge[String]] - RDD of vertices 27 | */ 28 | def ArrayToVertices(sc: SparkContext, collection: ArrayBuffer[(Long, (String))]): RDD[(VertexId, (String))] = { 29 | sc.parallelize(collection) 30 | } 31 | 32 | /** 33 | * @constructor ArrayToEdges 34 | * 35 | * Convert ArrayBuffer to RDD containing Edges 36 | * 37 | * @param SparkContext - $sc - SparkContext 38 | * @param ArrayBuffer[Edge[String]] - $collection - Contains edges 39 | * 40 | * @return RDD[Edge[String]] - RDD of edges 41 | */ 42 | def ArrayToEdges(sc: SparkContext, collection: ArrayBuffer[Edge[String]]): RDD[Edge[String]] = { 43 | sc.parallelize(collection) 44 | } 45 | 46 | /** 47 | * @constructor findUserByIDInGraph 48 | * 49 | * find user ID with username 50 | * 51 | * @param Graph[String,String] $graph - Graph element 52 | * @param Int $userID - User id 53 | * @return String - if success : username | failure : "user not found" 54 | */ 55 | def findUserNameByIDInGraph(graph: Graph[String, String], userID: Int): String = { 56 | println(color("\nCall : findUserNameWithID", RED)) 57 | 58 | graph.vertices.filter { case (id, name) => id.toString equals userID.toString }.collect().foreach { 59 | (e: (org.apache.spark.graphx.VertexId, String)) => return e._2 60 | } 61 | "user not found" 62 | } 63 | 64 | /** 65 | * @constructor findUserIDByNameInGraph 66 | * 67 | * find username with id 68 | * 69 | * @param Graph[String,String] $graph - Graph element 70 | * @param String $userName - Username 71 | * @return String - if success : id found | failure : "0" 72 | */ 73 | def findUserIDByNameInGraph(graph: Graph[String, String], userName: String): String = { 74 | println(color("\nCall : findUserIDWithName", RED)) 75 | 76 | graph.vertices.filter(_._2 == userName).collect().foreach { 77 | (e: (org.apache.spark.graphx.VertexId, String)) => return e._1.toString 78 | } 79 | "0" 80 | } 81 | 82 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 83 | 84 | /** 85 | * @constructor displayAllCommunications 86 | * 87 | * display all communications between users 88 | * 89 | * @param Graph[String,String] $graph - Graph element 90 | * @return Unit 91 | */ 92 | def displayAllCommunications(graph: Graph[String, String]): Unit = { 93 | 94 | println(color("\nCall : displayAllCommunications", RED)) 95 | println("Users communications: ") 96 | 97 | val facts: RDD[String] = graph.triplets.map(triplet => triplet.srcAttr + " communicate with " + 98 | triplet.dstAttr + " with tweet id " + triplet.attr) 99 | 100 | facts.collect().foreach(println(_)) 101 | } 102 | } -------------------------------------------------------------------------------- /scala/GraphxTesting/build.sbt: -------------------------------------------------------------------------------- 1 | name := "GraphxTesting" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.5" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-core" % "1.4.0" % "provided", 9 | "org.apache.spark" %% "spark-graphx" % "1.4.0" % "provided", 10 | "org.apache.spark" %% "spark-mllib" % "1.4.0" % "provided") 11 | 12 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0-M1" 13 | 14 | libraryDependencies += "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly() -------------------------------------------------------------------------------- /scala/GraphxTesting/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") -------------------------------------------------------------------------------- /scala/GraphxTesting/src/main/scala/GraphxTesting.scala: -------------------------------------------------------------------------------- 1 | import org.apache.log4j.{Level, Logger} 2 | import org.apache.spark.graphx._ 3 | import org.apache.spark.mllib.clustering.LDA 4 | import org.apache.spark.mllib.linalg.Vector 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import utils._ 7 | 8 | import scala.collection.mutable.ArrayBuffer 9 | import scala.math._ 10 | 11 | // To make some of the examples work we will also need RDD 12 | 13 | import org.apache.spark.rdd.RDD 14 | 15 | // Useful links 16 | // http://ampcamp.berkeley.edu/big-data-mini-course/graph-analytics-with-graphx.html 17 | // https://spark.apache.org/docs/latest/graphx-programming-guide.html 18 | 19 | object GraphxTesting { 20 | 21 | val RED = "\033[1;30m" 22 | val ENDC = "\033[0m" 23 | 24 | def main(args: Array[String]) { 25 | 26 | println("\n\n**************************************************************") 27 | println("****************** GraphxTesting ******************") 28 | println("**************************************************************\n") 29 | 30 | val cu = new CassandraUtils 31 | val comUtils = new CommunityUtils 32 | val gu = new GraphUtils 33 | val ru = new RDDUtils 34 | 35 | // Display only warning and infos messages 36 | Logger.getLogger("org").setLevel(Level.ERROR) 37 | Logger.getLogger("akka").setLevel(Level.ERROR) 38 | 39 | // Not displaying infos messages 40 | //Logger.getLogger("org").setLevel(Level.OFF) 41 | //Logger.getLogger("akka").setLevel(Level.OFF) 42 | 43 | // Spark configuration 44 | val sparkConf = new SparkConf(true) 45 | .setMaster("local[2]") 46 | .setAppName("GraphxTesting") 47 | .set("spark.cassandra.connection.host", "127.0.0.1") // Link to Cassandra 48 | 49 | // Init SparkContext 50 | val sc = new SparkContext(sparkConf) 51 | 52 | // Create Vertices and Edges 53 | val (users, relationships, defaultUser) = initGraph(sc) 54 | 55 | // Build the initial Graph 56 | val graph = Graph(users, relationships, defaultUser).cache() 57 | 58 | /* 59 | 60 | println("\n**************************************************************") 61 | println(" TEST METHODS ") 62 | println("**************************************************************") 63 | 64 | println("\n--------------------------------------------------------------") 65 | println("Operations on tweets") 66 | println("--------------------------------------------------------------\n") 67 | 68 | // See who communicates with who 69 | time { ru displayAllCommunications(graph) } 70 | 71 | // Let's find user id 72 | val id = time { ru findUserIDByNameInGraph(graph, "Michael") } 73 | println("ID for user Michael is : " + id.toString) 74 | 75 | // Find username with user ID 76 | val name = time { ru findUserNameByIDInGraph(graph, 1) } 77 | println("Name for id 1 is : " + name.toString) 78 | 79 | // get tweet content with tweet ID 80 | var resultGetTweetContentFromID = time { cu getTweetContentFromID(sc,"606461329357045760") } 81 | println(resultGetTweetContentFromID) 82 | 83 | // this one does not exist 84 | resultGetTweetContentFromID = time { cu getTweetContentFromID(sc,"604230254979346433") } 85 | println(resultGetTweetContentFromID) 86 | 87 | // Get tweets from user 88 | val resultGetTweetsIDFromUser = time { cu getTweetsIDFromUser(sc,"209144549") } 89 | resultGetTweetsIDFromUser.foreach(println(_)) 90 | 91 | // Count in and out degrees 92 | //time { gu inAndOutDegrees(graph) } 93 | 94 | 95 | println("\n--------------------------------------------------------------") 96 | println("Community detection") 97 | println("--------------------------------------------------------------\n") 98 | 99 | // Call ConnectedComponents 100 | time { comUtils cc(graph, users) } 101 | 102 | // Call StronglyConnectedComponents 103 | time { comUtils scc(graph, 1) } 104 | 105 | // Get triangle Count 106 | time { comUtils getTriangleCount(graph, users) } 107 | 108 | // Get PageRank 109 | time { gu getPageRank(graph, users) } 110 | 111 | // K-Core decomposition 112 | time { comUtils getKCoreGraph(graph, users, 4, true) } 113 | 114 | // LabelPropagation 115 | val graphLabelPropagation = time { LabelPropagation.run(graph, 4).cache() } 116 | 117 | println("VERTICES") 118 | graphLabelPropagation.vertices.collect.foreach(println(_)) 119 | 120 | val labelVertices = graphLabelPropagation.vertices 121 | 122 | val displayVertices = users.join(labelVertices).map { 123 | case (id, (username, rank)) => (id, username, rank) 124 | } 125 | println("VERTICES NAMED") 126 | 127 | // Print the result descending 128 | println(displayVertices.collect().sortBy(_._3).reverse.mkString("\n")) 129 | println("EDGES") 130 | 131 | graphLabelPropagation.edges.collect.foreach(println(_)) 132 | 133 | 134 | println("\n**************************************************************") 135 | println(" FIRST EXAMPLE ") 136 | println("**************************************************************") 137 | 138 | 139 | println("\n--------------------------------------------------------------") 140 | println("First Step - K-Core Decomposition algorithm") 141 | println("--------------------------------------------------------------") 142 | 143 | // K-Core decomposition 144 | val graph_2 = time { comUtils getKCoreGraph(graph, users, 5, false) }.cache() 145 | 146 | graph_2.edges.collect.foreach(println(_)) 147 | graph_2.vertices.collect.foreach(println(_)) 148 | 149 | println("\n--------------------------------------------------------------") 150 | println("Second Step - Connected Components algorithm") 151 | println("--------------------------------------------------------------") 152 | 153 | // Call ConnectedComponents 154 | time { comUtils cc(graph_2, graph_2.vertices) } 155 | 156 | println("\n--------------------------------------------------------------") 157 | println("Third Step - Get Tweets from Edges") 158 | println("--------------------------------------------------------------") 159 | 160 | val corpusWords = time { cu getTweetsContentFromEdge(sc, graph_2.edges, true) } 161 | corpusWords.foreach(println(_)) 162 | 163 | /*println("\n--------------------------------------------------------------") 164 | println("Fourth Step - LDA Algorithm") 165 | println("--------------------------------------------------------------") 166 | 167 | val nTopics = 10 168 | val nIterations = 10 169 | val nWordsByTopics = 10 170 | val nStopwords = 20 171 | time { mu getLDA(sc, corpusWords, nTopics, nIterations, nWordsByTopics, nStopwords, true) }*/ 172 | 173 | */ 174 | 175 | 176 | println("\n**************************************************************") 177 | println(" SECOND EXAMPLE ") 178 | println("**************************************************************") 179 | 180 | println("\n--------------------------------------------------------------") 181 | println("First Step - Split community : \n" + 182 | "\t Connected Components algorithm to find different\n" + 183 | "\t communities") 184 | println("--------------------------------------------------------------") 185 | 186 | //time { comUtils cc(graph, graph.vertices) } 187 | 188 | val subGraphes = time { 189 | comUtils splitCommunity(graph, users, false) 190 | } 191 | 192 | println("\n--------------------------------------------------------------") 193 | println("Second Step - Calculate LDA for every communities\n" + 194 | "\t 1. Get Tweets from Edges\n" + 195 | "\t 2. LDA Algorithm") 196 | println("--------------------------------------------------------------") 197 | var iComm = 1 198 | //for (community <- subGraphes){ 199 | println("--------------------------") 200 | println("Community : " + iComm) 201 | println("--------------------------") 202 | //community.edges.collect().foreach(println(_)) 203 | //community.vertices.collect().foreach(println(_)) 204 | 205 | println("--------------------------") 206 | println("Get Tweets from Edges") 207 | println("--------------------------") 208 | //val corpus = time { cu getTweetsContentFromEdge(sc, community.edges, false) } 209 | 210 | println("--------------------------") 211 | println("LDA Algorithm") 212 | println("--------------------------") 213 | val numTopics = 5 214 | val numIterations = 10 215 | val numWordsByTopics = 5 216 | val numStopwords = 0 217 | 218 | // Initialize LDA 219 | println(color("\nCall InitLDA", RED)) 220 | 221 | val topicSmoothing = 1.2 222 | val termSmoothing = 1.2 223 | 224 | // Set LDA parameters 225 | val lda = new LDA() 226 | .setOptimizer("online") 227 | .setK(numTopics) 228 | .setDocConcentration(topicSmoothing) 229 | .setTopicConcentration(termSmoothing) 230 | .setMaxIterations(numIterations) 231 | 232 | // Create documents 233 | var firstDoc = ArrayBuffer[String]() 234 | firstDoc += "Concentration parameter commonly named for the prior placed" 235 | 236 | // Init LDA 237 | val mu = new MllibUtils(lda, sc, firstDoc, firstDoc) 238 | 239 | // First tweet 240 | mu newTweet ("Concentration distributions topics Concentration") 241 | 242 | // Get documents and word's array 243 | val (newdoc: RDD[(Long, Vector)], newvocabArray) = time { 244 | mu createDocuments(sc, 0) 245 | } 246 | 247 | var ldaModel = lda.run(newdoc) 248 | 249 | // Find topics 250 | ldaModel = time { 251 | mu findTopics(ldaModel, newvocabArray, numWordsByTopics, true) 252 | } 253 | 254 | // Second tweet 255 | mu newTweet ("October arrived, spreading a damp chill") 256 | 257 | val (newdoc2: RDD[(Long, Vector)], newvocabArray2) = time { 258 | mu createDocuments(sc, 0) 259 | } 260 | 261 | ldaModel = lda.run(newdoc2) 262 | 263 | // Find 264 | ldaModel = time { 265 | mu findTopics(ldaModel, newvocabArray2, numWordsByTopics, true) 266 | } 267 | 268 | 269 | iComm += 1 270 | //} 271 | 272 | // Generate Vertices 273 | val collectionVertices = ArrayBuffer[(Long, String)]() 274 | collectionVertices += ((2732329846L, "Michael")) 275 | collectionVertices += ((132988448L, "Jean")) 276 | 277 | // Convert it to RDD 278 | val VerticesRDD = ru ArrayToVertices(sc, collectionVertices) 279 | 280 | // Generate Hash 281 | val random = abs(gu murmurHash64A ("MichaelCaraccio".getBytes)) 282 | 283 | // Add edges 284 | val collectionEdge = ArrayBuffer[Edge[String]]() 285 | collectionEdge += Edge(random, 132988448L, "606460188367974400") 286 | collectionEdge += Edge(2732329846L, 2941487254L, "606461336986386435") 287 | collectionEdge += Edge(2732329846L, 601389784L, "606461384767897600") 288 | 289 | // Convert it to RDD 290 | val EdgeRDD = ru ArrayToEdges(sc, collectionEdge) 291 | 292 | // Create Graph 293 | val testGraph = Graph(VerticesRDD, EdgeRDD) 294 | 295 | testGraph.vertices.collect.foreach(println(_)) 296 | testGraph.edges.collect.foreach(println(_)) 297 | } 298 | 299 | /** 300 | * @constructor time 301 | * 302 | * timer for profiling block 303 | * 304 | * @param R $block - Block executed 305 | * @return Unit 306 | */ 307 | def time[R](block: => R): R = { 308 | val t0 = System.nanoTime() 309 | val result = block // call-by-name 310 | val t1 = System.nanoTime() 311 | println("Elapsed time: " + (t1 - t0) / 1000000000.0 + " seconds") 312 | result 313 | } 314 | 315 | /** 316 | * @constructor initGraph 317 | * 318 | * init data - construct graph and populate it 319 | * 320 | * @param SparkContext $sc - Sparkcontext 321 | * @return RDD[(VertexId, (String))] - users (Vertices) 322 | * RDD[Edge[String]] - relationship (Edges) 323 | * String - default user 324 | */ 325 | def initGraph(sc: SparkContext): (RDD[(VertexId, (String))], RDD[Edge[String]], String) = { 326 | println(color("\nCall : initGraph", RED)) 327 | 328 | // Create an RDD for the vertices 329 | val users: RDD[(VertexId, (String))] = 330 | sc.parallelize(Array( 331 | (2732329846L, "Michael"), 332 | (132988448L, "David"), 333 | (473822999L, "Sarah"), 334 | (2932436311L, "Jean"), 335 | (2249679902L, "Raphael"), 336 | (601389784L, "Lucie"), 337 | (2941487254L, "Harold"), 338 | (1192483885L, "Pierre"), 339 | (465776805L, "Christophe"), 340 | (838147628L, "Zoe"), 341 | (2564641105L, "Fabien"), 342 | (1518391292L, "Nicolas") 343 | )) 344 | 345 | // Create an RDD for edges 346 | val relationships: RDD[Edge[String]] = 347 | sc.parallelize(Array( 348 | Edge(2732329846L, 132988448L, "608919340121870338"), 349 | Edge(2732329846L, 2941487254L, "608919742347264000"), 350 | Edge(2732329846L, 601389784L, "608918664549687299"), 351 | Edge(601389784L, 2732329846L, "608918165117104129"), 352 | Edge(2941487254L, 1192483885L, "608921008020566016"), 353 | Edge(2941487254L, 132988448L, "608920341084258304"), 354 | Edge(132988448L, 838147628L, "608919327694270464"), 355 | Edge(838147628L, 132988448L, "608919807887552513"), 356 | Edge(838147628L, 473822999L, "608919870277869568"), 357 | Edge(465776805L, 2941487254L, "608920678117597184"), 358 | Edge(465776805L, 601389784L, "608917990365499392"), 359 | Edge(465776805L, 2249679902L, "608918336643039232"), 360 | Edge(2249679902L, 465776805L, "608919570796163072"), 361 | Edge(2932436311L, 465776805L, "608921304377475073"), 362 | Edge(1192483885L, 2941487254L, "608921260387610624"), 363 | Edge(465776805L, 2941487254L, "608918707797110784"), 364 | Edge(601389784L, 2732329846L, "608919779542339584"), 365 | Edge(2932436311L, 465776805L, "608917272883789824"), 366 | Edge(2941487254L, 465776805L, "608920374680506368"), 367 | Edge(2941487254L, 1192483885L, "608920849664450560"), 368 | Edge(2941487254L, 1192483885L, "608917634822733824"), 369 | Edge(1192483885L, 2941487254L, "608920742990868480"), 370 | Edge(1192483885L, 2941487254L, "608921092334354432"), 371 | Edge(2732329846L, 132988448L, "608917366538424320"), 372 | Edge(2941487254L, 132988448L, "608920981650976769"), 373 | Edge(132988448L, 2941487254L, "608920887639855104"), 374 | Edge(132988448L, 2941487254L, "608916751988867072"), 375 | Edge(132988448L, 2941487254L, "608919716137033730"), 376 | Edge(601389784L, 2732329846L, "608921306705354752"), 377 | Edge(601389784L, 2732329846L, "608918359913164801"), 378 | Edge(2732329846L, 2941487254L, "608920468985266176"), 379 | Edge(2732329846L, 2941487254L, "608918157806432257"), 380 | Edge(2564641105L, 1518391292L, "608918942086799360"), 381 | Edge(1518391292L, 2564641105L, "608921314104094720") 382 | )) 383 | 384 | // Define a default user in case there are relationship with missing user 385 | val defaultUser = "John Doe" 386 | 387 | (users, relationships, defaultUser) 388 | } 389 | 390 | /** 391 | * @constructor 392 | * 393 | * 394 | * 395 | * @param 396 | * @return 397 | */ 398 | /*def isVerticeInGraph(): Unit ={ 399 | 400 | }*/ 401 | 402 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 403 | } -------------------------------------------------------------------------------- /scala/GraphxTesting/src/main/scala/utils/CassandraUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | 7 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD: 8 | 9 | import com.datastax.spark.connector._ 10 | 11 | // To make some of the examples work we will also need RDD 12 | 13 | import org.apache.spark.graphx._ 14 | import org.apache.spark.rdd.RDD 15 | 16 | 17 | class CassandraUtils { 18 | 19 | val RED = "\033[1;30m" 20 | val ENDC = "\033[0m" 21 | 22 | /** 23 | * @constructor getTweetContentFromID 24 | * 25 | * Return tweet content 26 | * 27 | * @param SparkContext sc - SparkContext 28 | * @param String $id - tweet id 29 | * @return Unit 30 | */ 31 | def getTweetContentFromID(sc: SparkContext, id: String): String = { 32 | 33 | println(color("\nCall getTweetContentFromID", RED)) 34 | 35 | val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", id) 36 | 37 | if (query.collect().length != 0) { 38 | query.first().getString("tweet_text") 39 | } 40 | else 41 | "Tweet not found" 42 | } 43 | 44 | /** 45 | * @constructor getTweetsIDFromUser 46 | * 47 | * Return tweet id 48 | * 49 | * @param SparkContext sc - SparkContext 50 | * @param String $id - user (sender) id 51 | * @return Unit 52 | */ 53 | def getTweetsIDFromUser(sc: SparkContext, id: String): ArrayBuffer[String] = { 54 | 55 | println(color("\nCall getTweetsIDFromUser", RED)) 56 | println("Tweets found:") 57 | 58 | val query = sc.cassandraTable("twitter", "users_communicate").select("tweet_id").where("user_send_local_id = ?", id) 59 | 60 | // Result will be stored in an array 61 | var result = ArrayBuffer[String]() 62 | 63 | if (query.collect().length != 0) { 64 | result += query.first().getString("tweet_id") 65 | } 66 | 67 | // Display result 68 | result.foreach(println(_)) 69 | 70 | // Return 71 | result 72 | } 73 | 74 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 75 | 76 | /** 77 | * @constructor getTweetsContentFromEdge 78 | * 79 | * Return an array of tweets content for a given Graph 80 | * 81 | * @param SparkContext sc - SparkContext 82 | * @param RDD[Edge[String]] $edge - graph's edge 83 | * @return Unit 84 | */ 85 | def getTweetsContentFromEdge(sc: SparkContext, edge: RDD[Edge[String]], displayResult: Boolean): RDD[String] = { 86 | 87 | println(color("\nCall getTweetsContentFromEdge", RED)) 88 | 89 | // Get the tweets ID for every communication 90 | val tweetsID = edge.flatMap({ 91 | case Edge(idSend, idExp, idTweet) => Seq(idTweet) 92 | }) 93 | 94 | // Result will be stored in an array 95 | var result = ArrayBuffer[String]() 96 | 97 | // Queries 98 | for (tweet <- tweetsID.collect()) { 99 | val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", tweet) 100 | 101 | if (query.collect().length != 0) { 102 | result += query.first().getString("tweet_text") 103 | } 104 | } 105 | 106 | // Display results 107 | if (displayResult) { 108 | result.foreach(println(_)) 109 | } 110 | 111 | // return 112 | sc.parallelize(result) 113 | } 114 | } -------------------------------------------------------------------------------- /scala/GraphxTesting/src/main/scala/utils/CommunityUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.graphx._ 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.collection.mutable.ArrayBuffer 8 | import scala.math._ 9 | import scala.reflect.ClassTag 10 | 11 | class CommunityUtils extends Logging with Serializable { 12 | 13 | val RED = "\033[1;30m" 14 | val ENDC = "\033[0m" 15 | 16 | /** 17 | * splitCommunity 18 | * 19 | * Find and split communities in graph 20 | * 21 | * @param Graph[String,String] $graph - Graph element 22 | * @param RDD[(VertexId, (String))] $users - Vertices 23 | * @param Boolean $displayResult - if true, display println 24 | * @return ArrayBuffer[Graph[String,String]] - Contains one graph per community 25 | * 26 | */ 27 | def splitCommunity(graph: Graph[String, String], users: RDD[(VertexId, (String))], displayResult: Boolean): ArrayBuffer[Graph[String, String]] = { 28 | 29 | println(color("\nCall SplitCommunity", RED)) 30 | 31 | val graph_2 = getKCoreGraph(graph, users, 2, false).cache() 32 | 33 | // Find the connected components 34 | val cc = graph_2.connectedComponents().vertices 35 | 36 | // Join the connected components with the usernames and id 37 | // The result is an RDD not a Graph 38 | val ccByUsername = users.join(cc).map { 39 | case (id, (username, cc)) => (id, username, cc) 40 | } 41 | 42 | // Print the result 43 | val lowerIDPerCommunity = ccByUsername.map { case (id, username, cc) => cc }.distinct() 44 | 45 | // Result will be stored in an array 46 | var result = ArrayBuffer[Graph[String, String]]() 47 | println("--------------------------") 48 | println("Total community found: " + lowerIDPerCommunity.toArray.size) 49 | println("--------------------------") 50 | for (id <- lowerIDPerCommunity.toArray) { 51 | 52 | println("\nCommunity ID : " + id) 53 | 54 | val subGraphVertices = ccByUsername.filter { 55 | _._3 == id 56 | }.map { case (id, username, cc) => (id, username) } 57 | 58 | //subGraphVertices.foreach(println(_)) 59 | 60 | // Create a new graph 61 | // And remove missing vertices as well as the edges to connected to them 62 | var tempGraph = Graph(subGraphVertices, graph_2.edges).subgraph(vpred = (id, username) => username != null) 63 | 64 | result += tempGraph 65 | } 66 | 67 | // Display communities 68 | if (displayResult) { 69 | println("\nCommunities found " + result.size) 70 | for (community <- result) { 71 | println("-----------------------") 72 | //community.edges.collect().foreach(println(_)) 73 | community.vertices.collect().foreach(println(_)) 74 | } 75 | } 76 | 77 | result 78 | } 79 | 80 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 81 | 82 | /** 83 | * Compute the k-core decomposition of the graph for all k <= kmax. This 84 | * uses the iterative pruning algorithm discussed by Alvarez-Hamelin et al. 85 | * in K-Core Decomposition: a Tool For the Visualization of Large Scale Networks 86 | * (see http://arxiv.org/abs/cs/0504107). 87 | * 88 | * @tparam VD the vertex attribute type (discarded in the computation) 89 | * @tparam ED the edge attribute type (preserved in the computation) 90 | * 91 | * @param graph the graph for which to compute the connected components 92 | * @param kmax the maximum value of k to decompose the graph 93 | * 94 | * @return a graph where the vertex attribute is the minimum of 95 | * kmax or the highest value k for which that vertex was a member of 96 | * the k-core. 97 | * 98 | * @note This method has the advantage of returning not just a single kcore of the 99 | * graph but will yield all the cores for k > kmin. 100 | */ 101 | def getKCoreGraph[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], 102 | users: RDD[(VertexId, (String))], 103 | kmin: Int, 104 | displayResult: Boolean): Graph[String, ED] = { 105 | 106 | // Graph[(Int, Boolean), ED] - boolean indicates whether it is active or not 107 | var g = graph.outerJoinVertices(graph.degrees)((vid, oldData, newData) => newData.getOrElse(0)).cache 108 | val degrees = graph.degrees 109 | 110 | println(color("\nCall KCoreDecomposition", RED)) 111 | 112 | g = computeCurrentKCore(g, kmin).cache 113 | val testK = kmin 114 | val vCount = g.vertices.filter { case (vid, vd) => vd >= kmin }.count() 115 | val eCount = g.triplets.map { t => t.srcAttr >= testK && t.dstAttr >= testK }.count() 116 | 117 | val v = g.vertices.filter { case (vid, vd) => vd >= kmin } 118 | 119 | // Display informations 120 | if (displayResult) { 121 | 122 | val numVertices = degrees.count 123 | 124 | logWarning(s"Number of vertices: $numVertices") 125 | logWarning(s"Degree sample: ${degrees.take(10).mkString(", ")}") 126 | logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey((_ + _)).collect().mkString(", ")) 127 | logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey((_ + _)).take(10).mkString(", ")) 128 | logWarning(s"K=$kmin, V=$vCount, E=$eCount") 129 | } 130 | 131 | // Create new RDD users 132 | val newUser = users.join(v).map { 133 | case (id, (username, rank)) => (id, username) 134 | } 135 | 136 | // Create a new graph 137 | val gra = Graph(newUser, g.edges) 138 | 139 | // Remove missing vertices as well as the edges to connected to them 140 | gra.subgraph(vpred = (id, username) => username != null) 141 | } 142 | 143 | def computeCurrentKCore[ED: ClassTag](graph: Graph[Int, ED], k: Int) = { 144 | //logWarning(s"Computing kcore for k=$k") 145 | def sendMsg(et: EdgeTriplet[Int, ED]): Iterator[(VertexId, Int)] = { 146 | if (et.srcAttr < 0 || et.dstAttr < 0) { 147 | // if either vertex has already been turned off we do nothing 148 | Iterator.empty 149 | } else if (et.srcAttr < k && et.dstAttr < k) { 150 | // tell both vertices to turn off but don't need change count value 151 | Iterator((et.srcId, -1), (et.dstId, -1)) 152 | 153 | } else if (et.srcAttr < k) { 154 | // if src is being pruned, tell dst to subtract from vertex count 155 | Iterator((et.srcId, -1), (et.dstId, 1)) 156 | 157 | } else if (et.dstAttr < k) { 158 | // if dst is being pruned, tell src to subtract from vertex count 159 | Iterator((et.dstId, -1), (et.srcId, 1)) 160 | 161 | } else { 162 | Iterator.empty 163 | } 164 | } 165 | 166 | // subtracts removed neighbors from neighbor count and tells vertex whether it was turned off or not 167 | def mergeMsg(m1: Int, m2: Int): Int = { 168 | if (m1 < 0 || m2 < 0) { 169 | -1 170 | } else { 171 | m1 + m2 172 | } 173 | } 174 | 175 | def vProg(vid: VertexId, data: Int, update: Int): Int = { 176 | if (update < 0) { 177 | // if the vertex has turned off, keep it turned off 178 | -1 179 | } else { 180 | // subtract the number of neighbors that have turned off this round from 181 | // the count of active vertices 182 | // TODO(crankshaw) can we ever have the case data < update? 183 | max(data - update, 0) 184 | } 185 | } 186 | 187 | // Note that initial message should have no effect 188 | Pregel(graph, 0)(vProg, sendMsg, mergeMsg) 189 | } 190 | 191 | /** 192 | * getTriangleCount 193 | * 194 | * Compute the number of triangles passing through each vertex. 195 | * 196 | * @param Graph[String,String] $graph - Graph element 197 | * @param RDD[(VertexId, (String))] $users - Vertices 198 | * @return Unit 199 | * 200 | * @see [[org.apache.spark.graphx.lib.TriangleCount$#run]] 201 | */ 202 | def getTriangleCount(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = { 203 | 204 | println(color("\nCall getTriangleCount", RED)) 205 | 206 | // Sort edges ID srcID < dstID 207 | val edges = graph.edges.map { e => 208 | if (e.srcId < e.dstId) { 209 | Edge(e.srcId, e.dstId, e.attr) 210 | } 211 | else { 212 | Edge(e.dstId, e.srcId, e.attr) 213 | } 214 | } 215 | 216 | // Temporary graph 217 | val newGraph = Graph(users, edges, "").cache() 218 | 219 | // Find the triangle count for each vertex 220 | // TriangleCount requires the graph to be partitioned 221 | val triCounts = newGraph.partitionBy(PartitionStrategy.RandomVertexCut).cache().triangleCount().vertices 222 | 223 | val triCountByUsername = users.join(triCounts).map { 224 | case (id, (username, rank)) => (id, username, rank) 225 | } 226 | 227 | println("Display triangle's sum for each user") 228 | triCountByUsername.foreach(println) 229 | 230 | println("\nTotal: " + triCountByUsername.map { case (id, username, rank) => rank }.distinct().count() + "\n") 231 | } 232 | 233 | /** 234 | * @constructor ConnectedComponents 235 | * 236 | * Compute the connected component membership of each vertex and return a graph with the vertex 237 | * value containing the lowest vertex id in the connected component containing that vertex. 238 | * 239 | * @param Graph[String,String] $graph - Graph element 240 | * @param RDD[(VertexId, (String))] $users - Vertices 241 | * @return Unit 242 | * 243 | * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]] 244 | */ 245 | def cc(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = { 246 | println(color("\nCall ConnectedComponents", RED)) 247 | 248 | // Find the connected components 249 | val cc = graph.connectedComponents().vertices 250 | 251 | // Join the connected components with the usernames and id 252 | val ccByUsername = users.join(cc).map { 253 | case (id, (username, cc)) => (id, username, cc) 254 | } 255 | // Print the result 256 | println(ccByUsername.collect().sortBy(_._3).mkString("\n")) 257 | 258 | println("\nTotal groups: " + ccByUsername.map { case (id, username, cc) => cc }.distinct().count() + "\n") 259 | } 260 | 261 | /** 262 | * @constructor StronglyConnectedComponents 263 | * 264 | * Compute the strongly connected component (SCC) of each vertex and return a graph with the 265 | * vertex value containing the lowest vertex id in the SCC containing that vertex. 266 | * 267 | * Display edges's membership and total groups 268 | * 269 | * @param Graph[String,String] $graph - Graph element 270 | * @param Int $iteration - Number of iteration 271 | * @return Unit 272 | */ 273 | def scc(graph: Graph[String, String], iteration: Int): Unit = { 274 | 275 | println(color("\nCall StronglyConnectedComponents : iteration : " + iteration, RED)) 276 | val sccGraph = graph.stronglyConnectedComponents(5) 277 | 278 | val connectedGraph = sccGraph.vertices.map { 279 | case (member, leaderGroup) => s"$member is in the group of $leaderGroup's edge" 280 | } 281 | 282 | val totalGroups = sccGraph.vertices.map { 283 | case (member, leaderGroup) => leaderGroup 284 | } 285 | 286 | connectedGraph.collect.foreach(println) 287 | 288 | println("\nTotal groups: " + totalGroups.distinct().count() + "\n") 289 | } 290 | } -------------------------------------------------------------------------------- /scala/GraphxTesting/src/main/scala/utils/GraphUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // To make some of the examples work we will also need RDD 4 | 5 | import org.apache.spark.graphx._ 6 | import org.apache.spark.rdd.RDD 7 | 8 | 9 | class GraphUtils { 10 | 11 | val RED = "\033[1;30m" 12 | val ENDC = "\033[0m" 13 | private val defaultSeed = 0xadc83b19L 14 | 15 | /** 16 | * @constructor murmurHash64A 17 | * 18 | * 19 | * @param 20 | * @param 21 | * @return Long 22 | * 23 | */ 24 | def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = { 25 | val m = 0xc6a4a7935bd1e995L 26 | val r = 47 27 | 28 | val f: Long => Long = m.* 29 | val g: Long => Long = x => x ^ (x >>> r) 30 | 31 | val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) => 32 | val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff)) 33 | val j: Long => Long = if (xs.length == 8) f compose g compose f else identity 34 | f(y ^ j(k)) 35 | } 36 | (g compose f compose g)(h) 37 | } 38 | 39 | /** 40 | * @constructor getPageRank 41 | * 42 | * Run PageRank for a fixed number of iterations returning a graph with vertex attributes 43 | * containing the PageRank and edge attributes the normalized edge weight. 44 | * 45 | * @param Graph[String,String] $graph - Graph element 46 | * @param RDD[(VertexId, (String))] $users - Vertices 47 | * @return Unit 48 | * 49 | * @see [[org.apache.spark.graphx.lib.PageRank$#run]] 50 | */ 51 | def getPageRank(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = { 52 | 53 | println(color("\nCall getPageRank", RED)) 54 | 55 | val ranks = graph.pageRank(0.00001).vertices 56 | 57 | val ranksByUsername = users.join(ranks).map { 58 | case (id, (username, rank)) => (id, username, rank) 59 | } 60 | 61 | // Print the result descending 62 | println(ranksByUsername.collect().sortBy(_._3).reverse.mkString("\n")) 63 | } 64 | 65 | /** 66 | * @constructor inAndOutDegrees 67 | * 68 | * @param Graph[String,String] $graph - Graph element 69 | * @return Unit 70 | * 71 | */ 72 | def inAndOutDegrees(graph: Graph[String, String]): Unit = { 73 | 74 | println(color("\nCall inAndOutDegrees", RED)) 75 | 76 | // Create User class 77 | case class User(name: String, // Username 78 | inDeg: Int, // Received tweets 79 | outDeg: Int) // Sent tweets 80 | 81 | // Create user Graph 82 | // def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED] 83 | val initialUserGraph: Graph[User, String] = graph.mapVertices { 84 | case (id, (name)) => User(name, 0, 0) 85 | } 86 | 87 | //initialUserGraph.edges.collect.foreach(println(_)) 88 | 89 | 90 | // Fill in the degree informations (out and in degrees) 91 | val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) { 92 | case (id, u, inDegOpt) => User(u.name, inDegOpt.getOrElse(0), u.outDeg) 93 | }.outerJoinVertices(initialUserGraph.outDegrees) { 94 | case (id, u, outDegOpt) => User(u.name, u.inDeg, outDegOpt.getOrElse(0)) 95 | } 96 | 97 | // Display the userGraph 98 | userGraph.vertices.foreach { 99 | case (id, u) => println(s"User $id is called ${u.name} and received ${u.inDeg} tweets and send ${u.outDeg}.") 100 | } 101 | } 102 | 103 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 104 | } -------------------------------------------------------------------------------- /scala/GraphxTesting/src/main/scala/utils/MllibUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.mllib.clustering.{LDA, _} 5 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 6 | import org.apache.spark.rdd.RDD 7 | 8 | import scala.collection.mutable 9 | import scala.collection.mutable.ArrayBuffer 10 | 11 | /** 12 | * Topic models automatically infer the topics discussed in a collection of documents. These topics can be used 13 | * to summarize and organize documents, or used for featurization and dimensionality reduction in later stages 14 | * of a Machine Learning (ML) pipeline. 15 | * 16 | * LDA is not given topics, so it must infer them from raw text. LDA defines a topic as a distribution over words. 17 | */ 18 | class MllibUtils(_lda: LDA, _sc: SparkContext, _dictionnary: ArrayBuffer[String], _currentTweet: ArrayBuffer[String]) { 19 | 20 | // Text Color 21 | val RED = "\033[1;30m" 22 | val ENDC = "\033[0m" 23 | 24 | // LDA attributs 25 | var lda: LDA = _lda 26 | var dictionnary: ArrayBuffer[String] = _dictionnary 27 | var currentTweet: ArrayBuffer[String] = _currentTweet 28 | var currentTweetRDD: RDD[String] = _sc.parallelize(_dictionnary) 29 | var sc: SparkContext = _sc 30 | 31 | /** 32 | * @constructor newTweet 33 | * 34 | * Set currentTweet attribut and add the new tweet to the dictionnary 35 | * 36 | * @param String $newTweet - tweet content 37 | * 38 | * @return Unit 39 | */ 40 | def newTweet(newTweet: String): Unit = { 41 | 42 | // Delete old currentTweet 43 | currentTweet = new ArrayBuffer[String]() 44 | 45 | // Set new value 46 | currentTweet += newTweet 47 | 48 | // Convert it to RDD 49 | currentTweetRDD = sc.parallelize(currentTweet) 50 | 51 | // Add tweet to dictionnary 52 | addToDictionnary(newTweet) 53 | 54 | currentTweetRDD.collect.foreach(println(_)) 55 | } 56 | 57 | /** 58 | * @constructor addToDictionnary 59 | * 60 | * Add tweet content to the dictionnary. A dictionnary contains every words set to the LDA 61 | * 62 | * @param String $newTweet - tweet content 63 | * 64 | * @return Unit 65 | */ 66 | def addToDictionnary(newTweet: String): Unit = { 67 | dictionnary += newTweet 68 | } 69 | 70 | /** 71 | * @constructor findTopics 72 | * 73 | * Set currentTweet attribut and add the new tweet to the dictionnary 74 | * 75 | * @param LDAModel $ldaModel - LDA Model (LocalModel) 76 | * @param Array[String] $vocabArray - Contains all distinct words set to LDA 77 | * @param Int $numWordsByTopics - 78 | * @param Boolean $displayResult - Display result in console 79 | * 80 | * @return LDAModel 81 | */ 82 | def findTopics(ldaModel: LDAModel, vocabArray: Array[String], numWordsByTopics: Int, displayResult: Boolean): LDAModel = { 83 | 84 | println(color("\nCall findTopics", RED)) 85 | 86 | println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):") 87 | 88 | // Print topics, showing top-weighted x terms for each topic. 89 | if (displayResult) { 90 | val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = numWordsByTopics) 91 | topicIndices.foreach { case (terms, termWeights) => 92 | println("TOPICS:") 93 | terms.zip(termWeights).foreach { case (term, weight) => 94 | println(s"${vocabArray(term.toInt)}\t\t$weight") 95 | } 96 | println() 97 | } 98 | } 99 | ldaModel 100 | } 101 | 102 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 103 | 104 | /** 105 | * @constructor createDocuments 106 | * 107 | * Set currentTweet attribut and add the new tweet to the dictionnary 108 | * 109 | * @param SparkContext $sc - LDA Model (LocalModel) 110 | * @param Int $numStopwords - Contains all distinct words set to LDA 111 | * 112 | * @return RDD[(Long, Vector)] and Array[String] : documentsRDD and array of vocabulary 113 | */ 114 | def createDocuments(sc: SparkContext, numStopwords: Int): (RDD[(Long, Vector)], Array[String]) = { 115 | 116 | println(color("\nCall createDocuments", RED)) 117 | 118 | val corpus: RDD[String] = sc.parallelize(dictionnary) 119 | 120 | // Split every tweets's text into terms (words) and then remove : 121 | // -> (a) non-alphabetic terms 122 | // -> (b) short terms with < 4 characters 123 | // -> (c) to lower 124 | val tokenizedCorpus: RDD[Seq[String]] = 125 | corpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3).filter(_.forall(java.lang.Character.isLetter))) 126 | 127 | // Split tweet's text into terms (words) and then remove : 128 | // -> (a) non-alphabetic terms 129 | // -> (b) short terms with < 4 characters 130 | // -> (c) to lower 131 | val tokenizedTweet: RDD[Seq[String]] = 132 | currentTweetRDD.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3).filter(_.forall(java.lang.Character.isLetter))) 133 | 134 | 135 | // Choose the vocabulary 136 | // termCounts: Sorted list of (term, termCount) pairs 137 | val termCounts: Array[(String, Long)] = tokenizedCorpus.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2) 138 | 139 | // vocabArray contains all distinct words 140 | val vocabArray: Array[String] = termCounts.takeRight(termCounts.size - numStopwords).map(_._1) 141 | 142 | 143 | // Map[String, Int] of words and theirs places in tweet 144 | val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap 145 | //vocab.foreach(println(_)) 146 | 147 | // MAP : [ Word ID , VECTOR [vocab.size, WordFrequency]] 148 | val documents: Map[Long, Vector] = 149 | vocab.map { case (tokens, id) => 150 | val counts = new mutable.HashMap[Int, Double]() 151 | 152 | // Word ID 153 | val idx = vocab(tokens) 154 | 155 | // Count word occurancy 156 | counts(idx) = counts.getOrElse(idx, 0.0) + tokenizedTweet.collect.flatten.count(_ == tokens) 157 | 158 | // Return word ID and Vector 159 | (id.toLong, Vectors.sparse(vocab.size, counts.toSeq)) 160 | } 161 | 162 | // Transform it to RDD 163 | val documentsRDD = sc.parallelize(documents.toSeq) 164 | 165 | // Display RDD 166 | documentsRDD.collect.foreach(println(_)) 167 | 168 | // Return 169 | (documentsRDD, vocabArray) 170 | } 171 | } -------------------------------------------------------------------------------- /scala/GraphxTesting/src/main/scala/utils/RDDUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | 7 | // To make some of the examples work we will also need RDD 8 | import org.apache.spark.graphx._ 9 | import org.apache.spark.rdd.RDD 10 | 11 | 12 | class RDDUtils { 13 | 14 | val RED = "\033[1;30m" 15 | val ENDC = "\033[0m" 16 | 17 | /** 18 | * @constructor ArrayToVertices 19 | * 20 | * Convert ArrayBuffer to RDD containing Vertices 21 | * 22 | * @param SparkContext - $sc - SparkContext 23 | * @param ArrayBuffer[(Long, (String))] - $collection - Contains vertices 24 | * 25 | * @return RDD[Edge[String]] - RDD of vertices 26 | */ 27 | def ArrayToVertices(sc: SparkContext, collection: ArrayBuffer[(Long, (String))]): RDD[(VertexId, (String))] = { 28 | sc.parallelize(collection) 29 | } 30 | 31 | /** 32 | * @constructor ArrayToEdges 33 | * 34 | * Convert ArrayBuffer to RDD containing Edges 35 | * 36 | * @param SparkContext - $sc - SparkContext 37 | * @param ArrayBuffer[Edge[String]] - $collection - Contains edges 38 | * 39 | * @return RDD[Edge[String]] - RDD of edges 40 | */ 41 | def ArrayToEdges(sc: SparkContext, collection: ArrayBuffer[Edge[String]]): RDD[Edge[String]] = { 42 | sc.parallelize(collection) 43 | } 44 | 45 | /** 46 | * @constructor findUserByIDInGraph 47 | * 48 | * find user ID with username 49 | * 50 | * @param Graph[String,String] $graph - Graph element 51 | * @param Int $userID - User id 52 | * @return String - if success : username | failure : "user not found" 53 | */ 54 | def findUserNameByIDInGraph(graph: Graph[String, String], userID: Int): String = { 55 | println(color("\nCall : findUserNameWithID", RED)) 56 | 57 | graph.vertices.filter { case (id, name) => id == userID }.collect.foreach { 58 | (e: (org.apache.spark.graphx.VertexId, String)) => return e._2 59 | } 60 | "user not found" 61 | } 62 | 63 | /** 64 | * @constructor findUserIDByNameInGraph 65 | * 66 | * find username with id 67 | * 68 | * @param Graph[String,String] $graph - Graph element 69 | * @param String $userName - Username 70 | * @return String - if success : id found | failure : "0" 71 | */ 72 | def findUserIDByNameInGraph(graph: Graph[String, String], userName: String): String = { 73 | println(color("\nCall : findUserIDWithName", RED)) 74 | 75 | graph.vertices.filter(_._2 == userName).collect.foreach { 76 | (e: (org.apache.spark.graphx.VertexId, String)) => return e._1.toString 77 | } 78 | "0" 79 | } 80 | 81 | def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC) 82 | 83 | /** 84 | * @constructor displayAllCommunications 85 | * 86 | * display all communications between users 87 | * 88 | * @param Graph[String,String] $graph - Graph element 89 | * @return Unit 90 | */ 91 | def displayAllCommunications(graph: Graph[String, String]): Unit = { 92 | 93 | println(color("\nCall : displayAllCommunications", RED)) 94 | println("Users communications: ") 95 | 96 | val facts: RDD[String] = graph.triplets.map(triplet => triplet.srcAttr + " communicate with " + 97 | triplet.dstAttr + " with tweet id " + triplet.attr) 98 | 99 | facts.collect.foreach(println(_)) 100 | } 101 | } -------------------------------------------------------------------------------- /scala/RDDFromCassandra/build.sbt: -------------------------------------------------------------------------------- 1 | name := "RDDFromCassandra" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-core" % "1.2.1" % "provided", 9 | "org.apache.spark" %% "spark-streaming" % "1.2.1" % "provided", 10 | "org.apache.spark" %% "spark-streaming-twitter" % "1.2.1") 11 | 12 | libraryDependencies += "org.twitter4j" % "twitter4j-stream" % "3.0.6" 13 | 14 | libraryDependencies += "org.twitter4j" % "twitter4j-core" % "3.0.6" 15 | 16 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.2.1" -------------------------------------------------------------------------------- /scala/RDDFromCassandra/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") -------------------------------------------------------------------------------- /scala/RDDFromCassandra/src/main/scala/RDDFromCassandra.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.streaming.{Seconds, StreamingContext} 2 | import StreamingContext._ 3 | 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.SparkContext._ 6 | 7 | //import org.apache.spark.streaming.twitter 8 | //import org.apache.spark.streaming.twitter._ 9 | //import org.apache.spark.streaming.twitter.TwitterUtils 10 | 11 | import org.apache.spark.SparkConf 12 | 13 | //import org.apache.spark.streaming.dstream.DStream 14 | //import org.apache.spark.streaming.Seconds 15 | //import org.apache.spark.streaming.StreamingContext 16 | //import org.apache.spark.streaming.StreamingContext._ 17 | 18 | import collection.JavaConversions._ 19 | 20 | import org.apache.log4j.Logger 21 | import org.apache.log4j.Level 22 | 23 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD: 24 | import com.datastax.spark.connector._ 25 | import com.datastax.spark.connector.streaming._ 26 | 27 | import scala.util.matching.Regex 28 | import org.apache.spark.rdd.RDD 29 | 30 | 31 | // Useful links 32 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/0_quick_start.md 33 | // http://planetcassandra.org/getting-started-with-apache-spark-and-cassandra/ 34 | // https://bcomposes.wordpress.com/2013/02/09/using-twitter4j-with-scala-to-access-streaming-tweets/ 35 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/5_saving.md 36 | 37 | object RDDFromCassandra { 38 | def main(args: Array[String]) { 39 | 40 | // Display only warning messages 41 | Logger.getLogger("org").setLevel(Level.ERROR) 42 | Logger.getLogger("akka").setLevel(Level.ERROR) 43 | 44 | val filters = args 45 | 46 | // Spark configuration 47 | val sparkConf = new SparkConf(true) 48 | .setMaster("local[4]") 49 | .setAppName("RDDFromCassandra") 50 | .set("spark.cassandra.connection.host", "127.0.0.1") // Add this line to link to Cassandra 51 | 52 | val sc = new SparkContext(sparkConf) 53 | 54 | val rdd = sc.cassandraTable("twitter", "users_communicate") 55 | 56 | rdd.toArray.foreach(println) 57 | 58 | } 59 | } -------------------------------------------------------------------------------- /scala/SaveCommunicationToCassandra/build.sbt: -------------------------------------------------------------------------------- 1 | name := "SaveCommunicationToCassandra" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.5" 6 | 7 | //resolvers += "Job Server Bintray" at "https://dl.bintray.com/spark-jobserver/maven" 8 | 9 | libraryDependencies ++= Seq( 10 | "org.apache.spark" %% "spark-core" % "1.4.0" % "provided", 11 | "org.apache.spark" %% "spark-streaming" % "1.4.0" % "provided", 12 | "org.apache.spark" %% "spark-streaming-twitter" % "1.2.1") 13 | 14 | //libraryDependencies += "org.twitter4j" % "twitter4j-stream" % "3.0.3" 15 | 16 | //libraryDependencies += "org.twitter4j" % "twitter4j-core" % "3.0.3" 17 | 18 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0-M1" 19 | 20 | //libraryDependencies += "spark.jobserver" %% "job-server-api" % "0.5.1" 21 | 22 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/" -------------------------------------------------------------------------------- /scala/SaveCommunicationToCassandra/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") -------------------------------------------------------------------------------- /scala/SaveCommunicationToCassandra/src/main/scala/SaveCommunicationToCassandra.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.SparkContext 2 | import org.apache.spark.SparkContext._ 3 | 4 | import org.apache.spark._ 5 | import org.apache.spark.streaming._ 6 | 7 | 8 | import org.apache.spark.SparkContext._ 9 | import org.apache.spark.streaming.twitter._ 10 | import org.apache.spark.streaming.twitter 11 | import org.apache.spark.streaming.twitter.TwitterUtils 12 | import org.apache.spark.streaming.twitter.TwitterUtils._ 13 | import org.apache.spark.SparkConf 14 | 15 | import collection.JavaConversions._ 16 | 17 | import org.apache.log4j.Logger 18 | import org.apache.log4j.Level 19 | 20 | import scala.math._ 21 | 22 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD: 23 | import com.datastax.spark.connector._ 24 | import com.datastax.spark.connector.streaming._ 25 | 26 | import scala.util.matching.Regex 27 | import org.apache.spark.rdd.RDD 28 | 29 | // Useful links 30 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/0_quick_start.md 31 | // http://planetcassandra.org/getting-started-with-apache-spark-and-cassandra/ 32 | // https://bcomposes.wordpress.com/2013/02/09/using-twitter4j-with-scala-to-access-streaming-tweets/ 33 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/5_saving.md 34 | 35 | object SaveCommunicationToCassandra{ 36 | 37 | private val defaultSeed = 0xadc83b19L 38 | 39 | /** 40 | * @constructor murmurHash64A 41 | * 42 | * 43 | * @param 44 | * @param 45 | * @return Long 46 | * 47 | */ 48 | def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = { 49 | val m = 0xc6a4a7935bd1e995L 50 | val r = 47 51 | 52 | val f: Long => Long = m.* 53 | val g: Long => Long = x => x ^ (x >>> r) 54 | 55 | val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) => 56 | val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff)) 57 | val j: Long => Long = if (xs.length == 8) f compose g compose f else identity 58 | f(y ^ j(k)) 59 | } 60 | (g compose f compose g)(h) 61 | } 62 | 63 | def main(args: Array[String]) { 64 | 65 | // Display only warning and infos messages 66 | //Logger.getLogger("org").setLevel(Level.ERROR) 67 | //Logger.getLogger("akka").setLevel(Level.ERROR) 68 | 69 | // Not displaying infos messages 70 | Logger.getLogger("org").setLevel(Level.OFF) 71 | Logger.getLogger("akka").setLevel(Level.OFF) 72 | 73 | // Spark configuration 74 | val sparkConf = new SparkConf() 75 | .setMaster("local[2]") 76 | .setAppName("SaveCommunicationToCassandra") 77 | .set("spark.cassandra.connection.host", "127.0.0.1") // Link to Cassandra 78 | 79 | // Filters by words that contains @ 80 | val words = Array(" @") 81 | 82 | // Pattern used to find users 83 | val pattern = new Regex("\\@\\w{3,}") 84 | val patternURL = new Regex("(http|ftp|https)://[A-Za-z0-9-_]+.[A-Za-z0-9-_:%&?/.=]+") 85 | val patternSmiley = new Regex("((?::|;|=)(?:-)?(?:\\)|D|P|3|O))") 86 | 87 | // First twitter instance : Used for stream 88 | /*val twitterstream = new TwitterFactory().getInstance() 89 | twitterstream.setOAuthConsumer("MCrQfOAttGZnIIkrqZ4lQA9gr", "5NnYhhGdfyqOE4pIXXdYkploCybQMzFJiQejZssK4a3mNdkCoa") 90 | twitterstream.setOAuthAccessToken(new AccessToken("237197078-6zwzHsuB3VY3psD5873hhU3KQ1lSVQlOXyBhDqpG", "UIMZ1aD06DObpKI741zC8wHZF8jkj1bh02Lqfl5cQ76Pl")) 91 | */ 92 | System.setProperty("twitter4j.http.retryCount", "3") 93 | System.setProperty("twitter4j.http.retryIntervalSecs", "10") 94 | System.setProperty("twitter4j.async.numThreads", "10") 95 | 96 | // Set the system properties so that Twitter4j library used by twitter stream 97 | // can use them to generat OAuth credentials 98 | System.setProperty("twitter4j.oauth.consumerKey", "MCrQfOAttGZnIIkrqZ4lQA9gr") 99 | System.setProperty("twitter4j.oauth.consumerSecret", "5NnYhhGdfyqOE4pIXXdYkploCybQMzFJiQejZssK4a3mNdkCoa") 100 | System.setProperty("twitter4j.oauth.accessToken", "237197078-6zwzHsuB3VY3psD5873hhU3KQ1lSVQlOXyBhDqpG") 101 | System.setProperty("twitter4j.oauth.accessTokenSecret", "UIMZ1aD06DObpKI741zC8wHZF8jkj1bh02Lqfl5cQ76Pl") 102 | 103 | val ssc = new StreamingContext(sparkConf, Seconds(2)) 104 | val stream = TwitterUtils.createStream(ssc, None) 105 | 106 | 107 | // Stream about users 108 | val usersStream = stream.map{status => ( 109 | status.getUser.getId.toString, 110 | abs(murmurHash64A(status.getUser.getScreenName.getBytes)), 111 | status.getUser.getName.toString, 112 | status.getUser.getLang, 113 | status.getUser.getFollowersCount.toString, 114 | status.getUser.getFriendsCount.toString, 115 | status.getUser.getScreenName, 116 | status.getUser.getStatusesCount.toString)} 117 | 118 | 119 | // Stream about communication between two users 120 | val commStream = stream.map{status => ( 121 | status.getId.toString, //tweet_id 122 | status.getUser.getId.toString, // user_send_twitter_ID 123 | abs(murmurHash64A(status.getUser.getScreenName.getBytes)), // user_send_local_ID 124 | if(pattern.findFirstIn(status.getText).isEmpty) 125 | { 126 | "" 127 | } 128 | else 129 | { 130 | pattern.findFirstIn(status.getText).getOrElse("@MichaelCaraccio").tail 131 | }, 132 | status.getText, 133 | status.getUser.getLang 134 | )} 135 | 136 | 137 | 138 | // Stream about tweets 139 | val tweetsStream = stream.map{status => ( 140 | status.getId.toString, 141 | status.getUser.getId.toString, 142 | abs(murmurHash64A(status.getUser.getScreenName.getBytes)), 143 | new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(status.getCreatedAt), 144 | status.getRetweetCount.toString, 145 | status.getText 146 | )} 147 | 148 | 149 | // ************************************************************ 150 | // Save user's informations in Cassandra 151 | // ************************************************************ 152 | usersStream.foreachRDD(rdd => { 153 | rdd.saveToCassandra("twitter", "user_filtered", SomeColumns("user_twitter_id", "user_local_id", "user_name", "user_lang", "user_follow_count", "user_friends_count", "user_screen_name", "user_status_count")) 154 | 155 | println("Users saved : " + rdd.count()) 156 | }) 157 | 158 | // ************************************************************ 159 | // Save communication's informations in Cassandra 160 | // ************************************************************ 161 | commStream.foreachRDD(rdd => { 162 | // Getting current context 163 | val currentContext = rdd.context 164 | 165 | // RDD -> Array() 166 | val tabValues = rdd.collect() 167 | 168 | // For each tweets in RDD 169 | for(item <- tabValues.toArray) { 170 | 171 | // Avoid single @ in message 172 | if(item._4 != "" && (item._6 == "en" || item._6 == "en-gb")){ 173 | 174 | // Find multiple dest 175 | val matches = pattern.findAllIn(item._5).toArray 176 | 177 | // For each receiver in tweet 178 | matches.foreach{destName => { 179 | var user_dest_name = destName.drop(1) 180 | 181 | // TODO : Optimize save to cassandra with concatenate seq and save it when the loop is over 182 | val collection = currentContext.parallelize(Seq((item._1, item._2,item._3, abs(murmurHash64A(user_dest_name.getBytes))))) 183 | 184 | collection.saveToCassandra( 185 | "twitter", 186 | "users_communicate", 187 | SomeColumns( 188 | "tweet_id", 189 | "user_send_twitter_id", 190 | "user_send_local_id", 191 | "user_dest_id")) 192 | }} 193 | } 194 | } 195 | 196 | println("Comm saved : " + rdd.count()) 197 | }) 198 | 199 | 200 | // ************************************************************ 201 | // Save tweet's informations in Cassandra 202 | // ************************************************************ 203 | tweetsStream.foreachRDD(rdd => { 204 | 205 | // Getting current context 206 | val currentContext = rdd.context 207 | 208 | // RDD -> Array() 209 | val tabValues = rdd.collect() 210 | 211 | /*var test = rdd.map{status => (status._1, 212 | status._2, 213 | patternURL.replaceAllIn(status._3, ""), 214 | status._4, 215 | status._5, 216 | status._6, 217 | status._7)}*/ 218 | 219 | // For each tweets in RDD 220 | for(item <- tabValues.toArray) { 221 | 222 | // New tweet value 223 | var newTweet = patternURL.replaceAllIn(item._6, "") 224 | newTweet = patternSmiley.replaceAllIn(newTweet, "") 225 | 226 | val collection = currentContext.parallelize(Seq((item._1, item._2, item._3, item._4, item._5, newTweet))) 227 | 228 | collection.saveToCassandra( 229 | "twitter", 230 | "tweet_filtered", 231 | SomeColumns("tweet_id", 232 | "user_twitter_id", 233 | "user_local_id", 234 | "tweet_create_at", 235 | "tweet_retweet", 236 | "tweet_text" 237 | )) 238 | } 239 | 240 | println("Tweets saved : " + rdd.count()) 241 | }) 242 | 243 | ssc.start() 244 | ssc.awaitTermination() 245 | } 246 | } -------------------------------------------------------------------------------- /scala/ScalaTwitterStreaming/build.sbt: -------------------------------------------------------------------------------- 1 | name := "ScalaTwitterStreaming" 2 | 3 | version := "1.1" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-core" % "1.3.0" % "provided", 9 | "org.apache.spark" %% "spark-streaming" % "1.3.0" % "provided", 10 | "org.apache.spark" %% "spark-streaming-twitter" % "1.2.0") 11 | 12 | libraryDependencies += "org.twitter4j" % "twitter4j-stream" % "3.0.3" 13 | libraryDependencies += "org.twitter4j" % "twitter4j-core" % "3.0.3" -------------------------------------------------------------------------------- /scala/ScalaTwitterStreaming/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") 2 | -------------------------------------------------------------------------------- /scala/ScalaTwitterStreaming/src/main/scala/ScalaTwitterStreaming.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.streaming.{Seconds, StreamingContext} 2 | import StreamingContext._ 3 | import org.apache.spark.SparkContext._ 4 | import org.apache.spark.streaming.twitter._ 5 | import org.apache.spark.streaming.twitter 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming.StreamingContext._ 8 | 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | import org.apache.spark.streaming.twitter.TwitterUtils 12 | 13 | import twitter4j.TwitterFactory 14 | import twitter4j.auth.AccessToken 15 | 16 | /** 17 | * Calculates popular hashtags (topics) over sliding 10 and 60 second windows from a Twitter 18 | * stream. The stream is instantiated with credentials and optionally filters supplied by the 19 | * command line arguments. 20 | * 21 | * Run this on your local machine as 22 | * 23 | */ 24 | object ScalaTwitterStreaming { 25 | def main(args: Array[String]) { 26 | 27 | val filters = args 28 | // Set the system properties so that Twitter4j library used by twitter stream 29 | // can use them to generat OAuth credentials 30 | System.setProperty("twitter4j.oauth.consumerKey", "MCrQfOAttGZnIIkrqZ4lQA9gr") 31 | System.setProperty("twitter4j.oauth.consumerSecret", "5NnYhhGdfyqOE4pIXXdYkploCybQMzFJiQejZssK4a3mNdkCoa") 32 | System.setProperty("twitter4j.oauth.accessToken", "237197078-6zwzHsuB3VY3psD5873hhU3KQ1lSVQlOXyBhDqpG") 33 | System.setProperty("twitter4j.oauth.accessTokenSecret", "UIMZ1aD06DObpKI741zC8wHZF8jkj1bh02Lqfl5cQ76Pl") 34 | 35 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("ScalaTwitterStreaming") 36 | val ssc = new StreamingContext(sparkConf, Seconds(10)) 37 | val stream = TwitterUtils.createStream(ssc, None) 38 | 39 | val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) 40 | 41 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) 42 | .map{case (topic, count) => (count, topic)} 43 | .transform(_.sortByKey(false)) 44 | 45 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) 46 | .map{case (topic, count) => (count, topic)} 47 | .transform(_.sortByKey(false)) 48 | 49 | 50 | // Print popular hashtags 51 | topCounts60.foreachRDD(rdd => { 52 | val topList = rdd.take(10) 53 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) 54 | topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} 55 | }) 56 | 57 | topCounts10.foreachRDD(rdd => { 58 | val topList = rdd.take(10) 59 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) 60 | topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} 61 | }) 62 | 63 | ssc.start() 64 | ssc.awaitTermination() 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /scala/SimpleAppUsingSBT/build.sbt: -------------------------------------------------------------------------------- 1 | name := "SimpleAppUsingSBT" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.2.1" 8 | 9 | libraryDependencies += "org.apache.spark" % "spark-streaming_2.10" % "1.2.1" 10 | 11 | libraryDependencies += "org.apache.spark" % "spark-streaming-twitter_2.10" % "1.2.1" -------------------------------------------------------------------------------- /scala/SimpleAppUsingSBT/src/main/scala/SimpleAppUsingSBT.scala: -------------------------------------------------------------------------------- 1 | /* SimpleAppUsingSBT.scala */ 2 | import org.apache.spark.SparkContext 3 | import org.apache.spark.SparkContext._ 4 | import org.apache.spark.SparkConf 5 | 6 | object SimpleAppUsingSBT { 7 | def main(args: Array[String]) { 8 | val logFile = "/home/mcaraccio/spark-1.2.1-bin-hadoop2.4/README.md" // Should be some file on your system 9 | val conf = new SparkConf().setAppName("Simple Application Using SBT") 10 | val sc = new SparkContext(conf) 11 | val logData = sc.textFile(logFile, 2).cache() 12 | val numAs = logData.filter(line => line.contains("a")).count() 13 | val numBs = logData.filter(line => line.contains("b")).count() 14 | println("Lines with a: %s, Lines with b: %s".format(numAs, numBs)) 15 | } 16 | } -------------------------------------------------------------------------------- /visualization/d3.slider.css: -------------------------------------------------------------------------------- 1 | .d3-slider { 2 | position: relative; 3 | font-family: Verdana,Arial,sans-serif; 4 | font-size: 1.1em; 5 | border: 1px solid #aaaaaa; 6 | z-index: 2; 7 | } 8 | 9 | .d3-slider-horizontal { 10 | height: .8em; 11 | } 12 | 13 | .d3-slider-range { 14 | background:#2980b9; 15 | left:0px; 16 | right:0px; 17 | height: 0.8em; 18 | position: absolute; 19 | } 20 | 21 | .d3-slider-range-vertical { 22 | background:#2980b9; 23 | left:0px; 24 | right:0px; 25 | position: absolute; 26 | top:0; 27 | } 28 | 29 | .d3-slider-vertical { 30 | width: .8em; 31 | height: 100px; 32 | } 33 | 34 | .d3-slider-handle { 35 | position: absolute; 36 | width: 1.2em; 37 | height: 1.2em; 38 | border: 1px solid #d3d3d3; 39 | border-radius: 4px; 40 | background: #eee; 41 | background: linear-gradient(to bottom, #eee 0%, #ddd 100%); 42 | z-index: 3; 43 | } 44 | 45 | .d3-slider-handle:hover { 46 | border: 1px solid #999999; 47 | } 48 | 49 | .d3-slider-horizontal .d3-slider-handle { 50 | top: -.3em; 51 | margin-left: -.6em; 52 | } 53 | 54 | .d3-slider-axis { 55 | position: relative; 56 | z-index: 1; 57 | } 58 | 59 | .d3-slider-axis-bottom { 60 | top: .8em; 61 | } 62 | 63 | .d3-slider-axis-right { 64 | left: .8em; 65 | } 66 | 67 | .d3-slider-axis path { 68 | stroke-width: 0; 69 | fill: none; 70 | } 71 | 72 | .d3-slider-axis line { 73 | fill: none; 74 | stroke: #aaa; 75 | shape-rendering: crispEdges; 76 | } 77 | 78 | .d3-slider-axis text { 79 | font-size: 11px; 80 | } 81 | 82 | .d3-slider-vertical .d3-slider-handle { 83 | left: -.25em; 84 | margin-left: 0; 85 | margin-bottom: -.6em; 86 | } -------------------------------------------------------------------------------- /visualization/d3.slider.js: -------------------------------------------------------------------------------- 1 | /* 2 | D3.js Slider 3 | Inspired by jQuery UI Slider 4 | Copyright (c) 2013, Bjorn Sandvik - http://blog.thematicmapping.org 5 | BSD license: http://opensource.org/licenses/BSD-3-Clause 6 | */ 7 | (function (root, factory) { 8 | if (typeof define === 'function' && define.amd) { 9 | // AMD. Register as an anonymous module. 10 | define(['d3'], factory); 11 | } else if (typeof exports === 'object') { 12 | if (process.browser) { 13 | // Browserify. Import css too using cssify. 14 | require('./d3.slider.css'); 15 | } 16 | // Node. Does not work with strict CommonJS, but 17 | // only CommonJS-like environments that support module.exports, 18 | // like Node. 19 | module.exports = factory(require('d3')); 20 | } else { 21 | // Browser globals (root is window) 22 | root.d3.slider = factory(root.d3); 23 | } 24 | }(this, function (d3) { 25 | return function module() { 26 | "use strict"; 27 | 28 | // Public variables width default settings 29 | var min = 0, 30 | max = 100, 31 | step = 0.01, 32 | animate = true, 33 | orientation = "horizontal", 34 | axis = false, 35 | margin = 50, 36 | value, 37 | active = 1, 38 | snap = false, 39 | scale; 40 | 41 | // Private variables 42 | var axisScale, 43 | dispatch = d3.dispatch("slide", "slideend"), 44 | formatPercent = d3.format(".2%"), 45 | tickFormat = d3.format(".0"), 46 | handle1, 47 | handle2 = null, 48 | divRange, 49 | sliderLength; 50 | 51 | function slider(selection) { 52 | selection.each(function() { 53 | 54 | // Create scale if not defined by user 55 | if (!scale) { 56 | scale = d3.scale.linear().domain([min, max]); 57 | } 58 | 59 | // Start value 60 | value = value || scale.domain()[0]; 61 | 62 | // DIV container 63 | var div = d3.select(this).classed("d3-slider d3-slider-" + orientation, true); 64 | 65 | var drag = d3.behavior.drag(); 66 | drag.on('dragend', function () { 67 | dispatch.slideend(d3.event, value); 68 | }) 69 | 70 | // Slider handle 71 | //if range slider, create two 72 | // var divRange; 73 | 74 | if (toType(value) == "array" && value.length == 2) { 75 | handle1 = div.append("a") 76 | .classed("d3-slider-handle", true) 77 | .attr("xlink:href", "#") 78 | .attr('id', "handle-one") 79 | .on("click", stopPropagation) 80 | .call(drag); 81 | handle2 = div.append("a") 82 | .classed("d3-slider-handle", true) 83 | .attr('id', "handle-two") 84 | .attr("xlink:href", "#") 85 | .on("click", stopPropagation) 86 | .call(drag); 87 | } else { 88 | handle1 = div.append("a") 89 | .classed("d3-slider-handle", true) 90 | .attr("xlink:href", "#") 91 | .attr('id', "handle-one") 92 | .on("click", stopPropagation) 93 | .call(drag); 94 | } 95 | 96 | // Horizontal slider 97 | if (orientation === "horizontal") { 98 | 99 | div.on("click", onClickHorizontal); 100 | 101 | if (toType(value) == "array" && value.length == 2) { 102 | divRange = d3.select(this).append('div').classed("d3-slider-range", true); 103 | 104 | handle1.style("left", formatPercent(scale(value[ 0 ]))); 105 | divRange.style("left", formatPercent(scale(value[ 0 ]))); 106 | drag.on("drag", onDragHorizontal); 107 | 108 | var width = 100 - parseFloat(formatPercent(scale(value[ 1 ]))); 109 | handle2.style("left", formatPercent(scale(value[ 1 ]))); 110 | divRange.style("right", width+"%"); 111 | drag.on("drag", onDragHorizontal); 112 | 113 | } else { 114 | handle1.style("left", formatPercent(scale(value))); 115 | drag.on("drag", onDragHorizontal); 116 | } 117 | 118 | sliderLength = parseInt(div.style("width"), 10); 119 | 120 | } else { // Vertical 121 | 122 | div.on("click", onClickVertical); 123 | drag.on("drag", onDragVertical); 124 | if (toType(value) == "array" && value.length == 2) { 125 | divRange = d3.select(this).append('div').classed("d3-slider-range-vertical", true); 126 | 127 | handle1.style("bottom", formatPercent(scale(value[ 0 ]))); 128 | divRange.style("bottom", formatPercent(scale(value[ 0 ]))); 129 | drag.on("drag", onDragVertical); 130 | 131 | var top = 100 - parseFloat(formatPercent(scale(value[ 1 ]))); 132 | handle2.style("bottom", formatPercent(scale(value[ 1 ]))); 133 | divRange.style("top", top+"%"); 134 | drag.on("drag", onDragVertical); 135 | 136 | } else { 137 | handle1.style("bottom", formatPercent(scale(value))); 138 | drag.on("drag", onDragVertical); 139 | } 140 | 141 | sliderLength = parseInt(div.style("height"), 10); 142 | 143 | } 144 | 145 | if (axis) { 146 | createAxis(div); 147 | } 148 | 149 | 150 | function createAxis(dom) { 151 | 152 | // Create axis if not defined by user 153 | if (typeof axis === "boolean") { 154 | 155 | axis = d3.svg.axis() 156 | .ticks(Math.round(sliderLength / 100)) 157 | .tickFormat(tickFormat) 158 | .orient((orientation === "horizontal") ? "bottom" : "right"); 159 | 160 | } 161 | 162 | // Copy slider scale to move from percentages to pixels 163 | axisScale = scale.ticks ? scale.copy().range([0, sliderLength]) : scale.copy().rangePoints([0, sliderLength], 0.5); 164 | axis.scale(axisScale); 165 | 166 | // Create SVG axis container 167 | var svg = dom.append("svg") 168 | .classed("d3-slider-axis d3-slider-axis-" + axis.orient(), true) 169 | .on("click", stopPropagation); 170 | 171 | var g = svg.append("g"); 172 | 173 | // Horizontal axis 174 | if (orientation === "horizontal") { 175 | 176 | svg.style("margin-left", -margin + "px"); 177 | 178 | svg.attr({ 179 | width: sliderLength + margin * 2, 180 | height: margin 181 | }); 182 | 183 | if (axis.orient() === "top") { 184 | svg.style("top", -margin + "px"); 185 | g.attr("transform", "translate(" + margin + "," + margin + ")"); 186 | } else { // bottom 187 | g.attr("transform", "translate(" + margin + ",0)"); 188 | } 189 | 190 | } else { // Vertical 191 | 192 | svg.style("top", -margin + "px"); 193 | 194 | svg.attr({ 195 | width: margin, 196 | height: sliderLength + margin * 2 197 | }); 198 | 199 | if (axis.orient() === "left") { 200 | svg.style("left", -margin + "px"); 201 | g.attr("transform", "translate(" + margin + "," + margin + ")"); 202 | } else { // right 203 | g.attr("transform", "translate(" + 0 + "," + margin + ")"); 204 | } 205 | 206 | } 207 | 208 | g.call(axis); 209 | 210 | } 211 | 212 | function onClickHorizontal() { 213 | if (toType(value) != "array") { 214 | var pos = Math.max(0, Math.min(sliderLength, d3.event.offsetX || d3.event.layerX)); 215 | moveHandle(scale.invert ? 216 | stepValue(scale.invert(pos / sliderLength)) 217 | : nearestTick(pos / sliderLength)); 218 | } 219 | } 220 | 221 | function onClickVertical() { 222 | if (toType(value) != "array") { 223 | var pos = sliderLength - Math.max(0, Math.min(sliderLength, d3.event.offsetY || d3.event.layerY)); 224 | moveHandle(scale.invert ? 225 | stepValue(scale.invert(pos / sliderLength)) 226 | : nearestTick(pos / sliderLength)); 227 | } 228 | } 229 | 230 | function onDragHorizontal() { 231 | if ( d3.event.sourceEvent.target.id === "handle-one") { 232 | active = 1; 233 | } else if ( d3.event.sourceEvent.target.id == "handle-two" ) { 234 | active = 2; 235 | } 236 | var pos = Math.max(0, Math.min(sliderLength, d3.event.x)); 237 | moveHandle(scale.invert ? 238 | stepValue(scale.invert(pos / sliderLength)) 239 | : nearestTick(pos / sliderLength)); 240 | } 241 | 242 | function onDragVertical() { 243 | if ( d3.event.sourceEvent.target.id === "handle-one") { 244 | active = 1; 245 | } else if ( d3.event.sourceEvent.target.id == "handle-two" ) { 246 | active = 2; 247 | } 248 | var pos = sliderLength - Math.max(0, Math.min(sliderLength, d3.event.y)) 249 | moveHandle(scale.invert ? 250 | stepValue(scale.invert(pos / sliderLength)) 251 | : nearestTick(pos / sliderLength)); 252 | } 253 | 254 | function stopPropagation() { 255 | d3.event.stopPropagation(); 256 | } 257 | 258 | }); 259 | 260 | } 261 | 262 | // Move slider handle on click/drag 263 | function moveHandle(newValue) { 264 | var currentValue = toType(value) == "array" && value.length == 2 ? value[active - 1]: value, 265 | oldPos = formatPercent(scale(stepValue(currentValue))), 266 | newPos = formatPercent(scale(stepValue(newValue))), 267 | position = (orientation === "horizontal") ? "left" : "bottom"; 268 | if (oldPos !== newPos) { 269 | 270 | if (toType(value) == "array" && value.length == 2) { 271 | value[ active - 1 ] = newValue; 272 | if (d3.event) { 273 | dispatch.slide(d3.event, value ); 274 | }; 275 | } else { 276 | if (d3.event) { 277 | dispatch.slide(d3.event.sourceEvent || d3.event, value = newValue); 278 | }; 279 | } 280 | 281 | if ( value[ 0 ] >= value[ 1 ] ) return; 282 | if ( active === 1 ) { 283 | if (toType(value) == "array" && value.length == 2) { 284 | (position === "left") ? divRange.style("left", newPos) : divRange.style("bottom", newPos); 285 | } 286 | 287 | if (animate) { 288 | handle1.transition() 289 | .styleTween(position, function() { return d3.interpolate(oldPos, newPos); }) 290 | .duration((typeof animate === "number") ? animate : 250); 291 | } else { 292 | handle1.style(position, newPos); 293 | } 294 | } else { 295 | 296 | var width = 100 - parseFloat(newPos); 297 | var top = 100 - parseFloat(newPos); 298 | 299 | (position === "left") ? divRange.style("right", width + "%") : divRange.style("top", top + "%"); 300 | 301 | if (animate) { 302 | handle2.transition() 303 | .styleTween(position, function() { return d3.interpolate(oldPos, newPos); }) 304 | .duration((typeof animate === "number") ? animate : 250); 305 | } else { 306 | handle2.style(position, newPos); 307 | } 308 | } 309 | } 310 | } 311 | 312 | // Calculate nearest step value 313 | function stepValue(val) { 314 | 315 | if (val === scale.domain()[0] || val === scale.domain()[1]) { 316 | return val; 317 | } 318 | 319 | var alignValue = val; 320 | if (snap) { 321 | alignValue = nearestTick(scale(val)); 322 | } else{ 323 | var valModStep = (val - scale.domain()[0]) % step; 324 | alignValue = val - valModStep; 325 | 326 | if (Math.abs(valModStep) * 2 >= step) { 327 | alignValue += (valModStep > 0) ? step : -step; 328 | } 329 | }; 330 | 331 | return alignValue; 332 | 333 | } 334 | 335 | // Find the nearest tick 336 | function nearestTick(pos) { 337 | var ticks = scale.ticks ? scale.ticks() : scale.domain(); 338 | var dist = ticks.map(function(d) {return pos - scale(d);}); 339 | var i = -1, 340 | index = 0, 341 | r = scale.ticks ? scale.range()[1] : scale.rangeExtent()[1]; 342 | do { 343 | i++; 344 | if (Math.abs(dist[i]) < r) { 345 | r = Math.abs(dist[i]); 346 | index = i; 347 | }; 348 | } while (dist[i] > 0 && i < dist.length - 1); 349 | 350 | return ticks[index]; 351 | }; 352 | 353 | // Return the type of an object 354 | function toType(v) { 355 | return ({}).toString.call(v).match(/\s([a-zA-Z]+)/)[1].toLowerCase(); 356 | }; 357 | 358 | // Getter/setter functions 359 | slider.min = function(_) { 360 | if (!arguments.length) return min; 361 | min = _; 362 | return slider; 363 | }; 364 | 365 | slider.max = function(_) { 366 | if (!arguments.length) return max; 367 | max = _; 368 | return slider; 369 | }; 370 | 371 | slider.step = function(_) { 372 | if (!arguments.length) return step; 373 | step = _; 374 | return slider; 375 | }; 376 | 377 | slider.animate = function(_) { 378 | if (!arguments.length) return animate; 379 | animate = _; 380 | return slider; 381 | }; 382 | 383 | slider.orientation = function(_) { 384 | if (!arguments.length) return orientation; 385 | orientation = _; 386 | return slider; 387 | }; 388 | 389 | slider.axis = function(_) { 390 | if (!arguments.length) return axis; 391 | axis = _; 392 | return slider; 393 | }; 394 | 395 | slider.margin = function(_) { 396 | if (!arguments.length) return margin; 397 | margin = _; 398 | return slider; 399 | }; 400 | 401 | slider.value = function(_) { 402 | if (!arguments.length) return value; 403 | if (value) { 404 | moveHandle(stepValue(_)); 405 | }; 406 | value = _; 407 | return slider; 408 | }; 409 | 410 | slider.snap = function(_) { 411 | if (!arguments.length) return snap; 412 | snap = _; 413 | return slider; 414 | }; 415 | 416 | slider.scale = function(_) { 417 | if (!arguments.length) return scale; 418 | scale = _; 419 | return slider; 420 | }; 421 | 422 | d3.rebind(slider, dispatch, "on"); 423 | 424 | return slider; 425 | 426 | } 427 | })); 428 | -------------------------------------------------------------------------------- /visualization/data.php: -------------------------------------------------------------------------------- 1 | withContactPoints('157.26.83.16') 11 | ->withPort(9042) 12 | ->build(); 13 | 14 | $keyspace = 'twitter'; 15 | $session = $cluster->connect($keyspace); // create session, optionally scoped to a keyspace 16 | 17 | // Default values 18 | $tValue = 1; 19 | $minVertices = 0; 20 | 21 | //*************************************/ 22 | // Change T 23 | //*************************************/ 24 | if (isset($_GET["value"])){ 25 | $tValue = $_GET["value"]; 26 | } 27 | 28 | //****************************************/ 29 | // Change minimum vertices in communities 30 | //****************************************/ 31 | if (isset($_GET["minVertices"])){ 32 | $minVertices = $_GET["minVertices"]; 33 | } 34 | 35 | $whereStatementForVertice = null; 36 | if($minVertices > 0){ 37 | $whereStatementForVertice = "and nbv >= $minVertices"; 38 | } 39 | 40 | //*************************************/ 41 | // Init 42 | //*************************************/ 43 | $data = new stdClass(); 44 | 45 | $data->nodes = array(); 46 | $data->links = array(); 47 | $data->lda = array(); 48 | $data->cosine = array(); 49 | 50 | $indices = array(); 51 | $groups = array(); 52 | $sg = array(); 53 | 54 | function myfunction($num) 55 | { 56 | return($num); 57 | } 58 | 59 | //*************************************/ 60 | // Get Source and Com ID -> Array 61 | //*************************************/ 62 | $statement = new Cassandra\SimpleStatement("SELECT src_id, com_id, sg FROM twitter.communities where t = $tValue $whereStatementForVertice"); 63 | $result = $session->execute($statement); 64 | 65 | // On introduit tout les indices dans la map 66 | foreach ($result as $row) { 67 | $indices[] = (int) $row['src_id']; 68 | $groups[] = (int) $row['com_id']; 69 | $sg[] = (int) $row['sg']; 70 | } 71 | 72 | 73 | //*************************************/ 74 | // Get Destination and Com ID -> Array 75 | //*************************************/ 76 | $statement = new Cassandra\SimpleStatement("SELECT dst_id, com_id, sg FROM twitter.communities where t = $tValue $whereStatementForVertice"); 77 | $result = $session->execute($statement); 78 | 79 | foreach ($result as $row) { 80 | $indices[] = (int) $row['dst_id']; 81 | $groups[] = (int) $row['com_id']; 82 | $sg[] = (int) $row['sg']; 83 | } 84 | 85 | // Combine node and group 86 | $c = array_combine($indices, $groups); 87 | $csg = array_combine($indices, $sg); 88 | 89 | // Get unique list of nodes 90 | $map = array_map("myfunction", $indices); 91 | $indices_unique = array_unique($map); 92 | $nodes_index = array(); 93 | 94 | 95 | //*************************************/ 96 | // Create nodes 97 | //*************************************/ 98 | foreach($indices_unique as $node){ 99 | $nodes_index[] = $node; 100 | $data->nodes[] = array("name" => (int) $node, "group" => (int) $c[$node], "sg" => (int) $csg[$node]); 101 | } 102 | 103 | 104 | //*************************************/ 105 | // Create links 106 | //*************************************/ 107 | $statement = new Cassandra\SimpleStatement("SELECT * FROM twitter.communities where t = $tValue $whereStatementForVertice"); 108 | $result = $session->execute($statement); 109 | 110 | foreach ($result as $row) { 111 | $data->links[] = array("source" => (int) array_search($row['src_id'],$nodes_index), "s" => (int) $row['src_id'], "d" => (int) $row['dst_id'], "target" => (int) array_search($row['dst_id'],$nodes_index), "value" => ((int) 1)); 112 | } 113 | 114 | //*************************************/ 115 | // Get LDA 116 | //*************************************/ 117 | $statement = new Cassandra\SimpleStatement("SELECT * FROM twitter.lda where t = $tValue"); 118 | $result = $session->execute($statement); 119 | 120 | foreach ($result as $row) { 121 | $data->lda[] = array("t" => (int) $row['t'], "sg" => (int) $row['sg'], "n_topic" => (int) $row['n_topic'], "words" => (string) $row['words']); 122 | } 123 | 124 | //*************************************/ 125 | // Get Cosine similarity 126 | //*************************************/ 127 | $statementComm = new Cassandra\SimpleStatement("SELECT * FROM twitter.communities where t = $tValue $whereStatementForVertice"); 128 | $resultComm = $session->execute($statementComm); 129 | 130 | foreach ($resultComm as $row) { 131 | $data->cosine[] = array("t" => (int) $row['t'], "sg" => (int) $row['sg'], "cosines" => (string) $row['lda']); 132 | } 133 | 134 | echo json_encode($data); 135 | ?> -------------------------------------------------------------------------------- /visualization/graph.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 24 | 25 | 26 | 27 | 28 | 29 |
30 |

Period: 1

31 |
32 |
33 | 34 |
35 |

36 |

Minimum vertices per communities: 0

37 |
0 = no restriction
38 |
39 |
40 | 41 |
42 |
43 | 270 | --------------------------------------------------------------------------------