├── .gitignore
├── README.md
├── python
    └── network_wordcount.py
├── scala
    ├── ConnectToCassandra
    │   ├── build.sbt
    │   ├── project
    │   │   └── assembly.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── ConnectToCassandra.scala
    ├── FinalProject
    │   ├── build.sbt
    │   ├── project
    │   │   └── assembly.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           ├── FinalProject.scala
    │   │           └── utils
    │   │               ├── CassandraUtils.scala
    │   │               ├── CommunityUtils.scala
    │   │               ├── GraphUtils.scala
    │   │               ├── MllibUtils.scala
    │   │               └── RDDUtils.scala
    ├── FindCommunities
    │   ├── build.sbt
    │   ├── launch.sh
    │   ├── project
    │   │   └── assembly.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           ├── FindCommunities.scala
    │   │           └── utils
    │   │               ├── CassandraUtils.scala
    │   │               ├── CommunityUtils.scala
    │   │               ├── GraphUtils.scala
    │   │               ├── MllibUtils.scala
    │   │               └── RDDUtils.scala
    ├── GraphxTesting
    │   ├── build.sbt
    │   ├── project
    │   │   └── assembly.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           ├── GraphxTesting.scala
    │   │           └── utils
    │   │               ├── CassandraUtils.scala
    │   │               ├── CommunityUtils.scala
    │   │               ├── GraphUtils.scala
    │   │               ├── MllibUtils.scala
    │   │               └── RDDUtils.scala
    ├── RDDFromCassandra
    │   ├── build.sbt
    │   ├── project
    │   │   └── assembly.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── RDDFromCassandra.scala
    ├── SaveCommunicationToCassandra
    │   ├── build.sbt
    │   ├── project
    │   │   └── assembly.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── SaveCommunicationToCassandra.scala
    ├── ScalaTwitterStreaming
    │   ├── build.sbt
    │   ├── project
    │   │   └── assembly.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── ScalaTwitterStreaming.scala
    └── SimpleAppUsingSBT
    │   ├── build.sbt
    │   └── src
    │       └── main
    │           └── scala
    │               └── SimpleAppUsingSBT.scala
└── visualization
    ├── d3.slider.css
    ├── d3.slider.js
    ├── data.php
    └── graph.html


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | TwitterConfig.scala
 5 | /TwitterConfig.scala
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Pycharm
11 | .idea/*
12 | .idea/
13 | .metadata
14 | .metadata/*
15 | 
16 | # Distribution / packaging
17 | .Python
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | 
33 | # PyInstaller
34 | #  Usually these files are written by a python script from a template
35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 | 
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 | 
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .coverage
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Django stuff:
56 | *.log
57 | 
58 | # Sphinx documentation
59 | docs/_build/
60 | 
61 | # PyBuilder
62 | target/
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Community detection and LDA 
2 | 
3 | For further informations -> wiki
4 | 


--------------------------------------------------------------------------------
/python/network_wordcount.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'michaelcaraccio'
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | """
21 |  Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
22 |  Usage: network_wordcount.py <hostname> <port>
23 |    <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive data.
24 |  To run this on your local machine, you need to first run a Netcat server
25 |     `$ nc -lk 9999`
26 |  and then run the example
27 |     `$ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localhost 9999`
28 | """
29 | 
30 | import sys
31 | 
32 | from pyspark import SparkContext
33 | from pyspark.streaming import StreamingContext
34 | 
35 | if __name__ == "__main__":
36 |     if len(sys.argv) != 3:
37 |         print >> sys.stderr, "Usage: network_wordcount.py <hostname> <port>"
38 |         exit(-1)
39 |     sc = SparkContext(appName="PythonStreamingNetworkWordCount")
40 |     ssc = StreamingContext(sc, 1)
41 | 
42 |     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
43 |     counts = lines.flatMap(lambda line: line.split(" "))\
44 |                   .map(lambda word: (word, 1))\
45 |                   .reduceByKey(lambda a, b: a+b)
46 |     counts.pprint()
47 | 
48 |     ssc.start()
49 |     ssc.awaitTermination()


--------------------------------------------------------------------------------
/scala/ConnectToCassandra/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "ConnectToCassandra"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |   "org.apache.spark" %% "spark-core"              % "1.2.0" % "provided",
 9 |   "org.apache.spark" %% "spark-streaming"         % "1.2.0" % "provided",
10 |   "org.apache.spark" %% "spark-streaming-twitter" % "1.2.1")
11 | 
12 | libraryDependencies += "org.twitter4j" % "twitter4j-stream" % "3.0.6"
13 | 
14 | libraryDependencies += "org.twitter4j" % "twitter4j-core" % "3.0.6"
15 | 
16 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.2.0-rc3"


--------------------------------------------------------------------------------
/scala/ConnectToCassandra/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")


--------------------------------------------------------------------------------
/scala/ConnectToCassandra/src/main/scala/ConnectToCassandra.scala:
--------------------------------------------------------------------------------
  1 | import org.apache.spark.streaming.{Seconds, StreamingContext}
  2 | import StreamingContext._
  3 | 
  4 | import org.apache.spark.SparkContext
  5 | import org.apache.spark.SparkContext._
  6 | 
  7 | import org.apache.spark.streaming.twitter
  8 | import org.apache.spark.streaming.twitter._
  9 | import org.apache.spark.streaming.twitter.TwitterUtils
 10 | 
 11 | import org.apache.spark.SparkConf
 12 | 
 13 | import org.apache.spark.streaming.dstream.DStream
 14 | import org.apache.spark.streaming.Seconds
 15 | import org.apache.spark.streaming.StreamingContext
 16 | import org.apache.spark.streaming.StreamingContext._
 17 | 
 18 | import twitter4j.TwitterFactory
 19 | import twitter4j.auth.AccessToken
 20 | import twitter4j._
 21 | import collection.JavaConversions._
 22 | 
 23 | import org.apache.log4j.Logger
 24 | import org.apache.log4j.Level
 25 | 
 26 | import com.datastax.spark.connector._ 
 27 | import com.datastax.spark.connector.streaming._
 28 | 
 29 | import scala.util.matching.Regex
 30 | 
 31 | 
 32 | // Useful links
 33 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/0_quick_start.md
 34 | // http://planetcassandra.org/getting-started-with-apache-spark-and-cassandra/
 35 | // https://bcomposes.wordpress.com/2013/02/09/using-twitter4j-with-scala-to-access-streaming-tweets/
 36 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/5_saving.md
 37 | 
 38 | object ConnectToCassandra {
 39 |     def main(args: Array[String]) {
 40 | 
 41 |         // Display only warning messages
 42 |         Logger.getLogger("org").setLevel(Level.ERROR)
 43 |         Logger.getLogger("akka").setLevel(Level.ERROR)
 44 | 
 45 |         val filters = args
 46 |         
 47 |         // Spark configuration
 48 |         val sparkConf = new SparkConf(true)
 49 |         .setMaster("local[4]")
 50 |         .setAppName("ConnectToCassandra")
 51 |         .set("spark.cassandra.connection.host", "127.0.0.1") // Add this line to link to Cassandra
 52 |         
 53 |         // Filters by words that contains @
 54 |         val words = Array("@")
 55 |         
 56 |         // Pattern used to find users
 57 |         val pattern = new Regex("\\@\\w+")
 58 |         
 59 |         // First twitter instance : Used for stream
 60 |         val twitterstream = new TwitterFactory().getInstance()
 61 |         twitterstream.setOAuthConsumer("MCrQfOAttGZnIIkrqZ4lQA9gr", "5NnYhhGdfyqOE4pIXXdYkploCybQMzFJiQejZssK4a3mNdkCoa")
 62 |         twitterstream.setOAuthAccessToken(new AccessToken("237197078-6zwzHsuB3VY3psD5873hhU3KQ1lSVQlOXyBhDqpG", "UIMZ1aD06DObpKI741zC8wHZF8jkj1bh02Lqfl5cQ76Pl"))
 63 |         System.setProperty("twitter4j.http.retryCount", "3");
 64 |         System.setProperty("twitter4j.http.retryIntervalSecs", "10")
 65 |         System.setProperty("twitter4j.async.numThreads", "1");
 66 | 
 67 |         val ssc = new StreamingContext(sparkConf, Seconds(1))
 68 |         val stream = TwitterUtils.createStream(ssc, Option(twitterstream.getAuthorization()), words)
 69 |         
 70 |         // Second twitter instance : Used to query user's informations
 71 |         val twitter = new TwitterFactory().getInstance()
 72 |         twitter.setOAuthConsumer("Vb0BxXrK933CDEeQ3Myj69kkC", "q55rXOM8pQnnAyPrYhHh6LHK4IFHw0U01tfe6VDoleaxmvOL3B")
 73 |         twitter.setOAuthAccessToken(new AccessToken("237197078-iXi3ANEAUXNmoDbcbH3lvS93vDO6PvEQj3255ToL", "Skv8J9xcfhbKV2Lwddke2g7llTDwwh6S9QyAlNR6fanqY"))
 74 | 
 75 |         // Stream about users
 76 |         val usersStream = stream.map{status => (status.getUser.getId.toString, 
 77 |                                                 status.getUser.getName.toString,
 78 |                                                 status.getUser.getLang,
 79 |                                                 status.getUser.getFollowersCount.toString,
 80 |                                                 status.getUser.getFriendsCount.toString,
 81 |                                                 status.getUser.getScreenName,
 82 |                                                 status.getUser.getStatusesCount.toString)}
 83 | 
 84 |         // Stream about tweets
 85 |         val tweetsStream = stream.map{status => (status.getId.toString, 
 86 |                                                  status.getUser.getId.toString, 
 87 |                                                  status.getUser.getName.toString,
 88 |                                                  status.getText, 
 89 | 
 90 |                                                     if(pattern.findFirstIn(status.getText).isEmpty){
 91 |                                                         ""
 92 |                                                     }
 93 |                                                     else
 94 |                                                     {
 95 |                                                         twitterstream.showUser(pattern.findFirstIn(status.getText).getOrElse("@MichaelCaraccio").tail).getName
 96 |                                                     },
 97 | 
 98 |                                                     if(pattern.findFirstIn(status.getText).isEmpty){
 99 |                                                         ""
100 |                                                     }
101 |                                                     else{
102 |                                                         twitterstream.showUser(pattern.findFirstIn(status.getText).getOrElse("@MichaelCaraccio").tail).getId
103 |                                                     },
104 | 
105 |                                                  status.getRetweetCount.toString,
106 |                                                  new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(status.getCreatedAt),
107 | 
108 |                                                  Option(status.getGeoLocation) match {
109 |                                                      case Some(theValue) => 
110 |                                                      status.getGeoLocation.getLongitude.toString
111 |                                                      case None           => 
112 |                                                      ""
113 |                                                  }, 
114 | 
115 |                                                  Option(status.getGeoLocation) match {
116 |                                                      case Some(theValue) => 
117 |                                                      status.getGeoLocation.getLatitude.toString
118 |                                                      case None           => 
119 |                                                      ""
120 |                                                  }
121 |                                                 )}
122 |         
123 |         // Save user's informations in Cassandra
124 |         usersStream.foreachRDD(rdd => {
125 |             //rdd.saveToCassandra("twitter", "user_filtered", SomeColumns("user_id", "user_name", "user_lang", "user_follower_count", "user_friends_count", "user_screen_name", "user_status_count"))
126 |             println("user added")
127 |         })
128 | 
129 |         // Save tweet's informations in Cassandra
130 |         tweetsStream.foreachRDD(rdd => {
131 |             //rdd.saveToCassandra("twitter", "tweet_filtered", SomeColumns("tweet_id", "user_id", "tweet_text", "tweet_retweet", "tweet_create_at", "user_longitude", "user_latitude"))
132 |             
133 |             
134 |            /* val twitter = new TwitterFactory().getInstance
135 |             val userName = twitter.getScreenName
136 | 
137 |             val statuses = twitter.getMentionsTimeline.take(2)
138 |   
139 |             statuses.foreach { status => {
140 |                 val statusAuthor = status.getUser.getScreenName
141 |                 val mentionedEntities = status.getUserMentionEntities.map(_.getScreenName).toList
142 |                 val participants = (statusAuthor :: mentionedEntities).toSet - userName
143 |                 val text = participants.map(p=>"@"+p).mkString(" ") + " OK."
144 |                 val reply = new StatusUpdate(text).inReplyToStatusId(status.getId)
145 |                 println("Replying: " + text)
146 |                 //twitter.updateStatus(reply)
147 |                 println("DAT BITCH" + mentionedEntities)
148 |                 println("DAT BITCH2" + reply)
149 |             }}*/
150 |             
151 |             
152 |             
153 |             rdd.foreach {r => {
154 |                 val sender_name = r._3
155 |                 val sender_id = r._2
156 |                 val tweet_text = r._4
157 |                 val dest_name = r._5               
158 |                 val dest_id = r._6
159 | 
160 |                 println("----------------------------------------------")
161 |                 println("Sender ID : " + sender_id)
162 |                 println("Sender Name : " + sender_name)
163 |                 println("Tweet : " + tweet_text)
164 |                 println("Dest name :" + dest_name)
165 |                 println("Dest ID : " + dest_id)
166 |                 println("----------------------------------------------")
167 | 
168 |             }}
169 |             println("tweet added")
170 |         })
171 | 
172 |         ssc.start()
173 |         ssc.awaitTermination()
174 |     }
175 | }


--------------------------------------------------------------------------------
/scala/FinalProject/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "FinalProject"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.5"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |     "org.apache.spark" %% "spark-core" % "1.3.0" % "provided",
 9 |     "org.apache.spark" %% "spark-graphx" % "1.3.0" % "provided",
10 |     //"org.apache.spark" %% "spark-streaming" % "1.3.0" % "provided",
11 |     "org.apache.spark" %% "spark-mllib" % "1.3.0" % "provided"//,
12 |     //    "org.apache.commons" % "commons-lang3" % "3.3.2",
13 |     /*"org.apache.spark" %% "spark-streaming-twitter" % "1.3.0"*/)
14 | 
15 | //libraryDependencies += "org.apache.spark" % "spark-streaming_2.10" % "1.3.0"
16 | libraryDependencies += "org.apache.spark" % "spark-streaming-twitter_2.10" % "1.3.0"	
17 | 
18 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.3.0-M1"
19 | 
20 | //libraryDependencies += "com.google.code.gson" % "gson" % "2.3"
21 | 
22 | //libraryDependencies += "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly()
23 | 
24 | // http://stackoverflow.com/questions/28459333/how-to-build-an-uber-jar-fat-jar-using-sbt-within-intellij-idea
25 | // META-INF discarding
26 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
27 |    {
28 |     case PathList("META-INF", xs @ _*) => MergeStrategy.discard
29 |     case x => MergeStrategy.first
30 |    }
31 | }
32 | 
33 | resolvers ++= Seq(
34 |     // "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
35 |     // "Spray Repository" at "http://repo.spray.cc/",
36 |     // "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
37 |     // "Akka Repository" at "http://repo.akka.io/releases/",
38 |     //  "Twitter4J Repository" at "http://twitter4j.org/maven2/",
39 |     //  "Apache HBase" at "https://repository.apache.org/content/repositories/releases",
40 |     //  "Twitter Maven Repo" at "http://maven.twttr.com/",
41 |     //  "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
42 |     //  "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
43 |     //  "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/"
44 |     //  "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
45 |     // Resolver.sonatypeRepo("public")
46 | )


--------------------------------------------------------------------------------
/scala/FinalProject/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")


--------------------------------------------------------------------------------
/scala/FinalProject/src/main/scala/utils/CassandraUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import scala.collection.mutable.ArrayBuffer
  4 | 
  5 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD:
  6 | 
  7 | import com.datastax.spark.connector._
  8 | 
  9 | // To make some of the examples work we will also need RDD
 10 | 
 11 | import org.apache.spark.SparkContext
 12 | import org.apache.spark.graphx._
 13 | import org.apache.spark.rdd.RDD
 14 | import org.apache.spark.sql.cassandra.CassandraSQLContext
 15 | 
 16 | //@SerialVersionUID(100L)
 17 | class CassandraUtils /*extends Serializable*/ {
 18 | 
 19 |     val RED = "\033[1;30m"
 20 |     val ENDC = "\033[0m"
 21 | 
 22 |     /**
 23 |      * @constructor getTweetContentFromID
 24 |      *
 25 |      *              Return tweet content
 26 |      *
 27 |      * @param SparkContext sc - SparkContext
 28 |      * @param String $id - tweet id
 29 |      * @return Unit
 30 |      */
 31 |     def getTweetContentFromID(sc: SparkContext, id: String): String = {
 32 | 
 33 |         println(color("\nCall getTweetContentFromID", RED))
 34 | 
 35 |         val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", id)
 36 | 
 37 |         if (query.collect().length != 0) {
 38 |             query.first().getString("tweet_text")
 39 |         }
 40 |         else
 41 |             "Tweet not found"
 42 |     }
 43 | 
 44 |     /**
 45 |      * @constructor getTweetsIDFromUser
 46 |      *
 47 |      *              Return tweet id
 48 |      *
 49 |      * @param SparkContext sc - SparkContext
 50 |      * @param String $id - user (sender) id
 51 |      * @return Unit
 52 |      */
 53 |     def getTweetsIDFromUser(sc: SparkContext, id: String): ArrayBuffer[String] = {
 54 | 
 55 |         println(color("\nCall getTweetsIDFromUser", RED))
 56 |         println("Tweets found:")
 57 | 
 58 |         val query = sc.cassandraTable("twitter", "users_communicate").select("tweet_id").where("user_send_local_id = ?", id)
 59 | 
 60 |         // Result will be stored in an array
 61 |         var result = ArrayBuffer[String]()
 62 | 
 63 |         if (query.collect().length != 0) {
 64 |             result += query.first().getString("tweet_id")
 65 |         }
 66 | 
 67 |         // Display result
 68 |         result.foreach(println(_))
 69 | 
 70 |         // Return
 71 |         result
 72 |     }
 73 | 
 74 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 75 | 
 76 |     /**
 77 |      * @constructor getTweetsContentFromEdge
 78 |      *
 79 |      *              Return an array of tweets content for a given Graph
 80 |      *
 81 |      * @param SparkContext sc - SparkContext
 82 |      * @param RDD[Edge[String]] $edge - graph's edge
 83 |      * @return Unit
 84 |      */
 85 |     def getTweetsContentFromEdge(sc: SparkContext, edge: RDD[Edge[String]], displayResult: Boolean): RDD[String] = {
 86 | 
 87 |         println(color("\nCall getTweetsContentFromEdge", RED))
 88 | 
 89 |         // Get the tweets ID for every communication
 90 |         val tweetsID = edge.flatMap({
 91 |             case Edge(idSend, idExp, idTweet) => Seq(idTweet)
 92 |         })
 93 | 
 94 |         // Result will be stored in an array
 95 |         var result = ArrayBuffer[String]()
 96 | 
 97 |         // Queries
 98 |         for (tweet <- tweetsID.collect()) {
 99 |             val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", tweet)
100 | 
101 |             if (query.collect().length != 0) {
102 |                 result += query.first().getString("tweet_text")
103 |             }
104 |         }
105 | 
106 |         // Display results
107 |         if (displayResult) {
108 |             result.foreach(println(_))
109 |         }
110 | 
111 | 
112 |         // return
113 |         sc.parallelize(result)
114 |     }
115 | 
116 |     /*def getAllTweetsText(sc: SparkContext): ArrayBuffer[String] = {
117 |         val rdd = sc.cassandraTable("twitter", "tweet_filtered2").select("tweet_text").cache()
118 | 
119 |         var dictionnary = new ArrayBuffer[String]
120 | 
121 |         println("Tweets by tweets -> Create documents and vocabulary")
122 |         rdd.select("tweet_text").as((i: String) => i).foreach(x => {
123 | 
124 |             val tweet = x
125 |                 .toLowerCase.split("\\s")
126 |                 .filter(_.length > 3)
127 |                 .filter(_.forall(java.lang.Character.isLetter)).mkString(" ")
128 | 
129 |             if (tweet.length > 1)
130 |                 dictionnary += tweet
131 |         })
132 |     }*/
133 | 
134 |     // (RDD[(VertexId, (String))], RDD[Edge[String]])
135 |     def getAllCommunicationsToGraph(sc: SparkContext): Graph[String, String] = {
136 |         println(color("\nCall getAllCommunications", RED))
137 | 
138 | 
139 |         /* val users: RDD[(VertexId, (String))] =
140 |              sc.parallelize(List(
141 |                  (2732329846L, "Michael"),
142 |                  (132988448L, "David"),
143 |                  (473822999L, "Sarah"),
144 |                  (2932436311L, "Jean"),
145 |                  (2249679902L, "Raphael"),
146 |                  (601389784L, "Lucie"),
147 |                  (2941487254L, "Harold"),
148 |                  (1192483885L, "Pierre"),
149 |                  (465776805L, "Christophe"),
150 |                  (838147628L, "Zoe"),
151 |                  (2564641105L, "Fabien"),
152 |                  (1518391292L, "Nicolas")
153 |              ))*/
154 | 
155 | 
156 |         // Collection of vertices (contains users)
157 |         // val collectionVertices = ListBuffer[(Long, String)]()
158 | 
159 | 
160 |         // val users: RDD[(VertexId, (String))] = sc.parallelize(collectionVertices)
161 | 
162 | 
163 |         //val con = sc.cassandraTable("twitter", "user_filtered")
164 |         //con.toArray.foreach(println)
165 |         /*println("Test -1")
166 | 
167 |         var t0 = System.nanoTime()
168 |             for (row <- query) {
169 | 
170 |             }
171 | 
172 |         var t1 = System.nanoTime()
173 |         println("Elapsed time: " + (t1 - t0) + "ns")*/
174 | 
175 |         // val query = sc.cassandraTable("twitter", "user_filtered").select("user_local_id", "user_screen_name")
176 | 
177 | 
178 |         /*val con = query.map{
179 |              case result => (result._1, result._2)
180 |          }*/
181 |         val cc = new CassandraSQLContext(sc)
182 | 
183 |         println("Test 0")
184 |         var t0 = System.nanoTime()
185 |         val rdd0 = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered")
186 | 
187 |         val pelo = rdd0.map(p => (p(0).toString.toLong, p(1).toString)).cache()
188 | 
189 |         val rdd1 = cc.sql("SELECT tweet_id, user_send_local_id, user_dest_id from twitter.users_communicate")
190 | 
191 |         val pelo2 = rdd1.map(p => Edge(p(1).toString.toLong, p(2).toString.toLong, p(0).toString)).cache()
192 | 
193 |         Graph(pelo, pelo2)
194 | 
195 |         /*println("okkk")
196 | 
197 |         graphh.vertices.foreach(println(_))
198 | 
199 | 
200 |         //pelo.foreach(println(_))
201 | 
202 |         println("After collecting")
203 | 
204 |         rdd0.show()
205 | 
206 |         for (row <- rdd0) {
207 |             //println(row(0))
208 | 
209 |             collectionVertices += ((row(0).toString.toLong, row(1).toString))
210 |             //collectionVertices.append((row(0).toString.toLong, row(1).toString))
211 |         }
212 |         var t1 = System.nanoTime()
213 |         println("Elapsed time: " + (t1 - t0) + "ns")
214 | 
215 | 
216 |         println("Test 1")
217 |         t0 = System.nanoTime()
218 | 
219 |         val rdd = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered LIMIT 100").persist()
220 |         for (row <- rdd) {
221 |             collectionVertices += ((row(0).toString.toLong, row(1).toString))
222 |         }
223 |         rdd.unpersist()
224 |         t1 = System.nanoTime()
225 |         println("Elapsed time: " + (t1 - t0) + "ns")
226 | 
227 | 
228 | 
229 |         println("Test 2")
230 |         t0 = System.nanoTime()
231 |         val rdd2 = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered limit 10000").cache()
232 |         for (row <- rdd2) {
233 |             collectionVertices += ((row(0).toString.toLong, row(1).toString))
234 |         }
235 |         t1 = System.nanoTime()
236 |         println("Elapsed time: " + (t1 - t0) + "ns")
237 | 
238 |         println("Test 3")
239 |         t0 = System.nanoTime()
240 | 
241 |         for (row <- cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered limit 10000")) {
242 |             collectionVertices += ((row(0).toString.toLong, row(1).toString))
243 |         }
244 |         t1 = System.nanoTime()
245 |         println("Elapsed time: " + (t1 - t0) + "ns")
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 |         println("f")
253 |         // println(rdd.take(1))
254 |         println("f2")
255 | *
256 |         /*
257 |         println("Query 1 ok")
258 |          */
259 |         // Save result to ArrayBuffer
260 |         //if (query.collect().length != 0) {
261 |             //collectionVertices += ((query.first().getString("user_local_id").toLong, query.first().getString("user_local_id").toString))
262 |             println(query.first().getString("user_local_id"))
263 |        // }
264 | 
265 |         //collectionVertices.foreach(println(_))
266 | 
267 |         println("Query 1 Collect ok")
268 | 
269 | 
270 | 
271 |         // Collection of edges (contains communications between users)
272 |         val collectionEdge = ArrayBuffer[Edge[String]]()
273 | 
274 | 
275 |         //query = sc.cassandraTable("twitter", "users_communicate").select("user_send_local_id", "user_dest_id", "tweet_id").toArray()
276 | 
277 |         println("Query 2 ok")
278 |         // Save result to ArrayBuffer
279 |         /*if (query.collect().length != 0) {
280 |             collectionEdge += Edge(query.first().getString("user_send_local_id").toLong, query.first().getString("user_dest_id").toLong, query.first().getString("tweet_id").toString)
281 |         }*/
282 | 
283 |         //collectionEdge.foreach(println(_))
284 | 
285 |         println("Query 2 Collect ok")
286 | 
287 |         // Convert vertices to RDD
288 |         val VerticesRDD = sc.parallelize(collectionVertices)
289 | 
290 |         // Convert it to RDD
291 |         val EdgeRDD = sc.parallelize(collectionEdge)
292 | 
293 |         println("Total vertices: " + collectionVertices.length)
294 |         println("Total edges: " + collectionEdge.length)
295 | 
296 |         (VerticesRDD, EdgeRDD)*/
297 |     }
298 | }


--------------------------------------------------------------------------------
/scala/FinalProject/src/main/scala/utils/CommunityUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark._
  4 | import org.apache.spark.graphx._
  5 | import org.apache.spark.rdd.RDD
  6 | 
  7 | import scala.math._
  8 | import scala.reflect.ClassTag
  9 | 
 10 | class CommunityUtils extends Logging {
 11 | 
 12 |     val RED = "\033[1;30m"
 13 |     val ENDC = "\033[0m"
 14 | 
 15 |     /**
 16 |      * splitCommunity
 17 |      *
 18 |      * Find and split communities in graph
 19 |      *
 20 |      * @param Graph[String,String] $graph - Graph element
 21 |      * @param RDD[(VertexId, (String))] $users - Vertices
 22 |      * @param Boolean $displayResult - if true, display println
 23 |      * @return ArrayBuffer[Graph[String,String]] - Contains one graph per community
 24 |      *
 25 |      */
 26 |     def splitCommunity(graph: Graph[String, String], users: RDD[(VertexId, (String))], NBKCORE: Int, displayResult: Boolean): Graph[String, String] = {
 27 | 
 28 |         println(color("\nCall SplitCommunity", RED))
 29 | 
 30 |         getKCoreGraph(graph, users, NBKCORE, displayResult).cache()
 31 |     }
 32 | 
 33 |     /**
 34 |      * Compute the k-core decomposition of the graph for all k <= kmax. This
 35 |      * uses the iterative pruning algorithm discussed by Alvarez-Hamelin et al.
 36 |      * in K-Core Decomposition: a Tool For the Visualization of Large Scale Networks
 37 |      * (see <a href="http://arxiv.org/abs/cs/0504107">http://arxiv.org/abs/cs/0504107</a>).
 38 |      *
 39 |      * @tparam VD the vertex attribute type (discarded in the computation)
 40 |      * @tparam ED the edge attribute type (preserved in the computation)
 41 |      *
 42 |      * @param graph the graph for which to compute the connected components
 43 |      * @param kmax the maximum value of k to decompose the graph
 44 |      *
 45 |      * @return a graph where the vertex attribute is the minimum of
 46 |      *         kmax or the highest value k for which that vertex was a member of
 47 |      *         the k-core.
 48 |      *
 49 |      * @note This method has the advantage of returning not just a single kcore of the
 50 |      *       graph but will yield all the cores for k > kmin.
 51 |      */
 52 |     def getKCoreGraph[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED],
 53 |                                                   users: RDD[(VertexId, (String))],
 54 |                                                   kmin: Int,
 55 |                                                   displayResult: Boolean): Graph[String, ED] = {
 56 | 
 57 |         // Graph[(Int, Boolean), ED] - boolean indicates whether it is active or not
 58 |         var g = graph.cache().outerJoinVertices(graph.degrees)((vid, oldData, newData) => newData.getOrElse(0)).cache()
 59 | 
 60 |         println(color("\nCall KCoreDecomposition", RED))
 61 | 
 62 |         g = computeCurrentKCore(g, kmin).cache()
 63 | 
 64 |         val v = g.vertices.filter { case (vid, vd) => vd >= kmin }.cache()
 65 | 
 66 |         // Display informations
 67 |         if (displayResult) {
 68 |             val degrees = graph.degrees
 69 |             val numVertices = degrees.count()
 70 |             val testK = kmin
 71 |             val vCount = g.vertices.filter { case (vid, vd) => vd >= kmin }.count()
 72 |             val eCount = g.triplets.map { t => t.srcAttr >= testK && t.dstAttr >= testK }.count()
 73 | 
 74 |             logWarning(s"Number of vertices: $numVertices")
 75 |             logWarning(s"Degree sample: ${degrees.take(10).mkString(", ")}")
 76 |             logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey(_ + _).collect().mkString(", "))
 77 |             logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey(_ + _).take(10).mkString(", "))
 78 |             logWarning(s"K=$kmin, V=$vCount, E=$eCount")
 79 |         }
 80 | 
 81 |         // Create new RDD users
 82 |         val newUser = users.join(v).map {
 83 |             case (id, (username, rank)) => (id, username)
 84 |         }
 85 | 
 86 |         // Create a new graph
 87 |         val gra = Graph(newUser, g.edges)
 88 | 
 89 |         // Remove missing vertices as well as the edges to connected to them
 90 |         gra.subgraph(vpred = (id, username) => username != null).cache()
 91 |     }
 92 | 
 93 |     def computeCurrentKCore[ED: ClassTag](graph: Graph[Int, ED], k: Int) = {
 94 |         println("Computing kcore for k=" + k)
 95 |         def sendMsg(et: EdgeTriplet[Int, ED]): Iterator[(VertexId, Int)] = {
 96 |             if (et.srcAttr < 0 || et.dstAttr < 0) {
 97 |                 // if either vertex has already been turned off we do nothing
 98 |                 Iterator.empty
 99 |             } else if (et.srcAttr < k && et.dstAttr < k) {
100 |                 // tell both vertices to turn off but don't need change count value
101 |                 Iterator((et.srcId, -1), (et.dstId, -1))
102 | 
103 |             } else if (et.srcAttr < k) {
104 |                 // if src is being pruned, tell dst to subtract from vertex count
105 |                 Iterator((et.srcId, -1), (et.dstId, 1))
106 | 
107 |             } else if (et.dstAttr < k) {
108 |                 // if dst is being pruned, tell src to subtract from vertex count
109 |                 Iterator((et.dstId, -1), (et.srcId, 1))
110 | 
111 |             } else {
112 |                 Iterator.empty
113 |             }
114 |         }
115 | 
116 |         // subtracts removed neighbors from neighbor count and tells vertex whether it was turned off or not
117 |         def mergeMsg(m1: Int, m2: Int): Int = {
118 |             if (m1 < 0 || m2 < 0) {
119 |                 -1
120 |             } else {
121 |                 m1 + m2
122 |             }
123 |         }
124 | 
125 |         def vProg(vid: VertexId, data: Int, update: Int): Int = {
126 |             if (update < 0) {
127 |                 // if the vertex has turned off, keep it turned off
128 |                 -1
129 |             } else {
130 |                 // subtract the number of neighbors that have turned off this round from
131 |                 // the count of active vertices
132 |                 // TODO(crankshaw) can we ever have the case data < update?
133 |                 max(data - update, 0)
134 |             }
135 |         }
136 | 
137 |         // Note that initial message should have no effect
138 |         Pregel(graph, 0)(vProg, sendMsg, mergeMsg)
139 |     }
140 | 
141 | 
142 |     /**
143 |      * @constructor time
144 |      *
145 |      *              timer for profiling block
146 |      *
147 |      * @param R $block - Block executed
148 |      * @return Unit
149 |      */
150 |     def time[R](block: => R): R = {
151 |         val t0 = System.nanoTime()
152 |         val result = block // call-by-name
153 |         val t1 = System.nanoTime()
154 |         println("Elapsed time: " + (t1 - t0) / 1000000000.0 + " seconds")
155 |         result
156 |     }
157 | 
158 |     def subgraphCommunities(graph: Graph[String, String], users: RDD[(VertexId, (String))], displayResult: Boolean): (Array[Graph[String, String]], Array[Long]) = {
159 | 
160 |         println(color("\nCall subgraphCommunities", RED))
161 | 
162 |         // Find the connected components
163 |         val cc = time {
164 |             graph.connectedComponents().vertices.cache()
165 |         }
166 | 
167 |         // Join the connected components with the usernames and id
168 |         // The result is an RDD not a Graph
169 |         val ccByUsername = users.join(cc).map {
170 |             case (id, (username, cci)) => (id, username, cci)
171 |         }.cache()
172 | 
173 |         // Print the result
174 |         val lowerIDPerCommunity = ccByUsername.map { case (id, username, cci) => cci }.distinct().cache()
175 | 
176 |         // Result will be stored in an array
177 |         //var result = new ArrayBuffer[Graph[String, String]]()
178 |         println("--------------------------")
179 |         println("Total community found: " + lowerIDPerCommunity.count())
180 |         println("--------------------------")
181 | 
182 | 
183 |         val collectIDsCommunity = lowerIDPerCommunity.collect()
184 | 
185 |         val result = collectIDsCommunity.map(colID => Graph(ccByUsername.filter {
186 |             _._3 == colID
187 |         }.map { case (id, username, cc) => (id, username) }, graph.edges).subgraph(vpred = (id, username) => username != null).cache())
188 | 
189 |         // Display communities
190 |         if (displayResult) {
191 |             println("\nCommunities found " + result.length)
192 |             for (community <- result) {
193 |                 println("-----------------------")
194 |                 community.edges.collect().foreach(println(_))
195 |                 community.vertices.collect().foreach(println(_))
196 |             }
197 |         }
198 | 
199 |         cc.unpersist()
200 |         lowerIDPerCommunity.unpersist()
201 | 
202 |         (result, collectIDsCommunity)
203 |     }
204 | 
205 |     /**
206 |      * getTriangleCount
207 |      *
208 |      * Compute the number of triangles passing through each vertex.
209 |      *
210 |      * @param Graph[String,String] $graph - Graph element
211 |      * @param RDD[(VertexId, (String))] $users - Vertices
212 |      * @return Unit
213 |      *
214 |      * @see [[org.apache.spark.graphx.lib.TriangleCount$#run]]
215 |      */
216 |     def getTriangleCount(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = {
217 | 
218 |         println(color("\nCall getTriangleCount", RED))
219 | 
220 |         // Sort edges ID srcID < dstID
221 |         val edges = graph.edges.map { e =>
222 |             if (e.srcId < e.dstId) {
223 |                 Edge(e.srcId, e.dstId, e.attr)
224 |             }
225 |             else {
226 |                 Edge(e.dstId, e.srcId, e.attr)
227 |             }
228 |         }
229 | 
230 |         // Temporary graph
231 |         val newGraph = Graph(users, edges, "").cache()
232 | 
233 |         // Find the triangle count for each vertex
234 |         // TriangleCount requires the graph to be partitioned
235 |         val triCounts = newGraph.partitionBy(PartitionStrategy.RandomVertexCut).cache().triangleCount().vertices
236 | 
237 |         val triCountByUsername = users.join(triCounts).map {
238 |             case (id, (username, rank)) => (id, username, rank)
239 |         }
240 | 
241 |         println("Display triangle's sum for each user")
242 |         triCountByUsername.foreach(println)
243 | 
244 |         println("\nTotal: " + triCountByUsername.map { case (id, username, rank) => rank }.distinct().count() + "\n")
245 |     }
246 | 
247 |     /**
248 |      * @constructor ConnectedComponents
249 |      *
250 |      *              Compute the connected component membership of each vertex and return a graph with the vertex
251 |      *              value containing the lowest vertex id in the connected component containing that vertex.
252 |      *
253 |      * @param Graph[String,String] $graph - Graph element
254 |      * @param RDD[(VertexId, (String))] $users - Vertices
255 |      * @return Unit
256 |      *
257 |      * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]]
258 |      */
259 |     def cc(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = {
260 |         println(color("\nCall ConnectedComponents", RED))
261 | 
262 |         // Find the connected components
263 |         val cc = graph.connectedComponents().vertices
264 | 
265 |         // Join the connected components with the usernames and id
266 |         val ccByUsername = users.join(cc).map {
267 |             case (id, (username, cc)) => (id, username, cc)
268 |         }
269 |         // Print the result
270 |         println(ccByUsername.collect().sortBy(_._3).mkString("\n"))
271 | 
272 |         println("\nTotal groups: " + ccByUsername.map { case (id, username, cc) => cc }.distinct().count() + "\n")
273 |     }
274 | 
275 |     /**
276 |      * @constructor StronglyConnectedComponents
277 |      *
278 |      *              Compute the strongly connected component (SCC) of each vertex and return a graph with the
279 |      *              vertex value containing the lowest vertex id in the SCC containing that vertex.
280 |      *
281 |      *              Display edges's membership and total groups
282 |      *
283 |      * @param Graph[String,String] $graph - Graph element
284 |      * @param Int $iteration - Number of iteration
285 |      * @return Unit
286 |      */
287 |     def scc(graph: Graph[String, String], iteration: Int): Unit = {
288 | 
289 |         println(color("\nCall StronglyConnectedComponents : iteration : " + iteration, RED))
290 |         val sccGraph = graph.stronglyConnectedComponents(5)
291 | 
292 |         val connectedGraph = sccGraph.vertices.map {
293 |             case (member, leaderGroup) => s"$member is in the group of $leaderGroup's edge"
294 |         }
295 | 
296 |         val totalGroups = sccGraph.vertices.map {
297 |             case (member, leaderGroup) => leaderGroup
298 |         }
299 | 
300 |         connectedGraph.collect().foreach(println)
301 | 
302 |         println("\nTotal groups: " + totalGroups.distinct().count() + "\n")
303 |     }
304 | 
305 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
306 | }


--------------------------------------------------------------------------------
/scala/FinalProject/src/main/scala/utils/GraphUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | // To make some of the examples work we will also need RDD
  4 | 
  5 | import org.apache.spark.graphx._
  6 | import org.apache.spark.rdd.RDD
  7 | 
  8 | 
  9 | class GraphUtils extends serializable {
 10 | 
 11 |     val RED = "\033[1;30m"
 12 |     val ENDC = "\033[0m"
 13 |     private val defaultSeed = 0xadc83b19L
 14 | 
 15 |     /**
 16 |      * @constructor murmurHash64A
 17 |      *
 18 |      * @param
 19 |      * @param
 20 |      * @return Long
 21 |      *
 22 |      */
 23 |     def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = {
 24 |         val m = 0xc6a4a7935bd1e995L
 25 |         val r = 47
 26 | 
 27 |         val f: Long => Long = m.*
 28 |         val g: Long => Long = x => x ^ (x >>> r)
 29 | 
 30 |         val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) =>
 31 |             val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff))
 32 |             val j: Long => Long = if (xs.length == 8) f compose g compose f else identity
 33 |             f(y ^ j(k))
 34 |         }
 35 |         (g compose f compose g)(h)
 36 |     }
 37 | 
 38 |     /**
 39 |      * @constructor getPageRank
 40 |      *
 41 |      *              Run PageRank for a fixed number of iterations returning a graph with vertex attributes
 42 |      *              containing the PageRank and edge attributes the normalized edge weight.
 43 |      *
 44 |      * @param Graph[String,String] $graph - Graph element
 45 |      * @param RDD[(VertexId, (String))] $users - Vertices
 46 |      * @return Unit
 47 |      *
 48 |      * @see [[org.apache.spark.graphx.lib.PageRank$#run]]
 49 |      */
 50 |     def getPageRank(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = {
 51 | 
 52 |         println(color("\nCall getPageRank", RED))
 53 | 
 54 |         val ranks = graph.pageRank(0.00001).vertices
 55 | 
 56 |         val ranksByUsername = users.join(ranks).map {
 57 |             case (id, (username, rank)) => (id, username, rank)
 58 |         }
 59 | 
 60 |         // Print the result descending
 61 |         println(ranksByUsername.collect().sortBy(_._3).reverse.mkString("\n"))
 62 |     }
 63 | 
 64 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 65 | 
 66 |     /**
 67 |      * @constructor inAndOutDegrees
 68 |      *
 69 |      * @param Graph[String,String] $graph - Graph element
 70 |      * @return Unit
 71 |      *
 72 |      */
 73 |     def inAndOutDegrees(graph: Graph[String, String]): Unit = {
 74 | 
 75 |         println(color("\nCall inAndOutDegrees", RED))
 76 | 
 77 |         // Create User class
 78 |         case class User(name: String, // Username
 79 |                         inDeg: Int, // Received tweets
 80 |                         outDeg: Int) // Sent tweets
 81 | 
 82 |         // Create user Graph
 83 |         // def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED]
 84 |         val initialUserGraph: Graph[User, String] = graph.mapVertices {
 85 |             case (id, (name)) => User(name, 0, 0)
 86 |         }
 87 | 
 88 |         //initialUserGraph.edges.collect.foreach(println(_))
 89 | 
 90 | 
 91 |         // Fill in the degree informations (out and in degrees)
 92 |         val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) {
 93 |             case (id, u, inDegOpt) => User(u.name, inDegOpt.getOrElse(0), u.outDeg)
 94 |         }.outerJoinVertices(initialUserGraph.outDegrees) {
 95 |             case (id, u, outDegOpt) => User(u.name, u.inDeg, outDegOpt.getOrElse(0))
 96 |         }
 97 | 
 98 |         // Display the userGraph
 99 |         userGraph.vertices.foreach {
100 |             case (id, u) => println(s"User $id is called ${u.name} and received ${u.inDeg} tweets and send ${u.outDeg}.")
101 |         }
102 |     }
103 | }


--------------------------------------------------------------------------------
/scala/FinalProject/src/main/scala/utils/MllibUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark.mllib.clustering._
  4 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
  5 | import org.apache.spark.rdd.RDD
  6 | 
  7 | import scala.collection.mutable
  8 | 
  9 | /**
 10 |  * Topic models automatically infer the topics discussed in a collection of documents. These topics can be used
 11 |  * to summarize and organize documents, or used for featurization and dimensionality reduction in later stages
 12 |  * of a Machine Learning (ML) pipeline.
 13 |  *
 14 |  * LDA is not given topics, so it must infer them from raw text. LDA defines a topic as a distribution over words.
 15 |  */
 16 | class MllibUtils {
 17 | 
 18 |     // Terminal Color
 19 |     val RED = "\033[1;30m"
 20 |     val ENDC = "\033[0m"
 21 | 
 22 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 23 | 
 24 |     def createdoc(tokenizedCorpus: RDD[String]): ((Seq[(Long, Vector)], Array[String], Map[String, Int], Array[String])) = {
 25 | 
 26 |         println(color("\nCall createdoc", RED))
 27 | 
 28 |         // Choose the vocabulary.
 29 |         // termCounts: Sorted list of (term, termCount) pairs
 30 |         val termCounts: Array[(String, Long)] =
 31 |             tokenizedCorpus.map(_ -> 1L).reduceByKey(_ + _).collect().sortBy(-_._2)
 32 | 
 33 |         // vocabArray: Chosen vocab (removing common terms)
 34 |         val numStopwords = 20
 35 |         val vocabArray: Array[String] =
 36 |             termCounts.takeRight(termCounts.length - numStopwords).map(_._1)
 37 | 
 38 |         // vocab: Map term -> term index
 39 |         val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap
 40 | 
 41 |         val tokenCollected = tokenizedCorpus.collect()
 42 | 
 43 | 
 44 |         // MAP : [ Word ID , VECTOR [vocab.size, WordFrequency]]
 45 |         val documents: Map[Long, Vector] = vocab.map { case (tokens, id) =>
 46 | 
 47 |             val counts = new mutable.HashMap[Int, Double]()
 48 | 
 49 |             // Word ID
 50 |             val idx = vocab(tokens)
 51 | 
 52 |             // Count word occurancy
 53 |             counts(idx) = counts.getOrElse(idx, 0.0) + tokenCollected.count(_ == tokens)
 54 | 
 55 |             // Return word ID and Vector
 56 |             (id.toLong, Vectors.sparse(vocab.size, counts.toSeq))
 57 |         }
 58 | 
 59 |         (documents.toSeq, tokenizedCorpus.collect(), vocab, tokenizedCorpus.collect())
 60 |     }
 61 | 
 62 | 
 63 |     def cosineSimilarity(tokenizedCorpus: RDD[String], vocab: Map[String, Int], tokenizedTweet: Array[String]): (Seq[(Long, Vector)]) = {
 64 | 
 65 |         println(color("\nCall cosineSimilarity", RED))
 66 | 
 67 |         val document: Map[Long, Vector] = vocab.map { case (tokens, id) =>
 68 | 
 69 |             val counts2 = new mutable.HashMap[Int, Double]()
 70 | 
 71 |             // Word ID
 72 |             val idx = vocab(tokens)
 73 | 
 74 |             // Count word occurancy
 75 |             counts2(idx) = counts2.getOrElse(idx, 0.0) + tokenizedTweet.count(_ == tokens).toDouble
 76 | 
 77 |             // Return word ID and Vector
 78 |             (id.toLong, Vectors.sparse(vocab.size, counts2.toSeq))
 79 |         }
 80 | 
 81 |         document.toSeq
 82 |     }
 83 | 
 84 |     /**
 85 |      * @constructor findTopics
 86 |      *
 87 |      *              Set currentTweet attribut and add the new tweet to the dictionnary
 88 |      *
 89 |      * @param LDAModel $ldaModel - LDA Model (LocalModel)
 90 |      * @param Array[String] $vocabArray - Contains all distinct words set to LDA
 91 |      * @param Int $numWordsByTopics -
 92 |      * @param Boolean $displayResult - Display result in console
 93 |      *
 94 |      * @return LDAModel
 95 |      */
 96 |     def findTopics(ldaModel: LDAModel, vocabArray: Array[String], T: String, SG: Int, numWordsByTopics: Int, displayResult: Boolean): Seq[(String, String, String, String)] = {
 97 | 
 98 |         println(color("\nCall findTopics", RED))
 99 | 
100 |         println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
101 | 
102 |         val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = numWordsByTopics)
103 | 
104 |         var it = 0
105 |         var seqC = List[(String, String, String, String)]()
106 | 
107 |         // Print topics, showing top-weighted x terms for each topic.
108 |         topicIndices.foreach { case (terms, termWeights) =>
109 | 
110 |             if (displayResult)
111 |                 println("TOPICS:")
112 | 
113 |             val tabTopics = terms.zip(termWeights).map(vector => vocabArray(vector._1.toInt).toString).mkString(";")
114 | 
115 |             if (displayResult) {
116 |                 terms.zip(termWeights).foreach { case (term, weight) =>
117 |                     println(s"${vocabArray(term.toInt)}\t\t$weight")
118 |                 }
119 |             }
120 | 
121 |             seqC = seqC :+(T, SG.toString, it.toString, tabTopics)
122 | 
123 |             println("T: " + T + " SG: " + SG + "TopicN: " + it + " c: " + tabTopics)
124 |             it += 1
125 | 
126 |             if (displayResult)
127 |                 println()
128 | 
129 |         }
130 |         seqC.toSeq
131 |     }
132 | }


--------------------------------------------------------------------------------
/scala/FinalProject/src/main/scala/utils/RDDUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | 
  5 | import scala.collection.mutable.ArrayBuffer
  6 | 
  7 | // To make some of the examples work we will also need RDD
  8 | 
  9 | import org.apache.spark.graphx._
 10 | import org.apache.spark.rdd.RDD
 11 | 
 12 | 
 13 | class RDDUtils {
 14 | 
 15 |     val RED = "\033[1;30m"
 16 |     val ENDC = "\033[0m"
 17 | 
 18 |     /**
 19 |      * @constructor ArrayToVertices
 20 |      *
 21 |      *              Convert ArrayBuffer to RDD containing Vertices
 22 |      *
 23 |      * @param SparkContext - $sc - SparkContext
 24 |      * @param ArrayBuffer[(Long, (String))] - $collection - Contains vertices
 25 |      *
 26 |      * @return RDD[Edge[String]] - RDD of vertices
 27 |      */
 28 |     def ArrayToVertices(sc: SparkContext, collection: ArrayBuffer[(Long, (String))]): RDD[(VertexId, (String))] = {
 29 |         sc.parallelize(collection)
 30 |     }
 31 | 
 32 |     /**
 33 |      * @constructor ArrayToEdges
 34 |      *
 35 |      *              Convert ArrayBuffer to RDD containing Edges
 36 |      *
 37 |      * @param SparkContext - $sc - SparkContext
 38 |      * @param ArrayBuffer[Edge[String]] - $collection - Contains edges
 39 |      *
 40 |      * @return RDD[Edge[String]] - RDD of edges
 41 |      */
 42 |     def ArrayToEdges(sc: SparkContext, collection: ArrayBuffer[Edge[String]]): RDD[Edge[String]] = {
 43 |         sc.parallelize(collection)
 44 |     }
 45 | 
 46 |     /**
 47 |      * @constructor findUserByIDInGraph
 48 |      *
 49 |      *              find user ID with username
 50 |      *
 51 |      * @param Graph[String,String] $graph - Graph element
 52 |      * @param Int $userID - User id
 53 |      * @return String - if success : username | failure : "user not found"
 54 |      */
 55 |     def findUserNameByIDInGraph(graph: Graph[String, String], userID: Int): String = {
 56 |         println(color("\nCall : findUserNameWithID", RED))
 57 | 
 58 |         graph.vertices.filter { case (id, name) => id.toString equals userID.toString }.collect().foreach {
 59 |             (e: (org.apache.spark.graphx.VertexId, String)) => return e._2
 60 |         }
 61 |         "user not found"
 62 |     }
 63 | 
 64 |     /**
 65 |      * @constructor findUserIDByNameInGraph
 66 |      *
 67 |      *              find username with id
 68 |      *
 69 |      * @param Graph[String,String] $graph - Graph element
 70 |      * @param String $userName - Username
 71 |      * @return String - if success : id found | failure : "0"
 72 |      */
 73 |     def findUserIDByNameInGraph(graph: Graph[String, String], userName: String): String = {
 74 |         println(color("\nCall : findUserIDWithName", RED))
 75 | 
 76 |         graph.vertices.filter(_._2 == userName).collect().foreach {
 77 |             (e: (org.apache.spark.graphx.VertexId, String)) => return e._1.toString
 78 |         }
 79 |         "0"
 80 |     }
 81 | 
 82 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 83 | 
 84 |     /**
 85 |      * @constructor displayAllCommunications
 86 |      *
 87 |      *              display all communications between users
 88 |      *
 89 |      * @param Graph[String,String] $graph - Graph element
 90 |      * @return Unit
 91 |      */
 92 |     def displayAllCommunications(graph: Graph[String, String]): Unit = {
 93 | 
 94 |         println(color("\nCall : displayAllCommunications", RED))
 95 |         println("Users communications: ")
 96 | 
 97 |         val facts: RDD[String] = graph.triplets.map(triplet => triplet.srcAttr + " communicate with " +
 98 |             triplet.dstAttr + " with tweet id " + triplet.attr)
 99 | 
100 |         facts.collect().foreach(println(_))
101 |     }
102 | }


--------------------------------------------------------------------------------
/scala/FindCommunities/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "FindCommunities"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.5"
 6 | 
 7 | /*libraryDependencies ++= Seq(
 8 |     "org.apache.spark" %% "spark-core" % "1.3.0" % "provided",
 9 |     "org.apache.spark" %% "spark-graphx" % "1.3.0" % "provided",
10 |     "org.apache.spark" %% "spark-mllib" % "1.3.0" % "provided")
11 | 
12 | libraryDependencies += "org.apache.spark" % "spark-streaming-twitter_2.10" % "1.3.0"	
13 | 
14 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.3.0-M1"*/
15 | 
16 | libraryDependencies ++= Seq(
17 |     "org.apache.spark" %% "spark-core" % "1.4.0" % "provided",
18 |     "org.apache.spark" %% "spark-graphx" % "1.4.0" % "provided",
19 |     "org.apache.spark" %% "spark-mllib" % "1.4.0" % "provided")
20 | 
21 | libraryDependencies += "org.apache.spark" % "spark-streaming-twitter_2.10" % "1.4.0"	
22 | 
23 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0-M1"
24 | 
25 | //libraryDependencies += "com.google.code.gson" % "gson" % "2.3"
26 | 
27 | //libraryDependencies += "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly()
28 | 
29 | // http://stackoverflow.com/questions/28459333/how-to-build-an-uber-jar-fat-jar-using-sbt-within-intellij-idea
30 | // META-INF discarding
31 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
32 |    {
33 |     case PathList("META-INF", xs @ _*) => MergeStrategy.discard
34 |     case x => MergeStrategy.first
35 |    }
36 | }
37 | 
38 | resolvers ++= Seq(
39 |     // "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
40 |     // "Spray Repository" at "http://repo.spray.cc/",
41 |     // "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
42 |     // "Akka Repository" at "http://repo.akka.io/releases/",
43 |     //  "Twitter4J Repository" at "http://twitter4j.org/maven2/",
44 |     //  "Apache HBase" at "https://repository.apache.org/content/repositories/releases",
45 |     //  "Twitter Maven Repo" at "http://maven.twttr.com/",
46 |     //  "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
47 |     //  "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
48 |     //  "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/"
49 |     //  "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
50 |     // Resolver.sonatypeRepo("public")
51 | )


--------------------------------------------------------------------------------
/scala/FindCommunities/launch.sh:
--------------------------------------------------------------------------------
1 | date >> log.log
2 | 
3 | spark-submit --class FindCommunities /home/mcaraccio/TB_2015/scala/FindCommunities/target/scala-2.10/FindCommunities-assembly-1.0.jar
4 | 
5 | # | tee log.log -a
6 | 
7 | 


--------------------------------------------------------------------------------
/scala/FindCommunities/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")


--------------------------------------------------------------------------------
/scala/FindCommunities/src/main/scala/FindCommunities.scala:
--------------------------------------------------------------------------------
  1 | // /////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2 | //  Author              : Michael Caraccio
  3 | //  Project title       : Détection et analyse de communauté Twitter
  4 | // /////////////////////////////////////////////////////////////////////////////////////////////////////////////
  5 | 
  6 | import org.apache.spark.SparkConf
  7 | import org.apache.spark.graphx._
  8 | import org.apache.spark.rdd.RDD
  9 | import org.apache.spark.storage.StorageLevel
 10 | import org.apache.spark.streaming._
 11 | import org.apache.spark.streaming.twitter.TwitterUtils
 12 | import utils._
 13 | 
 14 | import scala.collection.mutable
 15 | import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 16 | import scala.math._
 17 | import scala.reflect.ClassTag
 18 | 
 19 | //Log4J
 20 | import org.apache.log4j.{Level, Logger}
 21 | 
 22 | // Cassandra
 23 | import com.datastax.spark.connector._
 24 | 
 25 | // Regex
 26 | import scala.util.matching.Regex
 27 | 
 28 | // MLlib
 29 | import org.apache.spark.mllib.clustering.{LDA, _}
 30 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
 31 | 
 32 | 
 33 | object FindCommunities {
 34 | 
 35 |     // /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 36 |     // CONSTANT
 37 |     // /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 38 | 
 39 |     var MIN_VERTICES_PER_COMMUNITIES = 6            // Limit - Minimum vertices per communities
 40 |     var MIN_WORD_LENGTH = 3                         // Minimum word length in tweet
 41 |     var NBKCORE = 6                                 // Number of core - K Core Decomposition algorithm
 42 |     var BATCH_SIZE = 900                            // Batch size (in seconds)
 43 |     var CLEAN_GRAPH_MOD = 4                         // Clean stockGraph every CLEAN_GRAPH_MOD
 44 |     var CLEAN_GRAPH_NBKCORE = 2                     // When clean graph is called, k-core decomposition is called
 45 | 
 46 |     val defaultSeed = 0xadc83b19L                   // Seed for murmurhash - Do not change this value
 47 | 
 48 |     var dictionnary = new ArrayBuffer[String]()     // Store tweets
 49 |     var ldaModel: LDAModel = null                   // LDA Model
 50 |     var lda: LDA = null                             // LDA object
 51 |     var stockGraph: Graph[String, String] = null    // Store every edges and vertices received by Twitter
 52 |     var currentTweets: String = ""
 53 | 
 54 |     var counter = 1                                 // Perid
 55 | 
 56 |     val RED = "\033[1;30m"                          // Terminal color RED
 57 |     val ENDC = "\033[0m"                            // Terminal end character
 58 | 
 59 | 
 60 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 61 | 
 62 | 
 63 |     def main(args: Array[String]) {
 64 | 
 65 |         val ru = new RDDUtils // Manipulate RDD class
 66 |         val tc = new TwitterConfig // Login and password for Twitter
 67 | 
 68 |         // LDA parameters
 69 |         val topicSmoothing = 1.2
 70 |         val termSmoothing = 1.2
 71 |         val numTopics = 10
 72 |         val numIterations = 50
 73 |         val numWordsByTopics = 12
 74 | 
 75 |         // Display only error messages
 76 |         Logger.getLogger("org").setLevel(Level.ERROR)
 77 |         Logger.getLogger("akka").setLevel(Level.ERROR)
 78 |         Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
 79 |         Logger.getLogger("org.apache.spark.storage.BlockManager").setLevel(Level.ERROR)
 80 | 
 81 |         // Spark configuration
 82 |         val sparkConf = new SparkConf(true)
 83 |             .setAppName("FindCommunities")
 84 |             .setMaster("local[4]")
 85 |             .set("spark.akka.frameSize", "1000")
 86 |             .set("spark.streaming.receiver.maxRate", "0") // no limit on the rate
 87 |             .set("spark.task.maxFailures", "30000")
 88 |             .set("spark.akka.timeout", "180")
 89 |             .set("spark.network.timeout", "180")
 90 |             .set("spark.driver.cores", "4")
 91 |             .set("spark.driver.memory", "16g")
 92 |             .set("spark.executor.memory", "16g")
 93 |             .set("spark.shuffle.memoryFraction", "0.7")
 94 |             .set("spark.driver.maxResultSize", "0") // no limit
 95 |             .set("spark.cassandra.connection.host", "157.26.83.16") // Link to Cassandra
 96 |             .set("spark.cassandra.auth.username", "cassandra")
 97 |             .set("spark.cassandra.auth.password", "cassandra");
 98 | 
 99 |         // Set the system properties so that Twitter4j library used by twitter stream
100 |         // can use them to generate OAuth credentials
101 |         System.setProperty("twitter4j.oauth.consumerKey", tc.getconsumerKey())
102 |         System.setProperty("twitter4j.oauth.consumerSecret", tc.getconsumerSecret())
103 |         System.setProperty("twitter4j.oauth.accessToken", tc.getaccessToken())
104 |         System.setProperty("twitter4j.oauth.accessTokenSecret", tc.getaccessTokenSecret())
105 |         System.setProperty("twitter4j.http.connectionTimeout", "200000")
106 |         System.setProperty("twitter4j.http.retryCount", "30")
107 |         System.setProperty("twitter4j.http.retryIntervalSecs", "2")
108 | 
109 | 
110 | 
111 |         println("\n\n**************************************************************")
112 |         println("******************        FindCommunities      ***************")
113 |         println("**************************************************************\n")
114 | 
115 |         val words = Array(" @") // Filters tweet stream by words
116 | 
117 |         // Pattern used to find users and filter tweets
118 |         val pattern = new Regex("\\@\\w{3,}")
119 |         val patternURL = new Regex("(http|ftp|https)://[A-Za-z0-9-_]+.[A-Za-z0-9-_:%&?/.=]+")
120 |         val patternSmiley = new Regex("((?::|;|=)(?:-)?(?:\\)|D|P|3|O))")
121 |         val patternCommonWords = new Regex("\\b(that|have|with|this|from|they|would|there|their|what|about|which|when|make|like|time|just|know|take|into|year|your|good|some|could|them|other|than|then|look|only|come|over|think|also|back|after|work|first|well|even|want|because|these|give|most|http|https|fpt)\\b")
122 | 
123 |         // Streaming context -> batch size
124 |         val ssc = new StreamingContext(sparkConf, Seconds(BATCH_SIZE))
125 | 
126 |         val stream = TwitterUtils.createStream(ssc, None, words)
127 | 
128 |         // filter for english user only
129 |         stream.filter(a => a.getUser.getLang.equals("en") || a.getUser.getLang.equals("en-GB"))
130 | 
131 |         // Group into larger batches
132 |         val streamBatch = stream.window(Seconds(BATCH_SIZE), Seconds(BATCH_SIZE))
133 | 
134 |         // Init SparkContext
135 |         val sc = ssc.sparkContext
136 | 
137 |         /**
138 |          * LDA CREATED FROM CASSANDRA
139 |          * Date comes from old tweets
140 |          */
141 |         println("\n*******************************************")
142 |         println("Create corpus from Cassandra")
143 |         println("*******************************************\n")
144 | 
145 |         // Get every tweets
146 |         val rdd = sc.cassandraTable("twitter", "tweet_filtered").cache()
147 | 
148 |         rdd.select("tweet_text").as((i: String) => i).collect().foreach(x => {
149 | 
150 |             val preText = patternCommonWords.replaceAllIn(x.toLowerCase, "")
151 | 
152 |             val tweet = preText
153 |                 .toLowerCase.split("\\s")
154 |                 .filter(_.length > MIN_WORD_LENGTH)
155 |                 .filter(_.forall(java.lang.Character.isAlphabetic(_)))
156 | 
157 |             if (tweet.length > 0) {
158 |                 for (t <- tweet) {
159 |                     dictionnary += t
160 |                 }
161 |             }
162 |         })
163 | 
164 | 
165 |         // Create RDD
166 |         val dictRDDInit = sc.parallelize(dictionnary).cache()
167 | 
168 |         // Init LDA
169 |         lda = new LDA()
170 |             .setK(numTopics)
171 |             .setDocConcentration(topicSmoothing)
172 |             .setTopicConcentration(termSmoothing)
173 |             .setMaxIterations(numIterations)
174 |             .setOptimizer("online") // works with Apache Spark 1.4 only
175 | 
176 |         // Create documents for LDA
177 |         val (res1: RDD[(Long, Vector)], vocab: Map[String, Int]) = time {
178 |             createdoc(dictRDDInit)
179 |         }
180 | 
181 |         dictRDDInit.unpersist()
182 | 
183 |         if (!res1.isEmpty()) {
184 |             // Start LDA
185 |             println("LDA Started")
186 |             time {
187 |                 ldaModel = lda.run(res1.persist(StorageLevel.MEMORY_AND_DISK_SER))
188 |             }
189 |             println("LDA Finished\n")
190 |         }
191 |         res1.unpersist()
192 | 
193 | 
194 | 
195 |         // /////////////////////////////////////////////////////////////////////////////////////////////////////////////
196 |         // STREAM OBJECT
197 |         // /////////////////////////////////////////////////////////////////////////////////////////////////////////////
198 | 
199 | 
200 |         // Stream about users
201 |         val usersStream = streamBatch.map { status => (
202 |             status.getUser.getId.toString,
203 |             abs(murmurHash64A(status.getUser.getScreenName.getBytes)),
204 |             status.getUser.getName,
205 |             status.getUser.getLang,
206 |             status.getUser.getFollowersCount.toString,
207 |             status.getUser.getFriendsCount.toString,
208 |             status.getUser.getScreenName,
209 |             status.getUser.getStatusesCount.toString)
210 |         }
211 | 
212 |         // Stream about communication between two users
213 |         val commStream = streamBatch.map { status => (
214 |             status.getId,
215 |             status.getUser.getId.toString,
216 |             status.getUser.getScreenName,
217 |             if (pattern.findFirstIn(status.getText).isEmpty) {
218 |                 ""
219 |             }
220 |             else {
221 |                 pattern.findFirstIn(status.getText).getOrElse("@MichaelCaraccio").tail
222 |             },
223 |             status.getText
224 |             )
225 |         }
226 | 
227 |         // Stream about tweets
228 |         val tweetsStream = streamBatch.map { status => (
229 |             status.getId.toString,
230 |             status.getUser.getId.toString,
231 |             abs(murmurHash64A(status.getUser.getScreenName.getBytes)),
232 |             new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(status.getCreatedAt),
233 |             status.getRetweetCount.toString,
234 |             status.getText
235 |             )
236 |         }
237 | 
238 | 
239 |         // /////////////////////////////////////////////////////////////////////////////////////////////////////////////
240 |         // STREAMING PART
241 |         // Following code is called every batch interval
242 |         // /////////////////////////////////////////////////////////////////////////////////////////////////////////////
243 | 
244 |         println("*******************************************")
245 |         println("Streaming started")
246 |         println("*******************************************\n")
247 | 
248 |         // ************************************************************
249 |         // Save tweet's informations into Cassandra
250 |         // ************************************************************
251 |         tweetsStream.foreachRDD(rdd => {
252 | 
253 |             rdd.persist(StorageLevel.MEMORY_AND_DISK)
254 | 
255 |             // For each tweets in RDD
256 |             val seqtweetsStream = rdd.collect().map(a => (a._1, a._2, a._3.toString, a._4, a._5, patternSmiley.replaceAllIn(patternURL.replaceAllIn(a._6, ""), ""))).toList
257 | 
258 | 
259 |             sc.parallelize(seqtweetsStream).saveToCassandra(
260 |                 "twitter",
261 |                 "tweet_filtered",
262 |                 SomeColumns("tweet_id",
263 |                     "user_twitter_id",
264 |                     "user_local_id",
265 |                     "tweet_create_at",
266 |                     "tweet_retweet",
267 |                     "tweet_text"
268 |                 ))
269 | 
270 |             // reset
271 |             rdd.unpersist()
272 |         })
273 | 
274 |         // ************************************************************
275 |         // Save user's informations in Cassandra
276 |         // ************************************************************
277 |         usersStream.persist(StorageLevel.MEMORY_AND_DISK).foreachRDD(rdd => {
278 |             rdd.saveToCassandra("twitter", "user_filtered", SomeColumns("user_twitter_id", "user_local_id", "user_name", "user_lang", "user_follow_count", "user_friends_count", "user_screen_name", "user_status_count"))
279 |         })
280 | 
281 |         // ************************************************************
282 |         // Save communication's informations in Cassandra
283 |         // ************************************************************
284 |         commStream.persist(StorageLevel.MEMORY_AND_DISK).foreachRDD(rdd => {
285 | 
286 |             // Timer
287 |             val t00 = System.nanoTime()
288 | 
289 |             // Collection of vertices (contains users)
290 |             var collectionVertices = new ArrayBuffer[(Long, String)]()
291 | 
292 |             // Collection of edges (contains communications between users)
293 |             var collectionEdge = new ArrayBuffer[Edge[String]]()
294 | 
295 |             val seqcommStream = new ListBuffer[(String, String, String, String)]()
296 | 
297 |             rdd.persist(StorageLevel.MEMORY_AND_DISK)
298 | 
299 |             /**
300 |              * Enregistrement des messages dans cassandra
301 |              */
302 | 
303 |             val textBuffer = rdd.collect().map { g => g._1 -> g._5 }.toMap
304 | 
305 |             // For each tweets in RDD
306 |             for (item <- rdd.collect()) {
307 | 
308 |                 // Avoid single @ in message, english only
309 |                 if (item._4.nonEmpty) {
310 | 
311 |                     // Sender ID
312 |                     val sendID: Long = abs(murmurHash64A(item._3.getBytes))
313 | 
314 |                     // Sender
315 |                     collectionVertices += ((sendID, item._3))
316 | 
317 |                     // For each dest in tweet
318 |                     pattern.findAllIn(item._5).foreach { destName => {
319 | 
320 |                         val user_dest_name = destName.drop(1)
321 | 
322 |                         // Generate Hash
323 |                         val destID: Long = abs(murmurHash64A(user_dest_name.getBytes))
324 | 
325 |                         if (sendID != destID) {
326 |                             // Create each users and edges
327 |                             collectionVertices += ((destID, user_dest_name))
328 |                             collectionEdge += Edge(sendID, destID, item._1.toString)
329 | 
330 |                             seqcommStream.append((item._1.toString, item._2, sendID.toString, destID.toString))
331 |                         }
332 |                     }
333 |                     }
334 |                 }
335 |             }
336 | 
337 | 
338 |             sc.parallelize(seqcommStream).saveToCassandra(
339 |                 "twitter",
340 |                 "users_communicate",
341 |                 SomeColumns(
342 |                     "tweet_id",
343 |                     "user_send_twitter_id",
344 |                     "user_send_local_id",
345 |                     "user_dest_id"))
346 | 
347 |             // reset
348 |             seqcommStream.clear()
349 | 
350 | 
351 |             /**
352 |              * Initialisation du graph
353 |              */
354 | 
355 |             // Empty graph at first launch
356 |             if (stockGraph == null) {
357 | 
358 |                 // Convert vertices to RDD
359 |                 val VerticesRDD = ru ArrayToVertices(sc, collectionVertices)
360 | 
361 |                 // Convert it to RDD
362 |                 val EdgeRDD = ru ArrayToEdges(sc, collectionEdge)
363 | 
364 |                 stockGraph = Graph(VerticesRDD, EdgeRDD)
365 |                 stockGraph.unpersist()
366 |                 stockGraph.persist(StorageLevel.MEMORY_AND_DISK)
367 |             }
368 | 
369 |             /**
370 |              * Ajout des nouveaux Edges et Vertices dans le graph principal
371 |              */
372 | 
373 |             time {
374 |                 stockGraph = Graph(stockGraph.vertices.union(sc.parallelize(collectionVertices)), stockGraph.edges.union(sc.parallelize(collectionEdge)))
375 |             }
376 | 
377 |             collectionVertices = new ArrayBuffer[(Long, String)]()
378 |             collectionEdge = new ArrayBuffer[Edge[String]]()
379 | 
380 | 
381 |             /**
382 |              * Split main Graph in multiples communities
383 |              */
384 | 
385 |             if (counter % CLEAN_GRAPH_MOD == 0) {
386 |                 println("################################################")
387 |                 println("Clean stockgraph")
388 |                 println("Before cleaning (edges): " + stockGraph.edges.count())
389 | 
390 |                 stockGraph = time {
391 |                     splitCommunity(stockGraph, stockGraph.vertices, CLEAN_GRAPH_NBKCORE, displayResult = false)
392 |                 }
393 |                 println("After cleaning (edges): " + stockGraph.edges.count())
394 |                 println("################################################")
395 |             }
396 | 
397 |             val communityGraph = time {
398 |                 splitCommunity(stockGraph, stockGraph.vertices, NBKCORE, displayResult = false)
399 |             }
400 | 
401 |             communityGraph.cache()
402 | 
403 |             var (subgraphs, commIDs) = time {
404 |                 subgraphCommunities(communityGraph, stockGraph.vertices, displayResult = false)
405 |             }
406 | 
407 |             communityGraph.unpersist()
408 | 
409 |             /**
410 |              * LDA
411 |              */
412 | 
413 |             // We only care about subgraph bigger than MIN_VERTICES_PER_COMMUNITIES
414 |             subgraphs = time {
415 |                 subgraphs.filter(_.vertices.count() >= MIN_VERTICES_PER_COMMUNITIES)
416 |             }
417 | 
418 | 
419 |             currentTweets = ""
420 |             for (i <- subgraphs.indices) {
421 | 
422 |                 // Messages will be stored in an array
423 |                 val result = subgraphs(i).edges.collect().map(message => textBuffer.getOrElse(message.attr.toLong, "").replaceAll("[!?.,:;<>)(]", " "))
424 | 
425 |                 result.foreach(x => {
426 | 
427 |                     val preText = patternCommonWords.replaceAllIn(x.toLowerCase, "")
428 | 
429 |                     val tweet = preText
430 |                         .toLowerCase.split("\\s")
431 |                         .filter(_.length > MIN_WORD_LENGTH)
432 |                         .filter(_.forall(java.lang.Character.isAlphabetic(_)))
433 | 
434 |                     if (tweet.nonEmpty) {
435 |                         for (t <- tweet) {
436 |                             dictionnary += t
437 |                         }
438 |                     }
439 |                 })
440 |             }
441 | 
442 | 
443 |             // Create document
444 |             println("Create document")
445 |             val dictRDD = sc.parallelize(dictionnary).persist(StorageLevel.MEMORY_AND_DISK)
446 | 
447 |             val (res1: RDD[(Long, Vector)], vocab: Map[String, Int]) = time {
448 |                 createdoc(dictRDD)
449 |             }
450 | 
451 | 
452 |             // Start LDA
453 |             println("LDA Started")
454 |             ldaModel = lda.run(res1.persist(StorageLevel.MEMORY_AND_DISK_SER))
455 | 
456 |             res1.unpersist()
457 |             var seqC: Seq[(String, String, String, String)] = time {
458 |                 findTopics(ldaModel, dictionnary.toArray, counter.toString, 0, numWordsByTopics, displayResult = true)
459 |             }
460 | 
461 |             seqC = seqC.map(a => (counter.toString, a._2, a._3, a._4))
462 | 
463 | 
464 |             //Save to cassandra
465 |             sc.parallelize(seqC).saveToCassandra(
466 |                 "twitter",
467 |                 "lda",
468 |                 SomeColumns("t",
469 |                     "sg",
470 |                     "n_topic",
471 |                     "words"
472 |                 ))
473 | 
474 |             println("LDA Finished")
475 | 
476 | 
477 |             var cpt = 0
478 | 
479 |             for (i <- subgraphs.indices) {
480 | 
481 |                 println("\n\n:::::::::::::::::::::::::::::::::::")
482 |                 println("::::: Community N°" + i + " T: " + counter + " SG: " + cpt)
483 |                 println(":::::::::::::::::::::::::::::::::::")
484 | 
485 |                 // Timer
486 |                 val t0 = System.nanoTime()
487 | 
488 |                 // Current subgraph
489 |                 val sub = subgraphs(i).cache()
490 | 
491 |                 val verticesCount = sub.vertices.count()
492 | 
493 |                 println("Number of users in community : " + verticesCount)
494 | 
495 |                 // Messages will be stored in an array
496 |                 val result = sub.edges.collect().map(message => textBuffer.getOrElse(message.attr.toLong, "").replaceAll("[!?.,:;<>)(]", " "))
497 | 
498 |                 /**
499 |                  * If there's a new tweet in a community -> LDA
500 |                  */
501 | 
502 | 
503 |                 if (result.nonEmpty) {
504 | 
505 |                     println("Words in current tweet: " + result.length)
506 | 
507 |                     currentTweets = ""
508 |                     result.foreach(x => {
509 | 
510 |                         val preText = patternCommonWords.replaceAllIn(x.toLowerCase, "")
511 | 
512 |                         val tweet = preText
513 |                             .toLowerCase.split("\\s")
514 |                             .filter(_.length > MIN_WORD_LENGTH)
515 |                             .filter(_.forall(java.lang.Character.isAlphabetic(_)))
516 | 
517 |                         currentTweets = currentTweets.concat(tweet.mkString(" "))
518 | 
519 |                     })
520 | 
521 | 
522 |                     println("Call cosineSimilarity")
523 |                     val tabcosine: ArrayBuffer[Double] = cosineSimilarity(vocab, dictionnary.toArray.distinct, currentTweets.split(" "))
524 |                     println("outside cosineSimilarity")
525 | 
526 |                     // Pour chaques edges . On crée un Seq qui contient le futur record pour cassandra
527 |                     var seqcommunities = sub.edges.map(message => (counter.toString, verticesCount.toString, cpt.toString, commIDs(cpt).toString, message.srcId.toString, message.dstId.toString, message.attr, tabcosine.mkString(";"))).collect()
528 | 
529 |                     // Petit problème avec le counter qui ne se met pas a jour dans la method au dessus
530 |                     seqcommunities = seqcommunities.map(a => (counter.toString, a._2, a._3, a._4, a._5, a._6, a._7, a._8))
531 | 
532 |                     // Save to cassandra
533 |                     sc.parallelize(seqcommunities.toSeq).saveToCassandra(
534 |                         "twitter",
535 |                         "communities",
536 |                         SomeColumns("t",
537 |                             "nbv",
538 |                             "sg",
539 |                             "com_id",
540 |                             "src_id",
541 |                             "dst_id",
542 |                             "attr",
543 |                             "lda"
544 |                         ))
545 |                 } else {
546 |                     println("LDA wont process current document because it does not contains any words")
547 |                 }
548 | 
549 |                 cpt += 1
550 | 
551 | 
552 |                 val t1 = System.nanoTime()
553 |                 println("SubGraph N°: " + cpt + " processed in " + (t1 - t0) / 1000000000.0 + " seconds")
554 |             }
555 | 
556 |             counter += 1
557 | 
558 |             val t11 = System.nanoTime()
559 |             println("------------------------------------------------------------")
560 |             println("BATCH FINISHED")
561 |             println("Processed in " + (t11 - t00) / 1000000000.0 + " seconds")
562 |             println("------------------------------------------------------------")
563 |         })
564 | 
565 |         ssc.start()
566 |         ssc.awaitTermination()
567 |     }
568 | 
569 |     /**
570 |      * @constructor murmurHash64A
571 |      *
572 |      *              Murmur is a family of good general purpose hashing functions, suitable for non-cryptographic usage. As stated by Austin Appleby, MurmurHash provides the following benefits:
573 |      *              - good distribution (passing chi-squared tests for practically all keysets & bucket sizes.
574 |      *              - good avalanche behavior (max bias of 0.5%).
575 |      *              - good collision resistance (passes Bob Jenkin's frog.c torture-test. No collisions possible for 4-byte keys, no small (1- to 7-bit) differentials).
576 |      *              - great performance on Intel/AMD hardware, good tradeoff between hash quality and CPU consumption.
577 |      *
578 |      *              Source : http://stackoverflow.com/questions/11899616/murmurhash-what-is-it
579 |      *
580 |      * @param Seq[Byte] - $data
581 |      * @param Long - $seed
582 |      * @return Long - Return hash
583 |      *
584 |      */
585 |     def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = {
586 |         val m = 0xc6a4a7935bd1e995L
587 |         val r = 47
588 | 
589 |         val f: Long => Long = m.*
590 |         val g: Long => Long = x => x ^ (x >>> r)
591 | 
592 |         val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) =>
593 |             val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff))
594 |             val j: Long => Long = if (xs.length == 8) f compose g compose f else identity
595 |             f(y ^ j(k))
596 |         }
597 |         (g compose f compose g)(h)
598 |     }
599 | 
600 |     /**
601 |      * @constructor time
602 |      *
603 |      *              timer for profiling block
604 |      *
605 |      * @param R $block - Block executed
606 |      * @return Unit
607 |      */
608 |     def time[R](block: => R): R = {
609 |         val t0 = System.nanoTime()
610 |         val result = block // call-by-name
611 |         val t1 = System.nanoTime()
612 |         println("Elapsed time: " + (t1 - t0) / 1000000000.0 + " seconds")
613 |         result
614 |     }
615 | 
616 |     /**
617 |      * This method takes 2 equal length arrays of integers
618 |      * It returns a double representing similarity of the 2 arrays
619 |      * 0.9925 would be 99.25% similar
620 |      * (x dot y) / ||X|| ||Y||
621 |      *
622 |      * @param x
623 |      * @param y
624 |      * @return cosine similarity
625 |      */
626 |     def cosineSimilarity(x: ArrayBuffer[Double], y: ArrayBuffer[Double]): Double = {
627 |         require(x.length == y.length)
628 | 
629 |         if (magnitude(x) == 0.0 || magnitude(y) == 0.0)
630 |             return 0.0
631 | 
632 |         dotProduct(x, y) / (magnitude(x) * magnitude(y))
633 |     }
634 | 
635 |     /**
636 |      * Return the dot product of the 2 arrays
637 |      * e.g. (a[0]*b[0])+(a[1]*a[2])
638 |      *
639 |      * @param x
640 |      * @param y
641 |      * @return
642 |      */
643 |     def dotProduct(x: ArrayBuffer[Double], y: ArrayBuffer[Double]): Double = {
644 |         (for ((a, b) <- x zip y) yield a * b) sum
645 |     }
646 | 
647 |     /**
648 |      * We multiply each element, sum it, then square root the result.
649 |      *
650 |      * @param x
651 |      * @return  the magnitude of an array
652 |      */
653 |     def magnitude(x: ArrayBuffer[Double]): Double = {
654 |         math.sqrt(x map (i => i * i) sum)
655 |     }
656 | 
657 | 
658 |     def splitCommunity(graph: Graph[String, String], users: RDD[(VertexId, (String))], NBKCORE: Int, displayResult: Boolean): Graph[String, String] = {
659 | 
660 |         println(color("\nCall SplitCommunity", RED))
661 | 
662 |         getKCoreGraph(graph, users, NBKCORE, displayResult).cache()
663 |     }
664 | 
665 |     /**
666 |      * Compute the k-core decomposition of the graph for all k <= kmax. This
667 |      * uses the iterative pruning algorithm discussed by Alvarez-Hamelin et al.
668 |      * in K-Core Decomposition: a Tool For the Visualization of Large Scale Networks
669 |      * (see <a href="http://arxiv.org/abs/cs/0504107">http://arxiv.org/abs/cs/0504107</a>).
670 |      *
671 |      * @tparam VD the vertex attribute type (discarded in the computation)
672 |      * @tparam ED the edge attribute type (preserved in the computation)
673 |      *
674 |      * @param graph the graph for which to compute the connected components
675 |      * @param kmax the maximum value of k to decompose the graph
676 |      *
677 |      * @return a graph where the vertex attribute is the minimum of
678 |      *         kmax or the highest value k for which that vertex was a member of
679 |      *         the k-core.
680 |      *
681 |      * @note This method has the advantage of returning not just a single kcore of the
682 |      *       graph but will yield all the cores for k > kmin.
683 |      */
684 |     def getKCoreGraph[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED],
685 |                                                   users: RDD[(VertexId, (String))],
686 |                                                   kmin: Int,
687 |                                                   displayResult: Boolean): Graph[String, ED] = {
688 | 
689 |         // Graph[(Int, Boolean), ED] - boolean indicates whether it is active or not
690 |         var g = graph.cache().outerJoinVertices(graph.degrees)((vid, oldData, newData) => newData.getOrElse(0)).cache()
691 | 
692 |         println(color("\nCall KCoreDecomposition", RED))
693 | 
694 |         g = computeCurrentKCore(g, kmin).cache()
695 | 
696 |         val v = g.vertices.filter { case (vid, vd) => vd >= kmin }.cache()
697 | 
698 |         // Create new RDD users
699 |         val newUser = users.join(v).map {
700 |             case (id, (username, rank)) => (id, username)
701 |         }
702 | 
703 |         // Create a new graph
704 |         val gra = Graph(newUser, g.edges)
705 | 
706 |         // Remove missing vertices as well as the edges to connected to them
707 |         gra.subgraph(vpred = (id, username) => username != null).cache()
708 |     }
709 | 
710 |     def computeCurrentKCore[ED: ClassTag](graph: Graph[Int, ED], k: Int) = {
711 |         println("Computing kcore for k=" + k)
712 |         def sendMsg(et: EdgeTriplet[Int, ED]): Iterator[(VertexId, Int)] = {
713 |             if (et.srcAttr < 0 || et.dstAttr < 0) {
714 |                 // if either vertex has already been turned off we do nothing
715 |                 Iterator.empty
716 |             } else if (et.srcAttr < k && et.dstAttr < k) {
717 |                 // tell both vertices to turn off but don't need change count value
718 |                 Iterator((et.srcId, -1), (et.dstId, -1))
719 | 
720 |             } else if (et.srcAttr < k) {
721 |                 // if src is being pruned, tell dst to subtract from vertex count
722 |                 Iterator((et.srcId, -1), (et.dstId, 1))
723 | 
724 |             } else if (et.dstAttr < k) {
725 |                 // if dst is being pruned, tell src to subtract from vertex count
726 |                 Iterator((et.dstId, -1), (et.srcId, 1))
727 | 
728 |             } else {
729 |                 Iterator.empty
730 |             }
731 |         }
732 | 
733 |         // subtracts removed neighbors from neighbor count and tells vertex whether it was turned off or not
734 |         def mergeMsg(m1: Int, m2: Int): Int = {
735 |             if (m1 < 0 || m2 < 0) {
736 |                 -1
737 |             } else {
738 |                 m1 + m2
739 |             }
740 |         }
741 | 
742 |         def vProg(vid: VertexId, data: Int, update: Int): Int = {
743 |             if (update < 0) {
744 |                 // if the vertex has turned off, keep it turned off
745 |                 -1
746 |             } else {
747 |                 // subtract the number of neighbors that have turned off this round from
748 |                 // the count of active vertices
749 |                 // TODO(crankshaw) can we ever have the case data < update?
750 |                 max(data - update, 0)
751 |             }
752 |         }
753 | 
754 |         // Note that initial message should have no effect
755 |         Pregel(graph, 0)(vProg, sendMsg, mergeMsg)
756 |     }
757 | 
758 | 
759 |     /**
760 |      * SubGraphCommunities is used to find communities in a graph
761 |      *
762 |      * Steps :
763 |      *      1. Connected Compoenents
764 |      *      2. Collect subgraphs id's
765 |      *      3. Add subgraph to array
766 |      *      4. Return array of communities
767 |      *
768 |      * @param graph the graph for which to compute the connected components
769 |      * @param users RDD containing users - used to associate edges and vertices
770 |      * @param boolean displayResult
771 |      *
772 |      * @return an Array of graph (which contains subgraph) and communities ids
773 |      */
774 |     def subgraphCommunities(graph: Graph[String, String], users: RDD[(VertexId, (String))], displayResult: Boolean): (Array[Graph[String, String]], Array[Long]) = {
775 | 
776 |         println(color("\nCall subgraphCommunities", RED))
777 | 
778 |         // Find the connected components
779 |         val cc = time {
780 |             graph.connectedComponents().vertices.cache()
781 |         }
782 | 
783 |         // Join the connected components with the usernames and id
784 |         // The result is an RDD not a Graph
785 |         val ccByUsername = users.join(cc).map {
786 |             case (id, (username, cci)) => (id, username, cci)
787 |         }.cache()
788 | 
789 |         val lowerIDPerCommunity = ccByUsername.map { case (id, username, cci) => cci }.distinct().cache()
790 | 
791 |         // Result will be stored in an array
792 |         println("--------------------------")
793 |         println("Total community found: " + lowerIDPerCommunity.count())
794 |         println("--------------------------")
795 | 
796 | 
797 |         val collectIDsCommunity = lowerIDPerCommunity.collect()
798 | 
799 |         val result = collectIDsCommunity.map(colID => Graph(ccByUsername.filter {
800 |             _._3 == colID
801 |         }.map { case (id, username, cc) => (id, username) }, graph.edges).subgraph(vpred = (id, username) => username != null).cache())
802 | 
803 |         // Display communities
804 |         if (displayResult) {
805 |             println("\nCommunities found " + result.length)
806 |             for (community <- result) {
807 |                 println("-----------------------")
808 |                 community.edges.collect().foreach(println(_))
809 |                 community.vertices.collect().foreach(println(_))
810 |             }
811 |         }
812 | 
813 |         cc.unpersist()
814 |         lowerIDPerCommunity.unpersist()
815 | 
816 |         (result, collectIDsCommunity)
817 |     }
818 | 
819 |     /**
820 |      * CreateDoc generate document for LDA
821 |      *
822 |      * Steps :
823 |      *      1. Get tweets
824 |      *      2. Split into sequences
825 |      *      3. Counts terms occurency
826 |      *      4. Create vocab array with unique words
827 |      *      5. Create documents (RDD) containing vector and word id
828 |      *
829 |      * @param RDD tweets
830 |      *
831 |      * @return documents (RDD) ready to use
832 |      *         array of tweets
833 |      */
834 |     def createdoc(tokenizedCorpus: RDD[String]): ((RDD[(Long, Vector)], Map[String, Int])) = {
835 | 
836 |         println(color("\nCall createdoc", RED))
837 | 
838 |         // Split each document into a sequence of terms (words)
839 |         val tokenized: RDD[Seq[String]] =
840 |             tokenizedCorpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3))
841 | 
842 |         // Choose the vocabulary.
843 |         //   termCounts: Sorted list of (term, termCount) pairs
844 |         val termCounts: RDD[(String, Long)] =
845 |             tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).sortBy(_._2) //.collect().sortBy(-_._2)
846 | 
847 |         //   vocabArray: Chosen vocab (removing common terms)
848 |         val vocabArray: Array[String] = termCounts.map(a => a._1).collect()
849 | 
850 |         //   vocab: Map term -> term index
851 |         val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap
852 | 
853 |         // Convert documents into term count vectors
854 |         val documents: RDD[(Long, Vector)] =
855 |             tokenized.zipWithIndex.map { case (tokens, id) =>
856 |                 val counts = new mutable.HashMap[Int, Double]()
857 |                 tokens.foreach { term =>
858 |                     if (vocab.contains(term)) {
859 |                         val idx = vocab(term)
860 |                         counts(idx) = counts.getOrElse(idx, 0.0) + 1.0
861 |                     }
862 |                 }
863 |                 (id, Vectors.sparse(vocab.size, counts.toSeq))
864 |             }
865 | 
866 |         (documents, vocab)
867 |     }
868 | 
869 | 
870 |     def cosineSimilarity(vocab: Map[String, Int], vocabArray: Array[String], tokenizedTweet: Array[String]): ArrayBuffer[Double] = {
871 | 
872 |         println(color("\nCall cosineSimilarity", RED))
873 | 
874 |         var tab1 = new ArrayBuffer[Double]()
875 |         var tab2 = new ArrayBuffer[Double]()
876 |         var tabcosine = new ArrayBuffer[Double]()
877 | 
878 |         ldaModel.describeTopics().foreach { case (terms, termWeights) =>
879 |             terms.zip(termWeights).foreach { case (term, weight) =>
880 | 
881 |                 tab1 += tokenizedTweet.count(_ == vocabArray(term.toInt))
882 |                 tab2 += weight.toDouble
883 |             }
884 | 
885 |             // Store every cosine similarity
886 |             tabcosine += cosineSimilarity(tab1, tab2)
887 |         }
888 |         tabcosine
889 |     }
890 | 
891 |     /**
892 |      * @constructor findTopics
893 |      *
894 |      *              Set currentTweet attribut and add the new tweet to the dictionnary
895 |      *
896 |      * @param LDAModel $ldaModel - LDA Model (LocalModel)
897 |      * @param Array[String] $vocabArray - Contains all distinct words set to LDA
898 |      * @param Int $numWordsByTopics -
899 |      * @param Boolean $displayResult - Display result in console
900 |      *
901 |      * @return Seq
902 |      */
903 |     def findTopics(ldaModel: LDAModel, vocabArray: Array[String], T: String, SG: Int, numWordsByTopics: Int, displayResult: Boolean): Seq[(String, String, String, String)] = {
904 | 
905 |         println(color("\nCall findTopics", RED))
906 | 
907 |         println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
908 | 
909 |         val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = numWordsByTopics)
910 | 
911 |         var it = 0
912 |         var seqC = List[(String, String, String, String)]()
913 | 
914 |         // Print topics, showing top-weighted x terms for each topic.
915 |         topicIndices.foreach { case (terms, termWeights) =>
916 | 
917 |             if (displayResult)
918 |                 println("TOPICS:")
919 | 
920 |             val tabTopics = terms.zip(termWeights).map(vector => vocabArray(vector._1.toInt).toString).mkString(";")
921 | 
922 |             if (displayResult) {
923 |                 terms.zip(termWeights).foreach { case (term, weight) =>
924 |                     println(s"${vocabArray(term.toInt)}\t\t$weight")
925 |                 }
926 |             }
927 | 
928 |             seqC = seqC :+(T, SG.toString, it.toString, tabTopics)
929 | 
930 |             println("T: " + T + " SG: " + SG + "TopicN: " + it + " c: " + tabTopics)
931 |             it += 1
932 | 
933 |             if (displayResult)
934 |                 println()
935 | 
936 |         }
937 |         seqC.toSeq
938 |     }
939 | }


--------------------------------------------------------------------------------
/scala/FindCommunities/src/main/scala/utils/CassandraUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import scala.collection.mutable.ArrayBuffer
  4 | 
  5 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD:
  6 | 
  7 | import com.datastax.spark.connector._
  8 | 
  9 | // To make some of the examples work we will also need RDD
 10 | 
 11 | import org.apache.spark.SparkContext
 12 | import org.apache.spark.graphx._
 13 | import org.apache.spark.rdd.RDD
 14 | import org.apache.spark.sql.cassandra.CassandraSQLContext
 15 | 
 16 | //@SerialVersionUID(100L)
 17 | class CassandraUtils /*extends Serializable*/ {
 18 | 
 19 |     val RED = "\033[1;30m"
 20 |     val ENDC = "\033[0m"
 21 | 
 22 |     /**
 23 |      * @constructor getTweetContentFromID
 24 |      *
 25 |      *              Return tweet content
 26 |      *
 27 |      * @param SparkContext sc - SparkContext
 28 |      * @param String $id - tweet id
 29 |      * @return Unit
 30 |      */
 31 |     def getTweetContentFromID(sc: SparkContext, id: String): String = {
 32 | 
 33 |         println(color("\nCall getTweetContentFromID", RED))
 34 | 
 35 |         val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", id)
 36 | 
 37 |         if (query.collect().length != 0) {
 38 |             query.first().getString("tweet_text")
 39 |         }
 40 |         else
 41 |             "Tweet not found"
 42 |     }
 43 | 
 44 |     /**
 45 |      * @constructor getTweetsIDFromUser
 46 |      *
 47 |      *              Return tweet id
 48 |      *
 49 |      * @param SparkContext sc - SparkContext
 50 |      * @param String $id - user (sender) id
 51 |      * @return Unit
 52 |      */
 53 |     def getTweetsIDFromUser(sc: SparkContext, id: String): ArrayBuffer[String] = {
 54 | 
 55 |         println(color("\nCall getTweetsIDFromUser", RED))
 56 |         println("Tweets found:")
 57 | 
 58 |         val query = sc.cassandraTable("twitter", "users_communicate").select("tweet_id").where("user_send_local_id = ?", id)
 59 | 
 60 |         // Result will be stored in an array
 61 |         var result = ArrayBuffer[String]()
 62 | 
 63 |         if (query.collect().length != 0) {
 64 |             result += query.first().getString("tweet_id")
 65 |         }
 66 | 
 67 |         // Display result
 68 |         result.foreach(println(_))
 69 | 
 70 |         // Return
 71 |         result
 72 |     }
 73 | 
 74 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 75 | 
 76 |     /**
 77 |      * @constructor getTweetsContentFromEdge
 78 |      *
 79 |      *              Return an array of tweets content for a given Graph
 80 |      *
 81 |      * @param SparkContext sc - SparkContext
 82 |      * @param RDD[Edge[String]] $edge - graph's edge
 83 |      * @return Unit
 84 |      */
 85 |     def getTweetsContentFromEdge(sc: SparkContext, edge: RDD[Edge[String]], displayResult: Boolean): RDD[String] = {
 86 | 
 87 |         println(color("\nCall getTweetsContentFromEdge", RED))
 88 | 
 89 |         // Get the tweets ID for every communication
 90 |         val tweetsID = edge.flatMap({
 91 |             case Edge(idSend, idExp, idTweet) => Seq(idTweet)
 92 |         })
 93 | 
 94 |         // Result will be stored in an array
 95 |         var result = ArrayBuffer[String]()
 96 | 
 97 |         // Queries
 98 |         for (tweet <- tweetsID.collect()) {
 99 |             val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", tweet)
100 | 
101 |             if (query.collect().length != 0) {
102 |                 result += query.first().getString("tweet_text")
103 |             }
104 |         }
105 | 
106 |         // Display results
107 |         if (displayResult) {
108 |             result.foreach(println(_))
109 |         }
110 | 
111 | 
112 |         // return
113 |         sc.parallelize(result)
114 |     }
115 | 
116 |     /*def getAllTweetsText(sc: SparkContext): ArrayBuffer[String] = {
117 |         val rdd = sc.cassandraTable("twitter", "tweet_filtered2").select("tweet_text").cache()
118 | 
119 |         var dictionnary = new ArrayBuffer[String]
120 | 
121 |         println("Tweets by tweets -> Create documents and vocabulary")
122 |         rdd.select("tweet_text").as((i: String) => i).foreach(x => {
123 | 
124 |             val tweet = x
125 |                 .toLowerCase.split("\\s")
126 |                 .filter(_.length > 3)
127 |                 .filter(_.forall(java.lang.Character.isLetter)).mkString(" ")
128 | 
129 |             if (tweet.length > 1)
130 |                 dictionnary += tweet
131 |         })
132 |     }*/
133 | 
134 |     // (RDD[(VertexId, (String))], RDD[Edge[String]])
135 |     def getAllCommunicationsToGraph(sc: SparkContext): Graph[String, String] = {
136 |         println(color("\nCall getAllCommunications", RED))
137 | 
138 | 
139 |         /* val users: RDD[(VertexId, (String))] =
140 |              sc.parallelize(List(
141 |                  (2732329846L, "Michael"),
142 |                  (132988448L, "David"),
143 |                  (473822999L, "Sarah"),
144 |                  (2932436311L, "Jean"),
145 |                  (2249679902L, "Raphael"),
146 |                  (601389784L, "Lucie"),
147 |                  (2941487254L, "Harold"),
148 |                  (1192483885L, "Pierre"),
149 |                  (465776805L, "Christophe"),
150 |                  (838147628L, "Zoe"),
151 |                  (2564641105L, "Fabien"),
152 |                  (1518391292L, "Nicolas")
153 |              ))*/
154 | 
155 | 
156 |         // Collection of vertices (contains users)
157 |         // val collectionVertices = ListBuffer[(Long, String)]()
158 | 
159 | 
160 |         // val users: RDD[(VertexId, (String))] = sc.parallelize(collectionVertices)
161 | 
162 | 
163 |         //val con = sc.cassandraTable("twitter", "user_filtered")
164 |         //con.toArray.foreach(println)
165 |         /*println("Test -1")
166 | 
167 |         var t0 = System.nanoTime()
168 |             for (row <- query) {
169 | 
170 |             }
171 | 
172 |         var t1 = System.nanoTime()
173 |         println("Elapsed time: " + (t1 - t0) + "ns")*/
174 | 
175 |         // val query = sc.cassandraTable("twitter", "user_filtered").select("user_local_id", "user_screen_name")
176 | 
177 | 
178 |         /*val con = query.map{
179 |              case result => (result._1, result._2)
180 |          }*/
181 |         val cc = new CassandraSQLContext(sc)
182 | 
183 |         println("Test 0")
184 |         var t0 = System.nanoTime()
185 |         val rdd0 = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered")
186 | 
187 |         val pelo = rdd0.map(p => (p(0).toString.toLong, p(1).toString)).cache()
188 | 
189 |         val rdd1 = cc.sql("SELECT tweet_id, user_send_local_id, user_dest_id from twitter.users_communicate")
190 | 
191 |         val pelo2 = rdd1.map(p => Edge(p(1).toString.toLong, p(2).toString.toLong, p(0).toString)).cache()
192 | 
193 |         Graph(pelo, pelo2)
194 | 
195 |         /*println("okkk")
196 | 
197 |         graphh.vertices.foreach(println(_))
198 | 
199 | 
200 |         //pelo.foreach(println(_))
201 | 
202 |         println("After collecting")
203 | 
204 |         rdd0.show()
205 | 
206 |         for (row <- rdd0) {
207 |             //println(row(0))
208 | 
209 |             collectionVertices += ((row(0).toString.toLong, row(1).toString))
210 |             //collectionVertices.append((row(0).toString.toLong, row(1).toString))
211 |         }
212 |         var t1 = System.nanoTime()
213 |         println("Elapsed time: " + (t1 - t0) + "ns")
214 | 
215 | 
216 |         println("Test 1")
217 |         t0 = System.nanoTime()
218 | 
219 |         val rdd = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered LIMIT 100").persist()
220 |         for (row <- rdd) {
221 |             collectionVertices += ((row(0).toString.toLong, row(1).toString))
222 |         }
223 |         rdd.unpersist()
224 |         t1 = System.nanoTime()
225 |         println("Elapsed time: " + (t1 - t0) + "ns")
226 | 
227 | 
228 | 
229 |         println("Test 2")
230 |         t0 = System.nanoTime()
231 |         val rdd2 = cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered limit 10000").cache()
232 |         for (row <- rdd2) {
233 |             collectionVertices += ((row(0).toString.toLong, row(1).toString))
234 |         }
235 |         t1 = System.nanoTime()
236 |         println("Elapsed time: " + (t1 - t0) + "ns")
237 | 
238 |         println("Test 3")
239 |         t0 = System.nanoTime()
240 | 
241 |         for (row <- cc.sql("SELECT user_local_id, user_screen_name from twitter.user_filtered limit 10000")) {
242 |             collectionVertices += ((row(0).toString.toLong, row(1).toString))
243 |         }
244 |         t1 = System.nanoTime()
245 |         println("Elapsed time: " + (t1 - t0) + "ns")
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 |         println("f")
253 |         // println(rdd.take(1))
254 |         println("f2")
255 | *
256 |         /*
257 |         println("Query 1 ok")
258 |          */
259 |         // Save result to ArrayBuffer
260 |         //if (query.collect().length != 0) {
261 |             //collectionVertices += ((query.first().getString("user_local_id").toLong, query.first().getString("user_local_id").toString))
262 |             println(query.first().getString("user_local_id"))
263 |        // }
264 | 
265 |         //collectionVertices.foreach(println(_))
266 | 
267 |         println("Query 1 Collect ok")
268 | 
269 | 
270 | 
271 |         // Collection of edges (contains communications between users)
272 |         val collectionEdge = ArrayBuffer[Edge[String]]()
273 | 
274 | 
275 |         //query = sc.cassandraTable("twitter", "users_communicate").select("user_send_local_id", "user_dest_id", "tweet_id").toArray()
276 | 
277 |         println("Query 2 ok")
278 |         // Save result to ArrayBuffer
279 |         /*if (query.collect().length != 0) {
280 |             collectionEdge += Edge(query.first().getString("user_send_local_id").toLong, query.first().getString("user_dest_id").toLong, query.first().getString("tweet_id").toString)
281 |         }*/
282 | 
283 |         //collectionEdge.foreach(println(_))
284 | 
285 |         println("Query 2 Collect ok")
286 | 
287 |         // Convert vertices to RDD
288 |         val VerticesRDD = sc.parallelize(collectionVertices)
289 | 
290 |         // Convert it to RDD
291 |         val EdgeRDD = sc.parallelize(collectionEdge)
292 | 
293 |         println("Total vertices: " + collectionVertices.length)
294 |         println("Total edges: " + collectionEdge.length)
295 | 
296 |         (VerticesRDD, EdgeRDD)*/
297 |     }
298 | }


--------------------------------------------------------------------------------
/scala/FindCommunities/src/main/scala/utils/CommunityUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark._
  4 | import org.apache.spark.graphx._
  5 | import org.apache.spark.rdd.RDD
  6 | 
  7 | import scala.math._
  8 | import scala.reflect.ClassTag
  9 | 
 10 | class CommunityUtils extends Logging {
 11 | 
 12 |     val RED = "\033[1;30m"
 13 |     val ENDC = "\033[0m"
 14 | 
 15 |     /**
 16 |      * splitCommunity
 17 |      *
 18 |      * Find and split communities in graph
 19 |      *
 20 |      * @param Graph[String,String] $graph - Graph element
 21 |      * @param RDD[(VertexId, (String))] $users - Vertices
 22 |      * @param Boolean $displayResult - if true, display println
 23 |      * @return ArrayBuffer[Graph[String,String]] - Contains one graph per community
 24 |      *
 25 |      */
 26 |     def splitCommunity(graph: Graph[String, String], users: RDD[(VertexId, (String))], NBKCORE: Int, displayResult: Boolean): Graph[String, String] = {
 27 | 
 28 |         println(color("\nCall SplitCommunity", RED))
 29 | 
 30 |         getKCoreGraph(graph, users, NBKCORE, displayResult).cache()
 31 |     }
 32 | 
 33 |     /**
 34 |      * Compute the k-core decomposition of the graph for all k <= kmax. This
 35 |      * uses the iterative pruning algorithm discussed by Alvarez-Hamelin et al.
 36 |      * in K-Core Decomposition: a Tool For the Visualization of Large Scale Networks
 37 |      * (see <a href="http://arxiv.org/abs/cs/0504107">http://arxiv.org/abs/cs/0504107</a>).
 38 |      *
 39 |      * @tparam VD the vertex attribute type (discarded in the computation)
 40 |      * @tparam ED the edge attribute type (preserved in the computation)
 41 |      *
 42 |      * @param graph the graph for which to compute the connected components
 43 |      * @param kmax the maximum value of k to decompose the graph
 44 |      *
 45 |      * @return a graph where the vertex attribute is the minimum of
 46 |      *         kmax or the highest value k for which that vertex was a member of
 47 |      *         the k-core.
 48 |      *
 49 |      * @note This method has the advantage of returning not just a single kcore of the
 50 |      *       graph but will yield all the cores for k > kmin.
 51 |      */
 52 |     def getKCoreGraph[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED],
 53 |                                                   users: RDD[(VertexId, (String))],
 54 |                                                   kmin: Int,
 55 |                                                   displayResult: Boolean): Graph[String, ED] = {
 56 | 
 57 |         // Graph[(Int, Boolean), ED] - boolean indicates whether it is active or not
 58 |         var g = graph.cache().outerJoinVertices(graph.degrees)((vid, oldData, newData) => newData.getOrElse(0)).cache()
 59 | 
 60 |         println(color("\nCall KCoreDecomposition", RED))
 61 | 
 62 |         g = computeCurrentKCore(g, kmin).cache()
 63 | 
 64 |         val v = g.vertices.filter { case (vid, vd) => vd >= kmin }.cache()
 65 | 
 66 |         // Display informations
 67 |         if (displayResult) {
 68 |             val degrees = graph.degrees
 69 |             val numVertices = degrees.count()
 70 |             val testK = kmin
 71 |             val vCount = g.vertices.filter { case (vid, vd) => vd >= kmin }.count()
 72 |             val eCount = g.triplets.map { t => t.srcAttr >= testK && t.dstAttr >= testK }.count()
 73 | 
 74 |             logWarning(s"Number of vertices: $numVertices")
 75 |             logWarning(s"Degree sample: ${degrees.take(10).mkString(", ")}")
 76 |             logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey(_ + _).collect().mkString(", "))
 77 |             logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey(_ + _).take(10).mkString(", "))
 78 |             logWarning(s"K=$kmin, V=$vCount, E=$eCount")
 79 |         }
 80 | 
 81 |         // Create new RDD users
 82 |         val newUser = users.join(v).map {
 83 |             case (id, (username, rank)) => (id, username)
 84 |         }
 85 | 
 86 |         // Create a new graph
 87 |         val gra = Graph(newUser, g.edges)
 88 | 
 89 |         // Remove missing vertices as well as the edges to connected to them
 90 |         gra.subgraph(vpred = (id, username) => username != null).cache()
 91 |     }
 92 | 
 93 |     def computeCurrentKCore[ED: ClassTag](graph: Graph[Int, ED], k: Int) = {
 94 |         println("Computing kcore for k=" + k)
 95 |         def sendMsg(et: EdgeTriplet[Int, ED]): Iterator[(VertexId, Int)] = {
 96 |             if (et.srcAttr < 0 || et.dstAttr < 0) {
 97 |                 // if either vertex has already been turned off we do nothing
 98 |                 Iterator.empty
 99 |             } else if (et.srcAttr < k && et.dstAttr < k) {
100 |                 // tell both vertices to turn off but don't need change count value
101 |                 Iterator((et.srcId, -1), (et.dstId, -1))
102 | 
103 |             } else if (et.srcAttr < k) {
104 |                 // if src is being pruned, tell dst to subtract from vertex count
105 |                 Iterator((et.srcId, -1), (et.dstId, 1))
106 | 
107 |             } else if (et.dstAttr < k) {
108 |                 // if dst is being pruned, tell src to subtract from vertex count
109 |                 Iterator((et.dstId, -1), (et.srcId, 1))
110 | 
111 |             } else {
112 |                 Iterator.empty
113 |             }
114 |         }
115 | 
116 |         // subtracts removed neighbors from neighbor count and tells vertex whether it was turned off or not
117 |         def mergeMsg(m1: Int, m2: Int): Int = {
118 |             if (m1 < 0 || m2 < 0) {
119 |                 -1
120 |             } else {
121 |                 m1 + m2
122 |             }
123 |         }
124 | 
125 |         def vProg(vid: VertexId, data: Int, update: Int): Int = {
126 |             if (update < 0) {
127 |                 // if the vertex has turned off, keep it turned off
128 |                 -1
129 |             } else {
130 |                 // subtract the number of neighbors that have turned off this round from
131 |                 // the count of active vertices
132 |                 // TODO(crankshaw) can we ever have the case data < update?
133 |                 max(data - update, 0)
134 |             }
135 |         }
136 | 
137 |         // Note that initial message should have no effect
138 |         Pregel(graph, 0)(vProg, sendMsg, mergeMsg)
139 |     }
140 | 
141 | 
142 |     /**
143 |      * @constructor time
144 |      *
145 |      *              timer for profiling block
146 |      *
147 |      * @param R $block - Block executed
148 |      * @return Unit
149 |      */
150 |     def time[R](block: => R): R = {
151 |         val t0 = System.nanoTime()
152 |         val result = block // call-by-name
153 |         val t1 = System.nanoTime()
154 |         println("Elapsed time: " + (t1 - t0) / 1000000000.0 + " seconds")
155 |         result
156 |     }
157 | 
158 |     def subgraphCommunities(graph: Graph[String, String], users: RDD[(VertexId, (String))], displayResult: Boolean): (Array[Graph[String, String]], Array[Long]) = {
159 | 
160 |         println(color("\nCall subgraphCommunities", RED))
161 | 
162 |         // Find the connected components
163 |         val cc = time {
164 |             graph.connectedComponents().vertices.cache()
165 |         }
166 | 
167 |         // Join the connected components with the usernames and id
168 |         // The result is an RDD not a Graph
169 |         val ccByUsername = users.join(cc).map {
170 |             case (id, (username, cci)) => (id, username, cci)
171 |         }.cache()
172 | 
173 |         // Print the result
174 |         val lowerIDPerCommunity = ccByUsername.map { case (id, username, cci) => cci }.distinct().cache()
175 | 
176 |         // Result will be stored in an array
177 |         //var result = new ArrayBuffer[Graph[String, String]]()
178 |         println("--------------------------")
179 |         println("Total community found: " + lowerIDPerCommunity.count())
180 |         println("--------------------------")
181 | 
182 | 
183 |         val collectIDsCommunity = lowerIDPerCommunity.collect()
184 | 
185 |         val result = collectIDsCommunity.map(colID => Graph(ccByUsername.filter {
186 |             _._3 == colID
187 |         }.map { case (id, username, cc) => (id, username) }, graph.edges).subgraph(vpred = (id, username) => username != null).cache())
188 | 
189 |         // Display communities
190 |         if (displayResult) {
191 |             println("\nCommunities found " + result.length)
192 |             for (community <- result) {
193 |                 println("-----------------------")
194 |                 community.edges.collect().foreach(println(_))
195 |                 community.vertices.collect().foreach(println(_))
196 |             }
197 |         }
198 | 
199 |         cc.unpersist()
200 |         lowerIDPerCommunity.unpersist()
201 | 
202 |         (result, collectIDsCommunity)
203 |     }
204 | 
205 |     /**
206 |      * getTriangleCount
207 |      *
208 |      * Compute the number of triangles passing through each vertex.
209 |      *
210 |      * @param Graph[String,String] $graph - Graph element
211 |      * @param RDD[(VertexId, (String))] $users - Vertices
212 |      * @return Unit
213 |      *
214 |      * @see [[org.apache.spark.graphx.lib.TriangleCount$#run]]
215 |      */
216 |     def getTriangleCount(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = {
217 | 
218 |         println(color("\nCall getTriangleCount", RED))
219 | 
220 |         // Sort edges ID srcID < dstID
221 |         val edges = graph.edges.map { e =>
222 |             if (e.srcId < e.dstId) {
223 |                 Edge(e.srcId, e.dstId, e.attr)
224 |             }
225 |             else {
226 |                 Edge(e.dstId, e.srcId, e.attr)
227 |             }
228 |         }
229 | 
230 |         // Temporary graph
231 |         val newGraph = Graph(users, edges, "").cache()
232 | 
233 |         // Find the triangle count for each vertex
234 |         // TriangleCount requires the graph to be partitioned
235 |         val triCounts = newGraph.partitionBy(PartitionStrategy.RandomVertexCut).cache().triangleCount().vertices
236 | 
237 |         val triCountByUsername = users.join(triCounts).map {
238 |             case (id, (username, rank)) => (id, username, rank)
239 |         }
240 | 
241 |         println("Display triangle's sum for each user")
242 |         triCountByUsername.foreach(println)
243 | 
244 |         println("\nTotal: " + triCountByUsername.map { case (id, username, rank) => rank }.distinct().count() + "\n")
245 |     }
246 | 
247 |     /**
248 |      * @constructor ConnectedComponents
249 |      *
250 |      *              Compute the connected component membership of each vertex and return a graph with the vertex
251 |      *              value containing the lowest vertex id in the connected component containing that vertex.
252 |      *
253 |      * @param Graph[String,String] $graph - Graph element
254 |      * @param RDD[(VertexId, (String))] $users - Vertices
255 |      * @return Unit
256 |      *
257 |      * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]]
258 |      */
259 |     def cc(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = {
260 |         println(color("\nCall ConnectedComponents", RED))
261 | 
262 |         // Find the connected components
263 |         val cc = graph.connectedComponents().vertices
264 | 
265 |         // Join the connected components with the usernames and id
266 |         val ccByUsername = users.join(cc).map {
267 |             case (id, (username, cc)) => (id, username, cc)
268 |         }
269 |         // Print the result
270 |         println(ccByUsername.collect().sortBy(_._3).mkString("\n"))
271 | 
272 |         println("\nTotal groups: " + ccByUsername.map { case (id, username, cc) => cc }.distinct().count() + "\n")
273 |     }
274 | 
275 |     /**
276 |      * @constructor StronglyConnectedComponents
277 |      *
278 |      *              Compute the strongly connected component (SCC) of each vertex and return a graph with the
279 |      *              vertex value containing the lowest vertex id in the SCC containing that vertex.
280 |      *
281 |      *              Display edges's membership and total groups
282 |      *
283 |      * @param Graph[String,String] $graph - Graph element
284 |      * @param Int $iteration - Number of iteration
285 |      * @return Unit
286 |      */
287 |     def scc(graph: Graph[String, String], iteration: Int): Unit = {
288 | 
289 |         println(color("\nCall StronglyConnectedComponents : iteration : " + iteration, RED))
290 |         val sccGraph = graph.stronglyConnectedComponents(5)
291 | 
292 |         val connectedGraph = sccGraph.vertices.map {
293 |             case (member, leaderGroup) => s"$member is in the group of $leaderGroup's edge"
294 |         }
295 | 
296 |         val totalGroups = sccGraph.vertices.map {
297 |             case (member, leaderGroup) => leaderGroup
298 |         }
299 | 
300 |         connectedGraph.collect().foreach(println)
301 | 
302 |         println("\nTotal groups: " + totalGroups.distinct().count() + "\n")
303 |     }
304 | 
305 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
306 | }


--------------------------------------------------------------------------------
/scala/FindCommunities/src/main/scala/utils/GraphUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | // To make some of the examples work we will also need RDD
  4 | 
  5 | import org.apache.spark.graphx._
  6 | import org.apache.spark.rdd.RDD
  7 | 
  8 | 
  9 | class GraphUtils extends serializable {
 10 | 
 11 |     val RED = "\033[1;30m"
 12 |     val ENDC = "\033[0m"
 13 |     private val defaultSeed = 0xadc83b19L
 14 | 
 15 |     /**
 16 |      * @constructor murmurHash64A
 17 |      *
 18 |      * @param
 19 |      * @param
 20 |      * @return Long
 21 |      *
 22 |      */
 23 |     def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = {
 24 |         val m = 0xc6a4a7935bd1e995L
 25 |         val r = 47
 26 | 
 27 |         val f: Long => Long = m.*
 28 |         val g: Long => Long = x => x ^ (x >>> r)
 29 | 
 30 |         val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) =>
 31 |             val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff))
 32 |             val j: Long => Long = if (xs.length == 8) f compose g compose f else identity
 33 |             f(y ^ j(k))
 34 |         }
 35 |         (g compose f compose g)(h)
 36 |     }
 37 | 
 38 |     /**
 39 |      * @constructor getPageRank
 40 |      *
 41 |      *              Run PageRank for a fixed number of iterations returning a graph with vertex attributes
 42 |      *              containing the PageRank and edge attributes the normalized edge weight.
 43 |      *
 44 |      * @param Graph[String,String] $graph - Graph element
 45 |      * @param RDD[(VertexId, (String))] $users - Vertices
 46 |      * @return Unit
 47 |      *
 48 |      * @see [[org.apache.spark.graphx.lib.PageRank$#run]]
 49 |      */
 50 |     def getPageRank(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = {
 51 | 
 52 |         println(color("\nCall getPageRank", RED))
 53 | 
 54 |         val ranks = graph.pageRank(0.00001).vertices
 55 | 
 56 |         val ranksByUsername = users.join(ranks).map {
 57 |             case (id, (username, rank)) => (id, username, rank)
 58 |         }
 59 | 
 60 |         // Print the result descending
 61 |         println(ranksByUsername.collect().sortBy(_._3).reverse.mkString("\n"))
 62 |     }
 63 | 
 64 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 65 | 
 66 |     /**
 67 |      * @constructor inAndOutDegrees
 68 |      *
 69 |      * @param Graph[String,String] $graph - Graph element
 70 |      * @return Unit
 71 |      *
 72 |      */
 73 |     def inAndOutDegrees(graph: Graph[String, String]): Unit = {
 74 | 
 75 |         println(color("\nCall inAndOutDegrees", RED))
 76 | 
 77 |         // Create User class
 78 |         case class User(name: String, // Username
 79 |                         inDeg: Int, // Received tweets
 80 |                         outDeg: Int) // Sent tweets
 81 | 
 82 |         // Create user Graph
 83 |         // def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED]
 84 |         val initialUserGraph: Graph[User, String] = graph.mapVertices {
 85 |             case (id, (name)) => User(name, 0, 0)
 86 |         }
 87 | 
 88 |         //initialUserGraph.edges.collect.foreach(println(_))
 89 | 
 90 | 
 91 |         // Fill in the degree informations (out and in degrees)
 92 |         val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) {
 93 |             case (id, u, inDegOpt) => User(u.name, inDegOpt.getOrElse(0), u.outDeg)
 94 |         }.outerJoinVertices(initialUserGraph.outDegrees) {
 95 |             case (id, u, outDegOpt) => User(u.name, u.inDeg, outDegOpt.getOrElse(0))
 96 |         }
 97 | 
 98 |         // Display the userGraph
 99 |         userGraph.vertices.foreach {
100 |             case (id, u) => println(s"User $id is called ${u.name} and received ${u.inDeg} tweets and send ${u.outDeg}.")
101 |         }
102 |     }
103 | }


--------------------------------------------------------------------------------
/scala/FindCommunities/src/main/scala/utils/MllibUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark.mllib.clustering._
  4 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
  5 | import org.apache.spark.rdd.RDD
  6 | 
  7 | import scala.collection.mutable
  8 | 
  9 | /**
 10 |  * Topic models automatically infer the topics discussed in a collection of documents. These topics can be used
 11 |  * to summarize and organize documents, or used for featurization and dimensionality reduction in later stages
 12 |  * of a Machine Learning (ML) pipeline.
 13 |  *
 14 |  * LDA is not given topics, so it must infer them from raw text. LDA defines a topic as a distribution over words.
 15 |  */
 16 | class MllibUtils {
 17 | 
 18 |     // Terminal Color
 19 |     val RED = "\033[1;30m"
 20 |     val ENDC = "\033[0m"
 21 | 
 22 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 23 | 
 24 |     def createdoc(tokenizedCorpus: RDD[String]): ((Seq[(Long, Vector)], Array[String], Map[String, Int], Array[String])) = {
 25 | 
 26 |         println(color("\nCall createdoc", RED))
 27 | 
 28 |         // Choose the vocabulary.
 29 |         // termCounts: Sorted list of (term, termCount) pairs
 30 |         val termCounts: Array[(String, Long)] =
 31 |             tokenizedCorpus.map(_ -> 1L).reduceByKey(_ + _).collect().sortBy(-_._2)
 32 | 
 33 |         // vocabArray: Chosen vocab (removing common terms)
 34 |         val numStopwords = 20
 35 |         val vocabArray: Array[String] =
 36 |             termCounts.takeRight(termCounts.length - numStopwords).map(_._1)
 37 | 
 38 |         // vocab: Map term -> term index
 39 |         val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap
 40 | 
 41 |         val tokenCollected = tokenizedCorpus.collect()
 42 | 
 43 | 
 44 |         // MAP : [ Word ID , VECTOR [vocab.size, WordFrequency]]
 45 |         val documents: Map[Long, Vector] = vocab.map { case (tokens, id) =>
 46 | 
 47 |             val counts = new mutable.HashMap[Int, Double]()
 48 | 
 49 |             // Word ID
 50 |             val idx = vocab(tokens)
 51 | 
 52 |             // Count word occurancy
 53 |             counts(idx) = counts.getOrElse(idx, 0.0) + tokenCollected.count(_ == tokens)
 54 | 
 55 |             // Return word ID and Vector
 56 |             (id.toLong, Vectors.sparse(vocab.size, counts.toSeq))
 57 |         }
 58 | 
 59 |         (documents.toSeq, tokenizedCorpus.collect(), vocab, tokenizedCorpus.collect())
 60 |     }
 61 | 
 62 | 
 63 |     def cosineSimilarity(tokenizedCorpus: RDD[String], vocab: Map[String, Int], tokenizedTweet: Array[String]): (Seq[(Long, Vector)]) = {
 64 | 
 65 |         println(color("\nCall cosineSimilarity", RED))
 66 | 
 67 |         val document: Map[Long, Vector] = vocab.map { case (tokens, id) =>
 68 | 
 69 |             val counts2 = new mutable.HashMap[Int, Double]()
 70 | 
 71 |             // Word ID
 72 |             val idx = vocab(tokens)
 73 | 
 74 |             // Count word occurancy
 75 |             counts2(idx) = counts2.getOrElse(idx, 0.0) + tokenizedTweet.count(_ == tokens).toDouble
 76 | 
 77 |             // Return word ID and Vector
 78 |             (id.toLong, Vectors.sparse(vocab.size, counts2.toSeq))
 79 |         }
 80 | 
 81 |         document.toSeq
 82 |     }
 83 | 
 84 |     /**
 85 |      * @constructor findTopics
 86 |      *
 87 |      *              Set currentTweet attribut and add the new tweet to the dictionnary
 88 |      *
 89 |      * @param LDAModel $ldaModel - LDA Model (LocalModel)
 90 |      * @param Array[String] $vocabArray - Contains all distinct words set to LDA
 91 |      * @param Int $numWordsByTopics -
 92 |      * @param Boolean $displayResult - Display result in console
 93 |      *
 94 |      * @return LDAModel
 95 |      */
 96 |     def findTopics(ldaModel: LDAModel, vocabArray: Array[String], T: String, SG: Int, numWordsByTopics: Int, displayResult: Boolean): Seq[(String, String, String, String)] = {
 97 | 
 98 |         println(color("\nCall findTopics", RED))
 99 | 
100 |         println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
101 | 
102 |         val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = numWordsByTopics)
103 | 
104 |         var it = 0
105 |         var seqC = List[(String, String, String, String)]()
106 | 
107 |         // Print topics, showing top-weighted x terms for each topic.
108 |         topicIndices.foreach { case (terms, termWeights) =>
109 | 
110 |             if (displayResult)
111 |                 println("TOPICS:")
112 | 
113 |             val tabTopics = terms.zip(termWeights).map(vector => vocabArray(vector._1.toInt).toString).mkString(";")
114 | 
115 |             if (displayResult) {
116 |                 terms.zip(termWeights).foreach { case (term, weight) =>
117 |                     println(s"${vocabArray(term.toInt)}\t\t$weight")
118 |                 }
119 |             }
120 | 
121 |             seqC = seqC :+(T, SG.toString, it.toString, tabTopics)
122 | 
123 |             println("T: " + T + " SG: " + SG + "TopicN: " + it + " c: " + tabTopics)
124 |             it += 1
125 | 
126 |             if (displayResult)
127 |                 println()
128 | 
129 |         }
130 |         seqC.toSeq
131 |     }
132 | }


--------------------------------------------------------------------------------
/scala/FindCommunities/src/main/scala/utils/RDDUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | 
  5 | import scala.collection.mutable.ArrayBuffer
  6 | 
  7 | // To make some of the examples work we will also need RDD
  8 | 
  9 | import org.apache.spark.graphx._
 10 | import org.apache.spark.rdd.RDD
 11 | 
 12 | 
 13 | class RDDUtils {
 14 | 
 15 |     val RED = "\033[1;30m"
 16 |     val ENDC = "\033[0m"
 17 | 
 18 |     /**
 19 |      * @constructor ArrayToVertices
 20 |      *
 21 |      *              Convert ArrayBuffer to RDD containing Vertices
 22 |      *
 23 |      * @param SparkContext - $sc - SparkContext
 24 |      * @param ArrayBuffer[(Long, (String))] - $collection - Contains vertices
 25 |      *
 26 |      * @return RDD[Edge[String]] - RDD of vertices
 27 |      */
 28 |     def ArrayToVertices(sc: SparkContext, collection: ArrayBuffer[(Long, (String))]): RDD[(VertexId, (String))] = {
 29 |         sc.parallelize(collection)
 30 |     }
 31 | 
 32 |     /**
 33 |      * @constructor ArrayToEdges
 34 |      *
 35 |      *              Convert ArrayBuffer to RDD containing Edges
 36 |      *
 37 |      * @param SparkContext - $sc - SparkContext
 38 |      * @param ArrayBuffer[Edge[String]] - $collection - Contains edges
 39 |      *
 40 |      * @return RDD[Edge[String]] - RDD of edges
 41 |      */
 42 |     def ArrayToEdges(sc: SparkContext, collection: ArrayBuffer[Edge[String]]): RDD[Edge[String]] = {
 43 |         sc.parallelize(collection)
 44 |     }
 45 | 
 46 |     /**
 47 |      * @constructor findUserByIDInGraph
 48 |      *
 49 |      *              find user ID with username
 50 |      *
 51 |      * @param Graph[String,String] $graph - Graph element
 52 |      * @param Int $userID - User id
 53 |      * @return String - if success : username | failure : "user not found"
 54 |      */
 55 |     def findUserNameByIDInGraph(graph: Graph[String, String], userID: Int): String = {
 56 |         println(color("\nCall : findUserNameWithID", RED))
 57 | 
 58 |         graph.vertices.filter { case (id, name) => id.toString equals userID.toString }.collect().foreach {
 59 |             (e: (org.apache.spark.graphx.VertexId, String)) => return e._2
 60 |         }
 61 |         "user not found"
 62 |     }
 63 | 
 64 |     /**
 65 |      * @constructor findUserIDByNameInGraph
 66 |      *
 67 |      *              find username with id
 68 |      *
 69 |      * @param Graph[String,String] $graph - Graph element
 70 |      * @param String $userName - Username
 71 |      * @return String - if success : id found | failure : "0"
 72 |      */
 73 |     def findUserIDByNameInGraph(graph: Graph[String, String], userName: String): String = {
 74 |         println(color("\nCall : findUserIDWithName", RED))
 75 | 
 76 |         graph.vertices.filter(_._2 == userName).collect().foreach {
 77 |             (e: (org.apache.spark.graphx.VertexId, String)) => return e._1.toString
 78 |         }
 79 |         "0"
 80 |     }
 81 | 
 82 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 83 | 
 84 |     /**
 85 |      * @constructor displayAllCommunications
 86 |      *
 87 |      *              display all communications between users
 88 |      *
 89 |      * @param Graph[String,String] $graph - Graph element
 90 |      * @return Unit
 91 |      */
 92 |     def displayAllCommunications(graph: Graph[String, String]): Unit = {
 93 | 
 94 |         println(color("\nCall : displayAllCommunications", RED))
 95 |         println("Users communications: ")
 96 | 
 97 |         val facts: RDD[String] = graph.triplets.map(triplet => triplet.srcAttr + " communicate with " +
 98 |             triplet.dstAttr + " with tweet id " + triplet.attr)
 99 | 
100 |         facts.collect().foreach(println(_))
101 |     }
102 | }


--------------------------------------------------------------------------------
/scala/GraphxTesting/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "GraphxTesting"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.5"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |   "org.apache.spark" %% "spark-core"              % "1.4.0" % "provided",
 9 |   "org.apache.spark" %% "spark-graphx"            % "1.4.0" % "provided",
10 |   "org.apache.spark" %% "spark-mllib"             % "1.4.0" % "provided")
11 | 
12 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0-M1"
13 | 
14 | libraryDependencies += "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly()


--------------------------------------------------------------------------------
/scala/GraphxTesting/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")


--------------------------------------------------------------------------------
/scala/GraphxTesting/src/main/scala/GraphxTesting.scala:
--------------------------------------------------------------------------------
  1 | import org.apache.log4j.{Level, Logger}
  2 | import org.apache.spark.graphx._
  3 | import org.apache.spark.mllib.clustering.LDA
  4 | import org.apache.spark.mllib.linalg.Vector
  5 | import org.apache.spark.{SparkConf, SparkContext}
  6 | import utils._
  7 | 
  8 | import scala.collection.mutable.ArrayBuffer
  9 | import scala.math._
 10 | 
 11 | // To make some of the examples work we will also need RDD
 12 | 
 13 | import org.apache.spark.rdd.RDD
 14 | 
 15 | // Useful links
 16 | // http://ampcamp.berkeley.edu/big-data-mini-course/graph-analytics-with-graphx.html
 17 | // https://spark.apache.org/docs/latest/graphx-programming-guide.html
 18 | 
 19 | object GraphxTesting {
 20 | 
 21 |     val RED = "\033[1;30m"
 22 |     val ENDC = "\033[0m"
 23 | 
 24 |     def main(args: Array[String]) {
 25 | 
 26 |         println("\n\n**************************************************************")
 27 |         println("******************       GraphxTesting      ******************")
 28 |         println("**************************************************************\n")
 29 | 
 30 |         val cu = new CassandraUtils
 31 |         val comUtils = new CommunityUtils
 32 |         val gu = new GraphUtils
 33 |         val ru = new RDDUtils
 34 | 
 35 |         // Display only warning and infos messages
 36 |         Logger.getLogger("org").setLevel(Level.ERROR)
 37 |         Logger.getLogger("akka").setLevel(Level.ERROR)
 38 | 
 39 |         // Not displaying infos messages
 40 |         //Logger.getLogger("org").setLevel(Level.OFF)
 41 |         //Logger.getLogger("akka").setLevel(Level.OFF)
 42 | 
 43 |         // Spark configuration
 44 |         val sparkConf = new SparkConf(true)
 45 |             .setMaster("local[2]")
 46 |             .setAppName("GraphxTesting")
 47 |             .set("spark.cassandra.connection.host", "127.0.0.1") // Link to Cassandra
 48 | 
 49 |         // Init SparkContext
 50 |         val sc = new SparkContext(sparkConf)
 51 | 
 52 |         // Create Vertices and Edges
 53 |         val (users, relationships, defaultUser) = initGraph(sc)
 54 | 
 55 |         // Build the initial Graph
 56 |         val graph = Graph(users, relationships, defaultUser).cache()
 57 | 
 58 |         /*
 59 | 
 60 |         println("\n**************************************************************")
 61 |         println("                       TEST METHODS                           ")
 62 |         println("**************************************************************")
 63 | 
 64 |         println("\n--------------------------------------------------------------")
 65 |         println("Operations on tweets")
 66 |         println("--------------------------------------------------------------\n")
 67 | 
 68 |         // See who communicates with who
 69 |         time { ru displayAllCommunications(graph) }
 70 | 
 71 |         // Let's find user id
 72 |         val id = time { ru findUserIDByNameInGraph(graph, "Michael") }
 73 |         println("ID for user Michael is : " + id.toString)
 74 | 
 75 |         // Find username with user ID
 76 |         val name = time { ru findUserNameByIDInGraph(graph, 1) }
 77 |         println("Name for id 1 is : " + name.toString)
 78 | 
 79 |         // get tweet content with tweet ID
 80 |         var resultGetTweetContentFromID =  time { cu getTweetContentFromID(sc,"606461329357045760") }
 81 |         println(resultGetTweetContentFromID)
 82 | 
 83 |         // this one does not exist
 84 |         resultGetTweetContentFromID =  time { cu getTweetContentFromID(sc,"604230254979346433") }
 85 |         println(resultGetTweetContentFromID)
 86 | 
 87 |         // Get tweets from user
 88 |         val resultGetTweetsIDFromUser = time { cu getTweetsIDFromUser(sc,"209144549") }
 89 |         resultGetTweetsIDFromUser.foreach(println(_))
 90 | 
 91 |         // Count in and out degrees
 92 |         //time { gu inAndOutDegrees(graph) }
 93 | 
 94 | 
 95 |         println("\n--------------------------------------------------------------")
 96 |         println("Community detection")
 97 |         println("--------------------------------------------------------------\n")
 98 | 
 99 |         // Call ConnectedComponents
100 |         time { comUtils cc(graph, users) }
101 | 
102 |         // Call StronglyConnectedComponents
103 |         time { comUtils scc(graph, 1) }
104 | 
105 |         // Get triangle Count
106 |         time { comUtils getTriangleCount(graph, users) }
107 | 
108 |         // Get PageRank
109 |         time { gu getPageRank(graph, users) }
110 | 
111 |         // K-Core decomposition
112 |         time { comUtils getKCoreGraph(graph, users, 4, true) }
113 | 
114 |         // LabelPropagation
115 |         val graphLabelPropagation = time { LabelPropagation.run(graph, 4).cache() }
116 | 
117 |         println("VERTICES")
118 |         graphLabelPropagation.vertices.collect.foreach(println(_))
119 | 
120 |         val labelVertices = graphLabelPropagation.vertices
121 | 
122 |         val displayVertices = users.join(labelVertices).map {
123 |             case (id, (username, rank)) => (id, username, rank)
124 |         }
125 |         println("VERTICES NAMED")
126 | 
127 |         // Print the result descending
128 |         println(displayVertices.collect().sortBy(_._3).reverse.mkString("\n"))
129 |         println("EDGES")
130 | 
131 |         graphLabelPropagation.edges.collect.foreach(println(_))
132 | 
133 | 
134 |         println("\n**************************************************************")
135 |         println("                       FIRST EXAMPLE                          ")
136 |         println("**************************************************************")
137 | 
138 | 
139 |         println("\n--------------------------------------------------------------")
140 |         println("First Step - K-Core Decomposition algorithm")
141 |         println("--------------------------------------------------------------")
142 | 
143 |         // K-Core decomposition
144 |         val graph_2 = time { comUtils getKCoreGraph(graph, users, 5, false) }.cache()
145 | 
146 |         graph_2.edges.collect.foreach(println(_))
147 |         graph_2.vertices.collect.foreach(println(_))
148 | 
149 |         println("\n--------------------------------------------------------------")
150 |         println("Second Step - Connected Components algorithm")
151 |         println("--------------------------------------------------------------")
152 | 
153 |         // Call ConnectedComponents
154 |         time { comUtils cc(graph_2, graph_2.vertices) }
155 | 
156 |         println("\n--------------------------------------------------------------")
157 |         println("Third Step - Get Tweets from Edges")
158 |         println("--------------------------------------------------------------")
159 | 
160 |         val corpusWords = time { cu getTweetsContentFromEdge(sc, graph_2.edges, true) }
161 |         corpusWords.foreach(println(_))
162 | 
163 |         /*println("\n--------------------------------------------------------------")
164 |         println("Fourth Step - LDA Algorithm")
165 |         println("--------------------------------------------------------------")
166 | 
167 |         val nTopics = 10
168 |         val nIterations = 10
169 |         val nWordsByTopics = 10
170 |         val nStopwords  = 20
171 |         time { mu getLDA(sc, corpusWords, nTopics, nIterations, nWordsByTopics, nStopwords, true) }*/
172 | 
173 |         */
174 | 
175 | 
176 |         println("\n**************************************************************")
177 |         println("                       SECOND EXAMPLE                         ")
178 |         println("**************************************************************")
179 | 
180 |         println("\n--------------------------------------------------------------")
181 |         println("First Step - Split community : \n" +
182 |             "\t     Connected Components algorithm to find different\n" +
183 |             "\t     communities")
184 |         println("--------------------------------------------------------------")
185 | 
186 |         //time { comUtils cc(graph, graph.vertices) }
187 | 
188 |         val subGraphes = time {
189 |             comUtils splitCommunity(graph, users, false)
190 |         }
191 | 
192 |         println("\n--------------------------------------------------------------")
193 |         println("Second Step - Calculate LDA for every communities\n" +
194 |             "\t 1. Get Tweets from Edges\n" +
195 |             "\t 2. LDA Algorithm")
196 |         println("--------------------------------------------------------------")
197 |         var iComm = 1
198 |         //for (community <- subGraphes){
199 |         println("--------------------------")
200 |         println("Community : " + iComm)
201 |         println("--------------------------")
202 |         //community.edges.collect().foreach(println(_))
203 |         //community.vertices.collect().foreach(println(_))
204 | 
205 |         println("--------------------------")
206 |         println("Get Tweets from Edges")
207 |         println("--------------------------")
208 |         //val corpus = time { cu getTweetsContentFromEdge(sc, community.edges, false) }
209 | 
210 |         println("--------------------------")
211 |         println("LDA Algorithm")
212 |         println("--------------------------")
213 |         val numTopics = 5
214 |         val numIterations = 10
215 |         val numWordsByTopics = 5
216 |         val numStopwords = 0
217 | 
218 |         // Initialize LDA
219 |         println(color("\nCall InitLDA", RED))
220 | 
221 |         val topicSmoothing = 1.2
222 |         val termSmoothing = 1.2
223 | 
224 |         // Set LDA parameters
225 |         val lda = new LDA()
226 |             .setOptimizer("online")
227 |             .setK(numTopics)
228 |             .setDocConcentration(topicSmoothing)
229 |             .setTopicConcentration(termSmoothing)
230 |             .setMaxIterations(numIterations)
231 | 
232 |         // Create documents
233 |         var firstDoc = ArrayBuffer[String]()
234 |         firstDoc += "Concentration parameter commonly named for the prior placed"
235 | 
236 |         // Init LDA
237 |         val mu = new MllibUtils(lda, sc, firstDoc, firstDoc)
238 | 
239 |         // First tweet
240 |         mu newTweet ("Concentration distributions topics Concentration")
241 | 
242 |         // Get documents and word's array
243 |         val (newdoc: RDD[(Long, Vector)], newvocabArray) = time {
244 |             mu createDocuments(sc, 0)
245 |         }
246 | 
247 |         var ldaModel = lda.run(newdoc)
248 | 
249 |         // Find topics
250 |         ldaModel = time {
251 |             mu findTopics(ldaModel, newvocabArray, numWordsByTopics, true)
252 |         }
253 | 
254 |         // Second tweet
255 |         mu newTweet ("October arrived, spreading a damp chill")
256 | 
257 |         val (newdoc2: RDD[(Long, Vector)], newvocabArray2) = time {
258 |             mu createDocuments(sc, 0)
259 |         }
260 | 
261 |         ldaModel = lda.run(newdoc2)
262 | 
263 |         // Find
264 |         ldaModel = time {
265 |             mu findTopics(ldaModel, newvocabArray2, numWordsByTopics, true)
266 |         }
267 | 
268 | 
269 |         iComm += 1
270 |         //}
271 | 
272 |         // Generate Vertices
273 |         val collectionVertices = ArrayBuffer[(Long, String)]()
274 |         collectionVertices += ((2732329846L, "Michael"))
275 |         collectionVertices += ((132988448L, "Jean"))
276 | 
277 |         // Convert it to RDD
278 |         val VerticesRDD = ru ArrayToVertices(sc, collectionVertices)
279 | 
280 |         // Generate Hash
281 |         val random = abs(gu murmurHash64A ("MichaelCaraccio".getBytes))
282 | 
283 |         // Add edges
284 |         val collectionEdge = ArrayBuffer[Edge[String]]()
285 |         collectionEdge += Edge(random, 132988448L, "606460188367974400")
286 |         collectionEdge += Edge(2732329846L, 2941487254L, "606461336986386435")
287 |         collectionEdge += Edge(2732329846L, 601389784L, "606461384767897600")
288 | 
289 |         // Convert it to RDD
290 |         val EdgeRDD = ru ArrayToEdges(sc, collectionEdge)
291 | 
292 |         // Create Graph
293 |         val testGraph = Graph(VerticesRDD, EdgeRDD)
294 | 
295 |         testGraph.vertices.collect.foreach(println(_))
296 |         testGraph.edges.collect.foreach(println(_))
297 |     }
298 | 
299 |     /**
300 |      * @constructor time
301 |      *
302 |      *              timer for profiling block
303 |      *
304 |      * @param R $block - Block executed
305 |      * @return Unit
306 |      */
307 |     def time[R](block: => R): R = {
308 |         val t0 = System.nanoTime()
309 |         val result = block // call-by-name
310 |         val t1 = System.nanoTime()
311 |         println("Elapsed time: " + (t1 - t0) / 1000000000.0 + " seconds")
312 |         result
313 |     }
314 | 
315 |     /**
316 |      * @constructor initGraph
317 |      *
318 |      *              init data - construct graph and populate it
319 |      *
320 |      * @param SparkContext $sc - Sparkcontext
321 |      * @return RDD[(VertexId, (String))] - users (Vertices)
322 |      *         RDD[Edge[String]] - relationship (Edges)
323 |      *         String - default user
324 |      */
325 |     def initGraph(sc: SparkContext): (RDD[(VertexId, (String))], RDD[Edge[String]], String) = {
326 |         println(color("\nCall : initGraph", RED))
327 | 
328 |         // Create an RDD for the vertices
329 |         val users: RDD[(VertexId, (String))] =
330 |             sc.parallelize(Array(
331 |                 (2732329846L, "Michael"),
332 |                 (132988448L, "David"),
333 |                 (473822999L, "Sarah"),
334 |                 (2932436311L, "Jean"),
335 |                 (2249679902L, "Raphael"),
336 |                 (601389784L, "Lucie"),
337 |                 (2941487254L, "Harold"),
338 |                 (1192483885L, "Pierre"),
339 |                 (465776805L, "Christophe"),
340 |                 (838147628L, "Zoe"),
341 |                 (2564641105L, "Fabien"),
342 |                 (1518391292L, "Nicolas")
343 |             ))
344 | 
345 |         // Create an RDD for edges
346 |         val relationships: RDD[Edge[String]] =
347 |             sc.parallelize(Array(
348 |                 Edge(2732329846L, 132988448L, "608919340121870338"),
349 |                 Edge(2732329846L, 2941487254L, "608919742347264000"),
350 |                 Edge(2732329846L, 601389784L, "608918664549687299"),
351 |                 Edge(601389784L, 2732329846L, "608918165117104129"),
352 |                 Edge(2941487254L, 1192483885L, "608921008020566016"),
353 |                 Edge(2941487254L, 132988448L, "608920341084258304"),
354 |                 Edge(132988448L, 838147628L, "608919327694270464"),
355 |                 Edge(838147628L, 132988448L, "608919807887552513"),
356 |                 Edge(838147628L, 473822999L, "608919870277869568"),
357 |                 Edge(465776805L, 2941487254L, "608920678117597184"),
358 |                 Edge(465776805L, 601389784L, "608917990365499392"),
359 |                 Edge(465776805L, 2249679902L, "608918336643039232"),
360 |                 Edge(2249679902L, 465776805L, "608919570796163072"),
361 |                 Edge(2932436311L, 465776805L, "608921304377475073"),
362 |                 Edge(1192483885L, 2941487254L, "608921260387610624"),
363 |                 Edge(465776805L, 2941487254L, "608918707797110784"),
364 |                 Edge(601389784L, 2732329846L, "608919779542339584"),
365 |                 Edge(2932436311L, 465776805L, "608917272883789824"),
366 |                 Edge(2941487254L, 465776805L, "608920374680506368"),
367 |                 Edge(2941487254L, 1192483885L, "608920849664450560"),
368 |                 Edge(2941487254L, 1192483885L, "608917634822733824"),
369 |                 Edge(1192483885L, 2941487254L, "608920742990868480"),
370 |                 Edge(1192483885L, 2941487254L, "608921092334354432"),
371 |                 Edge(2732329846L, 132988448L, "608917366538424320"),
372 |                 Edge(2941487254L, 132988448L, "608920981650976769"),
373 |                 Edge(132988448L, 2941487254L, "608920887639855104"),
374 |                 Edge(132988448L, 2941487254L, "608916751988867072"),
375 |                 Edge(132988448L, 2941487254L, "608919716137033730"),
376 |                 Edge(601389784L, 2732329846L, "608921306705354752"),
377 |                 Edge(601389784L, 2732329846L, "608918359913164801"),
378 |                 Edge(2732329846L, 2941487254L, "608920468985266176"),
379 |                 Edge(2732329846L, 2941487254L, "608918157806432257"),
380 |                 Edge(2564641105L, 1518391292L, "608918942086799360"),
381 |                 Edge(1518391292L, 2564641105L, "608921314104094720")
382 |             ))
383 | 
384 |         // Define a default user in case there are relationship with missing user
385 |         val defaultUser = "John Doe"
386 | 
387 |         (users, relationships, defaultUser)
388 |     }
389 | 
390 |     /**
391 |      * @constructor
392 |      *
393 |      *
394 |      *
395 |      * @param
396 |      * @return
397 |      */
398 |     /*def isVerticeInGraph(): Unit ={
399 | 
400 |     }*/
401 | 
402 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
403 | }


--------------------------------------------------------------------------------
/scala/GraphxTesting/src/main/scala/utils/CassandraUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | 
  5 | import scala.collection.mutable.ArrayBuffer
  6 | 
  7 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD:
  8 | 
  9 | import com.datastax.spark.connector._
 10 | 
 11 | // To make some of the examples work we will also need RDD
 12 | 
 13 | import org.apache.spark.graphx._
 14 | import org.apache.spark.rdd.RDD
 15 | 
 16 | 
 17 | class CassandraUtils {
 18 | 
 19 |     val RED = "\033[1;30m"
 20 |     val ENDC = "\033[0m"
 21 | 
 22 |     /**
 23 |      * @constructor getTweetContentFromID
 24 |      *
 25 |      *              Return tweet content
 26 |      *
 27 |      * @param SparkContext sc - SparkContext
 28 |      * @param String $id - tweet id
 29 |      * @return Unit
 30 |      */
 31 |     def getTweetContentFromID(sc: SparkContext, id: String): String = {
 32 | 
 33 |         println(color("\nCall getTweetContentFromID", RED))
 34 | 
 35 |         val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", id)
 36 | 
 37 |         if (query.collect().length != 0) {
 38 |             query.first().getString("tweet_text")
 39 |         }
 40 |         else
 41 |             "Tweet not found"
 42 |     }
 43 | 
 44 |     /**
 45 |      * @constructor getTweetsIDFromUser
 46 |      *
 47 |      *              Return tweet id
 48 |      *
 49 |      * @param SparkContext sc - SparkContext
 50 |      * @param String $id - user (sender) id
 51 |      * @return Unit
 52 |      */
 53 |     def getTweetsIDFromUser(sc: SparkContext, id: String): ArrayBuffer[String] = {
 54 | 
 55 |         println(color("\nCall getTweetsIDFromUser", RED))
 56 |         println("Tweets found:")
 57 | 
 58 |         val query = sc.cassandraTable("twitter", "users_communicate").select("tweet_id").where("user_send_local_id = ?", id)
 59 | 
 60 |         // Result will be stored in an array
 61 |         var result = ArrayBuffer[String]()
 62 | 
 63 |         if (query.collect().length != 0) {
 64 |             result += query.first().getString("tweet_id")
 65 |         }
 66 | 
 67 |         // Display result
 68 |         result.foreach(println(_))
 69 | 
 70 |         // Return
 71 |         result
 72 |     }
 73 | 
 74 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 75 | 
 76 |     /**
 77 |      * @constructor getTweetsContentFromEdge
 78 |      *
 79 |      *              Return an array of tweets content for a given Graph
 80 |      *
 81 |      * @param SparkContext sc - SparkContext
 82 |      * @param RDD[Edge[String]] $edge - graph's edge
 83 |      * @return Unit
 84 |      */
 85 |     def getTweetsContentFromEdge(sc: SparkContext, edge: RDD[Edge[String]], displayResult: Boolean): RDD[String] = {
 86 | 
 87 |         println(color("\nCall getTweetsContentFromEdge", RED))
 88 | 
 89 |         // Get the tweets ID for every communication
 90 |         val tweetsID = edge.flatMap({
 91 |             case Edge(idSend, idExp, idTweet) => Seq(idTweet)
 92 |         })
 93 | 
 94 |         // Result will be stored in an array
 95 |         var result = ArrayBuffer[String]()
 96 | 
 97 |         // Queries
 98 |         for (tweet <- tweetsID.collect()) {
 99 |             val query = sc.cassandraTable("twitter", "tweet_filtered").select("tweet_text").where("tweet_id = ?", tweet)
100 | 
101 |             if (query.collect().length != 0) {
102 |                 result += query.first().getString("tweet_text")
103 |             }
104 |         }
105 | 
106 |         // Display results
107 |         if (displayResult) {
108 |             result.foreach(println(_))
109 |         }
110 | 
111 |         // return
112 |         sc.parallelize(result)
113 |     }
114 | }


--------------------------------------------------------------------------------
/scala/GraphxTesting/src/main/scala/utils/CommunityUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark._
  4 | import org.apache.spark.graphx._
  5 | import org.apache.spark.rdd.RDD
  6 | 
  7 | import scala.collection.mutable.ArrayBuffer
  8 | import scala.math._
  9 | import scala.reflect.ClassTag
 10 | 
 11 | class CommunityUtils extends Logging with Serializable {
 12 | 
 13 |     val RED = "\033[1;30m"
 14 |     val ENDC = "\033[0m"
 15 | 
 16 |     /**
 17 |      * splitCommunity
 18 |      *
 19 |      * Find and split communities in graph
 20 |      *
 21 |      * @param Graph[String,String] $graph - Graph element
 22 |      * @param RDD[(VertexId, (String))] $users - Vertices
 23 |      * @param Boolean $displayResult - if true, display println
 24 |      * @return ArrayBuffer[Graph[String,String]] - Contains one graph per community
 25 |      *
 26 |      */
 27 |     def splitCommunity(graph: Graph[String, String], users: RDD[(VertexId, (String))], displayResult: Boolean): ArrayBuffer[Graph[String, String]] = {
 28 | 
 29 |         println(color("\nCall SplitCommunity", RED))
 30 | 
 31 |         val graph_2 = getKCoreGraph(graph, users, 2, false).cache()
 32 | 
 33 |         // Find the connected components
 34 |         val cc = graph_2.connectedComponents().vertices
 35 | 
 36 |         // Join the connected components with the usernames and id
 37 |         // The result is an RDD not a Graph
 38 |         val ccByUsername = users.join(cc).map {
 39 |             case (id, (username, cc)) => (id, username, cc)
 40 |         }
 41 | 
 42 |         // Print the result
 43 |         val lowerIDPerCommunity = ccByUsername.map { case (id, username, cc) => cc }.distinct()
 44 | 
 45 |         // Result will be stored in an array
 46 |         var result = ArrayBuffer[Graph[String, String]]()
 47 |         println("--------------------------")
 48 |         println("Total community found: " + lowerIDPerCommunity.toArray.size)
 49 |         println("--------------------------")
 50 |         for (id <- lowerIDPerCommunity.toArray) {
 51 | 
 52 |             println("\nCommunity ID : " + id)
 53 | 
 54 |             val subGraphVertices = ccByUsername.filter {
 55 |                 _._3 == id
 56 |             }.map { case (id, username, cc) => (id, username) }
 57 | 
 58 |             //subGraphVertices.foreach(println(_))
 59 | 
 60 |             // Create a new graph
 61 |             // And remove missing vertices as well as the edges to connected to them
 62 |             var tempGraph = Graph(subGraphVertices, graph_2.edges).subgraph(vpred = (id, username) => username != null)
 63 | 
 64 |             result += tempGraph
 65 |         }
 66 | 
 67 |         // Display communities
 68 |         if (displayResult) {
 69 |             println("\nCommunities found " + result.size)
 70 |             for (community <- result) {
 71 |                 println("-----------------------")
 72 |                 //community.edges.collect().foreach(println(_))
 73 |                 community.vertices.collect().foreach(println(_))
 74 |             }
 75 |         }
 76 | 
 77 |         result
 78 |     }
 79 | 
 80 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 81 | 
 82 |     /**
 83 |      * Compute the k-core decomposition of the graph for all k <= kmax. This
 84 |      * uses the iterative pruning algorithm discussed by Alvarez-Hamelin et al.
 85 |      * in K-Core Decomposition: a Tool For the Visualization of Large Scale Networks
 86 |      * (see <a href="http://arxiv.org/abs/cs/0504107">http://arxiv.org/abs/cs/0504107</a>).
 87 |      *
 88 |      * @tparam VD the vertex attribute type (discarded in the computation)
 89 |      * @tparam ED the edge attribute type (preserved in the computation)
 90 |      *
 91 |      * @param graph the graph for which to compute the connected components
 92 |      * @param kmax the maximum value of k to decompose the graph
 93 |      *
 94 |      * @return a graph where the vertex attribute is the minimum of
 95 |      *         kmax or the highest value k for which that vertex was a member of
 96 |      *         the k-core.
 97 |      *
 98 |      * @note This method has the advantage of returning not just a single kcore of the
 99 |      *       graph but will yield all the cores for k > kmin.
100 |      */
101 |     def getKCoreGraph[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED],
102 |                                                   users: RDD[(VertexId, (String))],
103 |                                                   kmin: Int,
104 |                                                   displayResult: Boolean): Graph[String, ED] = {
105 | 
106 |         // Graph[(Int, Boolean), ED] - boolean indicates whether it is active or not
107 |         var g = graph.outerJoinVertices(graph.degrees)((vid, oldData, newData) => newData.getOrElse(0)).cache
108 |         val degrees = graph.degrees
109 | 
110 |         println(color("\nCall KCoreDecomposition", RED))
111 | 
112 |         g = computeCurrentKCore(g, kmin).cache
113 |         val testK = kmin
114 |         val vCount = g.vertices.filter { case (vid, vd) => vd >= kmin }.count()
115 |         val eCount = g.triplets.map { t => t.srcAttr >= testK && t.dstAttr >= testK }.count()
116 | 
117 |         val v = g.vertices.filter { case (vid, vd) => vd >= kmin }
118 | 
119 |         // Display informations
120 |         if (displayResult) {
121 | 
122 |             val numVertices = degrees.count
123 | 
124 |             logWarning(s"Number of vertices: $numVertices")
125 |             logWarning(s"Degree sample: ${degrees.take(10).mkString(", ")}")
126 |             logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey((_ + _)).collect().mkString(", "))
127 |             logWarning(s"Degree distribution: " + degrees.map { case (vid, data) => (data, 1) }.reduceByKey((_ + _)).take(10).mkString(", "))
128 |             logWarning(s"K=$kmin, V=$vCount, E=$eCount")
129 |         }
130 | 
131 |         // Create new RDD users
132 |         val newUser = users.join(v).map {
133 |             case (id, (username, rank)) => (id, username)
134 |         }
135 | 
136 |         // Create a new graph
137 |         val gra = Graph(newUser, g.edges)
138 | 
139 |         // Remove missing vertices as well as the edges to connected to them
140 |         gra.subgraph(vpred = (id, username) => username != null)
141 |     }
142 | 
143 |     def computeCurrentKCore[ED: ClassTag](graph: Graph[Int, ED], k: Int) = {
144 |         //logWarning(s"Computing kcore for k=$k")
145 |         def sendMsg(et: EdgeTriplet[Int, ED]): Iterator[(VertexId, Int)] = {
146 |             if (et.srcAttr < 0 || et.dstAttr < 0) {
147 |                 // if either vertex has already been turned off we do nothing
148 |                 Iterator.empty
149 |             } else if (et.srcAttr < k && et.dstAttr < k) {
150 |                 // tell both vertices to turn off but don't need change count value
151 |                 Iterator((et.srcId, -1), (et.dstId, -1))
152 | 
153 |             } else if (et.srcAttr < k) {
154 |                 // if src is being pruned, tell dst to subtract from vertex count
155 |                 Iterator((et.srcId, -1), (et.dstId, 1))
156 | 
157 |             } else if (et.dstAttr < k) {
158 |                 // if dst is being pruned, tell src to subtract from vertex count
159 |                 Iterator((et.dstId, -1), (et.srcId, 1))
160 | 
161 |             } else {
162 |                 Iterator.empty
163 |             }
164 |         }
165 | 
166 |         // subtracts removed neighbors from neighbor count and tells vertex whether it was turned off or not
167 |         def mergeMsg(m1: Int, m2: Int): Int = {
168 |             if (m1 < 0 || m2 < 0) {
169 |                 -1
170 |             } else {
171 |                 m1 + m2
172 |             }
173 |         }
174 | 
175 |         def vProg(vid: VertexId, data: Int, update: Int): Int = {
176 |             if (update < 0) {
177 |                 // if the vertex has turned off, keep it turned off
178 |                 -1
179 |             } else {
180 |                 // subtract the number of neighbors that have turned off this round from
181 |                 // the count of active vertices
182 |                 // TODO(crankshaw) can we ever have the case data < update?
183 |                 max(data - update, 0)
184 |             }
185 |         }
186 | 
187 |         // Note that initial message should have no effect
188 |         Pregel(graph, 0)(vProg, sendMsg, mergeMsg)
189 |     }
190 | 
191 |     /**
192 |      * getTriangleCount
193 |      *
194 |      * Compute the number of triangles passing through each vertex.
195 |      *
196 |      * @param Graph[String,String] $graph - Graph element
197 |      * @param RDD[(VertexId, (String))] $users - Vertices
198 |      * @return Unit
199 |      *
200 |      * @see [[org.apache.spark.graphx.lib.TriangleCount$#run]]
201 |      */
202 |     def getTriangleCount(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = {
203 | 
204 |         println(color("\nCall getTriangleCount", RED))
205 | 
206 |         // Sort edges ID srcID < dstID
207 |         val edges = graph.edges.map { e =>
208 |             if (e.srcId < e.dstId) {
209 |                 Edge(e.srcId, e.dstId, e.attr)
210 |             }
211 |             else {
212 |                 Edge(e.dstId, e.srcId, e.attr)
213 |             }
214 |         }
215 | 
216 |         // Temporary graph
217 |         val newGraph = Graph(users, edges, "").cache()
218 | 
219 |         // Find the triangle count for each vertex
220 |         // TriangleCount requires the graph to be partitioned
221 |         val triCounts = newGraph.partitionBy(PartitionStrategy.RandomVertexCut).cache().triangleCount().vertices
222 | 
223 |         val triCountByUsername = users.join(triCounts).map {
224 |             case (id, (username, rank)) => (id, username, rank)
225 |         }
226 | 
227 |         println("Display triangle's sum for each user")
228 |         triCountByUsername.foreach(println)
229 | 
230 |         println("\nTotal: " + triCountByUsername.map { case (id, username, rank) => rank }.distinct().count() + "\n")
231 |     }
232 | 
233 |     /**
234 |      * @constructor ConnectedComponents
235 |      *
236 |      *              Compute the connected component membership of each vertex and return a graph with the vertex
237 |      *              value containing the lowest vertex id in the connected component containing that vertex.
238 |      *
239 |      * @param Graph[String,String] $graph - Graph element
240 |      * @param RDD[(VertexId, (String))] $users - Vertices
241 |      * @return Unit
242 |      *
243 |      * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]]
244 |      */
245 |     def cc(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = {
246 |         println(color("\nCall ConnectedComponents", RED))
247 | 
248 |         // Find the connected components
249 |         val cc = graph.connectedComponents().vertices
250 | 
251 |         // Join the connected components with the usernames and id
252 |         val ccByUsername = users.join(cc).map {
253 |             case (id, (username, cc)) => (id, username, cc)
254 |         }
255 |         // Print the result
256 |         println(ccByUsername.collect().sortBy(_._3).mkString("\n"))
257 | 
258 |         println("\nTotal groups: " + ccByUsername.map { case (id, username, cc) => cc }.distinct().count() + "\n")
259 |     }
260 | 
261 |     /**
262 |      * @constructor StronglyConnectedComponents
263 |      *
264 |      *              Compute the strongly connected component (SCC) of each vertex and return a graph with the
265 |      *              vertex value containing the lowest vertex id in the SCC containing that vertex.
266 |      *
267 |      *              Display edges's membership and total groups
268 |      *
269 |      * @param Graph[String,String] $graph - Graph element
270 |      * @param Int $iteration - Number of iteration
271 |      * @return Unit
272 |      */
273 |     def scc(graph: Graph[String, String], iteration: Int): Unit = {
274 | 
275 |         println(color("\nCall StronglyConnectedComponents : iteration : " + iteration, RED))
276 |         val sccGraph = graph.stronglyConnectedComponents(5)
277 | 
278 |         val connectedGraph = sccGraph.vertices.map {
279 |             case (member, leaderGroup) => s"$member is in the group of $leaderGroup's edge"
280 |         }
281 | 
282 |         val totalGroups = sccGraph.vertices.map {
283 |             case (member, leaderGroup) => leaderGroup
284 |         }
285 | 
286 |         connectedGraph.collect.foreach(println)
287 | 
288 |         println("\nTotal groups: " + totalGroups.distinct().count() + "\n")
289 |     }
290 | }


--------------------------------------------------------------------------------
/scala/GraphxTesting/src/main/scala/utils/GraphUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | // To make some of the examples work we will also need RDD
  4 | 
  5 | import org.apache.spark.graphx._
  6 | import org.apache.spark.rdd.RDD
  7 | 
  8 | 
  9 | class GraphUtils {
 10 | 
 11 |     val RED = "\033[1;30m"
 12 |     val ENDC = "\033[0m"
 13 |     private val defaultSeed = 0xadc83b19L
 14 | 
 15 |     /**
 16 |      * @constructor murmurHash64A
 17 |      *
 18 |      *
 19 |      * @param
 20 |      * @param
 21 |      * @return Long
 22 |      *
 23 |      */
 24 |     def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = {
 25 |         val m = 0xc6a4a7935bd1e995L
 26 |         val r = 47
 27 | 
 28 |         val f: Long => Long = m.*
 29 |         val g: Long => Long = x => x ^ (x >>> r)
 30 | 
 31 |         val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) =>
 32 |             val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff))
 33 |             val j: Long => Long = if (xs.length == 8) f compose g compose f else identity
 34 |             f(y ^ j(k))
 35 |         }
 36 |         (g compose f compose g)(h)
 37 |     }
 38 | 
 39 |     /**
 40 |      * @constructor getPageRank
 41 |      *
 42 |      *              Run PageRank for a fixed number of iterations returning a graph with vertex attributes
 43 |      *              containing the PageRank and edge attributes the normalized edge weight.
 44 |      *
 45 |      * @param Graph[String,String] $graph - Graph element
 46 |      * @param RDD[(VertexId, (String))] $users - Vertices
 47 |      * @return Unit
 48 |      *
 49 |      * @see [[org.apache.spark.graphx.lib.PageRank$#run]]
 50 |      */
 51 |     def getPageRank(graph: Graph[String, String], users: RDD[(VertexId, (String))]): Unit = {
 52 | 
 53 |         println(color("\nCall getPageRank", RED))
 54 | 
 55 |         val ranks = graph.pageRank(0.00001).vertices
 56 | 
 57 |         val ranksByUsername = users.join(ranks).map {
 58 |             case (id, (username, rank)) => (id, username, rank)
 59 |         }
 60 | 
 61 |         // Print the result descending
 62 |         println(ranksByUsername.collect().sortBy(_._3).reverse.mkString("\n"))
 63 |     }
 64 | 
 65 |     /**
 66 |      * @constructor inAndOutDegrees
 67 |      *
 68 |      * @param Graph[String,String] $graph - Graph element
 69 |      * @return Unit
 70 |      *
 71 |      */
 72 |     def inAndOutDegrees(graph: Graph[String, String]): Unit = {
 73 | 
 74 |         println(color("\nCall inAndOutDegrees", RED))
 75 | 
 76 |         // Create User class
 77 |         case class User(name: String, // Username
 78 |                         inDeg: Int, // Received tweets
 79 |                         outDeg: Int) // Sent tweets
 80 | 
 81 |         // Create user Graph
 82 |         // def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED]
 83 |         val initialUserGraph: Graph[User, String] = graph.mapVertices {
 84 |             case (id, (name)) => User(name, 0, 0)
 85 |         }
 86 | 
 87 |         //initialUserGraph.edges.collect.foreach(println(_))
 88 | 
 89 | 
 90 |         // Fill in the degree informations (out and in degrees)
 91 |         val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) {
 92 |             case (id, u, inDegOpt) => User(u.name, inDegOpt.getOrElse(0), u.outDeg)
 93 |         }.outerJoinVertices(initialUserGraph.outDegrees) {
 94 |             case (id, u, outDegOpt) => User(u.name, u.inDeg, outDegOpt.getOrElse(0))
 95 |         }
 96 | 
 97 |         // Display the userGraph
 98 |         userGraph.vertices.foreach {
 99 |             case (id, u) => println(s"User $id is called ${u.name} and received ${u.inDeg} tweets and send ${u.outDeg}.")
100 |         }
101 |     }
102 | 
103 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
104 | }


--------------------------------------------------------------------------------
/scala/GraphxTesting/src/main/scala/utils/MllibUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | import org.apache.spark.mllib.clustering.{LDA, _}
  5 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
  6 | import org.apache.spark.rdd.RDD
  7 | 
  8 | import scala.collection.mutable
  9 | import scala.collection.mutable.ArrayBuffer
 10 | 
 11 | /**
 12 |  * Topic models automatically infer the topics discussed in a collection of documents. These topics can be used
 13 |  * to summarize and organize documents, or used for featurization and dimensionality reduction in later stages
 14 |  * of a Machine Learning (ML) pipeline.
 15 |  *
 16 |  * LDA is not given topics, so it must infer them from raw text. LDA defines a topic as a distribution over words.
 17 |  */
 18 | class MllibUtils(_lda: LDA, _sc: SparkContext, _dictionnary: ArrayBuffer[String], _currentTweet: ArrayBuffer[String]) {
 19 | 
 20 |     // Text Color
 21 |     val RED = "\033[1;30m"
 22 |     val ENDC = "\033[0m"
 23 | 
 24 |     // LDA attributs
 25 |     var lda: LDA = _lda
 26 |     var dictionnary: ArrayBuffer[String] = _dictionnary
 27 |     var currentTweet: ArrayBuffer[String] = _currentTweet
 28 |     var currentTweetRDD: RDD[String] = _sc.parallelize(_dictionnary)
 29 |     var sc: SparkContext = _sc
 30 | 
 31 |     /**
 32 |      * @constructor newTweet
 33 |      *
 34 |      *              Set currentTweet attribut and add the new tweet to the dictionnary
 35 |      *
 36 |      * @param String $newTweet - tweet content
 37 |      *
 38 |      * @return Unit
 39 |      */
 40 |     def newTweet(newTweet: String): Unit = {
 41 | 
 42 |         // Delete old currentTweet
 43 |         currentTweet = new ArrayBuffer[String]()
 44 | 
 45 |         // Set new value
 46 |         currentTweet += newTweet
 47 | 
 48 |         // Convert it to RDD
 49 |         currentTweetRDD = sc.parallelize(currentTweet)
 50 | 
 51 |         // Add tweet to dictionnary
 52 |         addToDictionnary(newTweet)
 53 | 
 54 |         currentTweetRDD.collect.foreach(println(_))
 55 |     }
 56 | 
 57 |     /**
 58 |      * @constructor addToDictionnary
 59 |      *
 60 |      *              Add tweet content to the dictionnary. A dictionnary contains every words set to the LDA
 61 |      *
 62 |      * @param String $newTweet - tweet content
 63 |      *
 64 |      * @return Unit
 65 |      */
 66 |     def addToDictionnary(newTweet: String): Unit = {
 67 |         dictionnary += newTweet
 68 |     }
 69 | 
 70 |     /**
 71 |      * @constructor findTopics
 72 |      *
 73 |      *              Set currentTweet attribut and add the new tweet to the dictionnary
 74 |      *
 75 |      * @param LDAModel $ldaModel - LDA Model (LocalModel)
 76 |      * @param Array[String] $vocabArray - Contains all distinct words set to LDA
 77 |      * @param Int $numWordsByTopics -
 78 |      * @param Boolean $displayResult - Display result in console
 79 |      *
 80 |      * @return LDAModel
 81 |      */
 82 |     def findTopics(ldaModel: LDAModel, vocabArray: Array[String], numWordsByTopics: Int, displayResult: Boolean): LDAModel = {
 83 | 
 84 |         println(color("\nCall findTopics", RED))
 85 | 
 86 |         println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
 87 | 
 88 |         // Print topics, showing top-weighted x terms for each topic.
 89 |         if (displayResult) {
 90 |             val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = numWordsByTopics)
 91 |             topicIndices.foreach { case (terms, termWeights) =>
 92 |                 println("TOPICS:")
 93 |                 terms.zip(termWeights).foreach { case (term, weight) =>
 94 |                     println(s"${vocabArray(term.toInt)}\t\t$weight")
 95 |                 }
 96 |                 println()
 97 |             }
 98 |         }
 99 |         ldaModel
100 |     }
101 | 
102 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
103 | 
104 |     /**
105 |      * @constructor createDocuments
106 |      *
107 |      *              Set currentTweet attribut and add the new tweet to the dictionnary
108 |      *
109 |      * @param SparkContext $sc - LDA Model (LocalModel)
110 |      * @param Int $numStopwords - Contains all distinct words set to LDA
111 |      *
112 |      * @return RDD[(Long, Vector)] and Array[String] : documentsRDD and array of vocabulary
113 |      */
114 |     def createDocuments(sc: SparkContext, numStopwords: Int): (RDD[(Long, Vector)], Array[String]) = {
115 | 
116 |         println(color("\nCall createDocuments", RED))
117 | 
118 |         val corpus: RDD[String] = sc.parallelize(dictionnary)
119 | 
120 |         // Split every tweets's text into terms (words) and then remove :
121 |         // -> (a) non-alphabetic terms
122 |         // -> (b) short terms with < 4 characters
123 |         // -> (c) to lower
124 |         val tokenizedCorpus: RDD[Seq[String]] =
125 |             corpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3).filter(_.forall(java.lang.Character.isLetter)))
126 | 
127 |         // Split tweet's text into terms (words) and then remove :
128 |         // -> (a) non-alphabetic terms
129 |         // -> (b) short terms with < 4 characters
130 |         // -> (c) to lower
131 |         val tokenizedTweet: RDD[Seq[String]] =
132 |             currentTweetRDD.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3).filter(_.forall(java.lang.Character.isLetter)))
133 | 
134 | 
135 |         // Choose the vocabulary
136 |         //   termCounts: Sorted list of (term, termCount) pairs
137 |         val termCounts: Array[(String, Long)] = tokenizedCorpus.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2)
138 | 
139 |         // vocabArray contains all distinct words
140 |         val vocabArray: Array[String] = termCounts.takeRight(termCounts.size - numStopwords).map(_._1)
141 | 
142 | 
143 |         // Map[String, Int] of words and theirs places in tweet
144 |         val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap
145 |         //vocab.foreach(println(_))
146 | 
147 |         // MAP : [ Word ID , VECTOR [vocab.size, WordFrequency]]
148 |         val documents: Map[Long, Vector] =
149 |             vocab.map { case (tokens, id) =>
150 |                 val counts = new mutable.HashMap[Int, Double]()
151 | 
152 |                 // Word ID
153 |                 val idx = vocab(tokens)
154 | 
155 |                 // Count word occurancy
156 |                 counts(idx) = counts.getOrElse(idx, 0.0) + tokenizedTweet.collect.flatten.count(_ == tokens)
157 | 
158 |                 // Return word ID and Vector
159 |                 (id.toLong, Vectors.sparse(vocab.size, counts.toSeq))
160 |             }
161 | 
162 |         // Transform it to RDD
163 |         val documentsRDD = sc.parallelize(documents.toSeq)
164 | 
165 |         // Display RDD
166 |         documentsRDD.collect.foreach(println(_))
167 | 
168 |         // Return
169 |         (documentsRDD, vocabArray)
170 |     }
171 | }


--------------------------------------------------------------------------------
/scala/GraphxTesting/src/main/scala/utils/RDDUtils.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | 
  5 | import scala.collection.mutable.ArrayBuffer
  6 | 
  7 | // To make some of the examples work we will also need RDD
  8 | import org.apache.spark.graphx._
  9 | import org.apache.spark.rdd.RDD
 10 | 
 11 | 
 12 | class RDDUtils {
 13 | 
 14 |     val RED = "\033[1;30m"
 15 |     val ENDC = "\033[0m"
 16 | 
 17 |     /**
 18 |      * @constructor ArrayToVertices
 19 |      *
 20 |      *              Convert ArrayBuffer to RDD containing Vertices
 21 |      *
 22 |      * @param SparkContext - $sc - SparkContext
 23 |      * @param ArrayBuffer[(Long, (String))] - $collection - Contains vertices
 24 |      *
 25 |      * @return RDD[Edge[String]] - RDD of vertices
 26 |      */
 27 |     def ArrayToVertices(sc: SparkContext, collection: ArrayBuffer[(Long, (String))]): RDD[(VertexId, (String))] = {
 28 |         sc.parallelize(collection)
 29 |     }
 30 | 
 31 |     /**
 32 |      * @constructor ArrayToEdges
 33 |      *
 34 |      *              Convert ArrayBuffer to RDD containing Edges
 35 |      *
 36 |      * @param SparkContext - $sc - SparkContext
 37 |      * @param ArrayBuffer[Edge[String]] - $collection - Contains edges
 38 |      *
 39 |      * @return RDD[Edge[String]] - RDD of edges
 40 |      */
 41 |     def ArrayToEdges(sc: SparkContext, collection: ArrayBuffer[Edge[String]]): RDD[Edge[String]] = {
 42 |         sc.parallelize(collection)
 43 |     }
 44 | 
 45 |     /**
 46 |      * @constructor findUserByIDInGraph
 47 |      *
 48 |      *              find user ID with username
 49 |      *
 50 |      * @param Graph[String,String] $graph - Graph element
 51 |      * @param Int $userID - User id
 52 |      * @return String - if success : username | failure : "user not found"
 53 |      */
 54 |     def findUserNameByIDInGraph(graph: Graph[String, String], userID: Int): String = {
 55 |         println(color("\nCall : findUserNameWithID", RED))
 56 | 
 57 |         graph.vertices.filter { case (id, name) => id == userID }.collect.foreach {
 58 |             (e: (org.apache.spark.graphx.VertexId, String)) => return e._2
 59 |         }
 60 |         "user not found"
 61 |     }
 62 | 
 63 |     /**
 64 |      * @constructor findUserIDByNameInGraph
 65 |      *
 66 |      *              find username with id
 67 |      *
 68 |      * @param Graph[String,String] $graph - Graph element
 69 |      * @param String $userName - Username
 70 |      * @return String - if success : id found | failure : "0"
 71 |      */
 72 |     def findUserIDByNameInGraph(graph: Graph[String, String], userName: String): String = {
 73 |         println(color("\nCall : findUserIDWithName", RED))
 74 | 
 75 |         graph.vertices.filter(_._2 == userName).collect.foreach {
 76 |             (e: (org.apache.spark.graphx.VertexId, String)) => return e._1.toString
 77 |         }
 78 |         "0"
 79 |     }
 80 | 
 81 |     def color(str: String, col: String): String = "%s%s%s".format(col, str, ENDC)
 82 | 
 83 |     /**
 84 |      * @constructor displayAllCommunications
 85 |      *
 86 |      *              display all communications between users
 87 |      *
 88 |      * @param Graph[String,String] $graph - Graph element
 89 |      * @return Unit
 90 |      */
 91 |     def displayAllCommunications(graph: Graph[String, String]): Unit = {
 92 | 
 93 |         println(color("\nCall : displayAllCommunications", RED))
 94 |         println("Users communications: ")
 95 | 
 96 |         val facts: RDD[String] = graph.triplets.map(triplet => triplet.srcAttr + " communicate with " +
 97 |             triplet.dstAttr + " with tweet id " + triplet.attr)
 98 | 
 99 |         facts.collect.foreach(println(_))
100 |     }
101 | }


--------------------------------------------------------------------------------
/scala/RDDFromCassandra/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "RDDFromCassandra"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |   "org.apache.spark" %% "spark-core"              % "1.2.1" % "provided",
 9 |   "org.apache.spark" %% "spark-streaming"         % "1.2.1" % "provided",
10 |   "org.apache.spark" %% "spark-streaming-twitter" % "1.2.1")
11 | 
12 | libraryDependencies += "org.twitter4j" % "twitter4j-stream" % "3.0.6"
13 | 
14 | libraryDependencies += "org.twitter4j" % "twitter4j-core" % "3.0.6"
15 | 
16 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.2.1"


--------------------------------------------------------------------------------
/scala/RDDFromCassandra/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")


--------------------------------------------------------------------------------
/scala/RDDFromCassandra/src/main/scala/RDDFromCassandra.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 2 | import StreamingContext._
 3 | 
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.SparkContext._
 6 | 
 7 | //import org.apache.spark.streaming.twitter
 8 | //import org.apache.spark.streaming.twitter._
 9 | //import org.apache.spark.streaming.twitter.TwitterUtils
10 | 
11 | import org.apache.spark.SparkConf
12 | 
13 | //import org.apache.spark.streaming.dstream.DStream
14 | //import org.apache.spark.streaming.Seconds
15 | //import org.apache.spark.streaming.StreamingContext
16 | //import org.apache.spark.streaming.StreamingContext._
17 | 
18 | import collection.JavaConversions._
19 | 
20 | import org.apache.log4j.Logger
21 | import org.apache.log4j.Level
22 | 
23 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD:
24 | import com.datastax.spark.connector._ 
25 | import com.datastax.spark.connector.streaming._
26 | 
27 | import scala.util.matching.Regex
28 | import org.apache.spark.rdd.RDD
29 | 
30 | 
31 | // Useful links
32 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/0_quick_start.md
33 | // http://planetcassandra.org/getting-started-with-apache-spark-and-cassandra/
34 | // https://bcomposes.wordpress.com/2013/02/09/using-twitter4j-with-scala-to-access-streaming-tweets/
35 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/5_saving.md
36 | 
37 | object RDDFromCassandra {
38 |     def main(args: Array[String]) {
39 | 
40 |         // Display only warning messages
41 |         Logger.getLogger("org").setLevel(Level.ERROR)
42 |         Logger.getLogger("akka").setLevel(Level.ERROR)
43 | 
44 |         val filters = args
45 |         
46 |         // Spark configuration
47 |         val sparkConf = new SparkConf(true)
48 |         .setMaster("local[4]")
49 |         .setAppName("RDDFromCassandra")
50 |         .set("spark.cassandra.connection.host", "127.0.0.1") // Add this line to link to Cassandra
51 |         
52 |         val sc = new SparkContext(sparkConf)
53 | 
54 |         val rdd = sc.cassandraTable("twitter", "users_communicate")
55 |         
56 |         rdd.toArray.foreach(println)
57 |         
58 |     }
59 | }


--------------------------------------------------------------------------------
/scala/SaveCommunicationToCassandra/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "SaveCommunicationToCassandra"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.5"
 6 | 
 7 | //resolvers += "Job Server Bintray" at "https://dl.bintray.com/spark-jobserver/maven"
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   "org.apache.spark" %% "spark-core"              % "1.4.0" % "provided",
11 |   "org.apache.spark" %% "spark-streaming"         % "1.4.0" % "provided",
12 |   "org.apache.spark" %% "spark-streaming-twitter" % "1.2.1")
13 | 
14 | //libraryDependencies += "org.twitter4j" % "twitter4j-stream" % "3.0.3"
15 | 
16 | //libraryDependencies += "org.twitter4j" % "twitter4j-core" % "3.0.3"
17 | 
18 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0-M1"
19 | 
20 | //libraryDependencies += "spark.jobserver" %% "job-server-api" % "0.5.1"
21 | 
22 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/"


--------------------------------------------------------------------------------
/scala/SaveCommunicationToCassandra/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")


--------------------------------------------------------------------------------
/scala/SaveCommunicationToCassandra/src/main/scala/SaveCommunicationToCassandra.scala:
--------------------------------------------------------------------------------
  1 | import org.apache.spark.SparkContext
  2 | import org.apache.spark.SparkContext._
  3 | 
  4 | import org.apache.spark._
  5 | import org.apache.spark.streaming._
  6 | 
  7 | 
  8 | import org.apache.spark.SparkContext._
  9 | import org.apache.spark.streaming.twitter._
 10 | import org.apache.spark.streaming.twitter
 11 | import org.apache.spark.streaming.twitter.TwitterUtils
 12 | import org.apache.spark.streaming.twitter.TwitterUtils._
 13 | import org.apache.spark.SparkConf
 14 | 
 15 | import collection.JavaConversions._
 16 | 
 17 | import org.apache.log4j.Logger
 18 | import org.apache.log4j.Level
 19 | 
 20 | import scala.math._
 21 | 
 22 | // Enable Cassandra-specific functions on the StreamingContext, DStream and RDD:
 23 | import com.datastax.spark.connector._ 
 24 | import com.datastax.spark.connector.streaming._
 25 | 
 26 | import scala.util.matching.Regex
 27 | import org.apache.spark.rdd.RDD
 28 | 
 29 | // Useful links
 30 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/0_quick_start.md
 31 | // http://planetcassandra.org/getting-started-with-apache-spark-and-cassandra/
 32 | // https://bcomposes.wordpress.com/2013/02/09/using-twitter4j-with-scala-to-access-streaming-tweets/
 33 | // https://github.com/datastax/spark-cassandra-connector/blob/master/doc/5_saving.md
 34 | 
 35 | object SaveCommunicationToCassandra{
 36 | 
 37 |     private val defaultSeed = 0xadc83b19L
 38 | 
 39 |     /**
 40 |      * @constructor murmurHash64A
 41 |      *
 42 |      *
 43 |      * @param
 44 |      * @param
 45 |      * @return Long
 46 |      *
 47 |      */
 48 |     def murmurHash64A(data: Seq[Byte], seed: Long = defaultSeed): Long = {
 49 |         val m = 0xc6a4a7935bd1e995L
 50 |         val r = 47
 51 | 
 52 |         val f: Long => Long = m.*
 53 |         val g: Long => Long = x => x ^ (x >>> r)
 54 | 
 55 |         val h = data.grouped(8).foldLeft(seed ^ f(data.length)) { case (y, xs) =>
 56 |             val k = xs.foldRight(0L)((b, x) => (x << 8) + (b & 0xff))
 57 |             val j: Long => Long = if (xs.length == 8) f compose g compose f else identity
 58 |             f(y ^ j(k))
 59 |         }
 60 |         (g compose f compose g)(h)
 61 |     }
 62 |     
 63 |     def main(args: Array[String]) {
 64 |         
 65 |         // Display only warning and infos messages
 66 |         //Logger.getLogger("org").setLevel(Level.ERROR)
 67 |         //Logger.getLogger("akka").setLevel(Level.ERROR)
 68 |         
 69 |         // Not displaying infos messages
 70 |         Logger.getLogger("org").setLevel(Level.OFF)
 71 |         Logger.getLogger("akka").setLevel(Level.OFF)
 72 |         
 73 |         // Spark configuration
 74 |         val sparkConf = new SparkConf()
 75 |         .setMaster("local[2]")
 76 |         .setAppName("SaveCommunicationToCassandra")
 77 |         .set("spark.cassandra.connection.host", "127.0.0.1") // Link to Cassandra
 78 |         
 79 |         // Filters by words that contains @
 80 |         val words = Array(" @")
 81 |         
 82 |         // Pattern used to find users
 83 |         val pattern = new Regex("\\@\\w{3,}")
 84 |         val patternURL = new Regex("(http|ftp|https)://[A-Za-z0-9-_]+.[A-Za-z0-9-_:%&?/.=]+")
 85 |         val patternSmiley = new Regex("((?::|;|=)(?:-)?(?:\\)|D|P|3|O))")
 86 |         
 87 |         // First twitter instance : Used for stream
 88 |         /*val twitterstream = new TwitterFactory().getInstance()
 89 |         twitterstream.setOAuthConsumer("MCrQfOAttGZnIIkrqZ4lQA9gr", "5NnYhhGdfyqOE4pIXXdYkploCybQMzFJiQejZssK4a3mNdkCoa")
 90 |         twitterstream.setOAuthAccessToken(new AccessToken("237197078-6zwzHsuB3VY3psD5873hhU3KQ1lSVQlOXyBhDqpG", "UIMZ1aD06DObpKI741zC8wHZF8jkj1bh02Lqfl5cQ76Pl"))
 91 |         */
 92 |         System.setProperty("twitter4j.http.retryCount", "3")
 93 |         System.setProperty("twitter4j.http.retryIntervalSecs", "10")
 94 |         System.setProperty("twitter4j.async.numThreads", "10")
 95 | 
 96 |         // Set the system properties so that Twitter4j library used by twitter stream
 97 |         // can use them to generat OAuth credentials
 98 |         System.setProperty("twitter4j.oauth.consumerKey", "MCrQfOAttGZnIIkrqZ4lQA9gr")
 99 |         System.setProperty("twitter4j.oauth.consumerSecret", "5NnYhhGdfyqOE4pIXXdYkploCybQMzFJiQejZssK4a3mNdkCoa")
100 |         System.setProperty("twitter4j.oauth.accessToken", "237197078-6zwzHsuB3VY3psD5873hhU3KQ1lSVQlOXyBhDqpG")
101 |         System.setProperty("twitter4j.oauth.accessTokenSecret", "UIMZ1aD06DObpKI741zC8wHZF8jkj1bh02Lqfl5cQ76Pl")
102 | 
103 |         val ssc = new StreamingContext(sparkConf, Seconds(2))
104 |         val stream = TwitterUtils.createStream(ssc, None)
105 | 
106 | 
107 |         // Stream about users
108 |         val usersStream = stream.map{status => (
109 |             status.getUser.getId.toString,
110 |             abs(murmurHash64A(status.getUser.getScreenName.getBytes)),
111 |             status.getUser.getName.toString,
112 |             status.getUser.getLang,
113 |             status.getUser.getFollowersCount.toString,
114 |             status.getUser.getFriendsCount.toString,
115 |             status.getUser.getScreenName,
116 |             status.getUser.getStatusesCount.toString)}
117 |         
118 |         
119 |         // Stream about communication between two users
120 |         val commStream = stream.map{status => (
121 |             status.getId.toString, //tweet_id
122 |             status.getUser.getId.toString, // user_send_twitter_ID
123 |             abs(murmurHash64A(status.getUser.getScreenName.getBytes)), // user_send_local_ID
124 |             if(pattern.findFirstIn(status.getText).isEmpty)
125 |             {
126 |                     ""
127 |             }
128 |             else
129 |             {
130 |                 pattern.findFirstIn(status.getText).getOrElse("@MichaelCaraccio").tail
131 |             },
132 |            status.getText,
133 |            status.getUser.getLang
134 |         )}
135 |         
136 |         
137 | 
138 |         // Stream about tweets
139 |         val tweetsStream = stream.map{status => (
140 |             status.getId.toString,
141 |             status.getUser.getId.toString,
142 |             abs(murmurHash64A(status.getUser.getScreenName.getBytes)),
143 |             new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(status.getCreatedAt),
144 |             status.getRetweetCount.toString,
145 |             status.getText
146 |         )}
147 |         
148 |         
149 |         // ************************************************************
150 |         // Save user's informations in Cassandra
151 |         // ************************************************************
152 |         usersStream.foreachRDD(rdd => {
153 |             rdd.saveToCassandra("twitter", "user_filtered", SomeColumns("user_twitter_id", "user_local_id", "user_name", "user_lang", "user_follow_count", "user_friends_count", "user_screen_name", "user_status_count"))
154 | 
155 |             println("Users saved : " + rdd.count())
156 |         })
157 |         
158 |         // ************************************************************
159 |         // Save communication's informations in Cassandra
160 |         // ************************************************************
161 |         commStream.foreachRDD(rdd => {
162 |             // Getting current context
163 |             val currentContext = rdd.context
164 | 
165 |             // RDD -> Array()
166 |             val tabValues = rdd.collect()
167 |             
168 |             // For each tweets in RDD
169 |             for(item <- tabValues.toArray) { 
170 |                 
171 |                 // Avoid single @ in message
172 |                 if(item._4 != "" && (item._6 == "en" || item._6 == "en-gb")){
173 |                     
174 |                     // Find multiple dest
175 |                     val matches = pattern.findAllIn(item._5).toArray
176 |                     
177 |                     // For each receiver in tweet
178 |                     matches.foreach{destName => {
179 |                         var user_dest_name = destName.drop(1)
180 | 
181 |                         // TODO : Optimize save to cassandra with concatenate seq and save it when the loop is over
182 |                         val collection = currentContext.parallelize(Seq((item._1, item._2,item._3, abs(murmurHash64A(user_dest_name.getBytes)))))
183 |                         
184 |                         collection.saveToCassandra(
185 |                             "twitter", 
186 |                             "users_communicate",
187 |                             SomeColumns(
188 |                                 "tweet_id",
189 |                                 "user_send_twitter_id",
190 |                                 "user_send_local_id",
191 |                                 "user_dest_id"))
192 |                     }}
193 |                 }
194 |             }
195 |             
196 |             println("Comm saved : " + rdd.count())
197 |         })
198 |         
199 |         
200 |         // ************************************************************
201 |         // Save tweet's informations in Cassandra
202 |         // ************************************************************
203 |         tweetsStream.foreachRDD(rdd => {
204 | 
205 |             // Getting current context
206 |             val currentContext = rdd.context
207 |             
208 |             // RDD -> Array()
209 |             val tabValues = rdd.collect()
210 |             
211 |             /*var test = rdd.map{status => (status._1,
212 |                                           status._2,
213 |                                           patternURL.replaceAllIn(status._3, ""),
214 |                                           status._4,
215 |                                           status._5, 
216 |                                           status._6, 
217 |                                           status._7)}*/
218 |             
219 |             // For each tweets in RDD
220 |             for(item <- tabValues.toArray) { 
221 |                 
222 |                 // New tweet value
223 |                 var newTweet = patternURL.replaceAllIn(item._6, "")
224 |                 newTweet = patternSmiley.replaceAllIn(newTweet, "")
225 |                 
226 |                 val collection = currentContext.parallelize(Seq((item._1, item._2, item._3, item._4, item._5, newTweet)))
227 |                 
228 |                 collection.saveToCassandra(
229 |                     "twitter", 
230 |                     "tweet_filtered",
231 |                     SomeColumns("tweet_id", 
232 |                                 "user_twitter_id",
233 |                                 "user_local_id",
234 |                                 "tweet_create_at",
235 |                                 "tweet_retweet",
236 |                                 "tweet_text"
237 |                                 ))
238 |             }
239 |             
240 |             println("Tweets saved : " + rdd.count())
241 |         })
242 | 
243 |         ssc.start()
244 |         ssc.awaitTermination()
245 |     }
246 | }


--------------------------------------------------------------------------------
/scala/ScalaTwitterStreaming/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "ScalaTwitterStreaming"
 2 | 
 3 | version := "1.1"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |   "org.apache.spark" %% "spark-core"              % "1.3.0" % "provided",
 9 |   "org.apache.spark" %% "spark-streaming"         % "1.3.0" % "provided",
10 |   "org.apache.spark" %% "spark-streaming-twitter" % "1.2.0")
11 | 
12 | libraryDependencies += "org.twitter4j" % "twitter4j-stream" % "3.0.3"
13 | libraryDependencies += "org.twitter4j" % "twitter4j-core" % "3.0.3"


--------------------------------------------------------------------------------
/scala/ScalaTwitterStreaming/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")
2 | 


--------------------------------------------------------------------------------
/scala/ScalaTwitterStreaming/src/main/scala/ScalaTwitterStreaming.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 2 | import StreamingContext._
 3 | import org.apache.spark.SparkContext._
 4 | import org.apache.spark.streaming.twitter._
 5 | import org.apache.spark.streaming.twitter
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.streaming.StreamingContext._
 8 | 
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | import org.apache.spark.streaming.twitter.TwitterUtils
12 | 
13 | import twitter4j.TwitterFactory
14 | import twitter4j.auth.AccessToken
15 | 
16 | /**
17 |  * Calculates popular hashtags (topics) over sliding 10 and 60 second windows from a Twitter
18 |  * stream. The stream is instantiated with credentials and optionally filters supplied by the
19 |  * command line arguments.
20 |  *
21 |  * Run this on your local machine as
22 |  *
23 |  */
24 | object ScalaTwitterStreaming {
25 |   def main(args: Array[String]) {
26 |    
27 |     val filters = args
28 |     // Set the system properties so that Twitter4j library used by twitter stream
29 |     // can use them to generat OAuth credentials
30 |     System.setProperty("twitter4j.oauth.consumerKey", "MCrQfOAttGZnIIkrqZ4lQA9gr")
31 |     System.setProperty("twitter4j.oauth.consumerSecret", "5NnYhhGdfyqOE4pIXXdYkploCybQMzFJiQejZssK4a3mNdkCoa")
32 |     System.setProperty("twitter4j.oauth.accessToken", "237197078-6zwzHsuB3VY3psD5873hhU3KQ1lSVQlOXyBhDqpG")
33 |     System.setProperty("twitter4j.oauth.accessTokenSecret", "UIMZ1aD06DObpKI741zC8wHZF8jkj1bh02Lqfl5cQ76Pl")
34 | 
35 |     val sparkConf = new SparkConf().setMaster("local[2]").setAppName("ScalaTwitterStreaming")
36 |     val ssc = new StreamingContext(sparkConf, Seconds(10))
37 |     val stream = TwitterUtils.createStream(ssc, None)
38 | 
39 |     val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))
40 |       
41 |     val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
42 |                      .map{case (topic, count) => (count, topic)}
43 |                      .transform(_.sortByKey(false))
44 | 
45 |     val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
46 |                      .map{case (topic, count) => (count, topic)}
47 |                      .transform(_.sortByKey(false))
48 | 
49 | 
50 |     // Print popular hashtags
51 |     topCounts60.foreachRDD(rdd => {
52 |       val topList = rdd.take(10)
53 |       println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
54 |       topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
55 |     })
56 | 
57 |     topCounts10.foreachRDD(rdd => {
58 |       val topList = rdd.take(10)
59 |       println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
60 |       topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
61 |     })
62 | 
63 |     ssc.start()
64 |     ssc.awaitTermination()
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/scala/SimpleAppUsingSBT/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "SimpleAppUsingSBT"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.2.1"
 8 | 
 9 | libraryDependencies += "org.apache.spark" % "spark-streaming_2.10" % "1.2.1"    
10 | 
11 | libraryDependencies += "org.apache.spark" % "spark-streaming-twitter_2.10" % "1.2.1"


--------------------------------------------------------------------------------
/scala/SimpleAppUsingSBT/src/main/scala/SimpleAppUsingSBT.scala:
--------------------------------------------------------------------------------
 1 | /* SimpleAppUsingSBT.scala */
 2 | import org.apache.spark.SparkContext
 3 | import org.apache.spark.SparkContext._
 4 | import org.apache.spark.SparkConf
 5 | 
 6 | object SimpleAppUsingSBT {
 7 |   def main(args: Array[String]) {
 8 |     val logFile = "/home/mcaraccio/spark-1.2.1-bin-hadoop2.4/README.md" // Should be some file on your system
 9 |     val conf = new SparkConf().setAppName("Simple Application Using SBT")
10 |     val sc = new SparkContext(conf)
11 |     val logData = sc.textFile(logFile, 2).cache()
12 |     val numAs = logData.filter(line => line.contains("a")).count()
13 |     val numBs = logData.filter(line => line.contains("b")).count()
14 |     println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
15 |   }
16 | }


--------------------------------------------------------------------------------
/visualization/d3.slider.css:
--------------------------------------------------------------------------------
 1 | .d3-slider {
 2 |     position: relative;
 3 |     font-family: Verdana,Arial,sans-serif;
 4 |     font-size: 1.1em;
 5 |     border: 1px solid #aaaaaa;
 6 |     z-index: 2;
 7 | }
 8 | 
 9 | .d3-slider-horizontal {
10 |     height: .8em;
11 | }  
12 | 
13 | .d3-slider-range {
14 |   background:#2980b9;
15 |   left:0px;
16 |   right:0px;
17 |   height: 0.8em;
18 |   position: absolute;
19 | }
20 | 
21 | .d3-slider-range-vertical {
22 |   background:#2980b9;
23 |   left:0px;
24 |   right:0px;
25 |   position: absolute;
26 |   top:0;
27 | }
28 | 
29 | .d3-slider-vertical {
30 |     width: .8em;
31 |     height: 100px;
32 | }      
33 | 
34 | .d3-slider-handle {
35 |     position: absolute;
36 |     width: 1.2em;
37 |     height: 1.2em;
38 |     border: 1px solid #d3d3d3;
39 |     border-radius: 4px;
40 |     background: #eee;
41 |     background: linear-gradient(to bottom, #eee 0%, #ddd 100%);
42 |     z-index: 3;
43 | }
44 | 
45 | .d3-slider-handle:hover {
46 |     border: 1px solid #999999;
47 | }
48 | 
49 | .d3-slider-horizontal .d3-slider-handle {
50 |     top: -.3em;
51 |     margin-left: -.6em;
52 | }
53 | 
54 | .d3-slider-axis {
55 |     position: relative;
56 |     z-index: 1;    
57 | }
58 | 
59 | .d3-slider-axis-bottom {
60 |     top: .8em;
61 | }
62 | 
63 | .d3-slider-axis-right {
64 |     left: .8em;
65 | }
66 | 
67 | .d3-slider-axis path {
68 |     stroke-width: 0;
69 |     fill: none;
70 | }
71 | 
72 | .d3-slider-axis line {
73 |     fill: none;
74 |     stroke: #aaa;
75 |     shape-rendering: crispEdges;
76 | }
77 | 
78 | .d3-slider-axis text {
79 |     font-size: 11px;
80 | }
81 | 
82 | .d3-slider-vertical .d3-slider-handle {
83 |     left: -.25em;
84 |     margin-left: 0;
85 |     margin-bottom: -.6em;      
86 | }


--------------------------------------------------------------------------------
/visualization/d3.slider.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |     D3.js Slider
  3 |     Inspired by jQuery UI Slider
  4 |     Copyright (c) 2013, Bjorn Sandvik - http://blog.thematicmapping.org
  5 |     BSD license: http://opensource.org/licenses/BSD-3-Clause
  6 | */
  7 | (function (root, factory) {
  8 |   if (typeof define === 'function' && define.amd) {
  9 |     // AMD. Register as an anonymous module.
 10 |     define(['d3'], factory);
 11 |   } else if (typeof exports === 'object') {
 12 |     if (process.browser) {
 13 |       // Browserify. Import css too using cssify.
 14 |       require('./d3.slider.css');
 15 |     }
 16 |     // Node. Does not work with strict CommonJS, but
 17 |     // only CommonJS-like environments that support module.exports,
 18 |     // like Node.
 19 |     module.exports = factory(require('d3'));
 20 |   } else {
 21 |     // Browser globals (root is window)
 22 |     root.d3.slider = factory(root.d3);
 23 |   }
 24 | }(this, function (d3) {
 25 | return function module() {
 26 |   "use strict";
 27 | 
 28 |   // Public variables width default settings
 29 |   var min = 0,
 30 |       max = 100,
 31 |       step = 0.01,
 32 |       animate = true,
 33 |       orientation = "horizontal",
 34 |       axis = false,
 35 |       margin = 50,
 36 |       value,
 37 |       active = 1,
 38 |       snap = false,
 39 |       scale;
 40 | 
 41 |   // Private variables
 42 |   var axisScale,
 43 |       dispatch = d3.dispatch("slide", "slideend"),
 44 |       formatPercent = d3.format(".2%"),
 45 |       tickFormat = d3.format(".0"),
 46 |       handle1,
 47 |       handle2 = null,
 48 |       divRange,
 49 |       sliderLength;
 50 | 
 51 |   function slider(selection) {
 52 |     selection.each(function() {
 53 | 
 54 |       // Create scale if not defined by user
 55 |       if (!scale) {
 56 |         scale = d3.scale.linear().domain([min, max]);
 57 |       }
 58 | 
 59 |       // Start value
 60 |       value = value || scale.domain()[0];
 61 | 
 62 |       // DIV container
 63 |       var div = d3.select(this).classed("d3-slider d3-slider-" + orientation, true);
 64 |       
 65 |       var drag = d3.behavior.drag();
 66 |       drag.on('dragend', function () {
 67 |         dispatch.slideend(d3.event, value);
 68 |       })
 69 | 
 70 |       // Slider handle
 71 |       //if range slider, create two
 72 |       // var divRange;
 73 | 
 74 |       if (toType(value) == "array" && value.length == 2) {
 75 |         handle1 = div.append("a")
 76 |           .classed("d3-slider-handle", true)
 77 |           .attr("xlink:href", "#")
 78 |           .attr('id', "handle-one")
 79 |           .on("click", stopPropagation)
 80 |           .call(drag);
 81 |         handle2 = div.append("a")
 82 |           .classed("d3-slider-handle", true)
 83 |           .attr('id', "handle-two")
 84 |           .attr("xlink:href", "#")
 85 |           .on("click", stopPropagation)
 86 |           .call(drag);
 87 |       } else {
 88 |         handle1 = div.append("a")
 89 |           .classed("d3-slider-handle", true)
 90 |           .attr("xlink:href", "#")
 91 |           .attr('id', "handle-one")
 92 |           .on("click", stopPropagation)
 93 |           .call(drag);
 94 |       }
 95 |       
 96 |       // Horizontal slider
 97 |       if (orientation === "horizontal") {
 98 | 
 99 |         div.on("click", onClickHorizontal);
100 |         
101 |         if (toType(value) == "array" && value.length == 2) {
102 |           divRange = d3.select(this).append('div').classed("d3-slider-range", true);
103 | 
104 |           handle1.style("left", formatPercent(scale(value[ 0 ])));
105 |           divRange.style("left", formatPercent(scale(value[ 0 ])));
106 |           drag.on("drag", onDragHorizontal);
107 | 
108 |           var width = 100 - parseFloat(formatPercent(scale(value[ 1 ])));
109 |           handle2.style("left", formatPercent(scale(value[ 1 ])));
110 |           divRange.style("right", width+"%");
111 |           drag.on("drag", onDragHorizontal);
112 | 
113 |         } else {
114 |           handle1.style("left", formatPercent(scale(value)));
115 |           drag.on("drag", onDragHorizontal);
116 |         }
117 |         
118 |         sliderLength = parseInt(div.style("width"), 10);
119 | 
120 |       } else { // Vertical
121 | 
122 |         div.on("click", onClickVertical);
123 |         drag.on("drag", onDragVertical);
124 |         if (toType(value) == "array" && value.length == 2) {
125 |           divRange = d3.select(this).append('div').classed("d3-slider-range-vertical", true);
126 | 
127 |           handle1.style("bottom", formatPercent(scale(value[ 0 ])));
128 |           divRange.style("bottom", formatPercent(scale(value[ 0 ])));
129 |           drag.on("drag", onDragVertical);
130 | 
131 |           var top = 100 - parseFloat(formatPercent(scale(value[ 1 ])));
132 |           handle2.style("bottom", formatPercent(scale(value[ 1 ])));
133 |           divRange.style("top", top+"%");
134 |           drag.on("drag", onDragVertical);
135 | 
136 |         } else {
137 |           handle1.style("bottom", formatPercent(scale(value)));
138 |           drag.on("drag", onDragVertical);
139 |         }
140 |         
141 |         sliderLength = parseInt(div.style("height"), 10);
142 | 
143 |       }
144 |       
145 |       if (axis) {
146 |         createAxis(div);
147 |       }
148 | 
149 | 
150 |       function createAxis(dom) {
151 | 
152 |         // Create axis if not defined by user
153 |         if (typeof axis === "boolean") {
154 | 
155 |           axis = d3.svg.axis()
156 |               .ticks(Math.round(sliderLength / 100))
157 |               .tickFormat(tickFormat)
158 |               .orient((orientation === "horizontal") ? "bottom" :  "right");
159 | 
160 |         }
161 | 
162 |         // Copy slider scale to move from percentages to pixels
163 |         axisScale = scale.ticks ? scale.copy().range([0, sliderLength]) : scale.copy().rangePoints([0, sliderLength], 0.5);
164 |           axis.scale(axisScale);
165 | 
166 |           // Create SVG axis container
167 |         var svg = dom.append("svg")
168 |             .classed("d3-slider-axis d3-slider-axis-" + axis.orient(), true)
169 |             .on("click", stopPropagation);
170 | 
171 |         var g = svg.append("g");
172 | 
173 |         // Horizontal axis
174 |         if (orientation === "horizontal") {
175 | 
176 |           svg.style("margin-left", -margin + "px");
177 | 
178 |           svg.attr({
179 |             width: sliderLength + margin * 2,
180 |             height: margin
181 |           });
182 | 
183 |           if (axis.orient() === "top") {
184 |             svg.style("top", -margin + "px");
185 |             g.attr("transform", "translate(" + margin + "," + margin + ")");
186 |           } else { // bottom
187 |             g.attr("transform", "translate(" + margin + ",0)");
188 |           }
189 | 
190 |         } else { // Vertical
191 | 
192 |           svg.style("top", -margin + "px");
193 | 
194 |           svg.attr({
195 |             width: margin,
196 |             height: sliderLength + margin * 2
197 |           });
198 | 
199 |           if (axis.orient() === "left") {
200 |             svg.style("left", -margin + "px");
201 |             g.attr("transform", "translate(" + margin + "," + margin + ")");
202 |           } else { // right          
203 |             g.attr("transform", "translate(" + 0 + "," + margin + ")");
204 |           }
205 | 
206 |         }
207 | 
208 |         g.call(axis);
209 | 
210 |       }
211 | 
212 |       function onClickHorizontal() {
213 |         if (toType(value) != "array") {
214 |           var pos = Math.max(0, Math.min(sliderLength, d3.event.offsetX || d3.event.layerX));
215 |           moveHandle(scale.invert ? 
216 |                       stepValue(scale.invert(pos / sliderLength))
217 |                     : nearestTick(pos / sliderLength));
218 |         }
219 |       }
220 | 
221 |       function onClickVertical() {
222 |         if (toType(value) != "array") {
223 |           var pos = sliderLength - Math.max(0, Math.min(sliderLength, d3.event.offsetY || d3.event.layerY));
224 |           moveHandle(scale.invert ? 
225 |                       stepValue(scale.invert(pos / sliderLength))
226 |                     : nearestTick(pos / sliderLength));
227 |         }
228 |       }
229 | 
230 |       function onDragHorizontal() {
231 |         if ( d3.event.sourceEvent.target.id === "handle-one") {
232 |           active = 1;
233 |         } else if ( d3.event.sourceEvent.target.id == "handle-two" ) {
234 |           active = 2;
235 |         }
236 |         var pos = Math.max(0, Math.min(sliderLength, d3.event.x));
237 |         moveHandle(scale.invert ? 
238 |                     stepValue(scale.invert(pos / sliderLength))
239 |                   : nearestTick(pos / sliderLength));
240 |       }
241 | 
242 |       function onDragVertical() {
243 |         if ( d3.event.sourceEvent.target.id === "handle-one") {
244 |           active = 1;
245 |         } else if ( d3.event.sourceEvent.target.id == "handle-two" ) {
246 |           active = 2;
247 |         }
248 |         var pos = sliderLength - Math.max(0, Math.min(sliderLength, d3.event.y))
249 |         moveHandle(scale.invert ? 
250 |                     stepValue(scale.invert(pos / sliderLength))
251 |                   : nearestTick(pos / sliderLength));
252 |       }
253 | 
254 |       function stopPropagation() {
255 |         d3.event.stopPropagation();
256 |       }
257 | 
258 |     });
259 | 
260 |   }
261 | 
262 |   // Move slider handle on click/drag
263 |   function moveHandle(newValue) {
264 |     var currentValue = toType(value) == "array"  && value.length == 2 ? value[active - 1]: value,
265 |         oldPos = formatPercent(scale(stepValue(currentValue))),
266 |         newPos = formatPercent(scale(stepValue(newValue))),
267 |         position = (orientation === "horizontal") ? "left" : "bottom";
268 |     if (oldPos !== newPos) {
269 | 
270 |       if (toType(value) == "array" && value.length == 2) {
271 |         value[ active - 1 ] = newValue;
272 |         if (d3.event) {
273 |           dispatch.slide(d3.event, value );
274 |         };
275 |       } else {
276 |         if (d3.event) {
277 |           dispatch.slide(d3.event.sourceEvent || d3.event, value = newValue);
278 |         };
279 |       }
280 | 
281 |       if ( value[ 0 ] >= value[ 1 ] ) return;
282 |       if ( active === 1 ) {
283 |         if (toType(value) == "array" && value.length == 2) {
284 |           (position === "left") ? divRange.style("left", newPos) : divRange.style("bottom", newPos);
285 |         }
286 | 
287 |         if (animate) {
288 |           handle1.transition()
289 |               .styleTween(position, function() { return d3.interpolate(oldPos, newPos); })
290 |               .duration((typeof animate === "number") ? animate : 250);
291 |         } else {
292 |           handle1.style(position, newPos);
293 |         }
294 |       } else {
295 |         
296 |         var width = 100 - parseFloat(newPos);
297 |         var top = 100 - parseFloat(newPos);
298 | 
299 |         (position === "left") ? divRange.style("right", width + "%") : divRange.style("top", top + "%");
300 |         
301 |         if (animate) {
302 |           handle2.transition()
303 |               .styleTween(position, function() { return d3.interpolate(oldPos, newPos); })
304 |               .duration((typeof animate === "number") ? animate : 250);
305 |         } else {
306 |           handle2.style(position, newPos);
307 |         }
308 |       }
309 |     }
310 |   }
311 | 
312 |   // Calculate nearest step value
313 |   function stepValue(val) {
314 | 
315 |     if (val === scale.domain()[0] || val === scale.domain()[1]) {
316 |       return val;
317 |     }
318 | 
319 |     var alignValue = val;
320 |     if (snap) {
321 |       alignValue = nearestTick(scale(val));
322 |     } else{
323 |       var valModStep = (val - scale.domain()[0]) % step;
324 |       alignValue = val - valModStep;
325 | 
326 |       if (Math.abs(valModStep) * 2 >= step) {
327 |         alignValue += (valModStep > 0) ? step : -step;
328 |       }
329 |     };
330 | 
331 |     return alignValue;
332 | 
333 |   }
334 | 
335 |   // Find the nearest tick
336 |   function nearestTick(pos) {
337 |     var ticks = scale.ticks ? scale.ticks() : scale.domain();
338 |     var dist = ticks.map(function(d) {return pos - scale(d);});
339 |     var i = -1,
340 |         index = 0,
341 |         r = scale.ticks ? scale.range()[1] : scale.rangeExtent()[1];
342 |     do {
343 |         i++;
344 |         if (Math.abs(dist[i]) < r) {
345 |           r = Math.abs(dist[i]);
346 |           index = i;
347 |         };
348 |     } while (dist[i] > 0 && i < dist.length - 1);
349 | 
350 |     return ticks[index];
351 |   };
352 | 
353 |   // Return the type of an object
354 |   function toType(v) {
355 |     return ({}).toString.call(v).match(/\s([a-zA-Z]+)/)[1].toLowerCase();
356 |   };
357 | 
358 |   // Getter/setter functions
359 |   slider.min = function(_) {
360 |     if (!arguments.length) return min;
361 |     min = _;
362 |     return slider;
363 |   };
364 | 
365 |   slider.max = function(_) {
366 |     if (!arguments.length) return max;
367 |     max = _;
368 |     return slider;
369 |   };
370 | 
371 |   slider.step = function(_) {
372 |     if (!arguments.length) return step;
373 |     step = _;
374 |     return slider;
375 |   };
376 | 
377 |   slider.animate = function(_) {
378 |     if (!arguments.length) return animate;
379 |     animate = _;
380 |     return slider;
381 |   };
382 | 
383 |   slider.orientation = function(_) {
384 |     if (!arguments.length) return orientation;
385 |     orientation = _;
386 |     return slider;
387 |   };
388 | 
389 |   slider.axis = function(_) {
390 |     if (!arguments.length) return axis;
391 |     axis = _;
392 |     return slider;
393 |   };
394 | 
395 |   slider.margin = function(_) {
396 |     if (!arguments.length) return margin;
397 |     margin = _;
398 |     return slider;
399 |   };
400 | 
401 |   slider.value = function(_) {
402 |     if (!arguments.length) return value;
403 |     if (value) {
404 |       moveHandle(stepValue(_));
405 |     };
406 |     value = _;
407 |     return slider;
408 |   };
409 | 
410 |   slider.snap = function(_) {
411 |     if (!arguments.length) return snap;
412 |     snap = _;
413 |     return slider;
414 |   };
415 | 
416 |   slider.scale = function(_) {
417 |     if (!arguments.length) return scale;
418 |     scale = _;
419 |     return slider;
420 |   };
421 | 
422 |   d3.rebind(slider, dispatch, "on");
423 | 
424 |   return slider;
425 | 
426 | }
427 | }));
428 | 


--------------------------------------------------------------------------------
/visualization/data.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | // Debug mode
  4 | ini_set('display_startup_errors',1);
  5 | ini_set('display_errors',1);
  6 | error_reporting(-1);
  7 | 
  8 | // Connect to Cassandra
  9 | $cluster   = Cassandra::cluster()                 // connects to localhost by default
 10 |     ->withContactPoints('157.26.83.16')
 11 |     ->withPort(9042)
 12 |     ->build();
 13 | 
 14 | $keyspace  = 'twitter';
 15 | $session   = $cluster->connect($keyspace);        // create session, optionally scoped to a keyspace
 16 | 
 17 | // Default values
 18 | $tValue = 1;
 19 | $minVertices = 0;
 20 | 
 21 | //*************************************/
 22 | // Change T
 23 | //*************************************/
 24 | if (isset($_GET["value"])){
 25 |     $tValue = $_GET["value"];
 26 | }
 27 | 
 28 | //****************************************/
 29 | // Change minimum vertices in communities
 30 | //****************************************/
 31 | if (isset($_GET["minVertices"])){
 32 |     $minVertices = $_GET["minVertices"];
 33 | }
 34 | 
 35 | $whereStatementForVertice = null;
 36 | if($minVertices > 0){
 37 |     $whereStatementForVertice = "and nbv >= $minVertices";
 38 | }
 39 | 
 40 | //*************************************/
 41 | // Init
 42 | //*************************************/
 43 | $data = new stdClass();
 44 | 
 45 | $data->nodes = array();
 46 | $data->links = array(); 
 47 | $data->lda = array(); 
 48 | $data->cosine = array(); 
 49 | 
 50 | $indices = array();
 51 | $groups =  array();
 52 | $sg =  array();
 53 | 
 54 | function myfunction($num)
 55 | {
 56 |     return($num);
 57 | }
 58 | 
 59 | //*************************************/
 60 | // Get Source and Com ID -> Array
 61 | //*************************************/
 62 | $statement = new Cassandra\SimpleStatement("SELECT src_id, com_id, sg FROM twitter.communities where t = $tValue $whereStatementForVertice");
 63 | $result    = $session->execute($statement);
 64 | 
 65 | // On introduit tout les indices dans la map
 66 | foreach ($result as $row) {
 67 |     $indices[] = (int) $row['src_id'];
 68 |     $groups[] = (int) $row['com_id'];
 69 |     $sg[] = (int) $row['sg'];
 70 | }
 71 | 
 72 | 
 73 | //*************************************/
 74 | // Get Destination and Com ID -> Array
 75 | //*************************************/
 76 | $statement = new Cassandra\SimpleStatement("SELECT dst_id, com_id, sg FROM twitter.communities where t = $tValue $whereStatementForVertice");
 77 | $result    = $session->execute($statement);
 78 | 
 79 | foreach ($result as $row) {
 80 |     $indices[] = (int) $row['dst_id'];
 81 |     $groups[] = (int) $row['com_id'];
 82 |     $sg[] = (int) $row['sg'];
 83 | }
 84 | 
 85 | // Combine node and group
 86 | $c = array_combine($indices, $groups);
 87 | $csg = array_combine($indices, $sg);
 88 | 
 89 | // Get unique list of nodes
 90 | $map = array_map("myfunction", $indices);
 91 | $indices_unique = array_unique($map);
 92 | $nodes_index = array();
 93 | 
 94 | 
 95 | //*************************************/
 96 | // Create nodes
 97 | //*************************************/
 98 | foreach($indices_unique as $node){
 99 |     $nodes_index[] = $node;
100 |     $data->nodes[] = array("name" => (int) $node, "group" => (int) $c[$node], "sg" => (int) $csg[$node]);
101 | }
102 | 
103 | 
104 | //*************************************/
105 | // Create links
106 | //*************************************/
107 | $statement = new Cassandra\SimpleStatement("SELECT * FROM twitter.communities where t = $tValue $whereStatementForVertice");
108 | $result    = $session->execute($statement);
109 | 
110 | foreach ($result as $row) {
111 |     $data->links[] = array("source" => (int) array_search($row['src_id'],$nodes_index), "s" => (int) $row['src_id'], "d" => (int) $row['dst_id'], "target" => (int) array_search($row['dst_id'],$nodes_index), "value" => ((int) 1));
112 | }
113 | 
114 | //*************************************/
115 | // Get LDA
116 | //*************************************/
117 | $statement = new Cassandra\SimpleStatement("SELECT * FROM twitter.lda where t = $tValue");
118 | $result    = $session->execute($statement);
119 | 
120 | foreach ($result as $row) {
121 |     $data->lda[] = array("t" => (int) $row['t'], "sg" => (int) $row['sg'], "n_topic" => (int) $row['n_topic'], "words" => (string) $row['words']);
122 | }
123 | 
124 | //*************************************/
125 | // Get Cosine similarity
126 | //*************************************/
127 | $statementComm = new Cassandra\SimpleStatement("SELECT * FROM twitter.communities where t = $tValue $whereStatementForVertice");
128 | $resultComm    = $session->execute($statementComm);
129 | 
130 | foreach ($resultComm as $row) {
131 |     $data->cosine[] = array("t" => (int) $row['t'], "sg" => (int) $row['sg'], "cosines" => (string) $row['lda']);
132 | }
133 | 
134 | echo json_encode($data); 
135 | ?>


--------------------------------------------------------------------------------
/visualization/graph.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <meta charset="utf-8">
  3 | <style>
  4 | 
  5 |     .node {
  6 |         stroke: #fff;
  7 |         stroke-width: 1.5px;
  8 |     }
  9 | 
 10 |     .link {
 11 |         stroke: #999;
 12 |         stroke-opacity: .6;
 13 |     }
 14 |     text {
 15 |         font: 24px "Helvetica Neue", Helvetica, Arial, sans-serif;
 16 |         text-anchor: middle;
 17 |         pointer-events: none;
 18 |     }
 19 |     .node:hover circle {
 20 |         fill: orange;
 21 |     }
 22 | 
 23 | </style>
 24 | <link rel="stylesheet" href="d3.slider.css" /> 
 25 | <body>
 26 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js" charset="utf-8"></script>
 27 |     <script src="d3.slider.js"></script>
 28 | 
 29 |     <div style="padding-left:30px;padding-right:30px;">
 30 |         <h3>Period: <span id="slider4text">1</span></h3>
 31 |         <div id="slider4"></div>
 32 |     </div>
 33 | 
 34 |     <div style="padding-left:30px;padding-right:30px;">
 35 |         <br><br>
 36 |         <h3>Minimum vertices per communities: <span id="slidermintext">0</span></h3>
 37 |         <h5>0 = no restriction</h5>
 38 |         <div id="slidermin"></div>
 39 |     </div>
 40 | 
 41 |     <div id="lda" style="float:right;width:350px;background-color: #ecf0f1; height:550px;padding:10px;margin-top:30px;margin-right:30px;">
 42 |     </div>
 43 |     <script>
 44 | 
 45 |         var width = 800,
 46 |             height = 550;
 47 | 
 48 |         var tValue = 1;
 49 |         var minVerticeValue = 0;
 50 | 
 51 |         var color = d3.scale.category20();
 52 | 
 53 |         var force = d3.layout.force()
 54 |         .charge(-20)
 55 |         .linkDistance(15)
 56 |         .size([width, height]);
 57 | 
 58 |         var svg = d3.select("body").append("svg")
 59 |         .attr("width", width)
 60 |         .attr("height", height);
 61 |         
 62 |         
 63 |         
 64 |         
 65 |         
 66 | 
 67 |         // ////////////////////////////////////////////////////////////
 68 |         // Count each nodes per groups
 69 |         // ////////////////////////////////////////////////////////////
 70 |         function replaceAll(find, replace, str) {
 71 |             return str.replace(new RegExp(find, 'g'), replace);
 72 |         }
 73 | 
 74 |         // ////////////////////////////////////////////////////////////
 75 |         // Count each nodes per groups
 76 |         // ////////////////////////////////////////////////////////////
 77 |         function clickOnNode(d){
 78 | 
 79 |             var mapLDA = new Map();
 80 |             var mapLDASorted = new Map();
 81 |             var arrayCosines = [];
 82 |             var arrayWords = [];
 83 |             for (i = 0; i < dataCosines.length; i++) {
 84 | 
 85 |                 if(dataCosines[i].sg == d.sg){
 86 | 
 87 |                     arrayCosines = dataCosines[i].cosines.split(";");
 88 | 
 89 |                     for (k = 0; k < datasetLDA.length; k++) {
 90 |                         // if(datasetLDA[k].sg == d.sg){
 91 |                         arrayWords.push(datasetLDA[k].words);
 92 |                         // }
 93 |                     }
 94 |                     // break;
 95 |                 }
 96 |             }
 97 | 
 98 |             // Until cosines array length and not arrayCosines
 99 |             for (i = 0; i < arrayCosines.length; i++) {
100 |                 mapLDA.set(parseFloat(arrayCosines[i]), arrayWords[i])
101 |             }
102 | 
103 |             var keys = [];
104 |             mapLDA.forEach(function(a,b,c){
105 |                 keys.push(b);
106 |             });
107 | 
108 |             sorted = keys.sort().reverse();
109 |             for(w = 0; w < sorted.length; w++){
110 |                 mapLDASorted.set(sorted[w],mapLDA.get(sorted[w]));
111 |             }
112 | 
113 |             var div = document.getElementById('lda');
114 |             div.innerHTML = "";
115 | 
116 |             var cpt = 1;
117 |             mapLDASorted.forEach(function (item, key, mapObj) {
118 |                 div.innerHTML = div.innerHTML + "Topic N°" + cpt + " (" + key + ")" + "<br>Words : " + replaceAll(";", " ",  item) + "<br><br>" ;
119 |                 cpt++;
120 |             });
121 |         }
122 |         // ////////////////////////////////////////////////////////////
123 |         // Count each nodes per groups
124 |         // ////////////////////////////////////////////////////////////
125 |         function onlyUnique(value, index, self) { 
126 |             return self.indexOf(value) === index;
127 |         }
128 |         
129 |         
130 |         // ////////////////////////////////////////////////////////////
131 |         // Count each nodes per groups
132 |         // ////////////////////////////////////////////////////////////
133 |         function logMapElements(valeur, clé, map) {
134 |             console.log("m[" + clé + "] = " + valeur);
135 |         }
136 |         
137 |         // ////////////////////////////////////////////////////////////
138 |         // Count each nodes per groups
139 |         // ////////////////////////////////////////////////////////////
140 |         function getKeys(valeur, clé, map) {
141 |             clé;
142 |         }
143 | 
144 | 
145 |         // ////////////////////////////////////////////////////////////
146 |         // Count each nodes per groups
147 |         // ////////////////////////////////////////////////////////////
148 |         function countNodesPerGroup(graph){
149 | 
150 |             // List of groups
151 |             var GroupList = [];
152 |                         
153 |             for (i = 0; i < graph.nodes.length; i++) {
154 |                 GroupList[i] = graph.nodes[i].group
155 |             }
156 | 
157 |             // List of unique groups
158 |             var uniqueGroups = GroupList.filter( onlyUnique );
159 | 
160 |             // ********************************************************
161 |             // Count each nodes per groups
162 |             // ********************************************************
163 | 
164 |             var nodesPerGroups = new Map();
165 | 
166 |             // Init nodesPerGroups
167 |             for (i = 0; i < uniqueGroups.length; i++) {
168 |                 nodesPerGroups.set(uniqueGroups[i], 0);
169 |             }
170 | 
171 |             // Count nodes per groups and store them in nodesPerGroups
172 |             for (i = 0; i < graph.nodes.length; i++) {
173 |                 var key = graph.nodes[i].group;                
174 |                 nodesPerGroups.set(key, nodesPerGroups.get(key) + 1);
175 |             }
176 | 
177 |             // Print elements
178 |             //nodesPerGroups.forEach(logMapElements);
179 | 
180 |             return nodesPerGroups;
181 |         }
182 | 
183 |         // TODO : lorsque l'on clique sur un node, alors va interoger bdd et prendre info de l'utilisateur
184 | 
185 |         function displayData(graph){
186 | 
187 |             var OriginalMap = countNodesPerGroup(graph);
188 | 
189 |             datasetLDA = graph.lda
190 |             dataCosines = graph.cosine
191 | 
192 |             console.log("NODES: " + graph.nodes.length)
193 |             console.log("LINKS: " + graph.links.length)
194 |             console.log("LDA: " + graph.lda.length)
195 |             console.log("COSINES: " + graph.cosine.length)
196 | 
197 |             force
198 |                 .nodes(graph.nodes)
199 |                 .links(graph.links)
200 |                 .start();
201 | 
202 |             var link = svg.selectAll(".link")
203 |             .data(graph.links)
204 |             .enter().append("line")
205 |             .attr("class", "link")
206 |             .style("stroke-width", function(d) { return Math.sqrt(d.value); });
207 | 
208 |             //OriginalMap.forEach(logMapElements);
209 | 
210 |             var node = svg.selectAll(".node")
211 |             .data(graph.nodes)
212 |             .enter()
213 |             .append("circle")
214 |             .attr("class", "node")
215 |             //.attr("r", function(d) { return Math.sqrt(OriginalMap.get(d.group))*2; })
216 |             .attr("r", function(d) { return 4; })
217 |             //.attr("text-anchor", "middle").text(function(d) { return d.name; })
218 |             .style("fill", function(d) { return color(d.group); })
219 |             .call(force.drag)
220 |             .on("click", function(d) {  clickOnNode(d); } );
221 | 
222 |             node.append("title")
223 |                 .text(function(d) { return d.name; });
224 | 
225 |             force.on("tick", function() {
226 |                 link.attr("x1", function(d) { return d.source.x; })
227 |                     .attr("y1", function(d) { return d.source.y; })
228 |                     .attr("x2", function(d) { return d.target.x; })
229 |                     .attr("y2", function(d) { return d.target.y; });
230 | 
231 |                 node.attr("cx", function(d) { return d.x; })
232 |                     .attr("cy", function(d) { return d.y; });
233 |             });
234 |         }
235 | 
236 | 
237 |         function getNewData() {
238 |             d3.json("data.php?value=" +tValue + "&minVertices=" + minVerticeValue, function(error, graph) {
239 |                 var div = document.getElementById('lda');
240 | 
241 |                 div.innerHTML = "";
242 | 
243 | 
244 |                 svg.selectAll(".link").remove();
245 |                 svg.selectAll(".node").remove();
246 | 
247 |                 displayData(graph);
248 |             });
249 |         }
250 | 
251 |         d3.select('#slider4').call(d3.slider().axis(true).min(1).max(30).step(1).on("slide", function(evt, value) {
252 |             d3.select('#slider4text').text(value);
253 |             tValue = value;
254 |             getNewData();
255 |         }));
256 | 
257 |         d3.select('#slidermin').call(d3.slider().axis(true).min(0).max(500).step(5).on("slide", function(evt, value) {
258 |             d3.select('#slidermintext').text(value);
259 |             minVerticeValue = value
260 |             getNewData();
261 |         }));
262 | 
263 |         d3.json("data.php", function(error, graph) {
264 |             if (error) throw error;
265 | 
266 |             displayData(graph);
267 |         });
268 | 
269 |     </script>
270 | 


--------------------------------------------------------------------------------