├── .gitignore ├── LICENSE ├── README.md ├── build.sbt ├── project └── build.properties └── src └── main ├── resources ├── application.conf └── logback.xml └── scala ├── Main.scala ├── models ├── Dependency.scala ├── GitHubRepository.scala ├── ModelWithId.scala ├── PageInfo.scala └── RelationProp.scala ├── modules ├── AkkaModule.scala ├── ConfigModule.scala ├── DBModule.scala └── GitHubModule.scala ├── repositories ├── MongoRepository.scala ├── Neo4jRepository.scala ├── ReactiveMongo.scala └── github │ └── GitHubProjectRepository.scala ├── services ├── github │ ├── client │ │ ├── GitHubProjectService.scala │ │ └── GitHubRequestComposer.scala │ └── spark │ │ └── GitHubGraphXService.scala ├── kafka │ └── KafkaService.scala └── spark │ ├── SparkContextConf.scala │ └── SparkMongoService.scala └── utils └── Logger.scala /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | target/ 3 | *.log -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 SysGears 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Akka Spark Pipeline 2 | 3 | ## Description 4 | 5 | Akka Spark Pipeline is an example project that lets you find out how frequently a specific technology is used with different technology stacks. 6 | 7 | Akka Spark Pipeline uses Akka, Spark GraphX, MongoDB, and Neo4j to handle and analyze thousands of projects published on GitHub (read: big data) to build a graph with relations between various technologies. Each relation shows the number of projects where two related technologies are used. 8 | 9 | It's possible to use the graph for further analysis and to obtain statistical data. 10 | 11 | ### How it works 12 | 13 | This example project uses the GitHub client to grab the data about repositories, in particular, project metadata and the list of project dependencies. This list of dependencies is then stored in MongoDB. 14 | 15 | Once the projects' data is downloaded and stored in the database, Spark gets it and builds a graph that reflects the relationships between technologies. 16 | 17 | The created graph is then stored in the Neo4j graph database. Using an HTTP server, you can query the database with a specific technology to see the list technologies it's predominantly used with. 18 | 19 | ## Technologies 20 | 21 | | Technology | Description | Project use | 22 | | -------------- | --------------------------------- | ------------------------------------------ | 23 | | [Akka Streams] | Compose data transformation flows | Retrieve repositories metadata from GitHub | 24 | | [Spark GraphX] | Spark component for graphs and graph-parallel computations | Build a graph from projects dependencies | 25 | | [MongoDB] | A document-oriented database | Used to store raw data | 26 | | [Neo4j] | A Graph database | Used to store the built graphs | 27 | 28 | ## Branches 29 | 30 | | Branch | Description | 31 | | -------------- | ---------------------------------------------------------------- | 32 | | [master] | The version with the latest features. May not work consistently | 33 | | [spark-graphx] | Version with the Spark GraphX functionality. Not fully completed | 34 | 35 | ## Project structure 36 | 37 | ``` 38 | akka-spark-kafka-pipeline 39 | ├── models # Contains models that define the GitHub project entity 40 | ├── modules # Contains Guice bindings 41 | ├── repositories # Contains classes to work with the database layer 42 | │ └── github # Contains the repository GitHub project entity 43 | ├── services # Services to work with different technologies such as Spark or Kafka 44 | │ ├── github 45 | │ │ ├── client # Contains GitHub client functionality 46 | │ │ └── spark 47 | │ │ └── GitHubGraphXService.scala # The service to create a graph from project dependencies using Spark GraphX 48 | │ ├── kafka 49 | │ │ └── KafkaService.scala # The service to interact with Kafka 50 | │ └── spark 51 | │ └── SparkMongoService.scala # Contains a connector between Spark and MongoDB 52 | └── utils # Contains application utils such as a logger 53 | ``` 54 | 55 | ## How to start 56 | 57 | Before starting the application, you must have MongoDB running on your computer. Also you must set 58 | personal GitHub token into either `GitHubOAuthToken` env variable (recommended) or 'services/github/GitHubRequestComposer.scala' class (as default value in `private val token = sys.env.getOrElse("GitHubOAuthToken", "")` string). 59 | 60 | Run the application: 61 | 62 | ```bash 63 | sbt run 64 | ``` 65 | 66 | ## Contributors 67 | 68 | If you have any suggestions or contributions, please contribute. 69 | 70 | ## License 71 | 72 | Copyright © 2019 [SysGears INC]. This source code is licensed under the [MIT] license. 73 | 74 | [akka streams]: https://doc.akka.io/docs/akka/2.5/stream/ 75 | [spark graphx]: https://spark.apache.org/graphx/ 76 | [mongodb]: https://www.mongodb.com/ 77 | [neo4j]: https://neo4j.com/ 78 | [master]: https://github.com/sysgears/akka-spark-pipeline/tree/master 79 | [spark-graphx]: https://github.com/sysgears/akka-spark-pipeline/tree/spark-graphx -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "akka-spark-kafka-pipeline" 2 | 3 | version := "0.1" 4 | 5 | scalaVersion := "2.11.12" 6 | 7 | resolvers += "Spark Packages" at "https://dl.bintray.com/spark-packages/maven/" 8 | 9 | libraryDependencies ++= Seq( 10 | "org.apache.spark" %% "spark-core" % "2.4.0", 11 | "org.apache.spark" %% "spark-sql" % "2.4.0", 12 | "org.apache.spark" %% "spark-sql-kafka-0-10" % "2.4.0" % "provided", 13 | "org.apache.kafka" %% "kafka" % "2.1.0", 14 | "org.apache.kafka" % "kafka-clients" % "2.1.1", 15 | 16 | "com.typesafe.akka" %% "akka-http" % "10.1.8", 17 | "com.typesafe.akka" %% "akka-stream" % "2.5.22", 18 | "io.spray" %% "spray-json" % "1.3.5", 19 | 20 | "com.google.inject" % "guice" % "4.2.2", 21 | "net.codingwell" %% "scala-guice" % "4.2.3", 22 | 23 | "org.reactivemongo" %% "reactivemongo" % "0.16.4", 24 | "org.mongodb.spark" %% "mongo-spark-connector" % "2.4.0", 25 | "org.apache.spark" %% "spark-graphx" % "2.4.0", 26 | "graphframes" % "graphframes" % "0.7.0-spark2.4-s_2.11", 27 | 28 | "neo4j-contrib" % "neo4j-spark-connector" % "2.1.0-M4", 29 | 30 | "commons-logging" % "commons-logging" % "1.2" 31 | 32 | ).map(_.exclude("org.slf4j", "*")) 33 | 34 | libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.2.3" 35 | 36 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.7" -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.2.8 -------------------------------------------------------------------------------- /src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | mongodb { 2 | uri = "mongodb://localhost:27017/" 3 | name = "default" 4 | } 5 | 6 | mongodb-dispatcher { 7 | type = Dispatcher 8 | executor = "thread-pool-executor" 9 | fork-join-executor { 10 | parallelism-min = 2 11 | parallelism-factor = 2.0 12 | parallelism-max = 10 13 | task-peeking-mode = "FIFO" 14 | } 15 | throughput = 1 16 | } 17 | 18 | github-dispatcher { 19 | type = Dispatcher 20 | executor = "thread-pool-executor" 21 | fork-join-executor { 22 | parallelism-min = 2 23 | parallelism-factor = 2.0 24 | parallelism-max = 10 25 | task-peeking-mode = "FIFO" 26 | } 27 | throughput = 1 28 | } 29 | 30 | kafka-services { 31 | bootstrap-servers-config = "localhost:9092" 32 | key-serializer-class-config = "org.apache.kafka.common.serialization.StringSerializer" 33 | value-serializer-class-config = "org.apache.kafka.common.serialization.StringSerializer" 34 | producer-data { 35 | client-id-config = "KafkaProducer" 36 | retries-config = 1 37 | } 38 | consumer-data { 39 | group-id-config = "something" 40 | enable-auto-commit-config = "true" 41 | auto-commit-interval-ms-config = "1000" 42 | } 43 | } 44 | 45 | spark { 46 | default { 47 | mongodb.input.uri = "mongodb://localhost:27017/default" 48 | mongodb.input.readPreference.name = "secondaryPreferred" 49 | mongodb.output.uri = "mongodb://localhost:27017/default" 50 | neo4j.bolt.url = "bolt://127.0.0.1:7687" 51 | neo4j.bolt.user = "neo4j" 52 | neo4j.bolt.password = "niger182" 53 | } 54 | } -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | [%level] [%date{MM/dd/yyyy HH:mm:ss.SSS}] [%thread] [%class] %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/main/scala/Main.scala: -------------------------------------------------------------------------------- 1 | import com.google.inject.Guice 2 | import modules.{AkkaModule, ConfigModule, DBModule} 3 | import org.apache.spark.graphx.Graph 4 | import org.graphframes.GraphFrame 5 | import repositories.Neo4jRepository 6 | import services.github.client.GitHubProjectService 7 | import services.github.spark.GitHubGraphXService 8 | import services.spark.{SparkContextConf, SparkMongoService} 9 | 10 | import scala.concurrent.ExecutionContext 11 | 12 | object Main extends App { 13 | 14 | //todo: connect drunk library to work with Sangria graphql 15 | private val body = "{ \"query\": \"query { search(query: Java, type: REPOSITORY, first: 1) { pageInfo { hasNextPage startCursor endCursor } edges { node { ... on Repository { id name description createdAt stargazers { totalCount } forkCount updatedAt dependencyGraphManifests { totalCount nodes { dependencies { edges { node { packageName requirements } } } } } } } } } } \"}" 16 | private val injector = Guice.createInjector(new ConfigModule, new AkkaModule, new DBModule) 17 | 18 | implicit val ec = injector.getInstance(classOf[ExecutionContext]) 19 | val gitHubRepositoryService: GitHubProjectService = injector.getInstance(classOf[GitHubProjectService]) 20 | gitHubRepositoryService.fetchRepositoriesWithGraphQL(body, 10, 5).onComplete { 21 | _ => 22 | //todo: change configuration when using Spark on a cluster 23 | val sparkMongoSession = injector.getInstance(classOf[SparkContextConf]).getSparkSession("local", "MongoSession") 24 | val sparkNeoSession = injector.getInstance(classOf[SparkContextConf]).getSparkSession("local", "NeoSession") 25 | val dataFrame = injector.getInstance(classOf[SparkMongoService]).loadData(sparkMongoSession) 26 | val graphFrame: GraphFrame = injector.getInstance(classOf[GitHubGraphXService]).createGraphFrame(dataFrame) 27 | val saveGraph: Unit = injector.getInstance(classOf[Neo4jRepository]).saveGraph(graphFrame, sparkNeoSession) 28 | val loadGraph: Graph[Long, String] = injector.getInstance(classOf[Neo4jRepository]).loadGraph(sparkNeoSession) 29 | 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/models/Dependency.scala: -------------------------------------------------------------------------------- 1 | package models 2 | 3 | import spray.json.{DefaultJsonProtocol, RootJsonFormat} 4 | 5 | case class Dependency(packageName: String, requirements: String) 6 | 7 | object DependencyProtocol extends DefaultJsonProtocol{ 8 | implicit val dependencyFormat: RootJsonFormat[Dependency] = jsonFormat2(Dependency) 9 | } -------------------------------------------------------------------------------- /src/main/scala/models/GitHubRepository.scala: -------------------------------------------------------------------------------- 1 | package models 2 | 3 | import models.DependencyProtocol._ 4 | import spray.json.{DefaultJsonProtocol, JsArray, JsValue, RootJsonReader} 5 | 6 | case class GitHubRepository(_id: String, 7 | name: String, 8 | description: Option[String] = None, 9 | createdAt: String, 10 | starCount: Int, 11 | forkCount: Int, 12 | updatedAt: String, 13 | dependencies: List[Dependency] = Nil) extends ModelWithId 14 | 15 | object GitHubRepositoryProtocol extends DefaultJsonProtocol { 16 | 17 | implicit object GitHubRepositoryFormat extends RootJsonReader[Seq[GitHubRepository]] { 18 | override def read(json: JsValue): Seq[GitHubRepository] = { 19 | val repos = json.asInstanceOf[JsArray].elements.map(_.asJsObject.fields("node")) 20 | repos.map { 21 | repo => 22 | val fields = repo.asJsObject.fields 23 | val edges = fields("dependencyGraphManifests") 24 | .asJsObject.fields("nodes") 25 | .asInstanceOf[JsArray] 26 | .elements 27 | .flatMap(_.asJsObject.fields("dependencies").asJsObject.fields("edges").asInstanceOf[JsArray].elements) 28 | val dependencies = edges.map(_.asJsObject.fields("node").convertTo[Dependency]).toList 29 | 30 | GitHubRepository( 31 | _id = fields("id").convertTo[String], 32 | name = fields("name").convertTo[String], 33 | description = fields("description").convertTo[Option[String]], 34 | createdAt = fields("createdAt").convertTo[String], 35 | starCount = fields("stargazers").asJsObject.fields("totalCount").convertTo[Int], 36 | forkCount = fields("forkCount").convertTo[Int], 37 | updatedAt = fields("updatedAt").convertTo[String], 38 | dependencies = dependencies 39 | ) 40 | } 41 | } 42 | } 43 | 44 | } -------------------------------------------------------------------------------- /src/main/scala/models/ModelWithId.scala: -------------------------------------------------------------------------------- 1 | package models 2 | 3 | trait ModelWithId { 4 | 5 | def _id: String 6 | 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/models/PageInfo.scala: -------------------------------------------------------------------------------- 1 | package models 2 | 3 | import spray.json.{DefaultJsonProtocol, RootJsonFormat} 4 | 5 | case class PageInfo(startCursor: String, endCursor: String, hasNextPage: Boolean) 6 | 7 | object PageInfoProtocol extends DefaultJsonProtocol { 8 | implicit val pageInfoFormat: RootJsonFormat[PageInfo] = jsonFormat3(PageInfo) 9 | } -------------------------------------------------------------------------------- /src/main/scala/models/RelationProp.scala: -------------------------------------------------------------------------------- 1 | package models 2 | 3 | import spray.json.{DefaultJsonProtocol, RootJsonFormat} 4 | 5 | case class RelationProp(weight: Int) 6 | 7 | object RelationPropProtocol extends DefaultJsonProtocol{ 8 | implicit val relationPropFormat: RootJsonFormat[RelationProp] = jsonFormat1(RelationProp) 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/modules/AkkaModule.scala: -------------------------------------------------------------------------------- 1 | package modules 2 | 3 | import akka.actor.ActorSystem 4 | import akka.stream.ActorMaterializer 5 | import net.codingwell.scalaguice.ScalaModule 6 | 7 | import scala.concurrent.ExecutionContext 8 | 9 | class AkkaModule extends ScalaModule { 10 | 11 | override def configure(): Unit = { 12 | implicit val as: ActorSystem = ActorSystem("global-actor-system") 13 | bind[ActorSystem].toInstance(as) 14 | bind[ActorMaterializer].toInstance(ActorMaterializer()) 15 | bind[ExecutionContext].toInstance(as.dispatcher) 16 | } 17 | } -------------------------------------------------------------------------------- /src/main/scala/modules/ConfigModule.scala: -------------------------------------------------------------------------------- 1 | package modules 2 | 3 | import java.io.File 4 | 5 | import com.google.inject.{Provides, Singleton} 6 | import com.typesafe.config.{Config, ConfigFactory} 7 | import net.codingwell.scalaguice.ScalaModule 8 | 9 | class ConfigModule extends ScalaModule { 10 | 11 | @Provides 12 | @Singleton 13 | def config: Config = { 14 | val conf = ConfigFactory.parseFile(new File("src/main/resources/application.conf")) 15 | ConfigFactory.load(conf) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/modules/DBModule.scala: -------------------------------------------------------------------------------- 1 | package modules 2 | 3 | import akka.actor.ActorSystem 4 | import com.google.inject.Provides 5 | import com.typesafe.config.Config 6 | import javax.inject.Singleton 7 | import net.codingwell.scalaguice.ScalaModule 8 | import repositories.ReactiveMongo 9 | 10 | import scala.concurrent.ExecutionContext 11 | 12 | class DBModule extends ScalaModule { 13 | 14 | @Provides 15 | @Singleton 16 | def reactiveMongoApi(config: Config): ReactiveMongo = { 17 | 18 | implicit val as: ActorSystem = ActorSystem("mongodb-ActorSystem") 19 | implicit val ec: ExecutionContext = as.dispatchers.lookup("mongodb-dispatcher") 20 | 21 | val uri = config.getString("mongodb.uri") 22 | val name = config.getString("mongodb.name") 23 | new ReactiveMongo(uri, name) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/modules/GitHubModule.scala: -------------------------------------------------------------------------------- 1 | package modules 2 | 3 | import com.google.inject.{Provides, Singleton} 4 | import net.codingwell.scalaguice.ScalaModule 5 | import repositories.ReactiveMongo 6 | import repositories.github.GitHubProjectRepository 7 | 8 | import scala.concurrent.ExecutionContext 9 | 10 | class GitHubModule extends ScalaModule { 11 | 12 | @Provides 13 | @Singleton 14 | def gitHubRepository(rm: ReactiveMongo) 15 | (implicit ec: ExecutionContext): GitHubProjectRepository = { 16 | new GitHubProjectRepository(rm) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/repositories/MongoRepository.scala: -------------------------------------------------------------------------------- 1 | package repositories 2 | 3 | import models.ModelWithId 4 | import reactivemongo.api.BSONSerializationPack.Writer 5 | import reactivemongo.api.collections.bson.BSONCollection 6 | import reactivemongo.api.commands.MultiBulkWriteResult 7 | import reactivemongo.bson.BSONDocument 8 | 9 | import scala.concurrent.{ExecutionContext, Future} 10 | 11 | 12 | abstract class MongoRepository[T <: ModelWithId](reactiveMongo: ReactiveMongo) 13 | (implicit executionContext: ExecutionContext) { 14 | 15 | def collectionName: String 16 | 17 | protected def insertMany(documents: Seq[T])(implicit writer: Writer[T]): Future[MultiBulkWriteResult] = { 18 | val collection: Future[BSONCollection] = reactiveMongo.db.map(_.collection(collectionName)) 19 | 20 | for { 21 | resolvedCollection <- collection 22 | updateBuilder <- Future(resolvedCollection.update(ordered = true)) 23 | updates <- Future.sequence(documents.map(doc => { 24 | updateBuilder.element( 25 | q = BSONDocument("_id" -> doc._id), 26 | u = doc, 27 | upsert = true, 28 | multi = false 29 | ) 30 | })) 31 | result <- updateBuilder.many(updates) 32 | } yield result 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /src/main/scala/repositories/Neo4jRepository.scala: -------------------------------------------------------------------------------- 1 | package repositories 2 | 3 | import models.RelationProp 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.graphx.{Edge, Graph, VertexId} 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.sql.{Encoder, Encoders, SparkSession} 8 | import org.graphframes._ 9 | import org.neo4j.spark._ 10 | import utils.Logger 11 | 12 | class Neo4jRepository extends Logger { 13 | 14 | implicit val relationEncoder: Encoder[RelationProp] = Encoders.product[RelationProp] 15 | implicit val tuple2Encoders: Encoder[(VertexId, String)] = Encoders.product[(VertexId, String)] 16 | implicit val long2Encoders: Encoder[(Long, Long, RelationProp)] = Encoders.product[(Long, Long, RelationProp)] 17 | implicit val edgeEncoders: Encoder[Edge[String]] = Encoders.product[Edge[String]] 18 | 19 | def saveGraph(graphFrame: GraphFrame, sparkNeoSession: SparkSession): Unit = { 20 | val sc: SparkContext = sparkNeoSession.sparkContext 21 | 22 | val neo: Neo4j = Neo4j(sc) 23 | 24 | val pattern = neo.pattern(("package", "id"), ("rel", "id"), ("package", "id")) 25 | 26 | val verticeRdd: RDD[(VertexId, String)] = { 27 | val array = graphFrame.vertices.map(ver => { 28 | val id = ver.getAs[VertexId]("id") 29 | val packageName = ver.getAs[String]("package") 30 | (id, packageName) 31 | }).collect() 32 | sc.parallelize(array) 33 | } 34 | 35 | val edgesRdd: RDD[Edge[String]] = { 36 | val array = graphFrame.edges.map(edg => { 37 | val src = edg.getAs[Long]("src") 38 | val dst = edg.getAs[Long]("dst") 39 | val prop = "relationship" 40 | Edge[String](src, dst, prop) 41 | }).collect() 42 | sc.parallelize(array) 43 | } 44 | 45 | val graph1 = Graph(verticeRdd, edgesRdd) 46 | 47 | neo.saveGraph(graph1, "name", neo.pattern, true) 48 | 49 | } 50 | 51 | def loadGraph(sparkNeoSession: SparkSession): Graph[Long, String] = { 52 | val sc: SparkContext = sparkNeoSession.sparkContext 53 | val neo: Neo4j = Neo4j(sc) 54 | val pattern = neo.pattern(("package", "id"), ("rel", "id"), ("package", "id")) 55 | neo.loadGraph 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/repositories/ReactiveMongo.scala: -------------------------------------------------------------------------------- 1 | package repositories 2 | 3 | import reactivemongo.api.{DefaultDB, MongoConnection, MongoDriver} 4 | 5 | import scala.concurrent.{ExecutionContext, Future} 6 | import scala.util.Try 7 | 8 | class ReactiveMongo(databaseUri: String, databaseName: String) 9 | (implicit ec: ExecutionContext) { 10 | 11 | private val driver: MongoDriver = MongoDriver() 12 | 13 | def connection: Future[MongoConnection] = { 14 | val parsedUri: Try[MongoConnection.ParsedURI] = MongoConnection.parseURI(databaseUri) 15 | val connection = parsedUri.map(driver.connection) 16 | Future.fromTry(connection) 17 | } 18 | 19 | def db: Future[DefaultDB] = connection.flatMap(_.database(databaseName)) 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/repositories/github/GitHubProjectRepository.scala: -------------------------------------------------------------------------------- 1 | package repositories.github 2 | 3 | import com.google.inject.Inject 4 | import models.{Dependency, GitHubRepository} 5 | import reactivemongo.api.commands.MultiBulkWriteResult 6 | import reactivemongo.bson.{BSONDocumentWriter, Macros} 7 | import repositories.{MongoRepository, ReactiveMongo} 8 | 9 | import scala.concurrent.{ExecutionContext, Future} 10 | 11 | class GitHubProjectRepository @Inject()(rm: ReactiveMongo) 12 | (implicit ec: ExecutionContext) extends MongoRepository[GitHubRepository](rm) { 13 | 14 | override val collectionName: String = "repositories" 15 | 16 | private implicit def repoWriter: BSONDocumentWriter[GitHubRepository] = Macros.writer[GitHubRepository] 17 | 18 | private implicit def dependencyWriter: BSONDocumentWriter[Dependency] = Macros.writer[Dependency] 19 | 20 | def insertMany(documents: Seq[GitHubRepository]): Future[MultiBulkWriteResult] = super.insertMany(documents) 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/services/github/client/GitHubProjectService.scala: -------------------------------------------------------------------------------- 1 | package services.github.client 2 | 3 | import java.util.concurrent.TimeUnit 4 | import java.util.concurrent.TimeUnit.SECONDS 5 | 6 | import akka.actor.{ActorSystem, Terminated} 7 | import akka.http.scaladsl.Http 8 | import akka.http.scaladsl.model._ 9 | import akka.http.scaladsl.unmarshalling.Unmarshal 10 | import akka.pattern._ 11 | import akka.stream._ 12 | import akka.stream.scaladsl.{Broadcast, Flow, GraphDSL, Keep, Merge, Partition, RestartSource, Sink, Source} 13 | import akka.util.Timeout 14 | import com.google.inject.Inject 15 | import models.GitHubRepositoryProtocol._ 16 | import models.PageInfoProtocol._ 17 | import models.{GitHubRepository, PageInfo} 18 | import repositories.github.GitHubProjectRepository 19 | import services.github.client.GitHubRequestComposer.GraphQLQuery 20 | import spray.json._ 21 | import utils.Logger 22 | 23 | import scala.concurrent.duration.FiniteDuration 24 | import scala.concurrent.{ExecutionContext, Future} 25 | import scala.util.{Failure, Success} 26 | 27 | class GitHubProjectService @Inject()(gitHubProjectRepository: GitHubProjectRepository) extends Logger { 28 | 29 | def fetchRepositoriesWithGraphQL(body: String, totalCount: Int, elementsPerPage: Int) = { 30 | 31 | //todo: fix bug: After fetching some count of repositories from GitHub 32 | // throw exception related to connections created with ReactiveMongo 33 | implicit val as: ActorSystem = ActorSystem("GitHub-ActorSystem") 34 | implicit val mat: ActorMaterializer = ActorMaterializer() 35 | implicit val ec: ExecutionContext = as.dispatchers.lookup("github-dispatcher") 36 | 37 | val aggregate = RestartSource.withBackoff( 38 | minBackoff = new FiniteDuration(3, TimeUnit.SECONDS), 39 | maxBackoff = new FiniteDuration(5, TimeUnit.SECONDS), 40 | randomFactor = 0.2 41 | ){ () => 42 | 43 | implicit val timeout: Timeout = Timeout(new FiniteDuration(10, TimeUnit.SECONDS)) 44 | 45 | val source = Source.single[String](body) 46 | 47 | // val sharedKillSwitch: SharedKillSwitch = KillSwitches.shared("GitHub-Repositories-Fetcher") 48 | 49 | val gitHubRequestComposer = as.actorOf(GitHubRequestComposer.props(totalCount, elementsPerPage)) 50 | 51 | def checkErrorMess(mess: JsObject): Boolean = { 52 | mess.getFields("errors").nonEmpty && 53 | mess.getFields("errors").toString.indexOf("timedout") > -1 54 | } 55 | 56 | val graph = GraphDSL.create() { implicit builder => 57 | 58 | import GraphDSL.Implicits._ 59 | 60 | val composeHttpRequest = builder.add { 61 | Flow[GraphQLQuery].mapAsync(1) { 62 | query => 63 | gitHubRequestComposer.ask(query).mapTo[Option[HttpRequest]] 64 | }.map { 65 | r => log.info(s"HTTP request have been composed"); r 66 | } 67 | } 68 | val sendRequest: FlowShape[Option[HttpRequest], HttpResponse] = builder.add { 69 | Flow[Option[HttpRequest]] 70 | .collect { 71 | case Some(httpRequest) => httpRequest 72 | } 73 | .map { 74 | r => log.info(s"Send HTTP request to GitHub. URI: ${r.getUri}"); r 75 | } 76 | .mapAsync(1)(request => Http().singleRequest(request)) 77 | .throttle(1, FiniteDuration(1, SECONDS)) //needs in order to not exceed rate limit on GitHub 78 | } 79 | val httpResponseChecker = builder.add { 80 | Flow[HttpResponse] 81 | .map(res => { 82 | log.error(s"response ===> $res") 83 | if (res.status.intValue == 502) 84 | throw new InternalError(s"Internal server error, ${res.httpMessage}") 85 | else res 86 | }) 87 | } 88 | val deserialization = builder.add { 89 | Flow[HttpResponse] 90 | .mapAsync(1)(res => Unmarshal(res.entity).to[String]) 91 | .map { 92 | entity => 93 | 94 | if (checkErrorMess(entity.parseJson.asJsObject)) { 95 | throw new RuntimeException("Timed out waiting for a response from the data source") 96 | } 97 | 98 | val fields = entity.parseJson 99 | .asJsObject.fields("data") 100 | .asJsObject.fields("search") 101 | .asJsObject.fields 102 | val repo = fields("edges").convertTo[Seq[GitHubRepository]] 103 | val pageInfo = fields("pageInfo").convertTo[PageInfo] 104 | (pageInfo, repo) 105 | }.map { 106 | t => log.info(s"Response from GitHub has been converted. PageInfo: ${t._1}"); t 107 | } 108 | } 109 | // val httpRequestPartitioner = builder.add { 110 | // Partition[Option[HttpRequest]]( 111 | // outputPorts = 2, 112 | // partitioner = { 113 | // case Some(_) => 1 114 | // case None => 0 115 | // } 116 | // ) 117 | // } 118 | val httpResponsePartitioner = builder.add { 119 | Partition[HttpResponse]( 120 | outputPorts = 2, 121 | partitioner = p => if (p.status.isSuccess) 1 else 0 122 | ) 123 | } 124 | val M = builder.add { 125 | Merge[GraphQLQuery](2) 126 | } 127 | val B = builder.add { 128 | Broadcast[(PageInfo, Seq[GitHubRepository])](2) 129 | } 130 | // val cancelledSink = { 131 | // Flow[Option[HttpRequest]].map(_ => sharedKillSwitch.shutdown).to(Sink.ignore) 132 | // } 133 | 134 | /* 135 | httpRequestPartitioner ~> cancelledSink 136 | M ~> composeHttpRequest ~> httpRequestPartitioner ~> sendRequest ~> httpResponsePartitioner ~> Sink.ignore 137 | httpResponsePartitioner ~> deserialization ~> B 138 | M <~ B 139 | */ 140 | // M ~> composeHttpRequest ~> httpRequestPartitioner 141 | // httpRequestPartitioner.out(0) ~> cancelledSink 142 | // httpRequestPartitioner.out(1) ~> sendRequest ~> httpResponsePartitioner 143 | // httpResponsePartitioner.out(0) ~> httpResponseChecker ~> Sink.ignore 144 | // httpResponsePartitioner.out(1) ~> deserialization ~> B 145 | 146 | M ~> composeHttpRequest ~> sendRequest ~> httpResponsePartitioner 147 | httpResponsePartitioner.out(0) ~> httpResponseChecker ~> Sink.ignore 148 | httpResponsePartitioner.out(1) ~> deserialization ~> B 149 | 150 | B.out(0).map(res => GraphQLQuery(body, Some(res._1.endCursor))) ~> M.in(0) 151 | 152 | FlowShape(M.in(1), B.out(1)) 153 | } 154 | 155 | source 156 | .map(_ => GraphQLQuery(body)) 157 | .via(graph) 158 | // .via(sharedKillSwitch.flow) 159 | .map(_._2) 160 | .mapAsync(1)(gitHubProjectRepository.insertMany) 161 | 162 | // as.whenTerminated 163 | 164 | } 165 | 166 | val killSwitch = aggregate 167 | .viaMat(KillSwitches.single)(Keep.right) 168 | .toMat(Sink.foreach(event => println(s"Got event: $event")))(Keep.left) 169 | .run() 170 | 171 | aggregate.runWith { 172 | Sink.onComplete { 173 | case Success(value) => { 174 | log.info("Terminate the stream.") 175 | // as.terminate 176 | // killSwitch.shutdown() 177 | } 178 | case Failure(exception) => log.info("Failure - restart") 179 | } 180 | } 181 | 182 | as.whenTerminated 183 | 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/main/scala/services/github/client/GitHubRequestComposer.scala: -------------------------------------------------------------------------------- 1 | package services.github.client 2 | 3 | import akka.actor.{Actor, ActorLogging, Props} 4 | import akka.http.scaladsl.model._ 5 | import akka.http.scaladsl.model.headers.RawHeader 6 | import services.github.client.GitHubRequestComposer._ 7 | 8 | object GitHubRequestComposer { 9 | 10 | //todo: set GitHub personal token to perform requests on GitHub 11 | private val token = sys.env.getOrElse("GitHubOAuthToken", "") 12 | 13 | private val headers = RawHeader("Authorization", "Bearer " + token) :: 14 | RawHeader("Accept", "application/vnd.github.hawkgirl-preview") :: Nil 15 | private val uri: Uri = Uri("https://api.github.com/graphql") 16 | private val httpRequest = HttpRequest(uri = uri, method = HttpMethods.POST).withHeaders(headers) 17 | 18 | def props(totalCount: Int, elementsPerPage: Int = 10): Props = { 19 | Props(new GitHubRequestComposer(totalCount, elementsPerPage)) 20 | } 21 | 22 | //todo: add checking for hasNext page 23 | case class GraphQLQuery(body: String, cursor: Option[String] = None) 24 | 25 | //todo: for further development 26 | case class ReduceElementsPerPage(httpRequest: HttpRequest) 27 | 28 | } 29 | 30 | class GitHubRequestComposer(totalCount: Int, elementsPerPage: Int) extends Actor with ActorLogging { 31 | 32 | override def receive: Receive = handleIncomingMessages(totalCount, elementsPerPage) 33 | 34 | private def handleIncomingMessages(totalCount: Int, elementsPerPage: Int): Receive = { 35 | 36 | case query: GraphQLQuery => { 37 | if (totalCount == 0) { 38 | log.info(s"All elements were fetched. Total count = $totalCount") 39 | sender ! None 40 | } 41 | else { 42 | val perPage = if (elementsPerPage > totalCount) totalCount else elementsPerPage 43 | 44 | val replacement = query.cursor match { 45 | case Some(value) => s"first: $perPage , after: " + """\\"""" + value + """\\"""" 46 | case _ => s"first: $perPage" 47 | } 48 | val body = query.body.replaceFirst("(first)(.*?)[0-9]+", replacement) 49 | log.debug(s"GraphQL body = $body") 50 | context.become(handleIncomingMessages(totalCount - perPage, perPage)) 51 | sender ! Some(httpRequest.withEntity(HttpEntity(ContentTypes.`application/json`, body))) 52 | } 53 | } 54 | 55 | case reduceElementsPerPage: ReduceElementsPerPage => { 56 | //todo: if we catch timeout exception reduce elements per page 57 | //todo: stop stream if elementsPerPage == 1 if we do not want to get looping 58 | } 59 | 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/services/github/spark/GitHubGraphXService.scala: -------------------------------------------------------------------------------- 1 | package services.github.spark 2 | 3 | import scala.language._ 4 | import com.google.inject.Inject 5 | import models.{Dependency, GitHubRepository, RelationProp} 6 | import org.apache.spark.sql._ 7 | import org.graphframes.GraphFrame 8 | import services.spark.SparkMongoService 9 | 10 | import scala.collection.mutable 11 | import scala.util.Random 12 | 13 | class GitHubGraphXService @Inject()(sparkMongoService: SparkMongoService) { 14 | 15 | /** 16 | * Convert DataFrame with GitHubRepository data into GraphFrame 17 | * 18 | * @param dataFrame an instance of DataFrame 19 | * @return an instance of GraphFrame 20 | */ 21 | def createGraphFrame(dataFrame: DataFrame): GraphFrame = { 22 | //todo: move encoders in an external class 23 | implicit val gitHubRepositoryEncoder: Encoder[GitHubRepository] = Encoders.product[GitHubRepository] 24 | implicit val dependencyEncoder: Encoder[Dependency] = Encoders.product[Dependency] 25 | implicit val relationEncoder: Encoder[RelationProp] = Encoders.product[RelationProp] 26 | implicit val stringEncoder: Encoder[String] = Encoders.STRING 27 | implicit val tuple2Encoders: Encoder[(Long, String)] = Encoders.product[(Long, String)] 28 | implicit val longEncoders: Encoder[Long] = Encoders.scalaLong 29 | implicit val long2Encoders: Encoder[(Long, Long, RelationProp)] = Encoders.product[(Long, Long, RelationProp)] 30 | implicit val tuple3Encoders: Encoder[(String, String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING, Encoders.STRING) 31 | 32 | val centerVertexID = Math.abs(Random.nextLong()) 33 | 34 | val projects = dataFrame 35 | .select("_id", "dependencies") 36 | .filter(""" size(dependencies) != 0 """) 37 | .toDF() 38 | 39 | val centralVertex = dataFrame 40 | .select("name") 41 | .filter(""" name == "Java" """) 42 | .distinct() 43 | .map(name => (centerVertexID, name.getAs[String]("name"))).toDF("id", "package") 44 | 45 | val vertices: DataFrame = { 46 | val vertexList = projects 47 | .flatMap(_.getAs[mutable.WrappedArray[Row]]("dependencies").map(dependency => { 48 | val packageName = Some(dependency.getAs[String]("packageName")).getOrElse("emptyName") 49 | (Math.abs(Random.nextLong()), packageName) 50 | })) 51 | .distinct 52 | .toDF("id", "package") 53 | vertexList.union(centralVertex) 54 | } 55 | 56 | val edges: DataFrame = { 57 | val dst = vertices.filter(""" package == "Java" """).distinct().map(_.getAs[Long]("id")).first() 58 | vertices.map { 59 | ver => { 60 | val src = ver.getAs[Long]("id") 61 | (src, dst, RelationProp(1)) 62 | } 63 | }.toDF("src", "dst", "relationship") 64 | } 65 | 66 | GraphFrame(vertices, edges) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/services/kafka/KafkaService.scala: -------------------------------------------------------------------------------- 1 | package services.kafka 2 | 3 | import java.time.Duration 4 | import java.util 5 | import java.util.{Collections, Properties} 6 | 7 | import com.google.inject.Inject 8 | import com.typesafe.config.Config 9 | import org.apache.kafka.clients.admin._ 10 | import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecords, KafkaConsumer} 11 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 12 | 13 | //todo: make methods asynchronous 14 | class KafkaService @Inject()(config: Config) { 15 | 16 | def send(topicName: String, key: String, value: String): Unit = { 17 | 18 | if (!topics.containsKey(topicName)) { 19 | createTopic(topicName, numPartitions = 1, replicationFactor = 1) 20 | } 21 | 22 | val producer = createProducer 23 | val record = new ProducerRecord[String, String](topicName, key, value) 24 | producer.send(record) 25 | producer.close() 26 | } 27 | 28 | def consume(topicName: String): ConsumerRecords[String, String] = { 29 | val consumer = createConsumer(topicName) 30 | consumer.subscribe(Collections.singletonList(topicName)) 31 | consumer.poll(Duration.ofSeconds(10)) 32 | } 33 | 34 | def createProducer: KafkaProducer[String, String] = { 35 | val props = new Properties() 36 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, config.getString("kafka-services.bootstrap-servers-config")) 37 | props.put(ProducerConfig.CLIENT_ID_CONFIG, config.getString("kafka-services.producer-data.client-id-config")) 38 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, config.getString("kafka-services.key-serializer-class-config")) 39 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, config.getString("kafka-services.value-serializer-class-config")) 40 | props.put(ProducerConfig.RETRIES_CONFIG, config.getInt("kafka-services.producer-data.retries-config").asInstanceOf[Integer]) 41 | 42 | new KafkaProducer[String, String](props) 43 | } 44 | 45 | def createConsumer(topicName: String): KafkaConsumer[String, String] = { 46 | val props = new Properties() 47 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, config.getString("kafka-services.bootstrap-servers-config")) 48 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, config.getString("kafka-services.producer-data.client-id-config")) 49 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, config.getString("kafka-services.value-serializer-class-config")) 50 | props.put(ConsumerConfig.GROUP_ID_CONFIG, config.getString("kafka-services.consumer-data.group-id-config")) 51 | props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, config.getString("kafka-services.consumer-data.enable-auto-commit-config")) 52 | props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, config.getString("kafka-services.consumer-data.auto-commit-interval-ms-config")) 53 | 54 | new KafkaConsumer[String, String](props) 55 | } 56 | 57 | def createTopic(topicName: String, numPartitions: Int, replicationFactor: Short): CreateTopicsResult = { 58 | val props = new Properties() 59 | props.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, config.getString("kafka-services.bootstrap-servers-config")) 60 | val adminClient = AdminClient.create(props) 61 | val createTopicResult = adminClient.createTopics(util.Arrays.asList(new NewTopic(topicName, numPartitions, replicationFactor))) 62 | adminClient.close() 63 | 64 | createTopicResult 65 | } 66 | 67 | def removeTopic(topicName: String): DeleteTopicsResult = { 68 | val props = new Properties() 69 | props.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, config.getString("kafka-services.bootstrap-servers-config")) 70 | val adminClient = AdminClient.create(props) 71 | val deleteTopicResult = adminClient.deleteTopics(util.Arrays.asList(topicName)) 72 | adminClient.close() 73 | deleteTopicResult 74 | } 75 | 76 | def topics: util.Map[String, TopicListing] = { 77 | val props = new Properties 78 | props.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, config.getString("kafka-services.bootstrap-servers-config")) 79 | val adminClient = AdminClient.create(props) 80 | adminClient.listTopics.namesToListings.get 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/scala/services/spark/SparkContextConf.scala: -------------------------------------------------------------------------------- 1 | package services.spark 2 | 3 | import com.google.inject.Inject 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.SparkSession 6 | import com.typesafe.config.Config 7 | import scala.collection.JavaConversions._ 8 | 9 | class SparkContextConf @Inject()(config: Config) { 10 | 11 | val configs = asScalaSet(config.getConfig("spark.default").entrySet()) 12 | .map(entry => entry.getKey -> entry.getValue.unwrapped().toString).toMap 13 | 14 | val sparkConfig: SparkConf = configs.foldRight(new SparkConf())((values, sparkConf) => sparkConf.set(values._1, values._2)) 15 | 16 | def getSparkSession(master: String, appName: String): SparkSession = { 17 | val sparkSession = SparkSession.builder() 18 | .master(master) 19 | .appName(appName) 20 | .config(sparkConfig) 21 | .getOrCreate() 22 | .newSession() 23 | 24 | sparkSession 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/services/spark/SparkMongoService.scala: -------------------------------------------------------------------------------- 1 | package services.spark 2 | 3 | import com.mongodb.spark.MongoSpark 4 | import com.mongodb.spark.config.ReadConfig 5 | import org.apache.spark.sql.{DataFrame, SparkSession} 6 | 7 | class SparkMongoService { 8 | 9 | /** 10 | * Connects to MongoDB using Spark connector and loads data from the database. 11 | * 12 | * @return a tuple which contains instances of DataFrame and SparkSession 13 | */ 14 | def loadData(sparkSession: SparkSession): DataFrame = { 15 | //todo: move configuration into application.conf 16 | val readConfig = ReadConfig(Map("uri" -> "mongodb://localhost:27017/default.repositories")) 17 | 18 | MongoSpark.load(sparkSession, readConfig) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/utils/Logger.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import org.slf4j 4 | import org.slf4j.LoggerFactory 5 | 6 | trait Logger { 7 | val log: slf4j.Logger = LoggerFactory.getLogger(this.getClass.getName) 8 | } --------------------------------------------------------------------------------