├── .github └── workflows │ ├── ISSUE_TEMPLATE.md │ ├── PULL_REQUEST_TEMPLATE.md │ └── maven.yml ├── .gitignore ├── .scalafmt.conf ├── .travis.yml ├── LICENSES ├── Apache-2.0.txt └── CC-1.0.txt ├── README.md ├── example ├── .gitignore ├── pom.xml └── src │ └── main │ ├── resources │ ├── data.csv │ ├── edge │ ├── log4j.properties │ └── vertex │ └── scala │ └── com │ └── vesoft │ └── nebula │ └── examples │ └── connector │ ├── NebulaSparkReaderExample.scala │ └── NebulaSparkWriterExample.scala ├── nebula-algorithm ├── README-CN.md ├── README.md ├── pom.xml └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ └── edge │ └── scala │ │ └── com │ │ └── vesoft │ │ └── nebula │ │ └── algorithm │ │ ├── Main.scala │ │ ├── config │ │ ├── AlgoConfig.scala │ │ ├── Configs.scala │ │ ├── NebulaConfig.scala │ │ └── SparkConfig.scala │ │ ├── lib │ │ ├── BetweennessCentralityAlgo.scala │ │ ├── ConnectedComponentsAlgo.scala │ │ ├── DegreeStaticAlgo.scala │ │ ├── GraphTriangleCountAlgo.scala │ │ ├── KCoreAlgo.scala │ │ ├── LabelPropagationAlgo.scala │ │ ├── LouvainAlgo.scala │ │ ├── PageRankAlgo.scala │ │ ├── ShortestPathAlgo.scala │ │ ├── StronglyConnectedComponentsAlgo.scala │ │ └── TriangleCountAlgo.scala │ │ ├── reader │ │ └── DataReader.scala │ │ ├── utils │ │ └── NebulaUtil.scala │ │ └── writer │ │ └── AlgoWriter.scala │ └── test │ ├── resources │ ├── application.conf │ ├── edge.csv │ └── edge_noWeight.csv │ └── scala │ └── com │ └── vesoft │ └── nebula │ └── algorithm │ ├── config │ └── ConfigSuite.scala │ ├── data │ └── MockNebulaData.scala │ ├── lib │ ├── BetweennessAlgoSuite.scala │ ├── CcAlgoSuite.scala │ ├── DegreeStaticAlgoSuite.scala │ ├── KCoreAlgoSuite.scala │ ├── LabelPropagationAlgoSuite.scala │ ├── LouvainAlgoSuite.scala │ ├── PageRankAlgoSuite.scala │ ├── SCCAlgoSuite.scala │ ├── ShortestPathAlgoSuite.scala │ └── TrangleCountSuite.scala │ └── utils │ └── NebulaUtilSuite.scala ├── nebula-exchange ├── .gitignore ├── README-CN.md ├── README.md ├── pom.xml └── src │ ├── main │ ├── resources │ │ └── application.conf │ └── scala │ │ └── com │ │ └── vesoft │ │ └── nebula │ │ └── exchange │ │ ├── CheckPointHandler.scala │ │ ├── ErrorHandler.scala │ │ ├── Exchange.scala │ │ ├── GraphProvider.scala │ │ ├── MetaProvider.scala │ │ ├── config │ │ ├── Configs.scala │ │ ├── SchemaConfigs.scala │ │ ├── SinkConfigs.scala │ │ └── SourceConfigs.scala │ │ ├── package.scala │ │ ├── processor │ │ ├── EdgeProcessor.scala │ │ ├── Processor.scala │ │ ├── ReloadProcessor.scala │ │ └── VerticesProcessor.scala │ │ ├── reader │ │ ├── FileBaseReader.scala │ │ ├── Reader.scala │ │ ├── ServerBaseReader.scala │ │ └── StreamingBaseReader.scala │ │ ├── utils │ │ ├── HDFSUtils.scala │ │ ├── KafkaUtils.scala │ │ ├── NebulaUtils.scala │ │ └── Neo4jUtils.scala │ │ └── writer │ │ ├── FileBaseWriter.scala │ │ ├── ServerBaseWriter.scala │ │ └── Writer.scala │ └── test │ ├── resources │ ├── application.conf │ └── docker-compose.yaml │ └── scala │ └── com │ └── vesoft │ └── nebula │ └── exchange │ ├── NebulaGraphMock.scala │ ├── config │ └── ConfigsSuite.scala │ ├── processor │ └── ProcessorSuite.scala │ └── utils │ └── NebulaUtilsSuite.scala ├── nebula-spark-connector ├── .gitignore ├── README.md ├── README_CN.md ├── pom.xml └── src │ ├── main │ └── scala │ │ └── com │ │ └── vesoft │ │ └── nebula │ │ └── connector │ │ ├── NebulaConfig.scala │ │ ├── NebulaDataSource.scala │ │ ├── NebulaEnum.scala │ │ ├── NebulaOptions.scala │ │ ├── NebulaUtils.scala │ │ ├── PartitionUtils.scala │ │ ├── Template.scala │ │ ├── exception │ │ └── Exception.scala │ │ ├── nebula │ │ ├── GraphProvider.scala │ │ └── MetaProvider.scala │ │ ├── package.scala │ │ ├── reader │ │ ├── NebulaEdgePartitionReader.scala │ │ ├── NebulaPartition.scala │ │ ├── NebulaPartitionReader.scala │ │ ├── NebulaSourceReader.scala │ │ └── NebulaVertexPartitionReader.scala │ │ └── writer │ │ ├── NebulaCommitMessage.scala │ │ ├── NebulaEdgeWriter.scala │ │ ├── NebulaExecutor.scala │ │ ├── NebulaSourceWriter.scala │ │ ├── NebulaVertexWriter.scala │ │ └── NebulaWriter.scala │ └── test │ └── scala │ └── com │ └── vesoft │ └── nebula │ └── connector │ ├── NebulaConfigSuite.scala │ └── writer │ └── NebulaExecutorSuite.scala └── pom.xml /.github/workflows/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | #### Expected behavior 2 | 3 | #### Actual behavior 4 | 5 | #### Steps to reproduce 6 | 7 | #### JVM version (e.g. `java -version`) 8 | 9 | #### Scala version (e.g. `scala -version`) 10 | 11 | #### OS version (e.g. `uname -a`) 12 | -------------------------------------------------------------------------------- /.github/workflows/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Motivation: 2 | 3 | Why you're making that change and what is the problem you're trying to solve. 4 | 5 | Modification: 6 | 7 | Describe the modifications you've done. 8 | 9 | Result: 10 | 11 | Fixes #. 12 | -------------------------------------------------------------------------------- /.github/workflows/maven.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Maven 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven 3 | 4 | name: Java CI with Maven 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: 11 | - master 12 | - 'v[0-9]+.*' 13 | 14 | jobs: 15 | build: 16 | 17 | runs-on: ubuntu-latest 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up JDK 1.8 22 | uses: actions/setup-java@v1 23 | with: 24 | java-version: 1.8 25 | 26 | - name: Cache the Maven packages to speed up build 27 | uses: actions/cache@v2 28 | with: 29 | path: ~/.m2/repository 30 | key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} 31 | restore-keys: ${{ runner.os }}-maven- 32 | 33 | - name: download neo4j-contrib & graphframes & pulsar-spark-connector dependency 34 | run: | 35 | wget https://oss-cdn.nebula-graph.com.cn/jar-packages/neo4j-contrib.zip 36 | wget https://oss-cdn.nebula-graph.com.cn/jar-packages/graphframes.zip 37 | wget https://oss-cdn.nebula-graph.com.cn/jar-packages/streamnative.zip 38 | unzip -o -d ~/.m2/repository/ neo4j-contrib.zip 39 | unzip -o -d ~/.m2/repository/ graphframes.zip 40 | rm -rf ~/.m2/repository/io/streamnative 41 | unzip -o -d ~/.m2/repository/io/ streamnative.zip 42 | 43 | - name: Install nebula-graph 44 | run: | 45 | mkdir tmp 46 | pushd tmp 47 | git clone https://github.com/vesoft-inc/nebula-docker-compose.git 48 | pushd nebula-docker-compose/ 49 | cp ../../nebula-exchange/src/test/resources/docker-compose.yaml . 50 | docker-compose up -d 51 | sleep 10 52 | popd 53 | popd 54 | 55 | - name: Build with Maven 56 | run: mvn -B package 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | 25 | # build target 26 | target/ 27 | 28 | # IDE 29 | .idea/ 30 | .eclipse/ 31 | *.iml 32 | 33 | spark-importer.ipr 34 | spark-importer.iws 35 | 36 | # mac 37 | .DS_Store 38 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | align = more 2 | maxColumn = 100 3 | docstrings = ScalaDoc 4 | assumeStandardLibraryStripMargin = true -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 vesoft inc. All rights reserved. 2 | # 3 | # This source code is licensed under Apache 2.0 License, 4 | # attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | 6 | language: java 7 | 8 | jdk: 9 | - oraclejdk11 10 | - openjdk8 11 | - openjdk11 12 | 13 | install: mvn clean compile package install -Dgpg.skip -Dmaven.javadoc.skip=true 14 | -------------------------------------------------------------------------------- /LICENSES/CC-1.0.txt: -------------------------------------------------------------------------------- 1 | "Commons Clause" License Condition v1.0 2 | 3 | The Software is provided to you by the Licensor under the License, as defined below, subject to the following condition. 4 | 5 | Without limiting other conditions in the License, the grant of rights under the License will not include, and the License does not grant to you, the right to Sell the Software. 6 | 7 | For purposes of the foregoing, "Sell" means practicing any or all of the rights granted to you under the License to provide to third parties, for a fee or other considerationon (including without limitation fees for hosting or consulting/support services related to the Software), a product or service whose value derives, entirely or substantially, from the functionality of the Software. Any license notice or attribution required by the License must also include this Commons Clause License Condition notice. 8 | 9 | Software: Nebula Graph [Software in this repository] 10 | 11 | License: Apache 2.0 [https://www.apache.org/licenses/LICENSE-2.0.html] 12 | 13 | Licensor: vesoft inc. 14 | 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nebula-spark-utils 2 | 3 | > **说明**: 4 | 5 | nebula-spark-utils 仓库包含 Nebula 的三个产品:Nebula Exchange、Nebula Spark Connector 和 Nebula Algorithm。 6 | 7 | 为了更好地进行产品的版本发布和管理,我们决定将 nebula-spark-utils 拆分成三个仓库,如果您要使用以上三个产品,请转移到产品对应的新仓库: 8 | 9 | * Nebula Exchange: https://github.com/vesoft-inc/nebula-exchange 10 | * Nebula Spark Connector: https://github.com/vesoft-inc/nebula-spark-connector 11 | * Nebula Algorithm: https://github.com/vesoft-inc/nebula-algorithm 12 | 13 | ------ 14 | 15 | > **Note**: 16 | 17 | nebula-spark-utils repository contains code of three nebula products: Nebula Exchange, Nebula Spark Connector, Nebula Algorithm. 18 | 19 | In order for better version release and management, the team decided to split the current repository into three independent repositories. 20 | 21 | Please choose the correct repository for use: 22 | 23 | * Nebula Exchange: https://github.com/vesoft-inc/nebula-exchange 24 | * Nebula Spark Connector: https://github.com/vesoft-inc/nebula-spark-connector 25 | * Nebula Algorithm: https://github.com/vesoft-inc/nebula-algorithm 26 | 27 | If you want to use Spark utilities for [Nebula Graph v1.x](https://github.com/vesoft-inc/nebula), visit [the v1.0 branch of nebula-java](https://github.com/vesoft-inc/nebula-java/tree/v1.0). 28 | -------------------------------------------------------------------------------- /example/.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | 25 | # build target 26 | target/ 27 | 28 | # IDE 29 | .idea/ 30 | .eclipse/ 31 | *.iml 32 | 33 | spark-importer.ipr 34 | spark-importer.iws 35 | 36 | .DS_Store 37 | -------------------------------------------------------------------------------- /example/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | nebula-spark 7 | com.vesoft 8 | 2.5-SNAPSHOT 9 | ../pom.xml 10 | 11 | 4.0.0 12 | 13 | example 14 | 15 | 16 | 17 | 18 | 19 | org.apache.maven.plugins 20 | maven-deploy-plugin 21 | 22 | true 23 | 24 | 25 | 26 | 27 | org.apache.maven.plugins 28 | maven-compiler-plugin 29 | 3.8.1 30 | 31 | 1.8 32 | 1.8 33 | 34 | 35 | 36 | 37 | org.apache.maven.plugins 38 | maven-jar-plugin 39 | 3.2.0 40 | 41 | 42 | 43 | test-jar 44 | 45 | 46 | 47 | 48 | 49 | 50 | org.apache.maven.plugins 51 | maven-shade-plugin 52 | 3.2.1 53 | 54 | 55 | package 56 | 57 | shade 58 | 59 | 60 | false 61 | 62 | 63 | org.apache.spark:* 64 | org.apache.hadoop:* 65 | org.apache.hive:* 66 | log4j:log4j 67 | org.apache.orc:* 68 | xml-apis:xml-apis 69 | javax.inject:javax.inject 70 | org.spark-project.hive:hive-exec 71 | stax:stax-api 72 | org.glassfish.hk2.external:aopalliance-repackaged 73 | 74 | 75 | 76 | 77 | *:* 78 | 79 | com/vesoft/tools/** 80 | META-INF/*.SF 81 | META-INF/*.DSA 82 | META-INF/*.RSA 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | org.scala-tools 93 | maven-scala-plugin 94 | 2.15.2 95 | 96 | 2.11.12 97 | 98 | -target:jvm-1.8 99 | 100 | 101 | -Xss4096K 102 | 103 | 104 | 105 | 106 | scala-compile 107 | 108 | compile 109 | 110 | 111 | 112 | com/vesoft/tools/** 113 | META-INF/*.SF 114 | META-INF/*.DSA 115 | META-INF/*.RSA 116 | 117 | 118 | 119 | 120 | scala-test-compile 121 | 122 | testCompile 123 | 124 | 125 | 126 | com/vesoft/tools/** 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | org.slf4j 138 | slf4j-log4j12 139 | 1.7.25 140 | 141 | 142 | org.slf4j 143 | slf4j-api 144 | 1.7.25 145 | 146 | 147 | 148 | org.apache.spark 149 | spark-core_2.11 150 | 2.4.4 151 | 152 | 153 | org.apache.spark 154 | spark-sql_2.11 155 | 2.4.4 156 | 157 | 158 | 159 | com.vesoft 160 | nebula-spark-connector 161 | 2.5-SNAPSHOT 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /example/src/main/resources/data.csv: -------------------------------------------------------------------------------- 1 | id,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13 2 | 1,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10 3 | 2,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10 4 | 3,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10 5 | 4,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10 6 | 5,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10 7 | 6,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10 8 | 7,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10 9 | 8,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10 10 | 9,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10 11 | 10,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10 12 | -1,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10 13 | -2,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10 14 | -3,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10 15 | -------------------------------------------------------------------------------- /example/src/main/resources/edge: -------------------------------------------------------------------------------- 1 | {"src":12345,"dst":23456,"degree":34, "descr": "aaa","timep": "2020-01-01"} 2 | {"src":11111,"dst":22222,"degree":33, "descr": "aaa","timep": "2020-01-01"} 3 | {"src":11111,"dst":33333,"degree":32, "descr": "a\baa","timep": "2020-01-01"} 4 | {"src":11111,"dst":44444,"degree":31, "descr": "aaa","timep": "2020-01-01"} 5 | {"src":22222,"dst":55555,"degree":30, "descr": "a\naa","timep": "2020-01-01"} 6 | {"src":33333,"dst":44444,"degree":29, "descr": "aaa","timep": "2020-01-01"} 7 | {"src":33333,"dst":55555,"degree":28, "descr": "aa\ta","timep": "2020-01-01"} 8 | {"src":44444,"dst":22222,"degree":27, "descr": "aaa","timep": "2020-01-01"} 9 | {"src":44444,"dst":55555,"degree":26, "descr": "aaa","timep": "2020-01-01"} 10 | {"src":22222,"dst":66666,"degree":25, "descr": "aaa","timep": "2020-01-01"} -------------------------------------------------------------------------------- /example/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Global logging configuration 2 | log4j.rootLogger=INFO, stdout 3 | # Console output... 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n 7 | -------------------------------------------------------------------------------- /example/src/main/resources/vertex: -------------------------------------------------------------------------------- 1 | {"id":12,"name":"Tom","age":20,"born": "2000-01-01"} 2 | {"id":13,"name":"Bob","age":21,"born": "1999-01-02"} 3 | {"id":14,"name":"Jane","age":22,"born": "1998-01-03"} 4 | {"id":15,"name":"Jena","age":23,"born": "1997-01-04"} 5 | {"id":16,"name":"Nic","age":24,"born": "1996-01-05"} 6 | {"id":17,"name":"Mei","age":25,"born": "1995-01-06"} 7 | {"id":18,"name":"HH","age":26,"born": "1994-01-07"} 8 | {"id":19,"name":"Tyler","age":27,"born": "1993-01-08"} 9 | {"id":20,"name":"Ber","age":28,"born": "1992-01-09"} 10 | {"id":21,"name":"Mercy","age":29,"born": "1991-01-10"} -------------------------------------------------------------------------------- /example/src/main/scala/com/vesoft/nebula/examples/connector/NebulaSparkReaderExample.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.examples.connector 8 | 9 | import com.facebook.thrift.protocol.TCompactProtocol 10 | import com.vesoft.nebula.connector.connector.NebulaDataFrameReader 11 | import com.vesoft.nebula.connector.{NebulaConnectionConfig, ReadNebulaConfig} 12 | import org.apache.spark.SparkConf 13 | import org.apache.spark.sql.SparkSession 14 | import org.slf4j.LoggerFactory 15 | 16 | object NebulaSparkReaderExample { 17 | 18 | private val LOG = LoggerFactory.getLogger(this.getClass) 19 | 20 | def main(args: Array[String]): Unit = { 21 | 22 | val sparkConf = new SparkConf 23 | sparkConf 24 | .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 25 | .registerKryoClasses(Array[Class[_]](classOf[TCompactProtocol])) 26 | val spark = SparkSession 27 | .builder() 28 | .master("local") 29 | .config(sparkConf) 30 | .getOrCreate() 31 | 32 | readVertex(spark) 33 | readEdges(spark) 34 | readVertexGraph(spark) 35 | readEdgeGraph(spark) 36 | 37 | spark.close() 38 | sys.exit() 39 | } 40 | 41 | def readVertex(spark: SparkSession): Unit = { 42 | LOG.info("start to read nebula vertices") 43 | val config = 44 | NebulaConnectionConfig 45 | .builder() 46 | .withMetaAddress("127.0.0.1:9559") 47 | .withConenctionRetry(2) 48 | .build() 49 | val nebulaReadVertexConfig: ReadNebulaConfig = ReadNebulaConfig 50 | .builder() 51 | .withSpace("test") 52 | .withLabel("person") 53 | .withNoColumn(false) 54 | .withReturnCols(List("birthday")) 55 | .withLimit(10) 56 | .withPartitionNum(10) 57 | .build() 58 | val vertex = spark.read.nebula(config, nebulaReadVertexConfig).loadVerticesToDF() 59 | vertex.printSchema() 60 | vertex.show(20) 61 | println("vertex count: " + vertex.count()) 62 | } 63 | 64 | def readEdges(spark: SparkSession): Unit = { 65 | LOG.info("start to read nebula edges") 66 | 67 | val config = 68 | NebulaConnectionConfig 69 | .builder() 70 | .withMetaAddress("127.0.0.1:9559") 71 | .withTimeout(6000) 72 | .withConenctionRetry(2) 73 | .build() 74 | val nebulaReadEdgeConfig: ReadNebulaConfig = ReadNebulaConfig 75 | .builder() 76 | .withSpace("test") 77 | .withLabel("knows") 78 | .withNoColumn(false) 79 | .withReturnCols(List("degree")) 80 | .withLimit(10) 81 | .withPartitionNum(10) 82 | .build() 83 | val edge = spark.read.nebula(config, nebulaReadEdgeConfig).loadEdgesToDF() 84 | edge.printSchema() 85 | edge.show(20) 86 | println("edge count: " + edge.count()) 87 | } 88 | 89 | def readVertexGraph(spark: SparkSession): Unit = { 90 | LOG.info("start to read graphx vertex") 91 | val config = 92 | NebulaConnectionConfig 93 | .builder() 94 | .withMetaAddress("127.0.0.1:9559") 95 | .withTimeout(6000) 96 | .withConenctionRetry(2) 97 | .build() 98 | val nebulaReadVertexConfig: ReadNebulaConfig = ReadNebulaConfig 99 | .builder() 100 | .withSpace("test") 101 | .withLabel("person") 102 | .withNoColumn(false) 103 | .withReturnCols(List("birthday")) 104 | .withLimit(10) 105 | .withPartitionNum(10) 106 | .build() 107 | 108 | val vertexRDD = spark.read.nebula(config, nebulaReadVertexConfig).loadVerticesToGraphx() 109 | LOG.info("vertex rdd first record: " + vertexRDD.first()) 110 | LOG.info("vertex rdd count: {}", vertexRDD.count()) 111 | } 112 | 113 | def readEdgeGraph(spark: SparkSession): Unit = { 114 | LOG.info("start to read graphx edge") 115 | val config = 116 | NebulaConnectionConfig 117 | .builder() 118 | .withMetaAddress("127.0.0.1:9559") 119 | .withTimeout(6000) 120 | .withConenctionRetry(2) 121 | .build() 122 | val nebulaReadEdgeConfig: ReadNebulaConfig = ReadNebulaConfig 123 | .builder() 124 | .withSpace("test") 125 | .withLabel("knows") 126 | .withNoColumn(false) 127 | .withReturnCols(List("timep")) 128 | .withLimit(10) 129 | .withPartitionNum(10) 130 | .build() 131 | val edgeRDD = spark.read.nebula(config, nebulaReadEdgeConfig).loadEdgesToGraphx() 132 | LOG.info("edge rdd first record:" + edgeRDD.first()) 133 | LOG.info("edge rdd count: {}", edgeRDD.count()) 134 | } 135 | 136 | } 137 | -------------------------------------------------------------------------------- /nebula-algorithm/README-CN.md: -------------------------------------------------------------------------------- 1 | # 欢迎使用 Nebula Algorithm 2 | 3 | nebula-algorithm 是一款基于 [GraphX](https://spark.apache.org/graphx/) 的 Spark 应用程序,提供了以下图计算算法: 4 | 5 | 6 | | 算法名 |中文说明|应用场景| 7 | |:------------------------:|:-----------:|:----:| 8 | | PageRank | 页面排序 | 网页排序、重点节点挖掘| 9 | | Louvain | 社区发现 | 社团挖掘、层次化聚类| 10 | | KCore | K核 |社区发现、金融风控| 11 | | LabelPropagation | 标签传播 |资讯传播、广告推荐、社区发现| 12 | | ConnectedComponent | 联通分量 |社区发现、孤岛发现| 13 | |StronglyConnectedComponent| 强联通分量 |社区发现| 14 | | ShortestPath | 最短路径 |路径规划、网络规划| 15 | | TriangleCount | 三角形计数 |网络结构分析| 16 | | GraphTriangleCount |全图三角形计数|网络紧密性分析| 17 | | BetweennessCentrality | 介数中心性 |关键节点挖掘,节点影响力计算| 18 | | DegreeStatic | 度统计 |图结构分析| 19 | 20 | 使用 `nebula-algorithm`,可以通过提交 `Spark` 任务的形式使用完整的算法工具对 `Nebula Graph` 数据库中的数据执行图计算,也可以通过编程形式调用`lib`库下的算法针对DataFrame执行图计算。 21 | 22 | ## 如何获取 23 | 1. 编译打包 Nebula Algorithm 24 | ``` 25 | $ git clone https://github.com/vesoft-inc/nebula-spark-utils.git 26 | $ cd nebula-algorithm 27 | $ mvn clean package -Dgpg.skip -Dmaven.javadoc.skip=true -Dmaven.test.skip=true 28 | ``` 29 | 编译完成后,在 `nebula-algorithm/target` 目录下会生成 `nebula-algorithm-2.0.0.jar` 。 30 | 31 | 2. 在 Maven 远程仓库下载 32 | https://repo1.maven.org/maven2/com/vesoft/nebula-algorithm/2.0.0/ 33 | 34 | # 使用 Nebula Algorithm 35 | 36 | 使用限制:Nebula Algorithm 未自动对字符串id进行编码,因此执行图算法时,边的源点和目标点必须是整数(Nebula Space 的 vid_type可以是String类型,但数据必须是整数)。 37 | 38 | * 使用方法1:直接提交 nebula-algorithm 算法包 39 | 40 | * 设置配置文件 41 | 42 | 关于配置项的具体说明参考[示例配置](https://github.com/vesoft-inc/nebula-spark-utils/blob/master/nebula-algorithm/src/main/resources/application.conf) 43 | 44 | * 提交算法任务 45 | 46 | ``` 47 | ${SPARK_HOME}/bin/spark-submit --master --class com.vesoft.nebula.algorithm.Main nebula-algorithm-2.0.0.jar -p application.conf 48 | ``` 49 | * 使用方法2:调用 nebula-algorithm 算法接口 50 | 51 | 在`nebula-algorithm`的`lib`库中提供了10中常用图计算算法,可通过编程调用的形式调用算法。 52 | * 在pom.xml中添加依赖 53 | ``` 54 | 55 | com.vesoft 56 | nebula-algorithm 57 | 2.0.0 58 | 59 | ``` 60 | * 定义算法参数调用算法(以`PageRank`为例) 61 | ``` 62 | val prConfig = new PRConfig(5, 1.0) 63 | val louvainResult = PageRankAlgo.apply(spark, data, prConfig, false) 64 | ``` 65 | 66 | 其他算法的调用方法见[测试示例](https://github.com/vesoft-inc/nebula-spark-utils/tree/master/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib) 。 67 | 68 | > 注:执行算法的DataFrame默认第一列是源点,第二列是目标点,第三列是边权重。 69 | 70 | ## 贡献 71 | 72 | Nebula Algorithm 是一个完全开源的项目,欢迎开源爱好者通过以下方式参与: 73 | 74 | - 前往 [Nebula Graph 论坛](https://discuss.nebula-graph.com.cn/ "点击前往“Nebula Graph 论坛") 上参与 Issue 讨论,如答疑、提供想法或者报告无法解决的问题 75 | - 撰写或改进文档 76 | - 提交优化代码 77 | -------------------------------------------------------------------------------- /nebula-algorithm/README.md: -------------------------------------------------------------------------------- 1 | # Welcome to Nebula Algorithm 2 | 3 |

4 |
English | 中文 5 |

6 | 7 | nebula-algorithm is a Spark Application based on [GraphX](https://spark.apache.org/graphx/) with the following Algorithm provided for now: 8 | 9 | 10 | | Name |Use Case| 11 | |:------------------------:|:---------------:| 12 | | PageRank | page ranking, important node digging| 13 | | Louvain | community digging, hierarchical clustering| 14 | | KCore | community detection, financial risk control| 15 | | LabelPropagation | community detection, consultation propagation, advertising recommendation| 16 | | ConnectedComponent | community detection, isolated island detection| 17 | |StronglyConnectedComponent| community detection| 18 | | ShortestPath | path plan, network plan| 19 | | TriangleCount | network structure analysis| 20 | | GraphTriangleCount | network structure and tightness analysis| 21 | | BetweennessCentrality | important node digging, node influence calculation| 22 | | DegreeStatic | graph structure analysis| 23 | 24 | 25 | You could submit the entire spark application or invoke algorithms in `lib` library to apply graph algorithms for DataFrame. 26 | 27 | ## Get Nebula Algorithm 28 | 1. Build Nebula Algorithm 29 | ``` 30 | $ git clone https://github.com/vesoft-inc/nebula-spark-utils.git 31 | $ cd nebula-algorithm 32 | $ mvn clean package -Dgpg.skip -Dmaven.javadoc.skip=true -Dmaven.test.skip=true 33 | ``` 34 | After the above buiding process, the target file `nebula-algorithm-2.0.0.jar` will be placed under `nebula-algorithm/target`. 35 | 36 | 2. Download from Maven repo 37 | 38 | Alternatively, it could be downloaded from the following Maven repo: 39 | 40 | https://repo1.maven.org/maven2/com/vesoft/nebula-algorithm/2.0.0/ 41 | 42 | ## Use Nebula Algorithm 43 | 44 | Limitation: Due to Nebula Algorithm will not encode string id, thus during the algorithm execution, the source and target of edges must be in Type Int (The `vid_type` in Nebula Space could be String, while data must be in Type Int). 45 | 46 | * Option 1: Submit nebula-algorithm package 47 | 48 | * Configuration 49 | 50 | Refer to the [configuration example](https://github.com/vesoft-inc/nebula-spark-utils/blob/master/nebula-algorithm/src/main/resources/application.conf). 51 | 52 | * Submit Spark Application 53 | 54 | ``` 55 | ${SPARK_HOME}/bin/spark-submit --master --class com.vesoft.nebula.algorithm.Main nebula-algorithm-2.0.0.jar -p application.conf 56 | ``` 57 | 58 | * Option2: Call nebula-algorithm interface 59 | 60 | Now there are 10 algorithms provided in `lib` from `nebula-algorithm`, which could be invoked in a programming fashion as below: 61 | 62 | * Add dependencies in `pom.xml`. 63 | ``` 64 | 65 | com.vesoft 66 | nebula-algorithm 67 | 2.0.0 68 | 69 | ``` 70 | * Instantiate algorithm's config, below is an example for `PageRank`. 71 | ``` 72 | val prConfig = new PRConfig(5, 1.0) 73 | val louvainResult = PageRankAlgo.apply(spark, data, prConfig, false) 74 | ``` 75 | 76 | For other algorithms, please refer to [test cases](https://github.com/vesoft-inc/nebula-spark-utils/tree/master/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib). 77 | 78 | > Note: The first column of DataFrame in the application represents the source vertices, the second represents the target vertices and the third represents edges' weight. 79 | 80 | ## Contribute 81 | 82 | Nebula Algorithm is open source, you are more than welcomed to contribute in the following ways: 83 | 84 | - Discuss in the community via [the forum](https://discuss.nebula-graph.io/) or raise issues here. 85 | - Compose or improve our documents. 86 | - Pull Request to help improve the code itself here. 87 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | { 2 | # Spark relation config 3 | spark: { 4 | app: { 5 | name: LPA 6 | # spark.app.partitionNum 7 | partitionNum:100 8 | } 9 | master:local 10 | } 11 | 12 | data: { 13 | # data source. optional of nebula,csv,json 14 | source: csv 15 | # data sink, means the algorithm result will be write into this sink. optional of nebula,csv,text 16 | sink: csv 17 | # if your algorithm needs weight 18 | hasWeight: false 19 | } 20 | 21 | # Nebula Graph relation config 22 | nebula: { 23 | # algo's data source from Nebula. If data.source is nebula, then this nebula.read config can be valid. 24 | read: { 25 | # Nebula metad server address, multiple addresses are split by English comma 26 | metaAddress: "127.0.0.1:9559" 27 | # Nebula space 28 | space: nb 29 | # Nebula edge types, multiple labels means that data from multiple edges will union together 30 | labels: ["serve"] 31 | # Nebula edge property name for each edge type, this property will be as weight col for algorithm. 32 | # Make sure the weightCols are corresponding to labels. 33 | weightCols: ["start_year"] 34 | } 35 | 36 | # algo result sink into Nebula. If data.sink is nebula, then this nebula.write config can be valid. 37 | write:{ 38 | # Nebula graphd server address, multiple addresses are split by English comma 39 | graphAddress: "127.0.0.1:9669" 40 | # Nebula metad server address, multiple addresses are split by English comma 41 | metaAddress: "127.0.0.1:9559,127.0.0.1:9560" 42 | user:root 43 | pswd:nebula 44 | # Nebula space name 45 | space:nb 46 | # Nebula tag name, the algorithm result will be write into this tag 47 | tag:pagerank 48 | } 49 | } 50 | 51 | local: { 52 | # algo's data source from Nebula. If data.source is csv or json, then this local.read can be valid. 53 | read:{ 54 | filePath: "file:///tmp/algo_edge.csv" 55 | # srcId column 56 | srcId:"_c0" 57 | # dstId column 58 | dstId:"_c1" 59 | # weight column 60 | #weight: "col3" 61 | # if csv file has header 62 | header: false 63 | # csv file's delimiter 64 | delimiter:"," 65 | } 66 | 67 | # algo result sink into local file. If data.sink is csv or text, then this local.write can be valid. 68 | write:{ 69 | resultPath:/tmp/count 70 | } 71 | } 72 | 73 | 74 | algorithm: { 75 | # the algorithm that you are going to execute,pick one from [pagerank, louvain, connectedcomponent, 76 | # labelpropagation, shortestpaths, degreestatic, kcore, stronglyconnectedcomponent, trianglecount, 77 | # betweenness, graphtriangleCount] 78 | executeAlgo: graphtrianglecount 79 | 80 | # PageRank parameter 81 | pagerank: { 82 | maxIter: 10 83 | resetProb: 0.15 # default 0.15 84 | } 85 | 86 | # Louvain parameter 87 | louvain: { 88 | maxIter: 20 89 | internalIter: 10 90 | tol: 0.5 91 | } 92 | 93 | # connected component parameter. 94 | connectedcomponent: { 95 | maxIter: 20 96 | } 97 | 98 | # LabelPropagation parameter 99 | labelpropagation: { 100 | maxIter: 20 101 | } 102 | 103 | # ShortestPaths parameter 104 | shortestpaths: { 105 | # several vertices to compute the shortest path to all vertices. 106 | landmarks: "1" 107 | } 108 | 109 | # Vertex degree statistics parameter 110 | degreestatic: {} 111 | 112 | # KCore parameter 113 | kcore:{ 114 | maxIter:10 115 | degree:1 116 | } 117 | 118 | # Trianglecount parameter 119 | trianglecount:{} 120 | 121 | # graphTriangleCount parameter 122 | graphtrianglecount:{} 123 | 124 | # Betweenness centrality parameter 125 | betweenness:{ 126 | maxIter:5 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/resources/edge: -------------------------------------------------------------------------------- 1 | {"src":12345,"dst":23456,"degree":34, "descr": "aaa","timep": "2020-01-01"} 2 | {"src":11111,"dst":22222,"degree":33, "descr": "aaa","timep": "2020-01-01"} 3 | {"src":11111,"dst":33333,"degree":32, "descr": "a\baa","timep": "2020-01-01"} 4 | {"src":11111,"dst":44444,"degree":31, "descr": "aaa","timep": "2020-01-01"} 5 | {"src":22222,"dst":55555,"degree":30, "descr": "a\naa","timep": "2020-01-01"} 6 | {"src":33333,"dst":44444,"degree":29, "descr": "aaa","timep": "2020-01-01"} 7 | {"src":33333,"dst":55555,"degree":28, "descr": "aa\ta","timep": "2020-01-01"} 8 | {"src":44444,"dst":22222,"degree":27, "descr": "aaa","timep": "2020-01-01"} 9 | {"src":44444,"dst":55555,"degree":26, "descr": "aaa","timep": "2020-01-01"} 10 | {"src":22222,"dst":66666,"degree":25, "descr": "aaa","timep": "2020-01-01"} -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/Main.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm 8 | 9 | import com.vesoft.nebula.algorithm.config.Configs.Argument 10 | import com.vesoft.nebula.algorithm.config.{ 11 | AlgoConfig, 12 | AlgoConstants, 13 | BetweennessConfig, 14 | CcConfig, 15 | Configs, 16 | KCoreConfig, 17 | LPAConfig, 18 | LouvainConfig, 19 | PRConfig, 20 | ShortestPathConfig, 21 | SparkConfig 22 | } 23 | import com.vesoft.nebula.algorithm.lib.{ 24 | BetweennessCentralityAlgo, 25 | ConnectedComponentsAlgo, 26 | DegreeStaticAlgo, 27 | GraphTriangleCountAlgo, 28 | KCoreAlgo, 29 | LabelPropagationAlgo, 30 | LouvainAlgo, 31 | PageRankAlgo, 32 | ShortestPathAlgo, 33 | StronglyConnectedComponentsAlgo, 34 | TriangleCountAlgo 35 | } 36 | import com.vesoft.nebula.algorithm.reader.{CsvReader, JsonReader, NebulaReader} 37 | import com.vesoft.nebula.algorithm.writer.{CsvWriter, NebulaWriter, TextWriter} 38 | import org.apache.commons.math3.ode.UnknownParameterException 39 | import org.apache.log4j.Logger 40 | import org.apache.spark.sql.types.{LongType, StructField, StructType} 41 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 42 | 43 | /** 44 | * This object is the entry of all graph algorithms. 45 | * 46 | * How to use this tool to run algorithm: 47 | * 1. Configure application.conf file. 48 | * 2. Make sure your environment has installed spark and started spark service. 49 | * 3. Submit nebula algorithm application using this command: 50 | * spark-submit --class com.vesoft.nebula.tools.algorithm.Main /your-jar-path/nebula-algorithm-1.1.0.jar -p /your-application.conf-path/application.conf 51 | */ 52 | object Main { 53 | 54 | private val LOGGER = Logger.getLogger(this.getClass) 55 | 56 | def main(args: Array[String]): Unit = { 57 | val PROGRAM_NAME = "Nebula graphx" 58 | val options = Configs.parser(args, PROGRAM_NAME) 59 | val p: Argument = options match { 60 | case Some(config) => config 61 | case _ => 62 | LOGGER.error("Argument parse failed") 63 | sys.exit(-1) 64 | } 65 | val configs = Configs.parse(p.config) 66 | LOGGER.info(s"configs = ${configs}") 67 | 68 | val algoName: String = AlgoConfig.getAlgoName(configs) 69 | LOGGER.info(s"algoName= ${algoName}") 70 | 71 | val sparkConfig = SparkConfig.getSpark(configs) 72 | val partitionNum = sparkConfig.partitionNum 73 | 74 | // reader 75 | val dataSet = createDataSource(sparkConfig.spark, configs, partitionNum) 76 | 77 | // algorithm 78 | val algoResult = executeAlgorithm(sparkConfig.spark, algoName, configs, dataSet) 79 | // writer 80 | saveAlgoResult(algoResult, configs) 81 | 82 | sys.exit(0) 83 | } 84 | 85 | /** 86 | * create data from datasource 87 | * 88 | * @param spark 89 | * @param configs 90 | * @return DataFrame 91 | */ 92 | private[this] def createDataSource(spark: SparkSession, 93 | configs: Configs, 94 | partitionNum: String): DataFrame = { 95 | val dataSource = configs.dataSourceSinkEntry.source 96 | val dataSet: Dataset[Row] = dataSource.toLowerCase match { 97 | case "nebula" => { 98 | val reader = new NebulaReader(spark, configs, partitionNum) 99 | reader.read() 100 | } 101 | case "csv" => { 102 | val reader = new CsvReader(spark, configs, partitionNum) 103 | reader.read() 104 | } 105 | case "json" => { 106 | val reader = new JsonReader(spark, configs, partitionNum) 107 | reader.read() 108 | } 109 | } 110 | dataSet 111 | } 112 | 113 | /** 114 | * execute algorithms 115 | * @param spark 116 | * @param algoName 117 | * @param configs 118 | * @param dataSet 119 | * @return DataFrame 120 | */ 121 | private[this] def executeAlgorithm(spark: SparkSession, 122 | algoName: String, 123 | configs: Configs, 124 | dataSet: DataFrame): DataFrame = { 125 | val hasWeight = configs.dataSourceSinkEntry.hasWeight 126 | val algoResult = { 127 | algoName.toLowerCase match { 128 | case "pagerank" => { 129 | val pageRankConfig = PRConfig.getPRConfig(configs) 130 | PageRankAlgo(spark, dataSet, pageRankConfig, hasWeight) 131 | } 132 | case "louvain" => { 133 | val louvainConfig = LouvainConfig.getLouvainConfig(configs) 134 | LouvainAlgo(spark, dataSet, louvainConfig, hasWeight) 135 | } 136 | case "connectedcomponent" => { 137 | val ccConfig = CcConfig.getCcConfig(configs) 138 | ConnectedComponentsAlgo(spark, dataSet, ccConfig, hasWeight) 139 | } 140 | case "labelpropagation" => { 141 | val lpaConfig = LPAConfig.getLPAConfig(configs) 142 | LabelPropagationAlgo(spark, dataSet, lpaConfig, hasWeight) 143 | } 144 | case "shortestpaths" => { 145 | val spConfig = ShortestPathConfig.getShortestPathConfig(configs) 146 | ShortestPathAlgo(spark, dataSet, spConfig, hasWeight) 147 | } 148 | case "degreestatic" => { 149 | DegreeStaticAlgo(spark, dataSet) 150 | } 151 | case "kcore" => { 152 | val kCoreConfig = KCoreConfig.getKCoreConfig(configs) 153 | KCoreAlgo(spark, dataSet, kCoreConfig) 154 | } 155 | case "stronglyconnectedcomponent" => { 156 | val ccConfig = CcConfig.getCcConfig(configs) 157 | StronglyConnectedComponentsAlgo(spark, dataSet, ccConfig, hasWeight) 158 | } 159 | case "betweenness" => { 160 | val betweennessConfig = BetweennessConfig.getBetweennessConfig(configs) 161 | BetweennessCentralityAlgo(spark, dataSet, betweennessConfig, hasWeight) 162 | } 163 | case "trianglecount" => { 164 | TriangleCountAlgo(spark, dataSet) 165 | } 166 | case "graphtrianglecount" => { 167 | GraphTriangleCountAlgo(spark, dataSet) 168 | } 169 | case _ => throw new UnknownParameterException("unknown executeAlgo name.") 170 | } 171 | } 172 | algoResult 173 | } 174 | 175 | private[this] def saveAlgoResult(algoResult: DataFrame, configs: Configs): Unit = { 176 | val dataSink = configs.dataSourceSinkEntry.sink 177 | dataSink.toLowerCase match { 178 | case "nebula" => { 179 | val writer = new NebulaWriter(algoResult, configs) 180 | writer.write() 181 | } 182 | case "csv" => { 183 | val writer = new CsvWriter(algoResult, configs) 184 | writer.write() 185 | } 186 | case "text" => { 187 | val writer = new TextWriter(algoResult, configs) 188 | writer.write() 189 | } 190 | case _ => throw new UnsupportedOperationException("unsupported data sink") 191 | } 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/AlgoConfig.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.config 8 | 9 | import org.apache.spark.graphx.VertexId 10 | 11 | case class PRConfig(maxIter: Int, resetProb: Double) 12 | 13 | /** 14 | * pagerank algorithm configuration 15 | */ 16 | object PRConfig { 17 | var maxIter: Int = _ 18 | var resetProb: Double = _ 19 | 20 | def getPRConfig(configs: Configs): PRConfig = { 21 | val prConfig = configs.algorithmConfig.map 22 | 23 | maxIter = prConfig("algorithm.pagerank.maxIter").toInt 24 | resetProb = 25 | if (prConfig.contains("algorithm.pagerank.resetProb")) 26 | prConfig("algorithm.pagerank.resetProb").toDouble 27 | else 0.15 28 | 29 | PRConfig(maxIter, resetProb) 30 | } 31 | } 32 | 33 | case class LPAConfig(maxIter: Int) 34 | 35 | /** 36 | * labelPropagation algorithm configuration 37 | */ 38 | object LPAConfig { 39 | var maxIter: Int = _ 40 | 41 | def getLPAConfig(configs: Configs): LPAConfig = { 42 | val lpaConfig = configs.algorithmConfig.map 43 | 44 | maxIter = lpaConfig("algorithm.labelpropagation.maxIter").toInt 45 | LPAConfig(maxIter) 46 | } 47 | } 48 | 49 | case class CcConfig(maxIter: Int) 50 | 51 | /** 52 | * ConnectedComponect algorithm configuration 53 | */ 54 | object CcConfig { 55 | var maxIter: Int = _ 56 | 57 | def getCcConfig(configs: Configs): CcConfig = { 58 | val ccConfig = configs.algorithmConfig.map 59 | 60 | maxIter = ccConfig("algorithm.connectedcomponent.maxIter").toInt 61 | CcConfig(maxIter) 62 | } 63 | } 64 | 65 | case class ShortestPathConfig(landmarks: Seq[VertexId]) 66 | 67 | /** 68 | * ConnectedComponect algorithm configuration 69 | */ 70 | object ShortestPathConfig { 71 | var landmarks: Seq[Long] = _ 72 | 73 | def getShortestPathConfig(configs: Configs): ShortestPathConfig = { 74 | val spConfig = configs.algorithmConfig.map 75 | 76 | landmarks = spConfig("algorithm.shortestpaths.landmarks").split(",").toSeq.map(_.toLong) 77 | ShortestPathConfig(landmarks) 78 | } 79 | } 80 | 81 | case class LouvainConfig(maxIter: Int, internalIter: Int, tol: Double) 82 | 83 | /** 84 | * louvain algorithm configuration 85 | */ 86 | object LouvainConfig { 87 | var maxIter: Int = _ 88 | var internalIter: Int = _ 89 | var tol: Double = _ 90 | 91 | def getLouvainConfig(configs: Configs): LouvainConfig = { 92 | val louvainConfig = configs.algorithmConfig.map 93 | 94 | maxIter = louvainConfig("algorithm.louvain.maxIter").toInt 95 | internalIter = louvainConfig("algorithm.louvain.internalIter").toInt 96 | tol = louvainConfig("algorithm.louvain.tol").toDouble 97 | 98 | LouvainConfig(maxIter, internalIter, tol) 99 | } 100 | } 101 | 102 | /** 103 | * degree static 104 | */ 105 | case class DegreeStaticConfig(degree: Boolean, inDegree: Boolean, outDegree: Boolean) 106 | 107 | object DegreeStaticConfig { 108 | var degree: Boolean = false 109 | var inDegree: Boolean = false 110 | var outDegree: Boolean = false 111 | 112 | def getDegreeStaticConfig(configs: Configs): DegreeStaticConfig = { 113 | val degreeConfig = configs.algorithmConfig.map 114 | degree = ConfigUtil.getOrElseBoolean(degreeConfig, "algorithm.degreestatic.degree", false) 115 | inDegree = ConfigUtil.getOrElseBoolean(degreeConfig, "algorithm.degreestatic.indegree", false) 116 | outDegree = ConfigUtil.getOrElseBoolean(degreeConfig, "algorithm.degreestatic.outdegree", false) 117 | DegreeStaticConfig(degree, inDegree, outDegree) 118 | } 119 | } 120 | 121 | /** 122 | * k-core 123 | */ 124 | case class KCoreConfig(maxIter: Int, degree: Int) 125 | 126 | object KCoreConfig { 127 | var maxIter: Int = _ 128 | var degree: Int = _ 129 | 130 | def getKCoreConfig(configs: Configs): KCoreConfig = { 131 | val kCoreConfig = configs.algorithmConfig.map 132 | maxIter = kCoreConfig("algorithm.kcore.maxIter").toInt 133 | degree = kCoreConfig("algorithm.kcore.degree").toInt 134 | KCoreConfig(maxIter, degree) 135 | } 136 | } 137 | 138 | /** 139 | * Betweenness 140 | */ 141 | case class BetweennessConfig(maxIter: Int) 142 | 143 | object BetweennessConfig { 144 | var maxIter: Int = _ 145 | 146 | def getBetweennessConfig(configs: Configs): BetweennessConfig = { 147 | val betweennessConfig = configs.algorithmConfig.map 148 | maxIter = betweennessConfig("algorithm.betweenness.maxIter").toInt 149 | BetweennessConfig(maxIter) 150 | } 151 | } 152 | 153 | case class AlgoConfig(configs: Configs) 154 | 155 | object AlgoConfig { 156 | def getAlgoName(configs: Configs): String = { 157 | val algoConfig = configs.algorithmConfig.map 158 | algoConfig("algorithm.executeAlgo") 159 | } 160 | } 161 | 162 | object ConfigUtil { 163 | def getOrElseBoolean(config: Map[String, String], key: String, defaultValue: Boolean): Boolean = { 164 | if (config.contains(key)) { 165 | config(key).toBoolean 166 | } else { 167 | defaultValue 168 | } 169 | } 170 | 171 | } 172 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/NebulaConfig.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.config 8 | 9 | object NebulaConfig { 10 | 11 | def getReadNebula(configs: Configs): NebulaReadConfigEntry = { 12 | val nebulaConfigs = configs.nebulaConfig 13 | nebulaConfigs.readConfigEntry 14 | } 15 | 16 | def getWriteNebula(configs: Configs): NebulaWriteConfigEntry = { 17 | val nebulaConfigs = configs.nebulaConfig 18 | nebulaConfigs.writeConfigEntry 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/SparkConfig.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.config 8 | 9 | import org.apache.spark.sql.SparkSession 10 | 11 | case class SparkConfig(spark: SparkSession, partitionNum: String) 12 | 13 | object SparkConfig { 14 | 15 | var spark: SparkSession = _ 16 | 17 | var partitionNum: String = _ 18 | 19 | def getSpark(configs: Configs, defaultAppName: String = "algorithm"): SparkConfig = { 20 | val sparkConfigs = configs.sparkConfig.map 21 | val session = SparkSession.builder 22 | .appName(defaultAppName) 23 | .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 24 | 25 | for (key <- sparkConfigs.keySet) { 26 | session.config(key, sparkConfigs(key)) 27 | } 28 | partitionNum = sparkConfigs.getOrElse("spark.app.partitionNum", "0") 29 | SparkConfig(session.getOrCreate(), partitionNum) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/ConnectedComponentsAlgo.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 10 | import com.vesoft.nebula.algorithm.config.{ 11 | AlgoConstants, 12 | CcConfig, 13 | Configs, 14 | LPAConfig, 15 | NebulaConfig, 16 | PRConfig, 17 | SparkConfig 18 | } 19 | import org.apache.log4j.Logger 20 | import org.apache.spark.graphx.{Graph, VertexId, VertexRDD} 21 | import org.apache.spark.rdd.RDD 22 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 23 | import org.apache.spark.graphx.lib.ConnectedComponents 24 | import org.apache.spark.sql.types.{DoubleType, LongType, StructField, StructType} 25 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 26 | 27 | object ConnectedComponentsAlgo { 28 | private val LOGGER = Logger.getLogger(this.getClass) 29 | 30 | val ALGORITHM: String = "ConnectedComponents" 31 | 32 | /** 33 | * run the ConnectedComponents algorithm for nebula graph 34 | */ 35 | def apply(spark: SparkSession, 36 | dataset: Dataset[Row], 37 | ccConfig: CcConfig, 38 | hasWeight: Boolean): DataFrame = { 39 | 40 | val graph: Graph[None.type, Double] = NebulaUtil.loadInitGraph(dataset, hasWeight) 41 | 42 | val ccResultRDD = execute(graph, ccConfig.maxIter) 43 | 44 | val schema = StructType( 45 | List( 46 | StructField(AlgoConstants.ALGO_ID_COL, LongType, nullable = false), 47 | StructField(AlgoConstants.CC_RESULT_COL, LongType, nullable = true) 48 | )) 49 | val algoResult = spark.sqlContext 50 | .createDataFrame(ccResultRDD, schema) 51 | 52 | algoResult 53 | } 54 | 55 | def execute(graph: Graph[None.type, Double], maxIter: Int): RDD[Row] = { 56 | val ccResultRDD: VertexRDD[VertexId] = ConnectedComponents.run(graph, maxIter).vertices 57 | ccResultRDD.map(row => Row(row._1, row._2)) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/DegreeStaticAlgo.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.AlgoConstants 10 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 11 | import org.apache.log4j.Logger 12 | import org.apache.spark.graphx.{Graph, VertexRDD} 13 | import org.apache.spark.rdd.RDD 14 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 15 | import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} 16 | 17 | object DegreeStaticAlgo { 18 | 19 | private val LOGGER = Logger.getLogger(this.getClass) 20 | 21 | val ALGORITHM: String = "DegreeStatic" 22 | 23 | /** 24 | * run the pagerank algorithm for nebula graph 25 | */ 26 | def apply(spark: SparkSession, dataset: Dataset[Row]): DataFrame = { 27 | 28 | val graph: Graph[None.type, Double] = NebulaUtil.loadInitGraph(dataset, false) 29 | 30 | val degreeResultRDD = execute(graph) 31 | 32 | val schema = StructType( 33 | List( 34 | StructField(AlgoConstants.ALGO_ID_COL, LongType, nullable = false), 35 | StructField(AlgoConstants.DEGREE_RESULT_COL, IntegerType, nullable = true), 36 | StructField(AlgoConstants.INDEGREE_RESULT_COL, IntegerType, nullable = true), 37 | StructField(AlgoConstants.OUTDEGREE_RESULT_COL, IntegerType, nullable = true) 38 | )) 39 | val algoResult = spark.sqlContext 40 | .createDataFrame(degreeResultRDD, schema) 41 | 42 | algoResult 43 | } 44 | 45 | def execute(graph: Graph[None.type, Double]): RDD[Row] = { 46 | val degreeRdd: VertexRDD[Int] = graph.degrees 47 | val inDegreeRdd: VertexRDD[Int] = graph.inDegrees 48 | val outDegreeRdd: VertexRDD[Int] = graph.outDegrees 49 | 50 | val degreeAndInDegree: VertexRDD[(Int, Int)] = 51 | degreeRdd.leftJoin(inDegreeRdd)((id, d, inD) => (d, inD.getOrElse(0))) 52 | 53 | val result = degreeAndInDegree.leftJoin(outDegreeRdd)((id, dAndInD, opt) => 54 | (dAndInD._1, dAndInD._2, opt.getOrElse(0))) 55 | result.map(vertex => Row(vertex._1, vertex._2._1, vertex._2._2, vertex._2._3)) 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/GraphTriangleCountAlgo.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.AlgoConstants 10 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 11 | import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} 12 | 13 | /** 14 | * compute all graph's triangle count 15 | */ 16 | object GraphTriangleCountAlgo { 17 | 18 | def apply(spark: SparkSession, dataset: Dataset[Row]): DataFrame = { 19 | 20 | val triangleCount = TriangleCountAlgo(spark, dataset) 21 | val count = triangleCount 22 | .select(AlgoConstants.TRIANGLECOUNT_RESULT_COL) 23 | .rdd 24 | .map(value => value.get(0).asInstanceOf[Int]) 25 | .reduce(_ + _) / 3 26 | val list = List(count) 27 | val rdd = spark.sparkContext.parallelize(list).map(row => Row(row)) 28 | 29 | val schema = StructType( 30 | List( 31 | StructField("count", IntegerType, nullable = false) 32 | )) 33 | val algoResult = spark.sqlContext 34 | .createDataFrame(rdd, schema) 35 | 36 | algoResult 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/KCoreAlgo.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.{AlgoConstants, KCoreConfig} 10 | import org.apache.log4j.Logger 11 | import org.apache.spark.graphx.Graph 12 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 13 | import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} 14 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 15 | 16 | object KCoreAlgo { 17 | private val LOGGER = Logger.getLogger(this.getClass) 18 | 19 | val ALGORITHM: String = "LabelPropagation" 20 | 21 | /** 22 | * run the louvain algorithm for nebula graph 23 | */ 24 | def apply(spark: SparkSession, dataset: Dataset[Row], kCoreConfig: KCoreConfig): DataFrame = { 25 | 26 | val graph: Graph[None.type, Double] = NebulaUtil.loadInitGraph(dataset, false) 27 | val kCoreGraph = execute(graph, kCoreConfig.maxIter, kCoreConfig.degree) 28 | 29 | val schema = StructType( 30 | List( 31 | StructField(AlgoConstants.ALGO_ID_COL, LongType, nullable = false), 32 | StructField(AlgoConstants.KCORE_RESULT_COL, IntegerType, nullable = true) 33 | )) 34 | val resultRDD = kCoreGraph.vertices.map(vertex => Row(vertex._1, vertex._2)) 35 | val algoResult = spark.sqlContext.createDataFrame(resultRDD, schema) 36 | algoResult 37 | } 38 | 39 | /** 40 | * extract k-core sub-graph 41 | */ 42 | def execute(graph: Graph[None.type, Double], maxIter: Int, k: Int): Graph[Int, Double] = { 43 | var lastVertexNum: Long = graph.numVertices 44 | var currentVertexNum: Long = -1 45 | var isStable: Boolean = false 46 | var iterNum: Int = 1 47 | 48 | var degreeGraph = graph 49 | .outerJoinVertices(graph.degrees) { (vid, vd, degree) => 50 | degree.getOrElse(0) 51 | } 52 | .cache 53 | var subGraph: Graph[Int, Double] = null 54 | 55 | while (iterNum < maxIter) { 56 | subGraph = degreeGraph.subgraph(vpred = (vid, degree) => degree >= k) 57 | degreeGraph = subGraph 58 | .outerJoinVertices(subGraph.degrees) { (vid, vd, degree) => 59 | degree.getOrElse(0) 60 | } 61 | .cache 62 | 63 | currentVertexNum = degreeGraph.numVertices 64 | if (currentVertexNum == lastVertexNum) { 65 | isStable = true; 66 | } else { 67 | lastVertexNum = currentVertexNum 68 | } 69 | 70 | iterNum += 1 71 | } 72 | subGraph 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/LabelPropagationAlgo.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 10 | import com.vesoft.nebula.algorithm.config.{AlgoConstants, LPAConfig} 11 | import org.apache.log4j.Logger 12 | import org.apache.spark.graphx.{Graph, VertexId, VertexRDD} 13 | import org.apache.spark.rdd.RDD 14 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 15 | import org.apache.spark.graphx.lib.LabelPropagation 16 | import org.apache.spark.sql.types.{LongType, StructField, StructType} 17 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 18 | 19 | object LabelPropagationAlgo { 20 | private val LOGGER = Logger.getLogger(this.getClass) 21 | 22 | val ALGORITHM: String = "LabelPropagation" 23 | 24 | /** 25 | * run the LabelPropagation algorithm for nebula graph 26 | */ 27 | def apply(spark: SparkSession, 28 | dataset: Dataset[Row], 29 | lpaConfig: LPAConfig, 30 | hasWeight: Boolean): DataFrame = { 31 | val graph: Graph[None.type, Double] = NebulaUtil.loadInitGraph(dataset, hasWeight) 32 | 33 | val lpaResultRDD = execute(graph, lpaConfig.maxIter) 34 | 35 | val schema = StructType( 36 | List( 37 | StructField(AlgoConstants.ALGO_ID_COL, LongType, nullable = false), 38 | StructField(AlgoConstants.LPA_RESULT_COL, LongType, nullable = true) 39 | )) 40 | val algoResult = spark.sqlContext 41 | .createDataFrame(lpaResultRDD, schema) 42 | 43 | algoResult 44 | } 45 | 46 | def execute(graph: Graph[None.type, Double], maxIter: Int): RDD[Row] = { 47 | val lpaResultRDD: VertexRDD[VertexId] = LabelPropagation.run(graph, maxIter).vertices 48 | lpaResultRDD.map(row => Row(row._1, row._2)) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/PageRankAlgo.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.{ 10 | AlgoConstants, 11 | Configs, 12 | NebulaConfig, 13 | PRConfig, 14 | SparkConfig 15 | } 16 | import org.apache.log4j.Logger 17 | import org.apache.spark.graphx.{Graph, VertexRDD} 18 | import org.apache.spark.rdd.RDD 19 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 20 | import org.apache.spark.graphx.lib.PageRank 21 | import org.apache.spark.sql.types.{DoubleType, LongType, StructField, StructType} 22 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 23 | 24 | object PageRankAlgo { 25 | private val LOGGER = Logger.getLogger(this.getClass) 26 | 27 | val ALGORITHM: String = "PageRank" 28 | 29 | /** 30 | * run the pagerank algorithm for nebula graph 31 | */ 32 | def apply(spark: SparkSession, 33 | dataset: Dataset[Row], 34 | pageRankConfig: PRConfig, 35 | hasWeight: Boolean): DataFrame = { 36 | 37 | val graph: Graph[None.type, Double] = NebulaUtil.loadInitGraph(dataset, hasWeight) 38 | 39 | val prResultRDD = execute(graph, pageRankConfig.maxIter, pageRankConfig.resetProb) 40 | 41 | val schema = StructType( 42 | List( 43 | StructField(AlgoConstants.ALGO_ID_COL, LongType, nullable = false), 44 | StructField(AlgoConstants.PAGERANK_RESULT_COL, DoubleType, nullable = true) 45 | )) 46 | val algoResult = spark.sqlContext 47 | .createDataFrame(prResultRDD, schema) 48 | 49 | algoResult 50 | } 51 | 52 | def execute(graph: Graph[None.type, Double], maxIter: Int, resetProb: Double): RDD[Row] = { 53 | val prResultRDD: VertexRDD[Double] = PageRank.run(graph, maxIter, resetProb).vertices 54 | prResultRDD.map(row => Row(row._1, row._2)) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/ShortestPathAlgo.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 10 | import com.vesoft.nebula.algorithm.config.{ 11 | AlgoConstants, 12 | CcConfig, 13 | Configs, 14 | NebulaConfig, 15 | PRConfig, 16 | ShortestPathConfig, 17 | SparkConfig 18 | } 19 | import org.apache.log4j.Logger 20 | import org.apache.spark.graphx.{Graph, VertexId, VertexRDD} 21 | import org.apache.spark.rdd.RDD 22 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 23 | import org.apache.spark.graphx.lib.ShortestPaths 24 | import org.apache.spark.graphx.lib.ShortestPaths.SPMap 25 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} 26 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 27 | 28 | object ShortestPathAlgo { 29 | private val LOGGER = Logger.getLogger(this.getClass) 30 | 31 | val ALGORITHM: String = "ShortestPath" 32 | 33 | /** 34 | * run the ShortestPath algorithm for nebula graph 35 | */ 36 | def apply(spark: SparkSession, 37 | dataset: Dataset[Row], 38 | shortestPathConfig: ShortestPathConfig, 39 | hasWeight: Boolean): DataFrame = { 40 | 41 | val graph: Graph[None.type, Double] = NebulaUtil.loadInitGraph(dataset, hasWeight) 42 | 43 | val prResultRDD = execute(graph, shortestPathConfig.landmarks) 44 | 45 | val schema = StructType( 46 | List( 47 | StructField(AlgoConstants.ALGO_ID_COL, LongType, nullable = false), 48 | StructField(AlgoConstants.SHORTPATH_RESULT_COL, StringType, nullable = true) 49 | )) 50 | val algoResult = spark.sqlContext 51 | .createDataFrame(prResultRDD, schema) 52 | 53 | algoResult 54 | } 55 | 56 | def execute(graph: Graph[None.type, Double], landmarks: Seq[VertexId]): RDD[Row] = { 57 | val spResultRDD: VertexRDD[SPMap] = ShortestPaths.run(graph, landmarks).vertices 58 | spResultRDD.map(row => Row(row._1, row._2.toString())) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/StronglyConnectedComponentsAlgo.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 10 | import com.vesoft.nebula.algorithm.config.{AlgoConstants, CcConfig} 11 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 12 | import org.apache.spark.graphx.{Graph, VertexId, VertexRDD} 13 | import org.apache.spark.graphx.lib.{ConnectedComponents, StronglyConnectedComponents} 14 | import org.apache.spark.rdd.RDD 15 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 16 | import org.apache.spark.sql.types.{LongType, StructField, StructType} 17 | 18 | object StronglyConnectedComponentsAlgo { 19 | 20 | val ALGORITHM: String = "StronglyConnectedComponents" 21 | 22 | /** 23 | * run the StronglyConnectedComponents algorithm for nebula graph 24 | */ 25 | def apply(spark: SparkSession, 26 | dataset: Dataset[Row], 27 | ccConfig: CcConfig, 28 | hasWeight: Boolean): DataFrame = { 29 | 30 | val graph: Graph[None.type, Double] = NebulaUtil.loadInitGraph(dataset, hasWeight) 31 | 32 | val ccResultRDD = execute(graph, ccConfig.maxIter) 33 | 34 | val schema = StructType( 35 | List( 36 | StructField(AlgoConstants.ALGO_ID_COL, LongType, nullable = false), 37 | StructField(AlgoConstants.SCC_RESULT_COL, LongType, nullable = true) 38 | )) 39 | val algoResult = spark.sqlContext 40 | .createDataFrame(ccResultRDD, schema) 41 | 42 | algoResult 43 | } 44 | 45 | def execute(graph: Graph[None.type, Double], maxIter: Int): RDD[Row] = { 46 | val ccResultRDD: VertexRDD[VertexId] = StronglyConnectedComponents.run(graph, maxIter).vertices 47 | ccResultRDD.map(row => Row(row._1, row._2)) 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/TriangleCountAlgo.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.AlgoConstants 10 | import com.vesoft.nebula.algorithm.utils.NebulaUtil 11 | import org.apache.log4j.Logger 12 | import org.apache.spark.graphx.{Graph, VertexRDD} 13 | import org.apache.spark.graphx.lib.TriangleCount 14 | import org.apache.spark.rdd.RDD 15 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 16 | import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} 17 | 18 | object TriangleCountAlgo { 19 | private val LOGGER = Logger.getLogger(this.getClass) 20 | 21 | val ALGORITHM: String = "TriangleCount" 22 | 23 | /** 24 | * run the TriangleCount algorithm for nebula graph 25 | * 26 | * compute each vertex's triangle count 27 | */ 28 | def apply(spark: SparkSession, dataset: Dataset[Row]): DataFrame = { 29 | 30 | val graph: Graph[None.type, Double] = NebulaUtil.loadInitGraph(dataset, false) 31 | 32 | val triangleResultRDD = execute(graph) 33 | 34 | val schema = StructType( 35 | List( 36 | StructField(AlgoConstants.ALGO_ID_COL, LongType, nullable = false), 37 | StructField(AlgoConstants.TRIANGLECOUNT_RESULT_COL, IntegerType, nullable = true) 38 | )) 39 | val algoResult = spark.sqlContext 40 | .createDataFrame(triangleResultRDD, schema) 41 | 42 | algoResult 43 | } 44 | 45 | def execute(graph: Graph[None.type, Double]): RDD[Row] = { 46 | val resultRDD: VertexRDD[Int] = TriangleCount.run(graph).vertices 47 | resultRDD.map(row => Row(row._1, row._2)) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/reader/DataReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.reader 8 | 9 | import com.vesoft.nebula.connector.connector.NebulaDataFrameReader 10 | import com.vesoft.nebula.connector.{NebulaConnectionConfig, ReadNebulaConfig} 11 | import com.vesoft.nebula.algorithm.config.Configs 12 | import org.apache.spark.sql.{DataFrame, SparkSession} 13 | 14 | import scala.collection.mutable.ListBuffer 15 | 16 | abstract class DataReader(spark: SparkSession, configs: Configs) { 17 | def read(): DataFrame 18 | } 19 | 20 | class NebulaReader(spark: SparkSession, configs: Configs, partitionNum: String) 21 | extends DataReader(spark, configs) { 22 | override def read(): DataFrame = { 23 | val metaAddress = configs.nebulaConfig.readConfigEntry.address 24 | val space = configs.nebulaConfig.readConfigEntry.space 25 | val labels = configs.nebulaConfig.readConfigEntry.labels 26 | val weights = configs.nebulaConfig.readConfigEntry.weightCols 27 | val partition = partitionNum.toInt 28 | 29 | val config = 30 | NebulaConnectionConfig 31 | .builder() 32 | .withMetaAddress(metaAddress) 33 | .withConenctionRetry(2) 34 | .build() 35 | 36 | val noColumn = weights.isEmpty 37 | 38 | var dataset: DataFrame = null 39 | for (i <- labels.indices) { 40 | val returnCols: ListBuffer[String] = new ListBuffer[String] 41 | if (configs.dataSourceSinkEntry.hasWeight && weights.nonEmpty) { 42 | returnCols.append(weights(i)) 43 | } 44 | val nebulaReadEdgeConfig: ReadNebulaConfig = ReadNebulaConfig 45 | .builder() 46 | .withSpace(space) 47 | .withLabel(labels(i)) 48 | .withNoColumn(noColumn) 49 | .withReturnCols(returnCols.toList) 50 | .withPartitionNum(partition) 51 | .build() 52 | if (dataset == null) { 53 | dataset = spark.read.nebula(config, nebulaReadEdgeConfig).loadEdgesToDF() 54 | } else { 55 | dataset = dataset.union(spark.read.nebula(config, nebulaReadEdgeConfig).loadEdgesToDF()) 56 | } 57 | } 58 | dataset 59 | } 60 | } 61 | 62 | class CsvReader(spark: SparkSession, configs: Configs, partitionNum: String) 63 | extends DataReader(spark, configs) { 64 | override def read(): DataFrame = { 65 | val delimiter = configs.localConfigEntry.delimiter 66 | val header = configs.localConfigEntry.header 67 | val localPath = configs.localConfigEntry.filePath 68 | 69 | val partition = partitionNum.toInt 70 | 71 | val data = 72 | spark.read 73 | .option("header", header) 74 | .option("delimiter", delimiter) 75 | .csv(localPath) 76 | val weight = configs.localConfigEntry.weight 77 | val src = configs.localConfigEntry.srcId 78 | val dst = configs.localConfigEntry.dstId 79 | if (configs.dataSourceSinkEntry.hasWeight && weight != null && !weight.trim.isEmpty) { 80 | data.select(src, dst, weight) 81 | } else { 82 | data.select(src, dst) 83 | } 84 | if (partition != 0) { 85 | data.repartition(partition) 86 | } 87 | data 88 | } 89 | } 90 | 91 | class JsonReader(spark: SparkSession, configs: Configs, partitionNum: String) 92 | extends DataReader(spark, configs) { 93 | override def read(): DataFrame = { 94 | val localPath = configs.localConfigEntry.filePath 95 | val data = spark.read.json(localPath) 96 | val partition = partitionNum.toInt 97 | 98 | val weight = configs.localConfigEntry.weight 99 | val src = configs.localConfigEntry.srcId 100 | val dst = configs.localConfigEntry.dstId 101 | if (configs.dataSourceSinkEntry.hasWeight && weight != null && !weight.trim.isEmpty) { 102 | data.select(src, dst, weight) 103 | } else { 104 | data.select(src, dst) 105 | } 106 | if (partition != 0) { 107 | data.repartition(partition) 108 | } 109 | data 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/utils/NebulaUtil.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.utils 8 | 9 | import org.apache.spark.graphx.{Edge, Graph} 10 | import org.apache.spark.rdd.RDD 11 | import org.apache.spark.sql.{Dataset, Encoder, Row} 12 | import org.slf4j.LoggerFactory 13 | 14 | object NebulaUtil { 15 | private val LOG = LoggerFactory.getLogger(this.getClass) 16 | 17 | /** 18 | * construct original graph 19 | * 20 | * @param hasWeight if the graph has no weight, then edge's weight is default 1.0 21 | * @return Graph 22 | */ 23 | def loadInitGraph(dataSet: Dataset[Row], hasWeight: Boolean): Graph[None.type, Double] = { 24 | implicit val encoder: Encoder[Edge[Double]] = org.apache.spark.sql.Encoders.kryo[Edge[Double]] 25 | val edges: RDD[Edge[Double]] = dataSet 26 | .map(row => { 27 | if (hasWeight) { 28 | Edge(row.get(0).toString.toLong, row.get(1).toString.toLong, row.get(2).toString.toDouble) 29 | } else { 30 | Edge(row.get(0).toString.toLong, row.get(1).toString.toLong, 1.0) 31 | } 32 | })(encoder) 33 | .rdd 34 | 35 | Graph.fromEdges(edges, None) 36 | } 37 | 38 | /** 39 | * Assembly algorithm's result file path 40 | * 41 | * @param path algorithm configuration 42 | * @param algorithmName 43 | * 44 | * @return validate result path 45 | */ 46 | def getResultPath(path: String, algorithmName: String): String = { 47 | var resultFilePath = path 48 | if (!resultFilePath.endsWith("/")) { 49 | resultFilePath = resultFilePath + "/" 50 | } 51 | resultFilePath + algorithmName 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/writer/AlgoWriter.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.writer 8 | 9 | import com.vesoft.nebula.connector.connector.NebulaDataFrameWriter 10 | import com.vesoft.nebula.connector.{NebulaConnectionConfig, WriteNebulaVertexConfig} 11 | import com.vesoft.nebula.algorithm.config.{AlgoConstants, Configs} 12 | import org.apache.spark.sql.DataFrame 13 | 14 | abstract class AlgoWriter(data: DataFrame, configs: Configs) { 15 | def write(): Unit 16 | } 17 | 18 | class NebulaWriter(data: DataFrame, configs: Configs) extends AlgoWriter(data, configs) { 19 | override def write(): Unit = { 20 | val graphAddress = configs.nebulaConfig.writeConfigEntry.graphAddress 21 | val metaAddress = configs.nebulaConfig.writeConfigEntry.metaAddress 22 | val space = configs.nebulaConfig.writeConfigEntry.space 23 | val tag = configs.nebulaConfig.writeConfigEntry.tag 24 | val user = configs.nebulaConfig.writeConfigEntry.user 25 | val passwd = configs.nebulaConfig.writeConfigEntry.pswd 26 | 27 | val config = 28 | NebulaConnectionConfig 29 | .builder() 30 | .withMetaAddress(metaAddress) 31 | .withGraphAddress(graphAddress) 32 | .withConenctionRetry(2) 33 | .build() 34 | val nebulaWriteVertexConfig: WriteNebulaVertexConfig = WriteNebulaVertexConfig 35 | .builder() 36 | .withSpace(space) 37 | .withTag(tag) 38 | .withVidField(AlgoConstants.ALGO_ID_COL) 39 | .withVidAsProp(false) 40 | .withBatch(1000) 41 | .build() 42 | data.write.nebula(config, nebulaWriteVertexConfig).writeVertices() 43 | } 44 | } 45 | 46 | class CsvWriter(data: DataFrame, configs: Configs) extends AlgoWriter(data, configs) { 47 | override def write(): Unit = { 48 | val resultPath = configs.localConfigEntry.resultPath 49 | data.repartition(1).write.option("header", true).csv(resultPath) 50 | } 51 | } 52 | 53 | class TextWriter(data: DataFrame, configs: Configs) extends AlgoWriter(data, configs) { 54 | override def write(): Unit = { 55 | val resultPath = configs.localConfigEntry.resultPath 56 | data.repartition(1).write.option("header", true).text(resultPath) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/resources/application.conf: -------------------------------------------------------------------------------- 1 | { 2 | # Spark relation config 3 | spark: { 4 | app: { 5 | name: LPA 6 | # spark.app.partitionNum 7 | partitionNum:100 8 | } 9 | master:local 10 | } 11 | 12 | data: { 13 | # data source. optional of nebula,csv,json,parquet 14 | source: csv 15 | # data sink, means the algorithm result will be write into this sink. optional of nebula,csv,txt 16 | sink: nebula 17 | # if your algorithm needs weight 18 | hasWeight: false 19 | } 20 | 21 | # Nebula Graph relation config 22 | nebula: { 23 | # algo's data source from Nebula 24 | read: { 25 | # Nebula metad server address, multiple addresses are split by English comma 26 | metaAddress: "127.0.0.1:9559" 27 | # Nebula space 28 | space: nb 29 | # Nebula edge types, multiple labels means that data from multiple edges will union together 30 | labels: ["serve"] 31 | # Nebula edge property name for each edge type, this property will be as weight col for algorithm. 32 | # Make sure the weightCols are corresponding to labels. 33 | weightCols: ["start_year"] 34 | } 35 | 36 | # algo result sink into Nebula 37 | write:{ 38 | # Nebula graphd server address, multiple addresses are split by English comma 39 | graphAddress: "127.0.0.1:9669" 40 | # Nebula metad server address, multiple addresses are split by English comma 41 | metaAddress: "127.0.0.1:9559,127.0.0.1:9560" 42 | user:root 43 | pswd:nebula 44 | # Nebula space name 45 | space:nb 46 | # Nebula tag name, the algorithm result will be write into this tag 47 | tag:pagerank 48 | } 49 | } 50 | 51 | local: { 52 | # algo's data source from Nebula 53 | read:{ 54 | filePath: "hdfs://127.0.0.1:9000/edge/work_for.csv" 55 | # srcId column 56 | srcId:"_c0" 57 | # dstId column 58 | dstId:"_c1" 59 | # weight column 60 | #weight: "col3" 61 | # if csv file has header 62 | header: false 63 | # csv file's delimiter 64 | delimiter:"," 65 | } 66 | 67 | # algo result sink into local file 68 | write:{ 69 | resultPath:/tmp/ 70 | } 71 | } 72 | 73 | 74 | algorithm: { 75 | # the algorithm that you are going to execute,pick one from [pagerank, louvain, connectedcomponent, 76 | # labelpropagation, shortestpaths, degreestatic, kcore, stronglyconnectedcomponent, trianglecount, 77 | # betweenness] 78 | executeAlgo: pagerank 79 | 80 | # pagerank parameter 81 | pagerank: { 82 | maxIter: 10 83 | resetProb: 0.15 # default 0.15 84 | } 85 | 86 | # louvain parameter 87 | louvain: { 88 | maxIter: 20 89 | internalIter: 10 90 | tol: 0.5 91 | } 92 | 93 | # connected component parameter TODO not implemented yet. 94 | connectedcomponent: { 95 | maxIter: 20 96 | } 97 | 98 | # LabelPropagation 99 | labelpropagation: { 100 | maxIter: 20 101 | } 102 | 103 | # ShortestPaths 104 | shortestpaths: { 105 | # several vertices to compute the shortest path to all vertices. 106 | landmarks: "1" 107 | } 108 | 109 | # vertex degree static 110 | degreestatic: {} 111 | 112 | # kcore 113 | kcore:{ 114 | maxIter:10 115 | degree:1 116 | } 117 | 118 | # trianglecount 119 | trianglecount:{} 120 | 121 | # betweenness centrality 122 | betweenness:{ 123 | maxIter:5 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/resources/edge.csv: -------------------------------------------------------------------------------- 1 | src,dst,weight 2 | 1,1,5.0 3 | 1,2,1.0 4 | 1,3,5.0 5 | 1,4,1.0 6 | 2,1,5.0 7 | 2,2,1.0 8 | 2,3,5.0 9 | 2,4,1.0 10 | 3,1,1.0 11 | 3,2,5.0 12 | 3,3,1.0 13 | 3,4,5.0 14 | 4,1,1.0 15 | 4,2,5.0 16 | 4,3,1.0 17 | 4,4,5.0 -------------------------------------------------------------------------------- /nebula-algorithm/src/test/resources/edge_noWeight.csv: -------------------------------------------------------------------------------- 1 | 1 2 2 | 1 7 3 | 1 4 4 | 2 0 5 | 2 4 6 | 2 5 7 | 2 6 8 | 3 0 9 | 3 7 10 | 4 0 11 | 4 10 12 | 5 7 13 | 5 11 14 | 6 7 15 | 6 11 16 | 8 9 17 | 8 10 18 | 8 11 19 | 9 12 20 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/config/ConfigSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.config 8 | 9 | import com.vesoft.nebula.algorithm.config.Configs.Argument 10 | import org.junit.Test 11 | 12 | import scala.collection.mutable.ListBuffer 13 | 14 | class ConfigSuite { 15 | 16 | var configs: Configs = _ 17 | 18 | @Test 19 | def getConfigsSuite(): Unit = { 20 | val args: ListBuffer[String] = new ListBuffer[String] 21 | args.append("-p") 22 | args.append("src/test/resources/application.conf") 23 | try { 24 | val options = Configs.parser(args.toArray, "TestProgram") 25 | val p: Argument = options match { 26 | case Some(config) => config 27 | case _ => 28 | assert(false) 29 | sys.exit(-1) 30 | } 31 | configs = Configs.parse(p.config) 32 | } catch { 33 | case e: Exception => { 34 | e.printStackTrace() 35 | assert(false) 36 | } 37 | } 38 | 39 | } 40 | 41 | @Test 42 | def getSparkConfigSuite(): Unit = { 43 | if (configs == null) { 44 | getConfigsSuite() 45 | } 46 | val sparkConfig = configs.sparkConfig 47 | assert(sparkConfig.map.size == 3) 48 | 49 | val spark = SparkConfig.getSpark(configs) 50 | assert(spark.partitionNum.toInt == 100) 51 | } 52 | 53 | @Test 54 | def getSourceSinkConfigSuite(): Unit = { 55 | if (configs == null) { 56 | getConfigsSuite() 57 | } 58 | val dataSourceSinkEntry = configs.dataSourceSinkEntry 59 | assert(dataSourceSinkEntry.source.equals("csv")) 60 | assert(dataSourceSinkEntry.sink.equals("nebula")) 61 | assert(!dataSourceSinkEntry.hasWeight) 62 | } 63 | @Test 64 | def getNebulaConfigSuite(): Unit = { 65 | if (configs == null) { 66 | getConfigsSuite() 67 | } 68 | val nebulaConfigEntry = configs.nebulaConfig 69 | val writeConfig = nebulaConfigEntry.writeConfigEntry 70 | assert(writeConfig.graphAddress.equals("127.0.0.1:9669")) 71 | assert(writeConfig.metaAddress.equals("127.0.0.1:9559,127.0.0.1:9560")) 72 | assert(writeConfig.space.equals("nb")) 73 | assert(writeConfig.tag.equals("pagerank")) 74 | assert(writeConfig.user.equals("root")) 75 | assert(writeConfig.pswd.equals("nebula")) 76 | 77 | val readConfig = nebulaConfigEntry.readConfigEntry 78 | assert(readConfig.address.equals("127.0.0.1:9559")) 79 | assert(readConfig.space.equals("nb")) 80 | assert(readConfig.labels.size == 1) 81 | assert(readConfig.weightCols.size == 1) 82 | } 83 | 84 | @Test 85 | def getLocalConfigSuite(): Unit = { 86 | if (configs == null) { 87 | getConfigsSuite() 88 | } 89 | val localConfigEntry = configs.localConfigEntry 90 | assert(localConfigEntry.filePath.startsWith("hdfs://")) 91 | assert(localConfigEntry.srcId.equals("_c0")) 92 | assert(localConfigEntry.dstId.equals("_c1")) 93 | assert(localConfigEntry.weight == null) 94 | assert(!localConfigEntry.header) 95 | assert(localConfigEntry.delimiter.equals(",")) 96 | assert(localConfigEntry.resultPath.equals("/tmp/")) 97 | } 98 | 99 | @Test 100 | def getAlgoConfigSuite(): Unit = { 101 | if (configs == null) { 102 | getConfigsSuite() 103 | } 104 | val algoConfig = configs.algorithmConfig 105 | val algoName = AlgoConfig.getAlgoName(configs) 106 | assert(algoName.equals("pagerank")) 107 | 108 | val prConfig = PRConfig.getPRConfig(configs) 109 | assert(prConfig.maxIter == 10) 110 | assert(prConfig.resetProb < 0.150000001) 111 | 112 | val louvainConfig = LouvainConfig.getLouvainConfig(configs) 113 | assert(louvainConfig.maxIter == 20) 114 | assert(louvainConfig.internalIter == 10) 115 | assert(louvainConfig.tol < 0.5000001) 116 | 117 | val ccConfig = CcConfig.getCcConfig(configs) 118 | assert(ccConfig.maxIter == 20) 119 | 120 | val lpaConfig = LPAConfig.getLPAConfig(configs) 121 | assert(lpaConfig.maxIter == 20) 122 | 123 | val shortestPathConfig = ShortestPathConfig.getShortestPathConfig(configs) 124 | assert(shortestPathConfig.landmarks.size == 1) 125 | 126 | val kcoreConfig = KCoreConfig.getKCoreConfig(configs) 127 | assert(kcoreConfig.maxIter == 10) 128 | assert(kcoreConfig.degree == 1) 129 | 130 | val betweennessConfig = BetweennessConfig.getBetweennessConfig(configs) 131 | assert(betweennessConfig.maxIter == 5) 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/data/MockNebulaData.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.data 8 | 9 | object MockNebulaData {} 10 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/BetweennessAlgoSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.BetweennessConfig 10 | import org.apache.spark.sql.SparkSession 11 | import org.junit.Test 12 | 13 | class BetweennessAlgoSuite { 14 | @Test 15 | def betweennessAlgoSuite(): Unit = { 16 | val spark = SparkSession.builder().master("local").getOrCreate() 17 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 18 | val betweennessConfig = new BetweennessConfig(5) 19 | val result = BetweennessCentralityAlgo.apply(spark, data, betweennessConfig, false) 20 | assert(result.count() == 4) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/CcAlgoSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.CcConfig 10 | import org.apache.spark.sql.SparkSession 11 | import org.junit.Test 12 | 13 | class CcAlgoSuite { 14 | @Test 15 | def ccAlgoSuite(): Unit = { 16 | val spark = SparkSession.builder().master("local").getOrCreate() 17 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 18 | val ccAlgoConfig = new CcConfig(5) 19 | val result = ConnectedComponentsAlgo.apply(spark, data, ccAlgoConfig, false) 20 | assert(result.count() == 4) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/DegreeStaticAlgoSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import org.apache.spark.sql.SparkSession 10 | import org.junit.Test 11 | 12 | class DegreeStaticAlgoSuite { 13 | @Test 14 | def degreeStaticAlgoSuite(): Unit = { 15 | val spark = SparkSession.builder().master("local").getOrCreate() 16 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 17 | val result = DegreeStaticAlgo.apply(spark, data) 18 | assert(result.count() == 4) 19 | result.foreach(row => { 20 | assert(row.get(1).toString.toInt == 8) 21 | assert(row.get(2).toString.toInt == 4) 22 | assert(row.get(3).toString.toInt == 4) 23 | }) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/KCoreAlgoSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.KCoreConfig 10 | import org.apache.spark.sql.SparkSession 11 | import org.junit.Test 12 | 13 | class KCoreAlgoSuite { 14 | @Test 15 | def kcoreSuite(): Unit = { 16 | val spark = SparkSession.builder().master("local").getOrCreate() 17 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 18 | val kcoreConfig = new KCoreConfig(10, 3) 19 | val kcoreResult = KCoreAlgo.apply(spark, data, kcoreConfig) 20 | assert(kcoreResult.count() == 4) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/LabelPropagationAlgoSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.LPAConfig 10 | import org.apache.spark.sql.SparkSession 11 | import org.junit.Test 12 | 13 | class LabelPropagationAlgoSuite { 14 | @Test 15 | def lpaAlgoSuite(): Unit = { 16 | val spark = SparkSession.builder().master("local").getOrCreate() 17 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 18 | val lpaConfig = new LPAConfig(5) 19 | val result = LabelPropagationAlgo.apply(spark, data, lpaConfig, false) 20 | assert(result.count() == 4) 21 | result.foreach(row => { 22 | assert(row.get(1).toString.toInt == 1) 23 | }) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/LouvainAlgoSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.{ConfigSuite, Configs, LouvainConfig, SparkConfig} 10 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 11 | import org.junit.Test 12 | 13 | class LouvainAlgoSuite { 14 | @Test 15 | def louvainSuite(): Unit = { 16 | val spark = SparkSession.builder().master("local").getOrCreate() 17 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 18 | val louvainConfig = new LouvainConfig(5, 2, 1.0) 19 | val louvainResult = LouvainAlgo.apply(spark, data, louvainConfig, false) 20 | assert(louvainResult.count() == 4) 21 | 22 | val dataWithoutWith = spark.read 23 | .option("header", false) 24 | .option("delimiter", " ") 25 | .csv("src/test/resources/edge_noWeight.csv") 26 | .select("_c0", "_c1") 27 | val louvainResult1 = LouvainAlgo.apply(spark, dataWithoutWith, louvainConfig, false) 28 | assert(louvainResult1.count() == 13) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/PageRankAlgoSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.{Configs, PRConfig, SparkConfig} 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | import org.junit.Test 12 | 13 | class PageRankAlgoSuite { 14 | @Test 15 | def pageRankSuite(): Unit = { 16 | val spark = SparkSession.builder().master("local").getOrCreate() 17 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 18 | val prConfig = new PRConfig(5, 1.0) 19 | val louvainResult = PageRankAlgo.apply(spark, data, prConfig, false) 20 | assert(louvainResult.count() == 4) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/SCCAlgoSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.CcConfig 10 | import org.apache.spark.sql.SparkSession 11 | import org.junit.Test 12 | 13 | class SCCAlgoSuite { 14 | @Test 15 | def sccAlgoSuite(): Unit = { 16 | val spark = SparkSession.builder().master("local").getOrCreate() 17 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 18 | val sccConfig = new CcConfig(5) 19 | val sccResult = StronglyConnectedComponentsAlgo.apply(spark, data, sccConfig, true) 20 | assert(sccResult.count() == 4) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/ShortestPathAlgoSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import com.vesoft.nebula.algorithm.config.ShortestPathConfig 10 | import org.apache.spark.sql.SparkSession 11 | import org.junit.Test 12 | 13 | class ShortestPathAlgoSuite { 14 | @Test 15 | def shortestPathAlgoSuite(): Unit = { 16 | val spark = SparkSession.builder().master("local").getOrCreate() 17 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 18 | val shortestPathConfig = new ShortestPathConfig(Seq(1, 2)) 19 | val result = ShortestPathAlgo.apply(spark, data, shortestPathConfig, false) 20 | assert(result.count() == 4) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/TrangleCountSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.lib 8 | 9 | import org.apache.spark.sql.SparkSession 10 | import org.junit.Test 11 | 12 | class TrangleCountSuite { 13 | @Test 14 | def trangleCountSuite(): Unit = { 15 | val spark = SparkSession.builder().master("local").getOrCreate() 16 | val data = spark.read.option("header", true).csv("src/test/resources/edge.csv") 17 | val trangleCountResult = TriangleCountAlgo.apply(spark, data) 18 | assert(trangleCountResult.count() == 4) 19 | assert(trangleCountResult.first().get(1) == 3) 20 | trangleCountResult.foreach(row => { 21 | assert(row.get(1) == 3) 22 | }) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/utils/NebulaUtilSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.algorithm.utils 8 | 9 | import org.junit.Test 10 | 11 | class NebulaUtilSuite { 12 | 13 | @Test 14 | def validateWithWeight: Unit = { 15 | val hostPorts: String = "127.0.0.1:9559" 16 | val nameSpace: String = "nb" 17 | val labels: List[String] = List("serve", "follow") 18 | val hasWeight: Boolean = true 19 | val weightCols: List[String] = List("start_year", "degree") 20 | } 21 | 22 | @Test 23 | def validateWithoutWeight: Unit = { 24 | val hostPorts: String = "127.0.0.1:9559" 25 | val nameSpace: String = "nb" 26 | val labels: List[String] = List("serve") 27 | val hasWeight: Boolean = false 28 | val weightCols: List[String] = List() 29 | } 30 | 31 | @Test 32 | def getResultPathWithEnding: Unit = { 33 | val path: String = "/tmp/" 34 | val algorithmName: String = "aaa" 35 | assert(NebulaUtil.getResultPath(path, algorithmName).equals("/tmp/aaa")) 36 | } 37 | 38 | @Test 39 | def getResultPathWithoutEnding: Unit = { 40 | val path: String = "/tmp" 41 | val algorithmName: String = "aaa" 42 | assert(NebulaUtil.getResultPath(path, algorithmName).equals("/tmp/aaa")) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /nebula-exchange/.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | 25 | # build target 26 | target/ 27 | 28 | # IDE 29 | .idea/ 30 | .eclipse/ 31 | *.iml 32 | 33 | spark-importer.ipr 34 | spark-importer.iws 35 | 36 | .DS_Store 37 | -------------------------------------------------------------------------------- /nebula-exchange/README-CN.md: -------------------------------------------------------------------------------- 1 | # 欢迎使用 Nebula Exchange 2.0 2 | [English](https://github.com/vesoft-inc/nebula-spark-utils/blob/master/nebula-exchange/README.md) 3 | 4 | Nebula Exchange 2.0(简称为 Exchange 2.0)是一款 Apache Spark™ 应用,用于在分布式环境中将集群中的数据批量迁移到 Nebula Graph 中,能支持多种不同格式的批式数据和流式数据的迁移。 5 | 6 | Exchange 2.0 仅支持 Nebula Graph 2.x。 7 | 8 | 如果您正在使用 Nebula Graph v1.x,请使用 [Nebula Exchange v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/exchange) ,或参考 Exchange 1.0 的使用文档[《Nebula Exchange 用户手册》](https://docs.nebula-graph.com.cn/nebula-exchange/about-exchange/ex-ug-what-is-exchange/ "点击前往 Nebula Graph 网站")。 9 | 10 | ## 如何获取 11 | 12 | 1. 编译打包最新的 Exchange。 13 | 14 | ```bash 15 | $ git clone https://github.com/vesoft-inc/nebula-spark-utils.git 16 | $ cd nebula-spark-utils/nebula-exchange 17 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true 18 | ``` 19 | 20 | 编译打包完成后,可以在 nebula-spark-utils/nebula-exchange/target/ 目录下看到 nebula-exchange-2.0-SNAPSHOT.jar 文件。 21 | 2. 在 Maven 远程仓库下载 22 | 23 | https://repo1.maven.org/maven2/com/vesoft/nebula-exchange/ 24 | ## 使用说明 25 | 26 | 特性 & 注意事项: 27 | 28 | *1. Nebula Graph 2.0 支持 String 类型和 Integer 类型的点 id 。* 29 | 30 | *2. Exchange 2.0 新增 null、Date、DateTime、Time 类型数据的导入( DateTime 是 UTC 时区,非 Local time)。* 31 | 32 | *3. Exchange 2.0 支持 Hive on Spark 以外的 Hive 数据源,需在配置文件中配置 Hive 源,具体配置示例参考 [application.conf](https://github.com/vesoft-inc/nebula-spark-utils/tree/master/nebula-exchange/src/main/resources/application.conf) 中 Hive 的配置。* 33 | 34 | *4. Exchange 2.0 将导入失败的 INSERT 语句进行落盘,存于配置文件的 error/output 路径中。* 35 | 36 | *5. 配置文件参考 [application.conf](https://github.com/vesoft-inc/nebula-spark-utils/tree/master/nebula-exchange/src/main/resources/application.conf )。* 37 | 38 | *6. Exchange 2.0 的导入命令:* 39 | ``` 40 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange --master local nebula-exchange-2.0.0.jar -c /path/to/application.conf 41 | ``` 42 | 如果数据源有HIVE,则导入命令最后还需要加 `-h` 表示启用HIVE数据源。 43 | 44 | 注:在Yarn-Cluster模式下提交 Exchange,请使用如下提交命令: 45 | ``` 46 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \ 47 | --master yarn-cluster \ 48 | --files application.conf \ 49 | --conf spark.driver.extraClassPath=./ \ 50 | --conf spark.executor.extraClassPath=./ \ 51 | nebula-exchange-2.0.0.jar \ 52 | -c application.conf 53 | ``` 54 | 55 | 关于 Nebula Exchange 的更多说明,请参考 Exchange 2.0 的[使用手册](https://docs.nebula-graph.com.cn/2.0.1/nebula-exchange/about-exchange/ex-ug-what-is-exchange/) 。 56 | 57 | ## 贡献 58 | 59 | Nebula Exchange 2.0 是一个完全开源的项目,欢迎开源爱好者通过以下方式参与: 60 | 61 | - 前往 [Nebula Graph 论坛](https://discuss.nebula-graph.com.cn/ "点击前往“Nebula Graph 论坛") 上参与 Issue 讨论,如答疑、提供想法或者报告无法解决的问题 62 | - 撰写或改进文档 63 | - 提交优化代码 64 | -------------------------------------------------------------------------------- /nebula-exchange/README.md: -------------------------------------------------------------------------------- 1 | # Nebula Exchange 2.0 2 | [中文版](https://github.com/vesoft-inc/nebula-spark-utils/blob/master/nebula-exchange/README-CN.md) 3 | 4 | Nebula Exchange (Exchange for short) is an Apache Spark application. It is used to migrate cluster data in bulk from Spark to Nebula Graph in a distributed environment. It supports migration of batch data and streaming data in various formats. 5 | 6 | Exchange 2.0 only supports Nebula Graph 2.0 . If you want to import data for Nebula Graph v1.x,please use [Nebula Exchange v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/exchange). 7 | 8 | ## How to get 9 | 10 | 1. Package latest Exchange。 11 | 12 | ```bash 13 | $ git clone https://github.com/vesoft-inc/nebula-spark-utils.git 14 | $ cd nebula-spark-utils/nebula-exchange 15 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true 16 | ``` 17 | 18 | After the packaging, you can see the newly generated nebula-exchange-2.0-SNAPSHOT.jar under the nebula-spark-utils/nebula-exchange/target/ directory. 19 | 2. Download from Maven repository 20 | 21 | https://repo1.maven.org/maven2/com/vesoft/nebula-exchange/ 22 | ## How to use 23 | 24 | Import command: 25 | ``` 26 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange --master local nebula-exchange-2.0.0.jar -c /path/to/application.conf 27 | ``` 28 | If your source is HIVE, import command is: 29 | ``` 30 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange --master local nebula-exchange-2.0.0.jar -c /path/to/application.conf -h 31 | ``` 32 | 33 | Note:Submit Exchange with Yarn-Cluster mode, please use following command: 34 | ``` 35 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \ 36 | --master yarn-cluster \ 37 | --files application.conf \ 38 | --conf spark.driver.extraClassPath=./ \ 39 | --conf spark.executor.extraClassPath=./ \ 40 | nebula-exchange-2.0.0.jar \ 41 | -c application.conf 42 | ``` 43 | 44 | For more details about Exchange, please refer to [Exchange 2.0](https://docs.nebula-graph.io/2.0.1/16.eco-tools/1.nebula-exchange/) . 45 | 46 | 47 | ## New Features 48 | 49 | 1. Supports importing vertex data with String and Integer type IDs. 50 | 2. Supports importing data of the Null, Date, DateTime, and Time types(DateTime uses UTC, not local time). 51 | 3. Supports importing data from other Hive sources besides Hive on Spark. 52 | 4. Supports recording and retrying the INSERT statement after failures during data import. 53 | 54 | Refer to [application.conf](https://github.com/vesoft-inc/nebula-spark-utils/tree/master/nebula-exchange/src/main/resources/application.conf) as an example to edit the configuration file. 55 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/CheckPointHandler.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange 8 | 9 | import com.vesoft.nebula.exchange.config.{SourceCategory} 10 | import com.vesoft.nebula.exchange.utils.HDFSUtils 11 | import com.vesoft.nebula.exchange.config.SchemaConfigEntry 12 | import org.apache.spark.TaskContext 13 | 14 | /** 15 | * CheckPointHandler handle the checkpoint files for Neo4j and Janusgraph 16 | */ 17 | object CheckPointHandler { 18 | 19 | def checkSupportResume(value: SourceCategory.Value): Boolean = { 20 | value match { 21 | case SourceCategory.NEO4J => true 22 | case SourceCategory.JANUS_GRAPH => true 23 | case _ => false 24 | } 25 | } 26 | 27 | def getPathAndOffset(schemaConfig: SchemaConfigEntry, 28 | breakPointCount: Long): Option[(String, Long)] = { 29 | val partitionId = TaskContext.getPartitionId() 30 | if (checkSupportResume(schemaConfig.dataSourceConfigEntry.category) && schemaConfig.checkPointPath.isDefined) { 31 | val path = s"${schemaConfig.checkPointPath.get}/${schemaConfig.name}.${partitionId}" 32 | val offset = breakPointCount + fetchOffset(path) 33 | Some((path, offset)) 34 | } else { 35 | None 36 | } 37 | } 38 | 39 | def fetchOffset(path: String): Long = { 40 | HDFSUtils.getContent(path).toLong 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/ErrorHandler.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange 8 | 9 | import org.apache.hadoop.conf.Configuration 10 | import org.apache.hadoop.fs.{FileSystem, Path} 11 | import org.apache.log4j.Logger 12 | 13 | import scala.collection.mutable.ArrayBuffer 14 | 15 | object ErrorHandler { 16 | @transient 17 | private[this] val LOG = Logger.getLogger(this.getClass) 18 | 19 | /** 20 | * clean all the failed data for error path before reload. 21 | * 22 | * @param path path to clean 23 | */ 24 | def clear(path: String): Unit = { 25 | try { 26 | val fileSystem = FileSystem.get(new Configuration()) 27 | val filesStatus = fileSystem.listStatus(new Path(path)) 28 | for (file <- filesStatus) { 29 | if (!file.getPath.getName.startsWith("reload.")) { 30 | fileSystem.delete(file.getPath, true) 31 | } 32 | } 33 | } catch { 34 | case e: Throwable => { 35 | LOG.error(s"$path cannot be clean, but this error does not affect the import result, " + 36 | s"you can only focus on the reload files.", 37 | e) 38 | } 39 | } 40 | } 41 | 42 | /** 43 | * save the failed execute statement. 44 | * 45 | * @param buffer buffer saved failed ngql 46 | * @param path path to write these buffer ngql 47 | */ 48 | def save(buffer: ArrayBuffer[String], path: String): Unit = { 49 | LOG.info(s"create reload path $path") 50 | val fileSystem = FileSystem.get(new Configuration()) 51 | val errors = fileSystem.create(new Path(path)) 52 | 53 | try { 54 | for (error <- buffer) { 55 | errors.writeBytes(error) 56 | errors.writeBytes("\n") 57 | } 58 | } finally { 59 | errors.close() 60 | } 61 | } 62 | 63 | /** 64 | * check if path exists 65 | * 66 | * @param path error path 67 | *@return true if path exists 68 | */ 69 | def existError(path: String): Boolean = { 70 | val fileSystem = FileSystem.get(new Configuration()) 71 | fileSystem.exists(new Path(path)) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/GraphProvider.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange 8 | 9 | import com.google.common.net.HostAndPort 10 | import com.vesoft.nebula.client.graph.NebulaPoolConfig 11 | import com.vesoft.nebula.client.graph.data.{HostAddress, ResultSet} 12 | import com.vesoft.nebula.client.graph.net.{NebulaPool, Session} 13 | import com.vesoft.nebula.exchange.config.UserConfigEntry 14 | import org.apache.log4j.Logger 15 | 16 | import scala.collection.JavaConverters._ 17 | import scala.collection.mutable.ListBuffer 18 | 19 | /** 20 | * GraphProvider for Nebula Graph Service 21 | */ 22 | class GraphProvider(addresses: List[HostAndPort], timeout: Int) 23 | extends AutoCloseable 24 | with Serializable { 25 | private[this] lazy val LOG = Logger.getLogger(this.getClass) 26 | 27 | @transient val nebulaPoolConfig = new NebulaPoolConfig 28 | @transient val pool: NebulaPool = new NebulaPool 29 | val address = new ListBuffer[HostAddress]() 30 | for (addr <- addresses) { 31 | address.append(new HostAddress(addr.getHostText, addr.getPort)) 32 | } 33 | val randAddr = scala.util.Random.shuffle(address) 34 | 35 | nebulaPoolConfig.setTimeout(timeout) 36 | pool.init(randAddr.asJava, nebulaPoolConfig) 37 | 38 | def getGraphClient(userConfigEntry: UserConfigEntry): Session = { 39 | pool.getSession(userConfigEntry.user, userConfigEntry.password, true); 40 | } 41 | 42 | def releaseGraphClient(session: Session): Unit = { 43 | session.release() 44 | } 45 | 46 | override def close(): Unit = { 47 | pool.close() 48 | } 49 | 50 | def switchSpace(session: Session, space: String): ResultSet = { 51 | val switchStatment = s"use $space" 52 | LOG.info(s"switch space $space") 53 | val result = submit(session, switchStatment) 54 | result 55 | } 56 | 57 | def submit(session: Session, statement: String): ResultSet = { 58 | session.execute(statement) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/MetaProvider.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange 8 | 9 | import com.google.common.net.HostAndPort 10 | import com.vesoft.nebula.client.graph.data.HostAddress 11 | import com.vesoft.nebula.client.meta.MetaClient 12 | import com.vesoft.nebula.exchange.config.Type 13 | import com.vesoft.nebula.meta.{EdgeItem, PropertyType, TagItem} 14 | import org.apache.log4j.Logger 15 | 16 | import scala.collection.JavaConverters._ 17 | import scala.collection.mutable 18 | import scala.collection.mutable.ListBuffer 19 | 20 | /** 21 | * MetaProvider provide nebula graph meta query operations. 22 | */ 23 | class MetaProvider(addresses: List[HostAndPort], timeout: Int, retry: Int) 24 | extends AutoCloseable 25 | with Serializable { 26 | private[this] lazy val LOG = Logger.getLogger(this.getClass) 27 | 28 | val address: ListBuffer[HostAddress] = new ListBuffer[HostAddress] 29 | for (addr <- addresses) { 30 | address.append(new HostAddress(addr.getHostText, addr.getPort)) 31 | } 32 | 33 | private val metaClient = new MetaClient(address.asJava) 34 | metaClient.connect() 35 | 36 | def getPartNumber(space: String): Int = { 37 | metaClient.getPartsAlloc(space).size() 38 | } 39 | 40 | def getVidType(space: String): VidType.Value = { 41 | val vidType = metaClient.getSpace(space).getProperties.getVid_type.getType 42 | if (vidType == PropertyType.FIXED_STRING) { 43 | return VidType.STRING 44 | } 45 | VidType.INT 46 | } 47 | 48 | def getTagSchema(space: String, tag: String): Map[String, Integer] = { 49 | val tagSchema = metaClient.getTag(space, tag) 50 | val schema = new mutable.HashMap[String, Integer] 51 | 52 | val columns = tagSchema.getColumns 53 | for (colDef <- columns.asScala) { 54 | schema.put(new String(colDef.getName), colDef.getType.getType.getValue) 55 | } 56 | schema.toMap 57 | } 58 | 59 | def getEdgeSchema(space: String, edge: String): Map[String, Integer] = { 60 | val edgeSchema = metaClient.getEdge(space, edge) 61 | val schema = new mutable.HashMap[String, Integer] 62 | 63 | val columns = edgeSchema.getColumns 64 | for (colDef <- columns.asScala) { 65 | schema.put(new String(colDef.getName), colDef.getType.getType.getValue) 66 | } 67 | schema.toMap 68 | } 69 | 70 | def getLabelType(space: String, label: String): Type.Value = { 71 | val tags = metaClient.getTags(space) 72 | for (tag <- tags.asScala) { 73 | if (new String(tag.getTag_name).equals(label)) { 74 | return Type.VERTEX 75 | } 76 | } 77 | val edges = metaClient.getEdges(space) 78 | for (edge <- edges.asScala) { 79 | if (new String(edge.getEdge_name).equals(label)) { 80 | return Type.EDGE 81 | } 82 | } 83 | null 84 | } 85 | 86 | def getSpaceVidLen(space: String): Int = { 87 | val spaceItem = metaClient.getSpace(space); 88 | if (spaceItem == null) { 89 | throw new IllegalArgumentException(s"space $space does not exist.") 90 | } 91 | spaceItem.getProperties.getVid_type.getType_length 92 | } 93 | 94 | def getTagItem(space: String, tag: String): TagItem = { 95 | val tagItemList = metaClient.getTags(space).asScala 96 | for (tagItem: TagItem <- tagItemList) { 97 | if (new String(tagItem.tag_name).equals(tag)) { 98 | return tagItem 99 | } 100 | } 101 | throw new IllegalArgumentException(s"tag ${space}.${tag} does not exist.") 102 | } 103 | 104 | def getEdgeItem(space: String, edge: String): EdgeItem = { 105 | val edgeItemList = metaClient.getEdges(space).asScala 106 | for (edgeItem: EdgeItem <- edgeItemList) { 107 | if (new String(edgeItem.edge_name).equals(edge)) { 108 | return edgeItem 109 | } 110 | } 111 | throw new IllegalArgumentException(s"edge ${space}.${edge} does not exist.") 112 | } 113 | 114 | override def close(): Unit = { 115 | metaClient.close() 116 | } 117 | 118 | } 119 | 120 | object VidType extends Enumeration { 121 | type Type = Value 122 | 123 | val STRING = Value("STRING") 124 | val INT = Value("INT") 125 | } 126 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SchemaConfigs.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.config 8 | 9 | import com.vesoft.nebula.exchange.KeyPolicy 10 | 11 | /** 12 | * SchemaConfigEntry is tag/edge super class use to save some basic parameter for importer. 13 | */ 14 | sealed trait SchemaConfigEntry { 15 | 16 | /** nebula tag or edge name */ 17 | def name: String 18 | 19 | /** see{@link DataSourceConfigEntry}*/ 20 | def dataSourceConfigEntry: DataSourceConfigEntry 21 | 22 | /** see{@link DataSinkConfigEntry}*/ 23 | def dataSinkConfigEntry: DataSinkConfigEntry 24 | 25 | /** data source fields which are going to be import to nebula as properties */ 26 | def fields: List[String] 27 | 28 | /** nebula properties which are going to fill value with data source value*/ 29 | def nebulaFields: List[String] 30 | 31 | /** vertex or edge amount of one batch import */ 32 | def batch: Int 33 | 34 | /** spark partition */ 35 | def partition: Int 36 | 37 | /** check point path */ 38 | def checkPointPath: Option[String] 39 | } 40 | 41 | /** 42 | * 43 | * @param name 44 | * @param dataSourceConfigEntry 45 | * @param dataSinkConfigEntry 46 | * @param fields 47 | * @param nebulaFields 48 | * @param vertexField 49 | * @param vertexPolicy 50 | * @param batch 51 | * @param partition 52 | * @param checkPointPath 53 | */ 54 | case class TagConfigEntry(override val name: String, 55 | override val dataSourceConfigEntry: DataSourceConfigEntry, 56 | override val dataSinkConfigEntry: DataSinkConfigEntry, 57 | override val fields: List[String], 58 | override val nebulaFields: List[String], 59 | vertexField: String, 60 | vertexPolicy: Option[KeyPolicy.Value], 61 | override val batch: Int, 62 | override val partition: Int, 63 | override val checkPointPath: Option[String]) 64 | extends SchemaConfigEntry { 65 | require(name.trim.nonEmpty && vertexField.trim.nonEmpty && batch > 0) 66 | 67 | override def toString: String = { 68 | s"Tag name: $name, " + 69 | s"source: $dataSourceConfigEntry, " + 70 | s"sink: $dataSinkConfigEntry, " + 71 | s"vertex field: $vertexField, " + 72 | s"vertex policy: $vertexPolicy, " + 73 | s"batch: $batch, " + 74 | s"partition: $partition." 75 | } 76 | } 77 | 78 | /** 79 | * 80 | * @param name 81 | * @param dataSourceConfigEntry 82 | * @param dataSinkConfigEntry 83 | * @param fields 84 | * @param nebulaFields 85 | * @param sourceField 86 | * @param sourcePolicy 87 | * @param rankingField 88 | * @param targetField 89 | * @param targetPolicy 90 | * @param isGeo 91 | * @param latitude 92 | * @param longitude 93 | * @param batch 94 | * @param partition 95 | * @param checkPointPath 96 | */ 97 | case class EdgeConfigEntry(override val name: String, 98 | override val dataSourceConfigEntry: DataSourceConfigEntry, 99 | override val dataSinkConfigEntry: DataSinkConfigEntry, 100 | override val fields: List[String], 101 | override val nebulaFields: List[String], 102 | sourceField: String, 103 | sourcePolicy: Option[KeyPolicy.Value], 104 | rankingField: Option[String], 105 | targetField: String, 106 | targetPolicy: Option[KeyPolicy.Value], 107 | isGeo: Boolean, 108 | latitude: Option[String], 109 | longitude: Option[String], 110 | override val batch: Int, 111 | override val partition: Int, 112 | override val checkPointPath: Option[String]) 113 | extends SchemaConfigEntry { 114 | require( 115 | name.trim.nonEmpty && sourceField.trim.nonEmpty && 116 | targetField.trim.nonEmpty && batch > 0) 117 | 118 | override def toString: String = { 119 | if (isGeo) { 120 | s"Edge name: $name, " + 121 | s"source: $dataSourceConfigEntry, " + 122 | s"sink: $dataSinkConfigEntry, " + 123 | s"latitude: $latitude, " + 124 | s"longitude: $longitude, " + 125 | s"source field: $sourceField, " + 126 | s"source policy: $sourcePolicy, " + 127 | s"ranking: $rankingField, " + 128 | s"target field: $targetField, " + 129 | s"target policy: $targetPolicy, " + 130 | s"batch: $batch, " + 131 | s"partition: $partition." 132 | } else { 133 | s"Edge name: $name, " + 134 | s"source: $dataSourceConfigEntry, " + 135 | s"sink: $dataSinkConfigEntry, " + 136 | s"source field: $sourceField, " + 137 | s"source policy: $sourcePolicy, " + 138 | s"ranking: $rankingField, " + 139 | s"target field: $targetField, " + 140 | s"target policy: $targetPolicy, " + 141 | s"batch: $batch, " + 142 | s"partition: $partition." 143 | } 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SinkConfigs.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.config 8 | 9 | /** 10 | * SinkCategory is used to expression the writer's type. 11 | */ 12 | object SinkCategory extends Enumeration { 13 | type Type = Value 14 | 15 | val CLIENT = Value("CLIENT") 16 | val SST = Value("SST") 17 | } 18 | 19 | class SinkCategory 20 | 21 | /** 22 | * DataSinkConfigEntry 23 | */ 24 | sealed trait DataSinkConfigEntry { 25 | def category: SinkCategory.Value 26 | } 27 | 28 | /** 29 | * FileBaseSinkConfigEntry 30 | */ 31 | case class FileBaseSinkConfigEntry(override val category: SinkCategory.Value, 32 | localPath: String, 33 | remotePath: String, 34 | fsName: Option[String]) 35 | extends DataSinkConfigEntry { 36 | override def toString: String = { 37 | s"File sink: from ${localPath} to ${fsName.get}${remotePath}" 38 | } 39 | } 40 | 41 | /** 42 | * NebulaSinkConfigEntry use to specified the nebula service's address. 43 | */ 44 | case class NebulaSinkConfigEntry(override val category: SinkCategory.Value, addresses: List[String]) 45 | extends DataSinkConfigEntry { 46 | override def toString: String = { 47 | s"Nebula sink addresses: ${addresses.mkString("[", ", ", "]")}" 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/package.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula 8 | 9 | import com.google.common.base.Optional 10 | import com.google.common.util.concurrent.ListenableFuture 11 | import com.vesoft.nebula.exchange.utils.NebulaUtils 12 | 13 | import scala.collection.mutable.ListBuffer 14 | 15 | package object exchange { 16 | 17 | type GraphSpaceID = Int 18 | type PartitionID = Int 19 | type TagID = Int 20 | type EdgeType = Int 21 | type SchemaID = (TagID, EdgeType) 22 | type TagVersion = Long 23 | type EdgeVersion = Long 24 | type SchemaVersion = (TagVersion, EdgeVersion) 25 | type VertexID = Long 26 | type VertexIDSlice = String 27 | type EdgeRank = Long 28 | type PropertyNames = List[String] 29 | type PropertyValues = List[Any] 30 | type ProcessResult = ListBuffer[WriterResult] 31 | type WriterResult = ListenableFuture[Optional[Integer]] 32 | 33 | case class Vertex(vertexID: VertexIDSlice, values: PropertyValues) { 34 | 35 | def propertyValues = values.mkString(", ") 36 | 37 | override def toString: String = { 38 | s"Vertex ID: ${vertexID}, " + 39 | s"Values: ${values.mkString(", ")}" 40 | } 41 | } 42 | 43 | case class Vertices(names: PropertyNames, 44 | values: List[Vertex], 45 | policy: Option[KeyPolicy.Value] = None) { 46 | 47 | def propertyNames: String = NebulaUtils.escapePropName(names).mkString(",") 48 | 49 | override def toString: String = { 50 | s"Vertices: " + 51 | s"Property Names: ${names.mkString(", ")}" + 52 | s"Vertex Values: ${values.mkString(", ")} " + 53 | s"with policy ${policy}" 54 | } 55 | } 56 | 57 | case class Edge(source: VertexIDSlice, 58 | destination: VertexIDSlice, 59 | ranking: Option[EdgeRank], 60 | values: PropertyValues) { 61 | 62 | def this(source: VertexIDSlice, destination: VertexIDSlice, values: PropertyValues) = { 63 | this(source, destination, None, values) 64 | } 65 | 66 | def propertyValues: String = values.mkString(", ") 67 | 68 | override def toString: String = { 69 | s"Edge: ${source}->${destination}@${ranking} values: ${propertyValues}" 70 | } 71 | } 72 | 73 | case class Edges(names: PropertyNames, 74 | values: List[Edge], 75 | sourcePolicy: Option[KeyPolicy.Value] = None, 76 | targetPolicy: Option[KeyPolicy.Value] = None) { 77 | def propertyNames: String = NebulaUtils.escapePropName(names).mkString(",") 78 | 79 | override def toString: String = { 80 | "Edges:" + 81 | s"Property Names: ${names.mkString(", ")}" + 82 | s"with source policy ${sourcePolicy}" + 83 | s"with target policy ${targetPolicy}" 84 | } 85 | } 86 | 87 | object KeyPolicy extends Enumeration { 88 | type POLICY = Value 89 | val HASH = Value("hash") 90 | val UUID = Value("uuid") 91 | } 92 | 93 | case class Offset(start: Long, size: Long) 94 | } 95 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/ReloadProcessor.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.processor 8 | 9 | import com.vesoft.nebula.exchange.{ErrorHandler, GraphProvider} 10 | import com.vesoft.nebula.exchange.config.Configs 11 | import com.vesoft.nebula.exchange.writer.NebulaGraphClientWriter 12 | import org.apache.log4j.Logger 13 | import org.apache.spark.TaskContext 14 | import org.apache.spark.sql.{DataFrame, Row} 15 | import org.apache.spark.util.LongAccumulator 16 | 17 | import scala.collection.mutable.ArrayBuffer 18 | 19 | class ReloadProcessor(data: DataFrame, 20 | config: Configs, 21 | batchSuccess: LongAccumulator, 22 | batchFailure: LongAccumulator) 23 | extends Processor { 24 | @transient 25 | private[this] lazy val LOG = Logger.getLogger(this.getClass) 26 | 27 | override def process(): Unit = { 28 | data.foreachPartition(processEachPartition(_)) 29 | } 30 | 31 | private def processEachPartition(iterator: Iterator[Row]): Unit = { 32 | val graphProvider = 33 | new GraphProvider(config.databaseConfig.getGraphAddress, config.connectionConfig.timeout) 34 | 35 | val writer = new NebulaGraphClientWriter(config.databaseConfig, 36 | config.userConfig, 37 | config.rateConfig, 38 | null, 39 | graphProvider) 40 | 41 | val errorBuffer = ArrayBuffer[String]() 42 | 43 | writer.prepare() 44 | // batch write 45 | val startTime = System.currentTimeMillis 46 | iterator.foreach { row => 47 | val failStatement = writer.writeNgql(row.getString(0)) 48 | if (failStatement == null) { 49 | batchSuccess.add(1) 50 | } else { 51 | errorBuffer.append(failStatement) 52 | batchFailure.add(1) 53 | } 54 | } 55 | if (errorBuffer.nonEmpty) { 56 | ErrorHandler.save(errorBuffer, 57 | s"${config.errorConfig.errorPath}/reload.${TaskContext.getPartitionId()}") 58 | errorBuffer.clear() 59 | } 60 | LOG.info(s"data reload in partition ${TaskContext 61 | .getPartitionId()} cost ${System.currentTimeMillis() - startTime}ms") 62 | writer.close() 63 | graphProvider.close() 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.reader 8 | 9 | import com.vesoft.nebula.exchange.config.FileBaseSourceConfigEntry 10 | import com.vesoft.nebula.exchange.utils.NebulaUtils.DEFAULT_EMPTY_VALUE 11 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 12 | import org.apache.spark.sql.types.StructType 13 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 14 | 15 | /** 16 | * The FileBaseReader is the abstract class for HDFS file reader. 17 | * 18 | * @param session 19 | * @param path 20 | */ 21 | abstract class FileBaseReader(val session: SparkSession, val path: String) extends Reader { 22 | 23 | require(path.trim.nonEmpty) 24 | 25 | override def close(): Unit = { 26 | session.close() 27 | } 28 | } 29 | 30 | /** 31 | * The ParquetReader extend the FileBaseReader and support read parquet file from HDFS. 32 | * 33 | * @param session 34 | * @param parquetConfig 35 | */ 36 | class ParquetReader(override val session: SparkSession, parquetConfig: FileBaseSourceConfigEntry) 37 | extends FileBaseReader(session, parquetConfig.path) { 38 | 39 | override def read(): DataFrame = { 40 | session.read.parquet(path) 41 | } 42 | } 43 | 44 | /** 45 | * The ORCReader extend the FileBaseReader and support read orc file from HDFS. 46 | * 47 | * @param session 48 | * @param orcConfig 49 | */ 50 | class ORCReader(override val session: SparkSession, orcConfig: FileBaseSourceConfigEntry) 51 | extends FileBaseReader(session, orcConfig.path) { 52 | 53 | override def read(): DataFrame = { 54 | session.read.orc(path) 55 | } 56 | } 57 | 58 | /** 59 | * The JSONReader extend the FileBaseReader and support read json file from HDFS. 60 | * 61 | * @param session 62 | * @param jsonConfig 63 | */ 64 | class JSONReader(override val session: SparkSession, jsonConfig: FileBaseSourceConfigEntry) 65 | extends FileBaseReader(session, jsonConfig.path) { 66 | 67 | override def read(): DataFrame = { 68 | session.read.json(path) 69 | } 70 | } 71 | 72 | /** 73 | * The CSVReader extend the FileBaseReader and support read csv file from HDFS. 74 | * All types of the structure are StringType. 75 | * 76 | * @param session 77 | * @param csvConfig 78 | */ 79 | class CSVReader(override val session: SparkSession, csvConfig: FileBaseSourceConfigEntry) 80 | extends FileBaseReader(session, csvConfig.path) { 81 | 82 | override def read(): DataFrame = { 83 | session.read 84 | .option("delimiter", csvConfig.separator.get) 85 | .option("header", csvConfig.header.get) 86 | .option("emptyValue", DEFAULT_EMPTY_VALUE) 87 | .csv(path) 88 | } 89 | } 90 | 91 | /** 92 | * The CustomReader extend the FileBaseReader and support read text file from HDFS. 93 | * Transformation is a function convert a line into Row. 94 | * The structure of the row should be specified. 95 | * 96 | * @param session 97 | * @param customConfig 98 | * @param transformation 99 | * @param structType 100 | */ 101 | abstract class CustomReader(override val session: SparkSession, 102 | customConfig: FileBaseSourceConfigEntry, 103 | transformation: String => Row, 104 | filter: Row => Boolean, 105 | structType: StructType) 106 | extends FileBaseReader(session, customConfig.path) { 107 | 108 | override def read(): DataFrame = { 109 | val encoder = RowEncoder.apply(structType) 110 | session.read 111 | .text(path) 112 | .filter(!_.getString(0).isEmpty) 113 | .map(row => transformation(row.getString(0)))(encoder) 114 | .filter(filter) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.reader 8 | 9 | import com.vesoft.nebula.exchange.Offset 10 | import com.vesoft.nebula.exchange.utils.HDFSUtils 11 | import org.apache.spark.sql.{DataFrame, SparkSession} 12 | 13 | /** 14 | * The Reader is used to create a DataFrame from the source, such as Hive or HDFS. 15 | */ 16 | trait Reader extends Serializable { 17 | def session: SparkSession 18 | 19 | def read(): DataFrame 20 | 21 | def close(): Unit 22 | } 23 | 24 | trait CheckPointSupport extends Serializable { 25 | 26 | def getOffsets(totalCount: Long, 27 | parallel: Int, 28 | checkPointPath: Option[String], 29 | checkPointNamePrefix: String): List[Offset] = { 30 | if (totalCount <= 0) 31 | throw new RuntimeException(s"${checkPointNamePrefix}: return data count<=0") 32 | 33 | val batchSizes = List.fill((totalCount % parallel).toInt)(totalCount / parallel + 1) ::: List 34 | .fill((parallel - totalCount % parallel).toInt)(totalCount / parallel) 35 | 36 | val startOffsets = batchSizes.scanLeft(0L)(_ + _).init 37 | 38 | val checkPointOffsets = checkPointPath match { 39 | case Some(path) => 40 | val files = Range(0, parallel).map(i => s"${path}/${checkPointNamePrefix}.${i}").toList 41 | if (files.forall(HDFSUtils.exists)) 42 | files.map(HDFSUtils.getContent(_).trim.toLong).sorted 43 | else startOffsets 44 | case _ => startOffsets 45 | } 46 | 47 | if (checkPointOffsets.zip(startOffsets).exists(x => x._1 < x._2)) 48 | throw new RuntimeException( 49 | s"Check Point file maybe previous. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") 50 | 51 | val eachPartitionLimit = { 52 | batchSizes 53 | .zip(startOffsets.zip(checkPointOffsets)) 54 | .map(x => { 55 | x._1 - (x._2._2 - x._2._1) 56 | }) 57 | } 58 | val offsets = checkPointOffsets.zip(eachPartitionLimit).map(x => Offset(x._1, x._2)) 59 | if (offsets.exists(_.size < 0L)) 60 | throw new RuntimeException( 61 | s"Check point file maybe broken. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") 62 | offsets 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.reader 8 | 9 | import com.vesoft.nebula.exchange.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry} 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | 12 | /** 13 | * Spark Streaming 14 | * 15 | * @param session 16 | */ 17 | abstract class StreamingBaseReader(override val session: SparkSession) extends Reader { 18 | 19 | override def close(): Unit = { 20 | session.close() 21 | } 22 | } 23 | 24 | /** 25 | * 26 | * @param session 27 | * @param kafkaConfig 28 | */ 29 | class KafkaReader(override val session: SparkSession, kafkaConfig: KafkaSourceConfigEntry) 30 | extends StreamingBaseReader(session) { 31 | 32 | require(kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty) 33 | 34 | override def read(): DataFrame = { 35 | session.readStream 36 | .format("kafka") 37 | .option("kafka.bootstrap.servers", kafkaConfig.server) 38 | .option("subscribe", kafkaConfig.topic) 39 | .load() 40 | } 41 | } 42 | 43 | /** 44 | * 45 | * @param session 46 | * @param pulsarConfig 47 | */ 48 | class PulsarReader(override val session: SparkSession, pulsarConfig: PulsarSourceConfigEntry) 49 | extends StreamingBaseReader(session) { 50 | 51 | override def read(): DataFrame = { 52 | session.readStream 53 | .format("pulsar") 54 | .option("service.url", pulsarConfig.serviceUrl) 55 | .option("admin.url", pulsarConfig.adminUrl) 56 | .options(pulsarConfig.options) 57 | .load() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/HDFSUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.utils 8 | 9 | import java.io.File 10 | import java.nio.charset.Charset 11 | import org.apache.hadoop.conf.Configuration 12 | import org.apache.hadoop.fs.{FileSystem, Path} 13 | import org.apache.log4j.Logger 14 | import scala.io.Source 15 | 16 | object HDFSUtils { 17 | private[this] val LOG = Logger.getLogger(this.getClass) 18 | 19 | def getFileSystem(namenode: String = null): FileSystem = { 20 | val conf = new Configuration() 21 | if (namenode != null) { 22 | conf.set("fs.default.name", namenode) 23 | conf.set("fs.defaultFS", namenode) 24 | } 25 | FileSystem.get(conf) 26 | } 27 | 28 | def list(path: String): List[String] = { 29 | val system = getFileSystem() 30 | system.listStatus(new Path(path)).map(_.getPath.getName).toList 31 | } 32 | 33 | def exists(path: String): Boolean = { 34 | val system = getFileSystem() 35 | system.exists(new Path(path)) 36 | } 37 | 38 | def getContent(path: String): String = { 39 | val system = getFileSystem() 40 | val inputStream = system.open(new Path(path)) 41 | Source.fromInputStream(inputStream).mkString 42 | } 43 | 44 | def saveContent(path: String, 45 | content: String, 46 | charset: Charset = Charset.defaultCharset()): Unit = { 47 | val system = getFileSystem() 48 | val outputStream = system.create(new Path(path)) 49 | try { 50 | outputStream.write(content.getBytes(charset)) 51 | } finally { 52 | outputStream.close() 53 | } 54 | } 55 | 56 | def upload(localPath: String, remotePath: String, namenode: String = null): Unit = { 57 | try { 58 | val localFile = new File(localPath) 59 | if (!localFile.exists() || localFile.length() <= 0) { 60 | return 61 | } 62 | } catch { 63 | case e: Throwable => 64 | LOG.warn("check for empty local file error, but you can ignore this check error. " + 65 | "If there is empty sst file in your hdfs, please delete it manually", 66 | e) 67 | } 68 | val system = getFileSystem(namenode) 69 | system.copyFromLocalFile(new Path(localPath), new Path(remotePath)) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/KafkaUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.utils 8 | 9 | import com.vesoft.nebula.exchange.{Edge, Vertex} 10 | 11 | object KafkaUtils { 12 | 13 | def writeVertices(vertices: Vertex*): Unit = {} 14 | def writeEdge(edges: Edge*): Unit = {} 15 | } 16 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/NebulaUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.utils 8 | 9 | import com.google.common.primitives.UnsignedLong 10 | import com.vesoft.nebula.exchange.{MetaProvider, VidType} 11 | import com.vesoft.nebula.exchange.config.{SchemaConfigEntry, Type} 12 | import org.apache.commons.codec.digest.MurmurHash2 13 | import org.apache.log4j.Logger 14 | 15 | import scala.collection.JavaConversions.seqAsJavaList 16 | import scala.collection.mutable 17 | import scala.collection.mutable.ListBuffer 18 | 19 | object NebulaUtils { 20 | val DEFAULT_EMPTY_VALUE: String = "_NEBULA_EMPTY" 21 | 22 | private[this] val LOG = Logger.getLogger(this.getClass) 23 | 24 | def getDataSourceFieldType(sourceConfig: SchemaConfigEntry, 25 | space: String, 26 | metaProvider: MetaProvider): Map[String, Int] = { 27 | val nebulaFields = sourceConfig.nebulaFields 28 | val sourceFields = sourceConfig.fields 29 | val label = sourceConfig.name 30 | 31 | var nebulaSchemaMap: Map[String, Integer] = null 32 | val dataType: Type.Value = metaProvider.getLabelType(space, label) 33 | if (dataType == null) { 34 | throw new IllegalArgumentException(s"label $label does not exist.") 35 | } 36 | if (dataType == Type.VERTEX) { 37 | nebulaSchemaMap = metaProvider.getTagSchema(space, label) 38 | } else { 39 | nebulaSchemaMap = metaProvider.getEdgeSchema(space, label) 40 | } 41 | 42 | val sourceSchemaMap: mutable.Map[String, Int] = mutable.HashMap[String, Int]() 43 | for (i <- nebulaFields.indices) { 44 | sourceSchemaMap.put(sourceFields.get(i), nebulaSchemaMap(nebulaFields.get(i))) 45 | } 46 | sourceSchemaMap.toMap 47 | } 48 | 49 | def isNumic(str: String): Boolean = { 50 | val newStr: String = if (str.startsWith("-")) { 51 | str.substring(1) 52 | } else { str } 53 | 54 | for (char <- newStr.toCharArray) { 55 | if (!Character.isDigit(char)) return false 56 | } 57 | true 58 | } 59 | 60 | def escapeUtil(str: String): String = { 61 | var s = str 62 | if (s.contains("\\")) { 63 | s = s.replaceAll("\\\\", "\\\\\\\\") 64 | } 65 | if (s.contains("\t")) { 66 | s = s.replaceAll("\t", "\\\\t") 67 | } 68 | if (s.contains("\n")) { 69 | s = s.replaceAll("\n", "\\\\n") 70 | } 71 | if (s.contains("\"")) { 72 | s = s.replaceAll("\"", "\\\\\"") 73 | } 74 | if (s.contains("\'")) { 75 | s = s.replaceAll("\'", "\\\\'") 76 | } 77 | if (s.contains("\r")) { 78 | s = s.replaceAll("\r", "\\\\r") 79 | } 80 | if (s.contains("\b")) { 81 | s = s.replaceAll("\b", "\\\\b") 82 | } 83 | s 84 | } 85 | 86 | def getPartitionId(id: String, partitionSize: Int, vidType: VidType.Value): Int = { 87 | val hashValue: Long = if (vidType == VidType.STRING) { 88 | MurmurHash2.hash64(id.getBytes, id.length, 0xc70f6907) 89 | } else { 90 | id.toLong 91 | } 92 | val unsignedValue = UnsignedLong.fromLongBits(hashValue) 93 | val partSize = UnsignedLong.fromLongBits(partitionSize) 94 | unsignedValue.mod(partSize).intValue + 1 95 | } 96 | 97 | def escapePropName(nebulaFields: List[String]): List[String] = { 98 | val propNames: ListBuffer[String] = new ListBuffer[String] 99 | for (key <- nebulaFields) { 100 | val sb = new StringBuilder() 101 | sb.append("`") 102 | sb.append(key) 103 | sb.append("`") 104 | propNames.append(sb.toString()) 105 | } 106 | propNames.toList 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/Neo4jUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.utils 8 | 9 | import org.neo4j.driver.Value 10 | 11 | object Neo4jUtils { 12 | 13 | def convertNeo4jData(value: Value): String = { 14 | value.`type`().name() match { 15 | case "NULL" => { 16 | null 17 | } 18 | case "STRING" => { 19 | value.asString() 20 | } 21 | case "INTEGER" => { 22 | value.asInt().toString 23 | } 24 | case "FLOAT" | "DOUBLE" => { 25 | value.asDouble().toString 26 | } 27 | case "BOOLEAN" => { 28 | value.asBoolean().toString 29 | } 30 | case "DATE" | "LOCAL_DATE" => { 31 | value.asLocalDate().toString 32 | } 33 | case "DATE_TIME" | "LOCAL_DATE_TIME" => { 34 | value.asLocalDateTime().toString 35 | } 36 | case "TIME" | "LOCAL_TIME" => { 37 | value.asLocalTime().toString 38 | } 39 | case "BYTES" => { 40 | new String(value.asByteArray()) 41 | } 42 | case "LIST" => { 43 | value.asList().toString 44 | } 45 | case "MAP" => { 46 | value.asMap().toString 47 | } 48 | case _ => { 49 | value.toString 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/FileBaseWriter.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.writer 8 | 9 | import org.rocksdb.{EnvOptions, Options, RocksDB, SstFileWriter} 10 | import org.slf4j.LoggerFactory 11 | 12 | /** 13 | * NebulaSSTWriter 14 | * @param path 15 | */ 16 | class NebulaSSTWriter(path: String) extends Writer { 17 | require(path.trim.nonEmpty) 18 | var isOpen = false 19 | 20 | private val LOG = LoggerFactory.getLogger(getClass) 21 | 22 | try { 23 | RocksDB.loadLibrary() 24 | LOG.info("Loading RocksDB successfully") 25 | } catch { 26 | case _: Exception => 27 | LOG.error("Can't load RocksDB library!") 28 | } 29 | 30 | // TODO More Config ... 31 | val options = new Options() 32 | .setCreateIfMissing(true) 33 | 34 | val env = new EnvOptions() 35 | var writer: SstFileWriter = _ 36 | 37 | override def prepare(): Unit = { 38 | writer = new SstFileWriter(env, options) 39 | writer.open(path) 40 | isOpen = true 41 | } 42 | 43 | def write(key: Array[Byte], value: Array[Byte]): Unit = { 44 | writer.put(key, value) 45 | } 46 | 47 | override def close(): Unit = { 48 | if (isOpen) { 49 | writer.finish() 50 | writer.close() 51 | } 52 | options.close() 53 | env.close() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/Writer.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.exchange.writer 8 | 9 | /** 10 | * 11 | */ 12 | trait Writer extends Serializable { 13 | 14 | def prepare(): Unit 15 | 16 | def close() 17 | } 18 | -------------------------------------------------------------------------------- /nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/ProcessorSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package scala.com.vesoft.nebula.exchange.processor 8 | 9 | import com.vesoft.nebula.exchange.processor.Processor 10 | import com.vesoft.nebula.{Date, DateTime, NullType, Time, Value} 11 | import com.vesoft.nebula.meta.PropertyType 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.types.{ 14 | BooleanType, 15 | DoubleType, 16 | IntegerType, 17 | LongType, 18 | ShortType, 19 | StringType, 20 | StructField, 21 | StructType 22 | } 23 | import org.junit.Test 24 | 25 | class ProcessorSuite extends Processor { 26 | val values = List("Bob", 27 | "fixedBob", 28 | 12, 29 | 200, 30 | 1000, 31 | 100000, 32 | "2021-01-01", 33 | "2021-01-01T12:00:00", 34 | "12:00:00", 35 | "2021-01-01T12:00:00", 36 | true, 37 | 12.01, 38 | 22.12, 39 | null) 40 | val schema: StructType = StructType( 41 | List( 42 | StructField("col1", StringType, nullable = true), 43 | StructField("col2", StringType, nullable = true), 44 | StructField("col3", ShortType, nullable = true), 45 | StructField("col4", ShortType, nullable = true), 46 | StructField("col5", IntegerType, nullable = true), 47 | StructField("col6", LongType, nullable = true), 48 | StructField("col7", StringType, nullable = true), 49 | StructField("col8", StringType, nullable = true), 50 | StructField("col9", StringType, nullable = true), 51 | StructField("col10", StringType, nullable = true), 52 | StructField("col11", BooleanType, nullable = true), 53 | StructField("col12", DoubleType, nullable = true), 54 | StructField("col13", DoubleType, nullable = true), 55 | StructField("col14", StringType, nullable = true) 56 | )) 57 | val row = new GenericRowWithSchema(values.toArray, schema) 58 | val map = Map( 59 | "col1" -> PropertyType.STRING.getValue, 60 | "col2" -> PropertyType.FIXED_STRING.getValue, 61 | "col3" -> PropertyType.INT8.getValue, 62 | "col4" -> PropertyType.INT16.getValue, 63 | "col5" -> PropertyType.INT32.getValue, 64 | "col6" -> PropertyType.INT64.getValue, 65 | "col7" -> PropertyType.DATE.getValue, 66 | "col8" -> PropertyType.DATETIME.getValue, 67 | "col9" -> PropertyType.TIME.getValue, 68 | "col10" -> PropertyType.TIMESTAMP.getValue, 69 | "col11" -> PropertyType.BOOL.getValue, 70 | "col12" -> PropertyType.DOUBLE.getValue, 71 | "col13" -> PropertyType.FLOAT.getValue, 72 | "col14" -> PropertyType.STRING.getValue 73 | ) 74 | 75 | @Test 76 | def extraValueForClientSuite(): Unit = { 77 | assert(extraValueForClient(row, "col1", map).toString.equals("\"Bob\"")) 78 | assert(extraValueForClient(row, "col2", map).toString.equals("\"fixedBob\"")) 79 | assert(extraValueForClient(row, "col3", map).toString.toInt == 12) 80 | assert(extraValueForClient(row, "col4", map).toString.toInt == 200) 81 | assert(extraValueForClient(row, "col5", map).toString.toInt == 1000) 82 | assert(extraValueForClient(row, "col6", map).toString.toLong == 100000) 83 | assert(extraValueForClient(row, "col7", map).toString.equals("date(\"2021-01-01\")")) 84 | assert( 85 | extraValueForClient(row, "col8", map).toString.equals("datetime(\"2021-01-01T12:00:00\")")) 86 | assert(extraValueForClient(row, "col9", map).toString.equals("time(\"12:00:00\")")) 87 | assert( 88 | extraValueForClient(row, "col10", map).toString.equals("timestamp(\"2021-01-01T12:00:00\")")) 89 | assert(extraValueForClient(row, "col11", map).toString.toBoolean) 90 | assert(extraValueForClient(row, "col12", map).toString.toDouble > 12.00) 91 | assert(extraValueForClient(row, "col13", map).toString.toDouble > 22.10) 92 | assert(extraValueForClient(row, "col14", map) == null) 93 | } 94 | 95 | @Test 96 | def extraValueForSSTSuite(): Unit = { 97 | assert(extraValueForSST(row, "col1", map).toString.equals("Bob")) 98 | assert(extraValueForSST(row, "col2", map).toString.equals("fixedBob")) 99 | assert(extraValueForSST(row, "col3", map).toString.toInt == 12) 100 | assert(extraValueForSST(row, "col4", map).toString.toInt == 200) 101 | assert(extraValueForSST(row, "col5", map).toString.toInt == 1000) 102 | assert(extraValueForSST(row, "col6", map).toString.toLong == 100000) 103 | val date = new Date(2021, 1, 1) 104 | assert(extraValueForSST(row, "col7", map).equals(date)) 105 | val datetime = new DateTime(2021, 1, 1, 12, 0, 0, 0) 106 | assert(extraValueForSST(row, "col8", map).equals(datetime)) 107 | 108 | val time = new Time(12, 0, 0, 0) 109 | assert(extraValueForSST(row, "col9", map).equals(time)) 110 | 111 | try { 112 | extraValueForSST(row, "col10", map).toString 113 | } catch { 114 | case e: Exception => assert(true) 115 | } 116 | 117 | assert(extraValueForSST(row, "col11", map).toString.toBoolean) 118 | assert(extraValueForSST(row, "col12", map).toString.toDouble > 12.0) 119 | assert(extraValueForSST(row, "col13", map).toString.toFloat > 22.10) 120 | 121 | val nullValue = new Value() 122 | nullValue.setNVal(NullType.__NULL__) 123 | assert(extraValueForSST(row, "col14", map).equals(nullValue)) 124 | } 125 | 126 | /** 127 | * process dataframe to vertices or edges 128 | */ 129 | override def process(): Unit = ??? 130 | 131 | } 132 | -------------------------------------------------------------------------------- /nebula-spark-connector/.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | 25 | # build target 26 | target/ 27 | 28 | # IDE 29 | .idea/ 30 | .eclipse/ 31 | *.iml 32 | 33 | spark-importer.ipr 34 | spark-importer.iws 35 | 36 | .DS_Store 37 | -------------------------------------------------------------------------------- /nebula-spark-connector/README.md: -------------------------------------------------------------------------------- 1 | # Nebula Spark Connector 2.0 2 | [中文版](https://github.com/vesoft-inc/nebula-spark-utils/blob/master/nebula-spark-connector/README_CN.md) 3 | 4 | ## Introduction 5 | 6 | Nebula Spark Connector 2.0 only supports Nebula Graph 2.x. If you are using Nebula Graph v1.x, please use [Nebula Spark Connector v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/nebula-spark) . 7 | 8 | ## How to Compile 9 | 10 | 1. Package Nebula Spark Connector 2.0. 11 | 12 | ```bash 13 | $ git clone https://github.com/vesoft-inc/nebula-spark-utils.git 14 | $ cd nebula-spark-utils/nebula-spark-connector 15 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true 16 | ``` 17 | 18 | After the packaging, you can see the newly generated nebula-spark-connector-2.0.0.jar under the nebula-spark-utils/nebula-spark-connector/target/ directory. 19 | 20 | ## New Features (Compared to Nebula Spark Connector 1.0) 21 | * Supports more connection configurations, such as timeout, connectionRetry, and executionRetry. 22 | * Supports more data configurations, such as whether vertexId can be written as vertex's property, whether srcId, dstId and rank can be written as edge's properties. 23 | * Spark Reader Supports non-property, all-property, and specific-properties read. 24 | * Spark Reader Supports reading data from Nebula Graph to Graphx as VertexRD and EdgeRDD, it also supports String type vertexId. 25 | * Nebula Spark Connector 2.0 uniformly uses SparkSQL's DataSourceV2 for data source expansion. 26 | * Nebula Spark Connector 2.1.0 support UPDATE write mode to NebulaGraph, see [Update Vertex](https://docs.nebula-graph.io/2.0.1/3.ngql-guide/12.vertex-statements/2.update-vertex/) . 27 | 28 | ## How to Use 29 | 30 | Write DataFrame `INSERT` into Nebula Graph as Vertices: 31 | ``` 32 | val config = NebulaConnectionConfig 33 | .builder() 34 | .withMetaAddress("127.0.0.1:9559") 35 | .withGraphAddress("127.0.0.1:9669") 36 | .build() 37 | val nebulaWriteVertexConfig: WriteNebulaVertexConfig = WriteNebulaVertexConfig 38 | .builder() 39 | .withSpace("test") 40 | .withTag("person") 41 | .withVidField("id") 42 | .withVidAsProp(true) 43 | .withBatch(1000) 44 | .build() 45 | df.write.nebula(config, nebulaWriteVertexConfig).writeVertices() 46 | ``` 47 | Write DataFrame `UPDATE` into Nebula Graph as Vertices: 48 | ``` 49 | val config = NebulaConnectionConfig 50 | .builder() 51 | .withMetaAddress("127.0.0.1:9559") 52 | .withGraphAddress("127.0.0.1:9669") 53 | .build() 54 | val nebulaWriteVertexConfig: WriteNebulaVertexConfig = WriteNebulaVertexConfig 55 | .builder() 56 | .withSpace("test") 57 | .withTag("person") 58 | .withVidField("id") 59 | .withVidAsProp(true) 60 | .withBatch(1000) 61 | .withWriteMode(WriteMode.UPDATE) 62 | .build() 63 | df.write.nebula(config, nebulaWriteVertexConfig).writeVertices() 64 | ``` 65 | Read vertices from Nebula Graph: 66 | ``` 67 | val config = NebulaConnectionConfig 68 | .builder() 69 | .withMetaAddress("127.0.0.1:9559") 70 | .withConenctionRetry(2) 71 | .build() 72 | val nebulaReadVertexConfig: ReadNebulaConfig = ReadNebulaConfig 73 | .builder() 74 | .withSpace("exchange") 75 | .withLabel("person") 76 | .withNoColumn(false) 77 | .withReturnCols(List("birthday")) 78 | .withLimit(10) 79 | .withPartitionNum(10) 80 | .build() 81 | val vertex = spark.read.nebula(config, nebulaReadVertexConfig).loadVerticesToDF() 82 | ``` 83 | 84 | Read vertices and edges from Nebula Graph to construct Graphx's graph: 85 | ``` 86 | val config = NebulaConnectionConfig 87 | .builder() 88 | .withMetaAddress("127.0.0.1:9559") 89 | .build() 90 | val nebulaReadVertexConfig = ReadNebulaConfig 91 | .builder() 92 | .withSpace("exchange") 93 | .withLabel("person") 94 | .withNoColumn(false) 95 | .withReturnCols(List("birthday")) 96 | .withLimit(10) 97 | .withPartitionNum(10) 98 | .build() 99 | val nebulaReadEdgeConfig = ReadNebulaConfig 100 | .builder() 101 | .withSpace("exchange") 102 | .withLabel("knows1") 103 | .withNoColumn(false) 104 | .withReturnCols(List("timep")) 105 | .withLimit(10) 106 | .withPartitionNum(10) 107 | .build() 108 | 109 | val vertex = spark.read.nebula(config, nebulaReadVertexConfig).loadVerticesToGraphx() 110 | val edgeRDD = spark.read.nebula(config, nebulaReadEdgeConfig).loadEdgesToGraphx() 111 | val graph = Graph(vertexRDD, edgeRDD) 112 | ``` 113 | After getting Graphx's Graph, you can develop graph algorithms in Graphx like [Nebula-Spark-Algorithm](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/nebula-algorithm). 114 | 115 | For more information on usage, please refer to [Example](https://github.com/vesoft-inc/nebula-spark-utils/tree/master/example/src/main/scala/com/vesoft/nebula/examples/connector). 116 | 117 | ## How to Contribute 118 | 119 | Nebula Spark Connector 2.0 is a completely opensource project, opensource enthusiasts are welcome to participate in the following ways: 120 | 121 | - Go to [Nebula Graph Forum](https://discuss.nebula-graph.com.cn/ "go to“Nebula Graph Forum") to discuss with other users. You can raise your own questions, help others' problems, share your thoughts. 122 | - Write or improve documents. 123 | - Submit code to add new features or fix bugs. 124 | -------------------------------------------------------------------------------- /nebula-spark-connector/README_CN.md: -------------------------------------------------------------------------------- 1 | # 欢迎使用 Nebula Spark Connector 2.0 2 | [English](https://github.com/vesoft-inc/nebula-spark-utils/blob/master/nebula-spark-connector/README.md) 3 | ## 介绍 4 | 5 | Nebula Spark Connector 2.0 仅支持 Nebula Graph 2.x。如果您正在使用 Nebula Graph v1.x,请使用 [Nebula Spark Connector v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools)。 6 | 7 | ## 如何编译 8 | 9 | 1. 编译打包 Nebula Spark Connector 2.0。 10 | 11 | ```bash 12 | $ git clone https://github.com/vesoft-inc/nebula-spark-utils.git 13 | $ cd nebula-spark-utils/nebula-spark-connector 14 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true 15 | ``` 16 | 17 | 编译打包完成后,可以在 nebula-spark-utils/nebula-spark-connector/target/ 目录下看到 nebula-spark-connector-2.0.0.jar 文件。 18 | 19 | ## 特性 20 | 21 | * 提供了更多连接配置项,如超时时间、连接重试次数、执行重试次数 22 | * 提供了更多数据配置项,如写入数据时是否将 vertexId 同时作为属性写入、是否将 srcId、dstId、rank 等同时作为属性写入 23 | * Spark Reader 支持无属性读取,支持全属性读取 24 | * Spark Reader 支持将 Nebula Graph 数据读取成 Graphx 的 VertexRD 和 EdgeRDD,支持非 Long 型 vertexId 25 | * Nebula Spark Connector 2.0 统一了 SparkSQL 的扩展数据源,统一采用 DataSourceV2 进行 Nebula Graph 数据扩展 26 | * Nebula Spark Connector 2.1.0 增加了 UPDATE 写入模式,相关说明参考[Update Vertex](https://docs.nebula-graph.com.cn/2.0.1/3.ngql-guide/12.vertex-statements/2.update-vertex/) 。 27 | 28 | ## 使用说明 29 | 30 | 将 DataFrame 作为点 `INSERT` 写入 Nebula Graph : 31 | ``` 32 | val config = NebulaConnectionConfig 33 | .builder() 34 | .withMetaAddress("127.0.0.1:9559") 35 | .withGraphAddress("127.0.0.1:9669") 36 | .build() 37 | val nebulaWriteVertexConfig = WriteNebulaVertexConfig 38 | .builder() 39 | .withSpace("test") 40 | .withTag("person") 41 | .withVidField("id") 42 | .withVidAsProp(true) 43 | .withBatch(1000) 44 | .build() 45 | df.write.nebula(config, nebulaWriteVertexConfig).writeVertices() 46 | ``` 47 | 将 DataFrame 作为点 `UPDATE` 写入 Nebula Graph : 48 | ``` 49 | val config = NebulaConnectionConfig 50 | .builder() 51 | .withMetaAddress("127.0.0.1:9559") 52 | .withGraphAddress("127.0.0.1:9669") 53 | .build() 54 | val nebulaWriteVertexConfig = WriteNebulaVertexConfig 55 | .builder() 56 | .withSpace("test") 57 | .withTag("person") 58 | .withVidField("id") 59 | .withVidAsProp(true) 60 | .withBatch(1000) 61 | .withWriteMode(WriteMode.UPDATE) 62 | .build() 63 | df.write.nebula(config, nebulaWriteVertexConfig).writeVertices() 64 | ``` 65 | 66 | 读取 Nebula Graph 的点数据: 67 | ``` 68 | val config = NebulaConnectionConfig 69 | .builder() 70 | .withMetaAddress("127.0.0.1:9559") 71 | .withConenctionRetry(2) 72 | .build() 73 | val nebulaReadVertexConfig = ReadNebulaConfig 74 | .builder() 75 | .withSpace("exchange") 76 | .withLabel("person") 77 | .withNoColumn(false) 78 | .withReturnCols(List("birthday")) 79 | .withLimit(10) 80 | .withPartitionNum(10) 81 | .build() 82 | val vertex = spark.read.nebula(config, nebulaReadVertexConfig).loadVerticesToDF() 83 | ``` 84 | 85 | 读取 Nebula Graph 的点边数据构造 Graphx 的图: 86 | ``` 87 | val config = NebulaConnectionConfig 88 | .builder() 89 | .withMetaAddress("127.0.0.1:9559") 90 | .withConenctionRetry(2) 91 | .build() 92 | val nebulaReadVertexConfig = ReadNebulaConfig 93 | .builder() 94 | .withSpace("exchange") 95 | .withLabel("person") 96 | .withNoColumn(false) 97 | .withReturnCols(List("birthday")) 98 | .withLimit(10) 99 | .withPartitionNum(10) 100 | .build() 101 | 102 | val nebulaReadEdgeConfig = ReadNebulaConfig 103 | .builder() 104 | .withSpace("exchange") 105 | .withLabel("knows1") 106 | .withNoColumn(false) 107 | .withReturnCols(List("timep")) 108 | .withLimit(10) 109 | .withPartitionNum(10) 110 | .build() 111 | 112 | val vertex = spark.read.nebula(config, nebulaReadVertexConfig).loadVerticesToGraphx() 113 | val edgeRDD = spark.read.nebula(config, nebulaReadEdgeConfig).loadEdgesToGraphx() 114 | val graph = Graph(vertexRDD, edgeRDD) 115 | ``` 116 | 得到 Graphx 的 Graph 之后,可以根据 [Nebula-Spark-Algorithm](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/nebula-algorithm) 的示例在 Graphx 框架中进行算法开发。 117 | 118 | 更多使用示例请参考 [Example](https://github.com/vesoft-inc/nebula-spark-utils/tree/master/example/src/main/scala/com/vesoft/nebula/examples/connector) 。 119 | 120 | ## 贡献 121 | 122 | Nebula Spark Connector 2.0 是一个完全开源的项目,欢迎开源爱好者通过以下方式参与: 123 | 124 | - 前往 [Nebula Graph 论坛](https://discuss.nebula-graph.com.cn/ "点击前往“Nebula Graph 论坛") 上参与 Issue 讨论,如答疑、提供想法或者报告无法解决的问题 125 | - 撰写或改进文档 126 | - 提交优化代码 127 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/NebulaDataSource.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector 8 | 9 | import java.util.Map.Entry 10 | import java.util.Optional 11 | 12 | import com.vesoft.nebula.connector.exception.IllegalOptionException 13 | import com.vesoft.nebula.connector.reader.{NebulaDataSourceEdgeReader, NebulaDataSourceVertexReader} 14 | import com.vesoft.nebula.connector.writer.{NebulaDataSourceEdgeWriter, NebulaDataSourceVertexWriter} 15 | import org.apache.spark.sql.SaveMode 16 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 17 | import org.apache.spark.sql.sources.DataSourceRegister 18 | import org.apache.spark.sql.sources.v2.reader.DataSourceReader 19 | import org.apache.spark.sql.sources.v2.writer.DataSourceWriter 20 | import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, WriteSupport} 21 | import org.apache.spark.sql.types.StructType 22 | import org.slf4j.LoggerFactory 23 | 24 | import scala.collection.JavaConversions.iterableAsScalaIterable 25 | 26 | class NebulaDataSource 27 | extends DataSourceV2 28 | with ReadSupport 29 | with WriteSupport 30 | with DataSourceRegister { 31 | private val LOG = LoggerFactory.getLogger(this.getClass) 32 | 33 | /** 34 | * The string that represents the format that nebula data source provider uses. 35 | */ 36 | override def shortName(): String = "nebula" 37 | 38 | /** 39 | * Creates a {@link DataSourceReader} to scan the data from Nebula Graph. 40 | */ 41 | override def createReader(options: DataSourceOptions): DataSourceReader = { 42 | val nebulaOptions = getNebulaOptions(options, OperaType.READ) 43 | val dataType = nebulaOptions.dataType 44 | 45 | LOG.info("create reader") 46 | LOG.info(s"options ${options.asMap()}") 47 | 48 | if (DataTypeEnum.VERTEX == DataTypeEnum.withName(dataType)) { 49 | new NebulaDataSourceVertexReader(nebulaOptions) 50 | } else { 51 | new NebulaDataSourceEdgeReader(nebulaOptions) 52 | } 53 | } 54 | 55 | /** 56 | * Creates an optional {@link DataSourceWriter} to save the data to Nebula Graph. 57 | */ 58 | override def createWriter(writeUUID: String, 59 | schema: StructType, 60 | mode: SaveMode, 61 | options: DataSourceOptions): Optional[DataSourceWriter] = { 62 | 63 | val nebulaOptions = getNebulaOptions(options, OperaType.WRITE) 64 | val dataType = nebulaOptions.dataType 65 | if (mode == SaveMode.Ignore || mode == SaveMode.ErrorIfExists) { 66 | LOG.warn(s"Currently do not support mode") 67 | } 68 | 69 | LOG.info("create writer") 70 | LOG.info(s"options ${options.asMap()}") 71 | 72 | if (DataTypeEnum.VERTEX == DataTypeEnum.withName(dataType)) { 73 | val vertexFiled = nebulaOptions.vertexField 74 | val vertexIndex: Int = { 75 | var index: Int = -1 76 | for (i <- schema.fields.indices) { 77 | if (schema.fields(i).name.equals(vertexFiled)) { 78 | index = i 79 | } 80 | } 81 | if (index < 0) { 82 | throw new IllegalOptionException( 83 | s" vertex field ${vertexFiled} does not exist in dataframe") 84 | } 85 | index 86 | } 87 | Optional.of(new NebulaDataSourceVertexWriter(nebulaOptions, vertexIndex, schema)) 88 | } else { 89 | val srcVertexFiled = nebulaOptions.srcVertexField 90 | val dstVertexField = nebulaOptions.dstVertexField 91 | val rankExist = !nebulaOptions.rankField.isEmpty 92 | val edgeFieldsIndex = { 93 | var srcIndex: Int = -1 94 | var dstIndex: Int = -1 95 | var rankIndex: Int = -1 96 | for (i <- schema.fields.indices) { 97 | if (schema.fields(i).name.equals(srcVertexFiled)) { 98 | srcIndex = i 99 | } 100 | if (schema.fields(i).name.equals(dstVertexField)) { 101 | dstIndex = i 102 | } 103 | if (rankExist) { 104 | if (schema.fields(i).name.equals(nebulaOptions.rankField)) { 105 | rankIndex = i 106 | } 107 | } 108 | } 109 | // check src filed and dst field 110 | if (srcIndex < 0 || dstIndex < 0) { 111 | throw new IllegalOptionException( 112 | s" srcVertex field ${srcVertexFiled} or dstVertex field ${dstVertexField} do not exist in dataframe") 113 | } 114 | // check rank field 115 | if (rankExist && rankIndex < 0) { 116 | throw new IllegalOptionException(s"rank field does not exist in dataframe") 117 | } 118 | 119 | if (!rankExist) { 120 | (srcIndex, dstIndex, Option.empty) 121 | } else { 122 | (srcIndex, dstIndex, Option(rankIndex)) 123 | } 124 | 125 | } 126 | Optional.of( 127 | new NebulaDataSourceEdgeWriter(nebulaOptions, 128 | edgeFieldsIndex._1, 129 | edgeFieldsIndex._2, 130 | edgeFieldsIndex._3, 131 | schema)) 132 | } 133 | } 134 | 135 | /** 136 | * construct nebula options with DataSourceOptions 137 | */ 138 | def getNebulaOptions(options: DataSourceOptions, operateType: OperaType.Value): NebulaOptions = { 139 | var parameters: Map[String, String] = Map() 140 | for (entry: Entry[String, String] <- options.asMap().entrySet) { 141 | parameters += (entry.getKey -> entry.getValue) 142 | } 143 | val nebulaOptions = new NebulaOptions(CaseInsensitiveMap(parameters))(operateType) 144 | nebulaOptions 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/NebulaEnum.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector 8 | 9 | object DataTypeEnum extends Enumeration { 10 | 11 | type DataType = Value 12 | val VERTEX = Value("vertex") 13 | val EDGE = Value("edge") 14 | 15 | def validDataType(dataType: String): Boolean = { 16 | dataType.equalsIgnoreCase(VERTEX.toString) || dataType.equalsIgnoreCase(EDGE.toString) 17 | } 18 | } 19 | 20 | object KeyPolicy extends Enumeration { 21 | 22 | type POLICY = Value 23 | val HASH = Value("hash") 24 | val UUID = Value("uuid") 25 | } 26 | 27 | object OperaType extends Enumeration { 28 | 29 | type Operation = Value 30 | val READ = Value("read") 31 | val WRITE = Value("write") 32 | } 33 | 34 | object WriteMode extends Enumeration { 35 | 36 | type Mode = Value 37 | val INSERT = Value("insert") 38 | val UPDATE = Value("update") 39 | val DELETE = Value("delete") 40 | } 41 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/NebulaUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector 8 | 9 | import com.vesoft.nebula.client.graph.data.{DateTimeWrapper, DateWrapper, TimeWrapper} 10 | import com.vesoft.nebula.meta.{ColumnDef, ColumnTypeDef, PropertyType} 11 | import org.apache.spark.sql.Row 12 | import org.apache.spark.sql.catalyst.InternalRow 13 | import org.apache.spark.sql.types.{ 14 | BooleanType, 15 | DataType, 16 | DoubleType, 17 | FloatType, 18 | IntegerType, 19 | LongType, 20 | NullType, 21 | StringType, 22 | StructType, 23 | TimestampType 24 | } 25 | import org.apache.spark.unsafe.types.UTF8String 26 | import org.slf4j.LoggerFactory 27 | 28 | object NebulaUtils { 29 | private val LOG = LoggerFactory.getLogger(this.getClass) 30 | 31 | var nebulaOptions: NebulaOptions = _ 32 | var parameters: Map[String, String] = Map() 33 | 34 | /** 35 | * convert nebula data type to spark sql data type 36 | */ 37 | def convertDataType(columnTypeDef: ColumnTypeDef): DataType = { 38 | 39 | columnTypeDef.getType match { 40 | case PropertyType.VID | PropertyType.INT8 | PropertyType.INT16 | PropertyType.INT32 | 41 | PropertyType.INT64 => 42 | LongType 43 | case PropertyType.BOOL => BooleanType 44 | case PropertyType.FLOAT | PropertyType.DOUBLE => DoubleType 45 | case PropertyType.TIMESTAMP => LongType 46 | case PropertyType.FIXED_STRING | PropertyType.STRING | PropertyType.DATE | PropertyType.TIME | 47 | PropertyType.DATETIME => 48 | StringType 49 | case PropertyType.UNKNOWN => throw new IllegalArgumentException("unsupported data type") 50 | } 51 | } 52 | 53 | def getColDataType(columnDefs: List[ColumnDef], columnName: String): DataType = { 54 | for (columnDef <- columnDefs) { 55 | if (columnName.equals(new String(columnDef.getName))) { 56 | return convertDataType(columnDef.getType) 57 | } 58 | } 59 | throw new IllegalArgumentException(s"column $columnName does not exist in schema") 60 | } 61 | 62 | type NebulaValueGetter = (Any, InternalRow, Int) => Unit 63 | 64 | def makeGetters(schema: StructType): Array[NebulaValueGetter] = { 65 | schema.fields.map(field => makeGetter(field.dataType)) 66 | } 67 | 68 | private def makeGetter(dataType: DataType): NebulaValueGetter = { 69 | dataType match { 70 | case BooleanType => 71 | (prop: Any, row: InternalRow, pos: Int) => 72 | row.setBoolean(pos, prop.asInstanceOf[Boolean]) 73 | case TimestampType | LongType => 74 | (prop: Any, row: InternalRow, pos: Int) => 75 | row.setLong(pos, prop.asInstanceOf[Long]) 76 | case FloatType | DoubleType => 77 | (prop: Any, row: InternalRow, pos: Int) => 78 | row.setDouble(pos, prop.asInstanceOf[Double]) 79 | case IntegerType => 80 | (prop: Any, row: InternalRow, pos: Int) => 81 | row.setInt(pos, prop.asInstanceOf[Int]) 82 | case _ => 83 | (prop: Any, row: InternalRow, pos: Int) => 84 | if (prop.isInstanceOf[DateTimeWrapper]) { 85 | row.update(pos, 86 | UTF8String.fromString(prop.asInstanceOf[DateTimeWrapper].getUTCDateTimeStr)) 87 | } else if (prop.isInstanceOf[TimeWrapper]) { 88 | row.update(pos, UTF8String.fromString(prop.asInstanceOf[TimeWrapper].getUTCTimeStr)) 89 | } else { 90 | row.update(pos, UTF8String.fromString(String.valueOf(prop))) 91 | } 92 | } 93 | } 94 | 95 | def isNumic(str: String): Boolean = { 96 | val newStr: String = if (str.startsWith("-")) { 97 | str.substring(1) 98 | } else { str } 99 | 100 | for (char <- newStr.toCharArray) { 101 | if (!Character.isDigit(char)) return false 102 | } 103 | true 104 | } 105 | 106 | def escapeUtil(str: String): String = { 107 | var s = str 108 | if (s.contains("\\")) { 109 | s = s.replaceAll("\\\\", "\\\\\\\\") 110 | } 111 | if (s.contains("\t")) { 112 | s = s.replaceAll("\t", "\\\\t") 113 | } 114 | if (s.contains("\n")) { 115 | s = s.replaceAll("\n", "\\\\n") 116 | } 117 | if (s.contains("\"")) { 118 | s = s.replaceAll("\"", "\\\\\"") 119 | } 120 | if (s.contains("\'")) { 121 | s = s.replaceAll("\'", "\\\\'") 122 | } 123 | if (s.contains("\r")) { 124 | s = s.replaceAll("\r", "\\\\r") 125 | } 126 | if (s.contains("\b")) { 127 | s = s.replaceAll("\b", "\\\\b") 128 | } 129 | s 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/PartitionUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector 8 | 9 | import scala.collection.mutable.ListBuffer 10 | 11 | object PartitionUtils { 12 | def getScanParts(index: Int, nebulaTotalPart: Int, sparkPartitionNum: Int): List[Integer] = { 13 | val scanParts = new ListBuffer[Integer] 14 | var currentPart = index 15 | while (currentPart <= nebulaTotalPart) { 16 | scanParts.append(currentPart) 17 | currentPart += sparkPartitionNum 18 | } 19 | scanParts.toList 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/Template.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector 8 | 9 | object NebulaTemplate { 10 | 11 | private[connector] val BATCH_INSERT_TEMPLATE = "INSERT %s `%s`(%s) VALUES %s" 12 | private[connector] val VERTEX_VALUE_TEMPLATE = "%s: (%s)" 13 | private[connector] val VERTEX_VALUE_TEMPLATE_WITH_POLICY = "%s(\"%s\"): (%s)" 14 | private[connector] val ENDPOINT_TEMPLATE = "%s(\"%s\")" 15 | private[connector] val EDGE_VALUE_WITHOUT_RANKING_TEMPLATE = "%s->%s: (%s)" 16 | private[connector] val EDGE_VALUE_TEMPLATE = "%s->%s@%d: (%s)" 17 | private[connector] val USE_TEMPLATE = "USE %s" 18 | 19 | private[connector] val UPDATE_VERTEX_TEMPLATE = "UPDATE %s ON `%s` %s SET %s" 20 | private[connector] val UPDATE_EDGE_TEMPLATE = "UPDATE %s ON `%s` %s->%s@%d SET %s" 21 | private[connector] val UPDATE_VALUE_TEMPLATE = "`%s`=%s" 22 | 23 | private[connector] val DELETE_VERTEX_TEMPLATE = "DELETE VERTEX %s" 24 | private[connector] val DELETE_EDGE_TEMPLATE = "DELETE EDGE `%s` %s" 25 | private[connector] val EDGE_ENDPOINT_TEMPLATE = "%s->%s@%d" 26 | } 27 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/exception/Exception.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.exception 8 | 9 | import com.facebook.thrift.TException 10 | 11 | /*** 12 | * An exception thrown if nebula client connects failed. 13 | */ 14 | class GraphConnectException(message: String, cause: Throwable = null) 15 | extends TException(message, cause) 16 | 17 | /** 18 | * An exception thrown if a required option is missing form [[NebulaOptions]] 19 | */ 20 | class IllegalOptionException(message: String, cause: Throwable = null) 21 | extends IllegalArgumentException(message, cause) 22 | 23 | /** 24 | * An exception thrown if nebula execution failed. 25 | */ 26 | class GraphExecuteException(message: String, cause: Throwable = null) 27 | extends TException(message, cause) 28 | 29 | /** 30 | * An exception thrown if nebula execution occur rpc exception. 31 | */ 32 | class NebulaRPCException(message: String, cause: Throwable = null) 33 | extends RuntimeException(message, cause) 34 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/nebula/GraphProvider.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.nebula 8 | 9 | import com.vesoft.nebula.client.graph.NebulaPoolConfig 10 | import com.vesoft.nebula.client.graph.data.{HostAddress, ResultSet} 11 | import com.vesoft.nebula.client.graph.net.{NebulaPool, Session} 12 | import com.vesoft.nebula.connector.connector.Address 13 | import com.vesoft.nebula.connector.exception.GraphConnectException 14 | import org.apache.log4j.Logger 15 | 16 | import scala.collection.JavaConverters._ 17 | import scala.collection.mutable.ListBuffer 18 | 19 | /** 20 | * GraphProvider for Nebula Graph Service 21 | */ 22 | class GraphProvider(addresses: List[Address]) extends AutoCloseable with Serializable { 23 | private[this] lazy val LOG = Logger.getLogger(this.getClass) 24 | 25 | @transient val nebulaPoolConfig = new NebulaPoolConfig 26 | 27 | @transient val pool: NebulaPool = new NebulaPool 28 | val address = new ListBuffer[HostAddress]() 29 | for (addr <- addresses) { 30 | address.append(new HostAddress(addr._1, addr._2)) 31 | } 32 | nebulaPoolConfig.setMaxConnSize(1) 33 | pool.init(address.asJava, nebulaPoolConfig) 34 | 35 | var session: Session = null 36 | 37 | def releaseGraphClient(session: Session): Unit = { 38 | session.release() 39 | } 40 | 41 | override def close(): Unit = { 42 | pool.close() 43 | } 44 | 45 | def switchSpace(user: String, password: String, space: String): Boolean = { 46 | if (session == null) { 47 | session = pool.getSession(user, password, true) 48 | } 49 | val switchStatment = s"use $space" 50 | LOG.info(s"switch space $space") 51 | val result = submit(switchStatment) 52 | result.isSucceeded 53 | } 54 | 55 | def submit(statement: String): ResultSet = { 56 | if (session == null) { 57 | LOG.error("graph session is null") 58 | throw new GraphConnectException("session is null") 59 | } 60 | session.execute(statement) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/nebula/MetaProvider.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.nebula 8 | 9 | import com.vesoft.nebula.client.graph.data.HostAddress 10 | import com.vesoft.nebula.client.meta.MetaClient 11 | import com.vesoft.nebula.connector.connector.Address 12 | import com.vesoft.nebula.connector.DataTypeEnum 13 | import com.vesoft.nebula.meta.{PropertyType, Schema} 14 | 15 | import scala.collection.JavaConverters._ 16 | import scala.collection.mutable 17 | 18 | class MetaProvider(addresses: List[Address]) extends AutoCloseable { 19 | 20 | val metaAddress = addresses.map(address => new HostAddress(address._1, address._2)).asJava 21 | val client = new MetaClient(metaAddress) 22 | client.connect() 23 | 24 | def getPartitionNumber(space: String): Int = { 25 | client.getPartsAlloc(space).size() 26 | } 27 | 28 | def getVidType(space: String): VidType.Value = { 29 | val vidType = client.getSpace(space).getProperties.getVid_type.getType 30 | if (vidType == PropertyType.FIXED_STRING) { 31 | return VidType.STRING 32 | } 33 | VidType.INT 34 | } 35 | 36 | def getTag(space: String, tag: String): Schema = { 37 | client.getTag(space, tag) 38 | } 39 | 40 | def getEdge(space: String, edge: String): Schema = { 41 | client.getEdge(space, edge) 42 | } 43 | 44 | def getTagSchema(space: String, tag: String): Map[String, Integer] = { 45 | val tagSchema = client.getTag(space, tag) 46 | val schema = new mutable.HashMap[String, Integer] 47 | 48 | val columns = tagSchema.getColumns 49 | for (colDef <- columns.asScala) { 50 | schema.put(new String(colDef.getName), colDef.getType.getType.getValue) 51 | } 52 | schema.toMap 53 | } 54 | 55 | def getEdgeSchema(space: String, edge: String): Map[String, Integer] = { 56 | val edgeSchema = client.getEdge(space, edge) 57 | val schema = new mutable.HashMap[String, Integer] 58 | 59 | val columns = edgeSchema.getColumns 60 | for (colDef <- columns.asScala) { 61 | schema.put(new String(colDef.getName), colDef.getType.getType.getValue) 62 | } 63 | schema.toMap 64 | } 65 | 66 | def getLabelType(space: String, label: String): DataTypeEnum.Value = { 67 | val tags = client.getTags(space) 68 | for (tag <- tags.asScala) { 69 | if (new String(tag.getTag_name).equals(label)) { 70 | return DataTypeEnum.VERTEX 71 | } 72 | } 73 | val edges = client.getEdges(space) 74 | for (edge <- edges.asScala) { 75 | if (new String(edge.getEdge_name).equals(label)) { 76 | return DataTypeEnum.EDGE 77 | } 78 | } 79 | null 80 | } 81 | 82 | override def close(): Unit = { 83 | client.close() 84 | } 85 | 86 | } 87 | 88 | object VidType extends Enumeration { 89 | type Type = Value 90 | 91 | val STRING = Value("STRING") 92 | val INT = Value("INT") 93 | } 94 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/reader/NebulaEdgePartitionReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.reader 8 | 9 | import com.vesoft.nebula.client.storage.scan.{ScanEdgeResult, ScanEdgeResultIterator} 10 | import com.vesoft.nebula.connector.NebulaOptions 11 | import org.apache.spark.sql.types.StructType 12 | import org.slf4j.{Logger, LoggerFactory} 13 | import scala.collection.JavaConverters._ 14 | 15 | class NebulaEdgePartitionReader(index: Int, nebulaOptions: NebulaOptions, schema: StructType) 16 | extends NebulaPartitionReader(index, nebulaOptions, schema) { 17 | private val LOG: Logger = LoggerFactory.getLogger(this.getClass) 18 | 19 | private var responseIterator: ScanEdgeResultIterator = _ 20 | 21 | override def next(): Boolean = { 22 | if (dataIterator == null && responseIterator == null && !scanPartIterator.hasNext) 23 | return false 24 | 25 | var continue: Boolean = false 26 | var break: Boolean = false 27 | while ((dataIterator == null || !dataIterator.hasNext) && !break) { 28 | resultValues.clear() 29 | continue = false 30 | if (responseIterator == null || !responseIterator.hasNext) { 31 | if (scanPartIterator.hasNext) { 32 | try { 33 | if (nebulaOptions.noColumn) { 34 | responseIterator = storageClient.scanEdge(nebulaOptions.spaceName, 35 | scanPartIterator.next(), 36 | nebulaOptions.label, 37 | nebulaOptions.limit, 38 | 0L, 39 | Long.MaxValue, 40 | true, 41 | true) 42 | } else { 43 | responseIterator = storageClient.scanEdge(nebulaOptions.spaceName, 44 | scanPartIterator.next(), 45 | nebulaOptions.label, 46 | nebulaOptions.getReturnCols.asJava, 47 | nebulaOptions.limit, 48 | 0, 49 | Long.MaxValue, 50 | true, 51 | true) 52 | } 53 | } catch { 54 | case e: Exception => 55 | LOG.error(s"Exception scanning vertex ${nebulaOptions.label}", e) 56 | storageClient.close() 57 | throw new Exception(e.getMessage, e) 58 | } 59 | // jump to the next loop 60 | continue = true 61 | } 62 | // break while loop 63 | break = !continue 64 | } else { 65 | val next: ScanEdgeResult = responseIterator.next 66 | if (!next.isEmpty) { 67 | dataIterator = next.getEdgeTableRows.iterator().asScala 68 | } 69 | } 70 | } 71 | 72 | if (dataIterator == null) { 73 | return false 74 | } 75 | dataIterator.hasNext 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/reader/NebulaPartition.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.reader 8 | 9 | import com.vesoft.nebula.connector.NebulaOptions 10 | import org.apache.spark.sql.catalyst.InternalRow 11 | import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader} 12 | import org.apache.spark.sql.types.StructType 13 | 14 | class NebulaVertexPartition(index: Int, nebulaOptions: NebulaOptions, schema: StructType) 15 | extends InputPartition[InternalRow] { 16 | override def createPartitionReader(): InputPartitionReader[InternalRow] = 17 | new NebulaVertexPartitionReader(index, nebulaOptions, schema) 18 | } 19 | 20 | class NebulaEdgePartition(index: Int, nebulaOptions: NebulaOptions, schema: StructType) 21 | extends InputPartition[InternalRow] { 22 | override def createPartitionReader(): InputPartitionReader[InternalRow] = 23 | new NebulaEdgePartitionReader(index, nebulaOptions, schema) 24 | } 25 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/reader/NebulaPartitionReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.reader 8 | 9 | import com.vesoft.nebula.client.graph.data.{HostAddress, ValueWrapper} 10 | import com.vesoft.nebula.client.storage.StorageClient 11 | import com.vesoft.nebula.client.storage.data.{BaseTableRow, VertexTableRow} 12 | import com.vesoft.nebula.connector.NebulaUtils.NebulaValueGetter 13 | import com.vesoft.nebula.connector.exception.GraphConnectException 14 | import com.vesoft.nebula.connector.{NebulaOptions, NebulaUtils, PartitionUtils} 15 | import com.vesoft.nebula.connector.nebula.MetaProvider 16 | import org.apache.spark.sql.catalyst.InternalRow 17 | import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow 18 | import org.apache.spark.sql.sources.v2.reader.InputPartitionReader 19 | import org.apache.spark.sql.types.StructType 20 | import org.slf4j.{Logger, LoggerFactory} 21 | 22 | import scala.collection.JavaConverters._ 23 | import scala.collection.mutable 24 | import scala.collection.mutable.ListBuffer 25 | 26 | /** 27 | * Read nebula data for each spark partition 28 | */ 29 | abstract class NebulaPartitionReader extends InputPartitionReader[InternalRow] { 30 | private val LOG: Logger = LoggerFactory.getLogger(this.getClass) 31 | 32 | private var metaProvider: MetaProvider = _ 33 | private var schema: StructType = _ 34 | 35 | protected var dataIterator: Iterator[BaseTableRow] = _ 36 | protected var scanPartIterator: Iterator[Integer] = _ 37 | protected var resultValues: mutable.ListBuffer[List[Object]] = mutable.ListBuffer[List[Object]]() 38 | protected var storageClient: StorageClient = _ 39 | 40 | /** 41 | * @param index identifier for spark partition 42 | * @param nebulaOptions nebula Options 43 | * @param schema of data need to read 44 | */ 45 | def this(index: Int, nebulaOptions: NebulaOptions, schema: StructType) { 46 | this() 47 | this.schema = schema 48 | 49 | metaProvider = new MetaProvider(nebulaOptions.getMetaAddress) 50 | val address: ListBuffer[HostAddress] = new ListBuffer[HostAddress] 51 | 52 | for (addr <- nebulaOptions.getMetaAddress) { 53 | address.append(new HostAddress(addr._1, addr._2)) 54 | } 55 | 56 | this.storageClient = new StorageClient(address.asJava) 57 | if (!storageClient.connect()) { 58 | throw new GraphConnectException("storage connect failed.") 59 | } 60 | // allocate scanPart to this partition 61 | val totalPart = metaProvider.getPartitionNumber(nebulaOptions.spaceName) 62 | 63 | val scanParts = PartitionUtils.getScanParts(index, totalPart, nebulaOptions.partitionNums.toInt) 64 | LOG.info(s"partition index: ${index}, scanParts: ${scanParts.toString}") 65 | scanPartIterator = scanParts.iterator 66 | } 67 | 68 | override def get(): InternalRow = { 69 | val resultSet: Array[ValueWrapper] = 70 | dataIterator.next().getValues.toArray.map(v => v.asInstanceOf[ValueWrapper]) 71 | val getters: Array[NebulaValueGetter] = NebulaUtils.makeGetters(schema) 72 | val mutableRow = new SpecificInternalRow(schema.fields.map(x => x.dataType)) 73 | 74 | for (i <- getters.indices) { 75 | val value: ValueWrapper = resultSet(i) 76 | var resolved = false 77 | if (value.isNull) { 78 | mutableRow.setNullAt(i) 79 | resolved = true 80 | } 81 | if (value.isString) { 82 | getters(i).apply(value.asString(), mutableRow, i) 83 | resolved = true 84 | } 85 | if (value.isDate) { 86 | getters(i).apply(value.asDate(), mutableRow, i) 87 | resolved = true 88 | } 89 | if (value.isTime) { 90 | getters(i).apply(value.asTime(), mutableRow, i) 91 | resolved = true 92 | } 93 | if (value.isDateTime) { 94 | getters(i).apply(value.asDateTime(), mutableRow, i) 95 | resolved = true 96 | } 97 | if (value.isLong) { 98 | getters(i).apply(value.asLong(), mutableRow, i) 99 | } 100 | if (value.isBoolean) { 101 | getters(i).apply(value.asBoolean(), mutableRow, i) 102 | } 103 | if (value.isDouble) { 104 | getters(i).apply(value.asDouble(), mutableRow, i) 105 | } 106 | } 107 | mutableRow 108 | } 109 | 110 | override def close(): Unit = { 111 | metaProvider.close() 112 | storageClient.close() 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/reader/NebulaSourceReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.reader 8 | 9 | import java.util 10 | 11 | import com.vesoft.nebula.connector.{DataTypeEnum, NebulaOptions, NebulaUtils} 12 | import com.vesoft.nebula.connector.nebula.MetaProvider 13 | import com.vesoft.nebula.meta.ColumnDef 14 | import org.apache.spark.sql.catalyst.InternalRow 15 | import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, InputPartition} 16 | import org.apache.spark.sql.types.{DataTypes, StructField, StructType} 17 | import org.slf4j.LoggerFactory 18 | 19 | import scala.collection.mutable.ListBuffer 20 | import scala.collection.JavaConverters._ 21 | 22 | /** 23 | * Base class of Nebula Source Reader 24 | */ 25 | abstract class NebulaSourceReader(nebulaOptions: NebulaOptions) extends DataSourceReader { 26 | private val LOG = LoggerFactory.getLogger(this.getClass) 27 | 28 | private var datasetSchema: StructType = _ 29 | 30 | override def readSchema(): StructType = { 31 | datasetSchema = getSchema(nebulaOptions) 32 | LOG.info(s"dataset's schema: $datasetSchema") 33 | datasetSchema 34 | } 35 | 36 | protected def getSchema: StructType = getSchema(nebulaOptions) 37 | 38 | /** 39 | * return the dataset's schema. Schema includes configured cols in returnCols or includes all properties in nebula. 40 | */ 41 | def getSchema(nebulaOptions: NebulaOptions): StructType = { 42 | val returnCols = nebulaOptions.getReturnCols 43 | val noColumn = nebulaOptions.noColumn 44 | val fields: ListBuffer[StructField] = new ListBuffer[StructField] 45 | val metaProvider = new MetaProvider(nebulaOptions.getMetaAddress) 46 | 47 | import scala.collection.JavaConverters._ 48 | var schemaCols: Seq[ColumnDef] = Seq() 49 | val isVertex = DataTypeEnum.VERTEX.toString.equalsIgnoreCase(nebulaOptions.dataType) 50 | 51 | // construct vertex or edge default prop 52 | if (isVertex) { 53 | fields.append(DataTypes.createStructField("_vertexId", DataTypes.StringType, false)) 54 | } else { 55 | fields.append(DataTypes.createStructField("_srcId", DataTypes.StringType, false)) 56 | fields.append(DataTypes.createStructField("_dstId", DataTypes.StringType, false)) 57 | fields.append(DataTypes.createStructField("_rank", DataTypes.LongType, false)) 58 | } 59 | 60 | var dataSchema: StructType = null 61 | // read no column 62 | if (noColumn) { 63 | dataSchema = new StructType(fields.toArray) 64 | return dataSchema 65 | } 66 | // get tag schema or edge schema 67 | val schema = if (isVertex) { 68 | metaProvider.getTag(nebulaOptions.spaceName, nebulaOptions.label) 69 | } else { 70 | metaProvider.getEdge(nebulaOptions.spaceName, nebulaOptions.label) 71 | } 72 | 73 | schemaCols = schema.columns.asScala 74 | 75 | // read all columns 76 | if (returnCols.isEmpty) { 77 | schemaCols.foreach(columnDef => { 78 | LOG.info(s"prop name ${new String(columnDef.getName)}, type ${columnDef.getType.getType} ") 79 | fields.append( 80 | DataTypes.createStructField(new String(columnDef.getName), 81 | NebulaUtils.convertDataType(columnDef.getType), 82 | true)) 83 | }) 84 | } else { 85 | for (col: String <- returnCols) { 86 | fields.append( 87 | DataTypes 88 | .createStructField(col, NebulaUtils.getColDataType(schemaCols.toList, col), true)) 89 | } 90 | } 91 | dataSchema = new StructType(fields.toArray) 92 | dataSchema 93 | } 94 | } 95 | 96 | /** 97 | * DataSourceReader for Nebula Vertex 98 | */ 99 | class NebulaDataSourceVertexReader(nebulaOptions: NebulaOptions) 100 | extends NebulaSourceReader(nebulaOptions) { 101 | 102 | override def planInputPartitions(): util.List[InputPartition[InternalRow]] = { 103 | val partitionNum = nebulaOptions.partitionNums.toInt 104 | val partitions = for (index <- 1 to partitionNum) 105 | yield { 106 | new NebulaVertexPartition(index, nebulaOptions, getSchema) 107 | } 108 | partitions.map(_.asInstanceOf[InputPartition[InternalRow]]).asJava 109 | } 110 | } 111 | 112 | /** 113 | * DataSourceReader for Nebula Edge 114 | */ 115 | class NebulaDataSourceEdgeReader(nebulaOptions: NebulaOptions) 116 | extends NebulaSourceReader(nebulaOptions) { 117 | 118 | override def planInputPartitions(): util.List[InputPartition[InternalRow]] = { 119 | val partitionNum = nebulaOptions.partitionNums.toInt 120 | val partitions = for (index <- 1 to partitionNum) 121 | yield new NebulaEdgePartition(index, nebulaOptions, getSchema) 122 | 123 | partitions.map(_.asInstanceOf[InputPartition[InternalRow]]).asJava 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/reader/NebulaVertexPartitionReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.reader 8 | 9 | import com.vesoft.nebula.client.storage.scan.{ScanVertexResult, ScanVertexResultIterator} 10 | import com.vesoft.nebula.connector.NebulaOptions 11 | import org.apache.spark.sql.types.StructType 12 | import org.slf4j.{Logger, LoggerFactory} 13 | 14 | import scala.collection.JavaConverters._ 15 | 16 | class NebulaVertexPartitionReader(index: Int, nebulaOptions: NebulaOptions, schema: StructType) 17 | extends NebulaPartitionReader(index, nebulaOptions, schema) { 18 | 19 | private val LOG: Logger = LoggerFactory.getLogger(this.getClass) 20 | 21 | private var responseIterator: ScanVertexResultIterator = _ 22 | 23 | override def next(): Boolean = { 24 | if (dataIterator == null && responseIterator == null && !scanPartIterator.hasNext) 25 | return false 26 | 27 | var continue: Boolean = false 28 | var break: Boolean = false 29 | while ((dataIterator == null || !dataIterator.hasNext) && !break) { 30 | resultValues.clear() 31 | continue = false 32 | if (responseIterator == null || !responseIterator.hasNext) { 33 | if (scanPartIterator.hasNext) { 34 | try { 35 | if (nebulaOptions.noColumn) { 36 | responseIterator = storageClient.scanVertex(nebulaOptions.spaceName, 37 | scanPartIterator.next(), 38 | nebulaOptions.label, 39 | nebulaOptions.limit, 40 | 0, 41 | Long.MaxValue, 42 | true, 43 | true) 44 | } else { 45 | responseIterator = storageClient.scanVertex(nebulaOptions.spaceName, 46 | scanPartIterator.next(), 47 | nebulaOptions.label, 48 | nebulaOptions.getReturnCols.asJava, 49 | nebulaOptions.limit, 50 | 0, 51 | Long.MaxValue, 52 | true, 53 | true) 54 | } 55 | } catch { 56 | case e: Exception => 57 | LOG.error(s"Exception scanning vertex ${nebulaOptions.label}", e) 58 | storageClient.close() 59 | throw new Exception(e.getMessage, e) 60 | } 61 | // jump to the next loop 62 | continue = true 63 | } 64 | // break while loop 65 | break = !continue 66 | } else { 67 | val next: ScanVertexResult = responseIterator.next 68 | if (!next.isEmpty) { 69 | dataIterator = next.getVertexTableRows.iterator().asScala 70 | } 71 | } 72 | } 73 | 74 | if (dataIterator == null) { 75 | return false 76 | } 77 | dataIterator.hasNext 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/writer/NebulaCommitMessage.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.writer 8 | 9 | import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage 10 | 11 | case class NebulaCommitMessage(executeStatements: List[String]) extends WriterCommitMessage 12 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/writer/NebulaEdgeWriter.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.writer 8 | 9 | import com.vesoft.nebula.connector.connector.{NebulaEdge, NebulaEdges} 10 | import com.vesoft.nebula.connector.{KeyPolicy, NebulaOptions, WriteMode} 11 | import org.apache.spark.sql.catalyst.InternalRow 12 | import org.apache.spark.sql.sources.v2.writer.{DataWriter, WriterCommitMessage} 13 | import org.apache.spark.sql.types.StructType 14 | import org.slf4j.LoggerFactory 15 | 16 | import scala.collection.mutable.ListBuffer 17 | 18 | class NebulaEdgeWriter(nebulaOptions: NebulaOptions, 19 | srcIndex: Int, 20 | dstIndex: Int, 21 | rankIndex: Option[Int], 22 | schema: StructType) 23 | extends NebulaWriter(nebulaOptions) 24 | with DataWriter[InternalRow] { 25 | 26 | private val LOG = LoggerFactory.getLogger(this.getClass) 27 | 28 | val rankIdx = if (rankIndex.isDefined) rankIndex.get else -1 29 | val propNames = NebulaExecutor.assignEdgePropNames(schema, 30 | srcIndex, 31 | dstIndex, 32 | rankIdx, 33 | nebulaOptions.srcAsProp, 34 | nebulaOptions.dstAsProp, 35 | nebulaOptions.rankAsProp) 36 | val fieldTypMap: Map[String, Integer] = 37 | if (nebulaOptions.writeMode == WriteMode.DELETE) Map[String, Integer]() 38 | else metaProvider.getEdgeSchema(nebulaOptions.spaceName, nebulaOptions.label) 39 | 40 | val srcPolicy = 41 | if (nebulaOptions.srcPolicy.isEmpty) Option.empty 42 | else Option(KeyPolicy.withName(nebulaOptions.srcPolicy)) 43 | val dstPolicy = { 44 | if (nebulaOptions.dstPolicy.isEmpty) Option.empty 45 | else Option(KeyPolicy.withName(nebulaOptions.dstPolicy)) 46 | } 47 | 48 | /** buffer to save batch edges */ 49 | var edges: ListBuffer[NebulaEdge] = new ListBuffer() 50 | 51 | prepareSpace() 52 | 53 | /** 54 | * write one edge record to buffer 55 | */ 56 | override def write(row: InternalRow): Unit = { 57 | val srcId = NebulaExecutor.extraID(schema, row, srcIndex, srcPolicy, isVidStringType) 58 | val dstId = NebulaExecutor.extraID(schema, row, dstIndex, dstPolicy, isVidStringType) 59 | val rank = 60 | if (rankIndex.isEmpty) Option.empty 61 | else Option(NebulaExecutor.extraRank(schema, row, rankIndex.get)) 62 | val values = 63 | if (nebulaOptions.writeMode == WriteMode.DELETE) List() 64 | else 65 | NebulaExecutor.assignEdgeValues(schema, 66 | row, 67 | srcIndex, 68 | dstIndex, 69 | rankIdx, 70 | nebulaOptions.srcAsProp, 71 | nebulaOptions.dstAsProp, 72 | nebulaOptions.rankAsProp, 73 | fieldTypMap) 74 | val nebulaEdge = NebulaEdge(srcId, dstId, rank, values) 75 | edges.append(nebulaEdge) 76 | if (edges.size >= nebulaOptions.batch) { 77 | execute() 78 | } 79 | } 80 | 81 | def execute(): Unit = { 82 | val nebulaEdges = NebulaEdges(propNames, edges.toList, srcPolicy, dstPolicy) 83 | val exec = nebulaOptions.writeMode match { 84 | case WriteMode.INSERT => NebulaExecutor.toExecuteSentence(nebulaOptions.label, nebulaEdges) 85 | case WriteMode.UPDATE => 86 | NebulaExecutor.toUpdateExecuteStatement(nebulaOptions.label, nebulaEdges) 87 | case WriteMode.DELETE => 88 | NebulaExecutor.toDeleteExecuteStatement(nebulaOptions.label, nebulaEdges) 89 | case _ => 90 | throw new IllegalArgumentException(s"write mode ${nebulaOptions.writeMode} not supported.") 91 | } 92 | edges.clear() 93 | submit(exec) 94 | } 95 | 96 | override def commit(): WriterCommitMessage = { 97 | if (edges.nonEmpty) { 98 | execute() 99 | } 100 | graphProvider.close() 101 | NebulaCommitMessage.apply(failedExecs.toList) 102 | } 103 | 104 | override def abort(): Unit = { 105 | LOG.error("insert edge task abort.") 106 | graphProvider.close() 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/writer/NebulaSourceWriter.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.writer 8 | 9 | import com.vesoft.nebula.connector.NebulaOptions 10 | import org.apache.spark.sql.catalyst.InternalRow 11 | import org.apache.spark.sql.sources.v2.writer.{ 12 | DataSourceWriter, 13 | DataWriter, 14 | DataWriterFactory, 15 | WriterCommitMessage 16 | } 17 | import org.apache.spark.sql.types.StructType 18 | import org.slf4j.LoggerFactory 19 | 20 | /** 21 | * creating and initializing the actual Nebula vertex writer at executor side 22 | */ 23 | class NebulaVertexWriterFactory(nebulaOptions: NebulaOptions, vertexIndex: Int, schema: StructType) 24 | extends DataWriterFactory[InternalRow] { 25 | override def createDataWriter(partitionId: Int, 26 | taskId: Long, 27 | epochId: Long): DataWriter[InternalRow] = { 28 | new NebulaVertexWriter(nebulaOptions, vertexIndex, schema) 29 | } 30 | } 31 | 32 | /** 33 | * creating and initializing the actual Nebula edge writer at executor side 34 | */ 35 | class NebulaEdgeWriterFactory(nebulaOptions: NebulaOptions, 36 | srcIndex: Int, 37 | dstIndex: Int, 38 | rankIndex: Option[Int], 39 | schema: StructType) 40 | extends DataWriterFactory[InternalRow] { 41 | override def createDataWriter(partitionId: Int, 42 | taskId: Long, 43 | epochId: Long): DataWriter[InternalRow] = { 44 | new NebulaEdgeWriter(nebulaOptions, srcIndex, dstIndex, rankIndex, schema) 45 | } 46 | } 47 | 48 | /** 49 | * nebula vertex writer to create factory 50 | */ 51 | class NebulaDataSourceVertexWriter(nebulaOptions: NebulaOptions, 52 | vertexIndex: Int, 53 | schema: StructType) 54 | extends DataSourceWriter { 55 | private val LOG = LoggerFactory.getLogger(this.getClass) 56 | 57 | override def createWriterFactory(): DataWriterFactory[InternalRow] = { 58 | new NebulaVertexWriterFactory(nebulaOptions, vertexIndex, schema) 59 | } 60 | 61 | override def commit(messages: Array[WriterCommitMessage]): Unit = { 62 | LOG.debug(s"${messages.length}") 63 | for (msg <- messages) { 64 | val nebulaMsg = msg.asInstanceOf[NebulaCommitMessage] 65 | LOG.info(s"failed execs:\n ${nebulaMsg.executeStatements.toString()}") 66 | } 67 | } 68 | 69 | override def abort(messages: Array[WriterCommitMessage]): Unit = { 70 | LOG.error("NebulaDataSourceVertexWriter abort") 71 | } 72 | } 73 | 74 | /** 75 | * nebula edge writer to create factory 76 | */ 77 | class NebulaDataSourceEdgeWriter(nebulaOptions: NebulaOptions, 78 | srcIndex: Int, 79 | dstIndex: Int, 80 | rankIndex: Option[Int], 81 | schema: StructType) 82 | extends DataSourceWriter { 83 | private val LOG = LoggerFactory.getLogger(this.getClass) 84 | 85 | override def createWriterFactory(): DataWriterFactory[InternalRow] = { 86 | new NebulaEdgeWriterFactory(nebulaOptions, srcIndex, dstIndex, rankIndex, schema) 87 | } 88 | 89 | override def commit(messages: Array[WriterCommitMessage]): Unit = { 90 | LOG.debug(s"${messages.length}") 91 | for (msg <- messages) { 92 | val nebulaMsg = msg.asInstanceOf[NebulaCommitMessage] 93 | LOG.info(s"failed execs:\n ${nebulaMsg.executeStatements.toString()}") 94 | } 95 | 96 | } 97 | 98 | override def abort(messages: Array[WriterCommitMessage]): Unit = { 99 | LOG.error("NebulaDataSourceEdgeWriter abort") 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/writer/NebulaVertexWriter.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.writer 8 | 9 | import com.vesoft.nebula.connector.connector.{NebulaVertex, NebulaVertices} 10 | import com.vesoft.nebula.connector.{KeyPolicy, NebulaOptions, WriteMode} 11 | import org.apache.spark.sql.catalyst.InternalRow 12 | import org.apache.spark.sql.sources.v2.writer.{DataWriter, WriterCommitMessage} 13 | import org.apache.spark.sql.types.StructType 14 | import org.slf4j.LoggerFactory 15 | 16 | import scala.collection.mutable.ListBuffer 17 | 18 | class NebulaVertexWriter(nebulaOptions: NebulaOptions, vertexIndex: Int, schema: StructType) 19 | extends NebulaWriter(nebulaOptions) 20 | with DataWriter[InternalRow] { 21 | 22 | private val LOG = LoggerFactory.getLogger(this.getClass) 23 | 24 | val propNames = NebulaExecutor.assignVertexPropNames(schema, vertexIndex, nebulaOptions.vidAsProp) 25 | val fieldTypMap: Map[String, Integer] = 26 | if (nebulaOptions.writeMode == WriteMode.DELETE) Map[String, Integer]() 27 | else metaProvider.getTagSchema(nebulaOptions.spaceName, nebulaOptions.label) 28 | 29 | val policy = { 30 | if (nebulaOptions.vidPolicy.isEmpty) Option.empty 31 | else Option(KeyPolicy.withName(nebulaOptions.vidPolicy)) 32 | } 33 | 34 | /** buffer to save batch vertices */ 35 | var vertices: ListBuffer[NebulaVertex] = new ListBuffer() 36 | 37 | prepareSpace() 38 | 39 | /** 40 | * write one vertex row to buffer 41 | */ 42 | override def write(row: InternalRow): Unit = { 43 | val vertex = 44 | NebulaExecutor.extraID(schema, row, vertexIndex, policy, isVidStringType) 45 | val values = 46 | if (nebulaOptions.writeMode == WriteMode.DELETE) List() 47 | else 48 | NebulaExecutor.assignVertexPropValues(schema, 49 | row, 50 | vertexIndex, 51 | nebulaOptions.vidAsProp, 52 | fieldTypMap) 53 | val nebulaVertex = NebulaVertex(vertex, values) 54 | vertices.append(nebulaVertex) 55 | if (vertices.size >= nebulaOptions.batch) { 56 | execute() 57 | } 58 | } 59 | 60 | def execute(): Unit = { 61 | val nebulaVertices = NebulaVertices(propNames, vertices.toList, policy) 62 | val exec = nebulaOptions.writeMode match { 63 | case WriteMode.INSERT => NebulaExecutor.toExecuteSentence(nebulaOptions.label, nebulaVertices) 64 | case WriteMode.UPDATE => 65 | NebulaExecutor.toUpdateExecuteStatement(nebulaOptions.label, nebulaVertices) 66 | case WriteMode.DELETE => NebulaExecutor.toDeleteExecuteStatement(nebulaVertices) 67 | case _ => 68 | throw new IllegalArgumentException(s"write mode ${nebulaOptions.writeMode} not supported.") 69 | } 70 | vertices.clear() 71 | submit(exec) 72 | } 73 | 74 | override def commit(): WriterCommitMessage = { 75 | if (vertices.nonEmpty) { 76 | execute() 77 | } 78 | graphProvider.close() 79 | NebulaCommitMessage(failedExecs.toList) 80 | } 81 | 82 | override def abort(): Unit = { 83 | LOG.error("insert vertex task abort.") 84 | graphProvider.close() 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/main/scala/com/vesoft/nebula/connector/writer/NebulaWriter.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector.writer 8 | 9 | import java.util.concurrent.TimeUnit 10 | 11 | import com.google.common.util.concurrent.RateLimiter 12 | import com.vesoft.nebula.connector.NebulaOptions 13 | import com.vesoft.nebula.connector.nebula.{GraphProvider, MetaProvider, VidType} 14 | import org.slf4j.LoggerFactory 15 | 16 | import scala.collection.mutable.ListBuffer 17 | 18 | class NebulaWriter(nebulaOptions: NebulaOptions) extends Serializable { 19 | private val LOG = LoggerFactory.getLogger(this.getClass) 20 | 21 | val failedExecs: ListBuffer[String] = new ListBuffer[String] 22 | 23 | val graphProvider = new GraphProvider(nebulaOptions.getGraphAddress) 24 | val metaProvider = new MetaProvider(nebulaOptions.getMetaAddress) 25 | val isVidStringType = metaProvider.getVidType(nebulaOptions.spaceName) == VidType.STRING 26 | 27 | def prepareSpace(): Unit = { 28 | graphProvider.switchSpace(nebulaOptions.user, nebulaOptions.passwd, nebulaOptions.spaceName) 29 | } 30 | 31 | def submit(exec: String): Unit = { 32 | @transient val rateLimiter = RateLimiter.create(nebulaOptions.rateLimit) 33 | if (rateLimiter.tryAcquire(nebulaOptions.rateTimeOut, TimeUnit.MILLISECONDS)) { 34 | val result = graphProvider.submit(exec) 35 | if (!result.isSucceeded) { 36 | failedExecs.append(exec) 37 | LOG.error(s"failed to write ${exec} for " + result.getErrorMessage) 38 | } else { 39 | LOG.info(s"batch write succeed") 40 | LOG.debug(s"batch write succeed: ${exec}") 41 | } 42 | } else { 43 | failedExecs.append(exec) 44 | LOG.error(s"failed to acquire reteLimiter for statement {$exec}") 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /nebula-spark-connector/src/test/scala/com/vesoft/nebula/connector/NebulaConfigSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License, 4 | * attached with Common Clause Condition 1.0, found in the LICENSES directory. 5 | */ 6 | 7 | package com.vesoft.nebula.connector 8 | 9 | import org.scalatest.BeforeAndAfterAll 10 | import org.scalatest.funsuite.AnyFunSuite 11 | 12 | class NebulaConfigSuite extends AnyFunSuite with BeforeAndAfterAll { 13 | 14 | test("test NebulaConnectionConfig") { 15 | try { 16 | NebulaConnectionConfig.builder().withTimeout(1).build() 17 | } catch { 18 | case e: java.lang.AssertionError => assert(true) 19 | } 20 | 21 | try { 22 | NebulaConnectionConfig.builder().withTimeout(-1).build() 23 | } catch { 24 | case e: java.lang.AssertionError => assert(true) 25 | } 26 | 27 | try { 28 | NebulaConnectionConfig 29 | .builder() 30 | .withMetaAddress("127.0.0.1:9559") 31 | .withTimeout(1) 32 | .build() 33 | assert(true) 34 | } catch { 35 | case _: Throwable => assert(false) 36 | } 37 | } 38 | 39 | test("test WriteNebulaConfig") { 40 | var writeNebulaConfig: WriteNebulaVertexConfig = null 41 | try { 42 | writeNebulaConfig = WriteNebulaVertexConfig 43 | .builder() 44 | .withSpace("test") 45 | .withTag("tag") 46 | .withVidField("vid") 47 | .build() 48 | } catch { 49 | case e: Throwable => assert(false) 50 | } 51 | assert(true) 52 | assert(!writeNebulaConfig.getVidAsProp) 53 | assert(writeNebulaConfig.getSpace.equals("test")) 54 | } 55 | 56 | test("test wrong policy") { 57 | try { 58 | WriteNebulaVertexConfig 59 | .builder() 60 | .withSpace("test") 61 | .withTag("tag") 62 | .withVidField("vId") 63 | .withVidPolicy("wrong_policy") 64 | .build() 65 | } catch { 66 | case e: java.lang.AssertionError => assert(true) 67 | } 68 | } 69 | 70 | test("test wrong batch") { 71 | try { 72 | WriteNebulaVertexConfig 73 | .builder() 74 | .withSpace("test") 75 | .withTag("tag") 76 | .withVidField("vId") 77 | .withVidPolicy("hash") 78 | .withBatch(-1) 79 | .build() 80 | } catch { 81 | case e: java.lang.AssertionError => assert(true) 82 | } 83 | } 84 | 85 | test("test ReadNebulaConfig") { 86 | try { 87 | ReadNebulaConfig 88 | .builder() 89 | .withSpace("test") 90 | .withLabel("tagName") 91 | .withNoColumn(true) 92 | .withReturnCols(List("col")) 93 | .build() 94 | } catch { 95 | case e: java.lang.AssertionError => assert(false) 96 | } 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.vesoft 8 | nebula-spark 9 | pom 10 | 2.5-SNAPSHOT 11 | 12 | 13 | UTF-8 14 | 15 | 16 | 17 | nebula-spark 18 | Nebula Spark Utils 19 | https://github.com/vesoft-inc/nebula-spark-utils 20 | 21 | scm:git:https://github.com/vesoft-inc/nebula 22 | https://github.com/vesoft-inc/nebula 23 | scm:git:https://github.com/vesoft-inc/nebula 24 | 25 | 26 | 27 | Apache License, Version 2.0 28 | https://www.apache.org/licenses/LICENSE-2.0.txt 29 | repo 30 | license 31 | 32 | 33 | 34 | 35 | 36 | nebula 37 | Nebula Graph 38 | nebula-spark-utils@vesoft-inc.com 39 | vesoft 40 | 41 | architect 42 | developer 43 | 44 | 45 | 46 | 47 | 48 | nebula-exchange 49 | nebula-spark-connector 50 | nebula-algorithm 51 | example 52 | 53 | 54 | 55 | 56 | release 57 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 58 | 59 | 60 | snapshots 61 | https://oss.sonatype.org/content/repositories/snapshots/ 62 | 63 | 64 | 65 | 66 | 67 | 68 | org.apache.maven.plugins 69 | maven-gpg-plugin 70 | 1.6 71 | 72 | 73 | verify 74 | 75 | sign 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | --------------------------------------------------------------------------------