├── .github
    └── workflows
    │   ├── ISSUE_TEMPLATE.md
    │   ├── PULL_REQUEST_TEMPLATE.md
    │   ├── check_label.yml
    │   ├── pull_request.yml
    │   ├── release.yml
    │   └── snapshot.yml
├── .gitignore
├── .scalafmt.conf
├── .travis.yml
├── LICENSE
├── LICENSES
    └── Apache-2.0.txt
├── README-CN.md
├── README.md
├── bench
    ├── EXCHANGE_CONFIG
    │   ├── app_sf1.conf
    │   ├── app_sf100.conf
    │   ├── app_sf1000_sst_without_header.conf
    │   └── app_sf30.conf
    ├── NEBULA_DDL
    │   ├── SPACE_SF1
    │   ├── SPACE_SF100
    │   └── SPACE_SF30
    └── exchange-test.md
├── codecov.yml
├── conf-template
    ├── client_import
    │   ├── bigquery_datasource.conf
    │   ├── csv_datasource.conf
    │   └── hive_datasource.conf
    └── sst_import
    │   ├── csv_datasource.conf
    │   └── hive_datasource.conf
├── exchange-common
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── com
    │       │   │   └── vesoft
    │       │   │       └── exchange
    │       │   │           └── common
    │       │   │               └── FileMigrate.java
    │       ├── resources
    │       │   ├── config_template
    │       │   │   ├── csv.conf
    │       │   │   ├── hbase.conf
    │       │   │   ├── hive.conf
    │       │   │   ├── jdbc.conf
    │       │   │   ├── json.conf
    │       │   │   ├── kafka.conf
    │       │   │   ├── neo4j.conf
    │       │   │   ├── orc.conf
    │       │   │   └── parquet.conf
    │       │   └── log4j.properties
    │       └── scala
    │       │   └── com
    │       │       └── vesoft
    │       │           └── exchange
    │       │               └── common
    │       │                   ├── CheckPointHandler.scala
    │       │                   ├── ErrorHandler.scala
    │       │                   ├── GenerateConfigTemplate.scala
    │       │                   ├── GraphProvider.scala
    │       │                   ├── MetaProvider.scala
    │       │                   ├── Package.scala
    │       │                   ├── PasswordEncryption.scala
    │       │                   ├── config
    │       │                       ├── Configs.scala
    │       │                       ├── SchemaConfigs.scala
    │       │                       ├── SinkConfigs.scala
    │       │                       └── SourceConfigs.scala
    │       │                   ├── processor
    │       │                       ├── Processor.scala
    │       │                       └── ReloadProcessor.scala
    │       │                   ├── utils
    │       │                       ├── ConfigTemplateUtils.scala
    │       │                       ├── HDFSUtils.scala
    │       │                       ├── NebulaPartitioner.scala
    │       │                       ├── NebulaUtils.scala
    │       │                       └── SparkValidate.scala
    │       │                   └── writer
    │       │                       ├── FileBaseWriter.scala
    │       │                       ├── ServerBaseWriter.scala
    │       │                       └── Writer.scala
    │   └── test
    │       ├── resources
    │           ├── application.conf
    │           ├── docker-compose.yaml
    │           ├── edge.csv
    │           ├── process_application.conf
    │           └── vertex.csv
    │       └── scala
    │           └── com
    │               └── vesoft
    │                   └── exchange
    │                       └── common
    │                           ├── GraphProviderSuite.scala
    │                           ├── MetaProviderSuite.scala
    │                           ├── NebulaGraphMock.scala
    │                           ├── config
    │                               └── ConfigsSuite.scala
    │                           ├── processor
    │                               └── ProcessorSuite.scala
    │                           ├── utils
    │                               ├── NebulaUtilsSuite.scala
    │                               └── SparkValidateSuite.scala
    │                           └── writer
    │                               ├── FileBaseWriterSuite.scala
    │                               └── ServerBaseWriterSuite.scala
├── nebula-exchange_spark_2.2
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── resources
    │       │   └── log4j.properties
    │       └── scala
    │       │   └── com
    │       │       └── vesoft
    │       │           └── nebula
    │       │               └── exchange
    │       │                   ├── Exchange.scala
    │       │                   ├── processor
    │       │                       ├── EdgeProcessor.scala
    │       │                       └── VerticesProcessor.scala
    │       │                   └── reader
    │       │                       ├── FileBaseReader.scala
    │       │                       ├── Reader.scala
    │       │                       ├── ServerBaseReader.scala
    │       │                       └── StreamingBaseReader.scala
    │   └── test
    │       └── scala
    │           └── com
    │               └── vesoft
    │                   └── nebula
    │                       └── exchange
    │                           └── processor
    │                               ├── EdgeProcessorSuite.scala
    │                               └── VerticesProcessorSuite.scala
├── nebula-exchange_spark_2.4
    ├── .gitignore
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── application.conf
    │       │   ├── application_encrypt_password.conf
    │       │   └── log4j.properties
    │       └── scala
    │       │   └── com
    │       │       └── vesoft
    │       │           └── nebula
    │       │               └── exchange
    │       │                   ├── Exchange.scala
    │       │                   ├── processor
    │       │                       ├── EdgeProcessor.scala
    │       │                       └── VerticesProcessor.scala
    │       │                   ├── reader
    │       │                       ├── FileBaseReader.scala
    │       │                       ├── Reader.scala
    │       │                       ├── ServerBaseReader.scala
    │       │                       └── StreamingBaseReader.scala
    │       │                   └── utils
    │       │                       └── Neo4jUtils.scala
    │   └── test
    │       ├── resources
    │           ├── application.conf
    │           ├── docker-compose.yaml
    │           ├── edge.csv
    │           ├── process_application.conf
    │           └── vertex.csv
    │       └── scala
    │           └── com
    │               └── vesoft
    │                   └── nebula
    │                       └── exchange
    │                           └── processor
    │                               ├── EdgeProcessorSuite.scala
    │                               └── VerticesProcessorSuite.scala
├── nebula-exchange_spark_3.0
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── resources
    │       │   └── log4j.properties
    │       └── scala
    │       │   └── com
    │       │       └── vesoft
    │       │           └── nebula
    │       │               └── exchange
    │       │                   ├── Exchange.scala
    │       │                   ├── processor
    │       │                       ├── EdgeProcessor.scala
    │       │                       └── VerticesProcessor.scala
    │       │                   └── reader
    │       │                       ├── FileBaseReader.scala
    │       │                       ├── Reader.scala
    │       │                       ├── ServerBaseReader.scala
    │       │                       └── StreamingBaseReader.scala
    │   └── test
    │       └── scala
    │           └── com
    │               └── vesoft
    │                   └── nebula
    │                       └── exchange
    │                           └── processor
    │                               ├── EdgeProcessorSuite.scala
    │                               └── VerticesProcessorSuite.scala
└── pom.xml


/.github/workflows/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | #### Expected behavior
 2 | 
 3 | #### Actual behavior
 4 | 
 5 | #### Steps to reproduce
 6 | 
 7 | #### JVM version (e.g. `java -version`)
 8 | 
 9 | #### Scala version (e.g. `scala -version`)
10 | 
11 | #### OS version (e.g. `uname -a`)
12 | 


--------------------------------------------------------------------------------
/.github/workflows/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Motivation:
 2 | 
 3 | Why you're making that change and what is the problem you're trying to solve.
 4 | 
 5 | Modification:
 6 | 
 7 | Describe the modifications you've done.
 8 | 
 9 | Result:
10 | 
11 | Fixes #<GitHub issue number>.
12 | 


--------------------------------------------------------------------------------
/.github/workflows/check_label.yml:
--------------------------------------------------------------------------------
 1 | name: Auto label
 2 | 
 3 | on:
 4 |   issues:
 5 |     types:
 6 |       - reopened
 7 |       - opened
 8 |       - labeled
 9 |       - unlabeled
10 |       - closed
11 |     
12 | env:
13 |   GH_PAT: ${{ secrets.GITHUB_TOKEN }}
14 |   EVENT: ${{ toJSON(github.event)}}
15 |   EVENT_NAME: ${{ github.event_name}}
16 | 
17 | jobs:
18 |   sync:
19 |     name: auto label
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - uses: HarrisChu/auto_label@v1
23 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will build a Java project with Maven
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
 3 | 
 4 | name: pull_request
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches:
11 |       - master
12 |       - 'v[0-9]+.*'
13 | 
14 | jobs:
15 |   build:
16 | 
17 |     runs-on: ubuntu-latest
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up JDK 1.8
22 |       uses: actions/setup-java@v1
23 |       with:
24 |         java-version: 1.8
25 | 
26 |     - name: Cache the Maven packages to speed up build
27 |       uses: actions/cache@v2
28 |       with:
29 |         path: ~/.m2/repository
30 |         key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
31 |         restore-keys: ${{ runner.os }}-maven-
32 | 
33 |     - name: Install nebula-graph
34 |       run: |
35 |         mkdir tmp
36 |         pushd tmp
37 |         git clone https://github.com/vesoft-inc/nebula-docker-compose.git
38 |         pushd nebula-docker-compose/
39 |         cp ../../exchange-common/src/test/resources/docker-compose.yaml .
40 |         docker-compose up -d
41 |         sleep 10
42 |         popd
43 |         popd
44 | 
45 |     - name: Build with Maven
46 |       run: |
47 |         mvn clean package -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2
48 |         mvn clean package -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4
49 |         mvn clean package -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0
50 | 
51 |     - uses: codecov/codecov-action@v2
52 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will build a Java project with Maven
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
 3 | 
 4 | name: release
 5 | 
 6 | on:
 7 |   release:
 8 |     types: published
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - name: Set up JDK 1.8
17 |         uses: actions/setup-java@v1
18 |         with:
19 |           java-version: 1.8
20 | 
21 |       - name: Cache the Maven packages to speed up build
22 |         uses: actions/cache@v2
23 |         with:
24 |           path: ~/.m2/repository
25 |           key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
26 |           restore-keys: ${{ runner.os }}-maven-
27 | 
28 |       - name: Install nebula-graph
29 |         run: |
30 |           mkdir tmp
31 |           pushd tmp
32 |           git clone https://github.com/vesoft-inc/nebula-docker-compose.git
33 |           pushd nebula-docker-compose/
34 |           cp ../../exchange-common/src/test/resources/docker-compose.yaml .
35 |           docker-compose up -d
36 |           sleep 10
37 |           popd
38 |           popd
39 | 
40 |       - name: Build with Maven
41 |         run: |
42 |           mvn clean package -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2
43 |           mvn clean package -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4
44 |           mvn clean package -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0
45 | 
46 |       - name: Get the version
47 |         id: get_version
48 |         run: |
49 |           tag=$(echo ${{ github.ref }} | rev | cut -d/ -f1 | rev)
50 |           tagnum=$(echo $tag | sed 's/^v//')
51 |           echo "::set-output name=tag::$tag"
52 |           echo "::set-output name=tagnum::$tagnum"
53 |         shell: bash
54 | 
55 |       - name: upload to release assets
56 |         uses: softprops/action-gh-release@v1
57 |         env:
58 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
59 |         with:
60 |           files: |
61 |             nebula-exchange_spark_2.2/target/nebula-exchange_spark_2.2-${{ steps.get_version.outputs.tagnum }}.jar
62 |             nebula-exchange_spark_2.4/target/nebula-exchange_spark_2.4-${{ steps.get_version.outputs.tagnum }}.jar
63 |             nebula-exchange_spark_3.0/target/nebula-exchange_spark_3.0-${{ steps.get_version.outputs.tagnum }}.jar
64 | 
65 |       - name: upload to oss
66 |         run: |
67 |           wget http://gosspublic.alicdn.com/ossutil/1.7.8/ossutil64
68 |           chmod 755 ossutil64
69 |           ./ossutil64 -e ${{ secrets.OSS_ENDPOINT }} \
70 |             -i ${{ secrets.OSS_ID }} \
71 |             -k ${{ secrets.OSS_SECRET }} \
72 |             -f cp nebula-exchange_spark_2.2/target/nebula-exchange_spark_2.2-${{ steps.get_version.outputs.tagnum}}.jar oss://nebula-graph/maven2/nebula-exchange/${{ steps.get_version.outputs.tagnum }}/
73 |           ./ossutil64 -e ${{ secrets.OSS_ENDPOINT }} \
74 |             -i ${{ secrets.OSS_ID }} \
75 |             -k ${{ secrets.OSS_SECRET }} \
76 |             -f cp nebula-exchange_spark_2.4/target/nebula-exchange_spark_2.4-${{ steps.get_version.outputs.tagnum }}.jar oss://nebula-graph/maven2/nebula-exchange/${{ steps.get_version.outputs.tagnum }}/
77 |           ./ossutil64 -e ${{ secrets.OSS_ENDPOINT }} \
78 |             -i ${{ secrets.OSS_ID }} \
79 |             -k ${{ secrets.OSS_SECRET }} \
80 |             -f cp nebula-exchange_spark_3.0/target/nebula-exchange_spark_3.0-${{ steps.get_version.outputs.tagnum }}.jar oss://nebula-graph/maven2/nebula-exchange/${{ steps.get_version.outputs.tagnum }}/
81 | 


--------------------------------------------------------------------------------
/.github/workflows/snapshot.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will build a Java project with Maven
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
 3 | 
 4 | name: snapshot
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   schedule:
10 |     - cron: '0 6 * * *'
11 | 
12 | jobs:
13 |   deploy:
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |     - name: Set up JDK 1.8
19 |       uses: actions/setup-java@v1
20 |       with:
21 |         java-version: 1.8
22 | 
23 |     - name: Cache the Maven packages to speed up build
24 |       uses: actions/cache@v2
25 |       with:
26 |         path: ~/.m2/repository
27 |         key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
28 |         restore-keys: ${{ runner.os }}-maven-
29 | 
30 |     - name: Install nebula-graph
31 |       run: |
32 |         mkdir tmp
33 |         pushd tmp
34 |         git clone https://github.com/vesoft-inc/nebula-docker-compose.git
35 |         pushd nebula-docker-compose/
36 |         cp ../../exchange-common/src/test/resources/docker-compose.yaml .
37 |         docker-compose up -d
38 |         sleep 10
39 |         popd
40 |         popd
41 | 
42 |     - name: Build with Maven
43 |       run: |
44 |         mvn clean package -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2
45 |         mvn clean package -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4
46 |         mvn clean package -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0
47 | 
48 |     - name: upload Exchange with Spark 2.2 to snapshot assets
49 |       uses: actions/upload-artifact@v2
50 |       with:
51 |         name: nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar
52 |         path:
53 |           nebula-exchange_spark_2.2/target/nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar
54 | 
55 |     - name: upload Exchange with Spark 2.4 to snapshot assets
56 |       uses: actions/upload-artifact@v2
57 |       with:
58 |         name: nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar
59 |         path:
60 |           nebula-exchange_spark_2.4/target/nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar
61 | 
62 |     - name: upload Exchange with Spark 3.0 to snapshot assets
63 |       uses: actions/upload-artifact@v2
64 |       with:
65 |         name: nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar
66 |         path:
67 |           nebula-exchange_spark_3.0/target/nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled class file
 2 | *.class
 3 | 
 4 | # Log file
 5 | *.log
 6 | 
 7 | # BlueJ files
 8 | *.ctxt
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 | 
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 | 
25 | # build target
26 | target/
27 | 
28 | # IDE
29 | .idea/
30 | .eclipse/
31 | *.iml
32 | .project
33 | .bloop
34 | .metals
35 | .settings
36 | .vscode
37 | .classpath
38 | .factorypath
39 | 
40 | spark-importer.ipr
41 | spark-importer.iws
42 | 
43 | # mac
44 | .DS_Store
45 | 


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | align = more
2 | maxColumn = 100
3 | docstrings = ScalaDoc
4 | assumeStandardLibraryStripMargin = true


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 vesoft inc. All rights reserved.
 2 | #
 3 | # This source code is licensed under Apache 2.0 License.
 4 | 
 5 | language: java
 6 | 
 7 | jdk:
 8 |   - oraclejdk11
 9 |   - openjdk8
10 |   - openjdk11
11 | 
12 | install: mvn clean compile package install -Dgpg.skip -Dmaven.javadoc.skip=true
13 | 


--------------------------------------------------------------------------------
/README-CN.md:
--------------------------------------------------------------------------------
  1 | # 欢迎使用 NebulaGraph Exchange
  2 | 
  3 | [English](https://github.com/vesoft-inc/nebula-exchange/blob/master/README.md)
  4 | 
  5 | NebulaGraph Exchange（以下简称 Exchange）是一款 Apache Spark&trade; 应用，用于在分布式环境中将集群中的数据批量迁移到
  6 | NebulaGraph 中，它能支持多种不同格式的批式数据和流式数据的迁移，它还支持直接与 SST File 方式的
  7 | NebulaGraph 写入。
  8 | 
  9 | Exchange 支持的 Spark 版本包括 2.2、2.4 和
 10 | 3.0，对应的工具包名分别为 `nebula-exchange_spark_2.2`、`nebula-exchange_spark_2.4`
 11 | 和 `nebula-exchange_spark_3.0`。
 12 | 
 13 | > 注意：
 14 | > - 3.4.0 版本不支持 kafka 和 pulsar， 若需将 kafka 或 pulsar 数据导入 NebulaGraph，请使用 3.0.0 或
 15 |     3.3.0 或 3.5.0 版本。
 16 | > - 本仓库仅支持 NebulaGraph 2.x 和 3.x，如果您在使用 NebulaGraph
 17 |     v1.x，请使用 [NebulaExchange v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/exchange)
 18 |     ，或参考 Exchange 1.0
 19 |     的使用文档[NebulaExchange 用户手册](https://docs.nebula-graph.com.cn/3.6.0/import-export/nebula-exchange/about-exchange/ex-ug-what-is-exchange/ "点击前往 Nebula Graph 网站")。
 20 | 
 21 | 
 22 | ## 如何获取
 23 | 
 24 | 1. 编译打包最新的 Exchange。
 25 | 
 26 |     ```bash
 27 |     $ git clone https://github.com/vesoft-inc/nebula-exchange.git
 28 |     $ cd nebula-exchange
 29 |     $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2
 30 |     $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4
 31 |     $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0 
 32 |     ```
 33 | 
 34 |    编译打包完成后，可以：
 35 |     - 在 nebula-exchange/nebula-exchange_spark_2.2/target/ 目录下找到
 36 |       nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar 文件；
 37 |     - 在 nebula-exchange/nebula-exchange_spark_2.4/target/ 目录下找到
 38 |       nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar 文件；
 39 |     - 以及在 nebula-exchange/nebula-exchange_spark_3.0/target/ 目录下找到
 40 |       nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar 文件。
 41 | 
 42 | 3. 在官网或 GitHub 下载
 43 | 
 44 |    **正式版本**
 45 | 
 46 |    [GitHub Releases](https://github.com/vesoft-inc/nebula-exchange/releases)
 47 |    或者 [Downloads](https://www.nebula-graph.com.cn/release?exchange=)
 48 | 
 49 |    **快照版本**
 50 | 
 51 |    进入[GitHub Actions Artifacts](https://github.com/vesoft-inc/nebula-exchange/actions/workflows/snapshot.yml)
 52 |    页面点击任意 workflow 后，从 Artifacts 中，根据需求下载下载。
 53 | 
 54 | ## 自动生成示例配置文件
 55 | 
 56 | 通过如下命令，指定要导入的数据源，即可获得该数据源所对应的配置文件示例。
 57 | ```agsl
 58 | java -cp nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar com.vesoft.exchange.common.GenerateConfigTemplate -s {source} -p
 59 | {target-path-to-save-config-file}
 60 | ```
 61 | 
 62 | ## 加密 NebulaGraph 密码
 63 | ```agsl
 64 | spark-submit --master local --class com.vesoft.exchange.common.PasswordEncryption nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -p {password}
 65 | ```
 66 | 加密 密码 nebula，输出结果包括RSA 公钥、私钥和加密后的password，示例：
 67 | ```agsl
 68 | =================== public key begin ===================
 69 | MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCLl7LaNSEXlZo2hYiJqzxgyFBQdkxbQXYU/xQthsBJwjOPhkiY37nokzKnjNlp6mv5ZUomqxLsoNQHEJ6BZD4VPiaiElFAkTD+gyul1v8f3A446Fr2rnVLogWHnz8ECPt7X8jwmpiKOXkOPIhqU5E0Cua+Kk0nnVosbos/VShfiQIDAQAB
 70 | =================== public key end ===================
 71 | 
 72 | 
 73 | =================== private key begin ===================
 74 | MIICeAIBADANBgkqhkiG9w0BAQEFAASCAmIwggJeAgEAAoGBAIuXsto1IReVmjaFiImrPGDIUFB2TFtBdhT/FC2GwEnCM4+GSJjfueiTMqeM2Wnqa/llSiarEuyg1AcQnoFkPhU+JqISUUCRMP6DK6XW/x/cDjjoWvaudUuiBYefPwQI+3tfyPCamIo5eQ48iGpTkTQK5r4qTSedWixuiz9VKF+JAgMBAAECgYADWbfEPwQ1UbTq3Bej3kVLuWMcG0rH4fFYnaq5UQOqgYvFRR7W9H+80lOj6+CIB0ViLgkylmaU4WNVbBOx3VsUFFWSqIIIviKubg8m8ey7KAd9X2wMEcUHi4JyS2+/WSacaXYS5LOmMevvuaOwLEV0QmyM+nNGRIjUdzCLR1935QJBAM+IF8YD5GnoAPPjGIDS1Ljhu/u/Gj6/YBCQKSHQ5+HxHEKjQ/YxQZ/otchmMZanYelf1y+byuJX3NZ04/KSGT8CQQCsMaoFO2rF5M84HpAXPi6yH2chbtz0VTKZworwUnpmMVbNUojf4VwzAyOhT1U5o0PpFbpi+NqQhC63VUN5k003AkEArI8vnVGNMlZbvG7e5/bmM9hWs2viSbxdB0inOtv2g1M1OV+B2gp405ru0/PNVcRV0HQFfCuhVfTSxmspQoAihwJBAJW6EZa/FZbB4JVxreUoAr6Lo8dkeOhT9M3SZbGWZivaFxot/Cp/8QXCYwbuzrJxjqlsZUeOD6694Uk08JkURn0CQQC8V6aRa8ylMhLJFkGkMDHLqHcQCmY53Kd73mUu4+mjMJLZh14zQD9ydFtc0lbLXTeBAMWV3uEdeLhRvdAo3OwV
 75 | =================== private key end ===================
 76 | 
 77 | 
 78 | =================== encrypted  password begin ===================
 79 | Io+3y3mLOMnZJJNUPHZ8pKb4VfTvg6wUh6jSu5xdmLAoX/59tK1HTwoN40aOOWJwa1a5io7S4JqcX/jEcAorw7pelITr+F4oB0AMCt71d+gJuu3/lw9bjUEl9tF4Raj82y2Dg39wYbagN84fZMgCD63TPiDIevSr6+MFKASpGrY=
 80 | =================== encrypted  password end ===================
 81 | check: the real password decrypted by private key and encrypted password is: nebula
 82 | ```
 83 | 
 84 | ## 版本匹配
 85 | 
 86 | Exchange 和 NebulaGraph 的版本对应关系如下:
 87 | 
 88 | |              Exchange Version              | NebulaGraph Version |          Spark Version          |
 89 | |:------------------------------------------:|:-------------------:|:-------------------------------:|
 90 | |         nebula-exchange-2.0.0.jar          |    2.0.0, 2.0.1     |              2.4.*              |
 91 | |         nebula-exchange-2.0.1.jar          |    2.0.0, 2.0.1     |              2.4.*              |
 92 | |         nebula-exchange-2.1.0.jar          |    2.0.0, 2.0.1     |              2.4.*              |
 93 | |         nebula-exchange-2.5.0.jar          |    2.5.0, 2.5.1     |              2.4.*              |
 94 | |         nebula-exchange-2.5.1.jar          |    2.5.0, 2.5.1     |              2.4.*              |
 95 | |         nebula-exchange-2.5.2.jar          |    2.5.0, 2.5.1     |              2.4.*              |
 96 | |         nebula-exchange-2.6.0.jar          |    2.6.0, 2.6.1     |              2.4.*              |
 97 | |         nebula-exchange-2.6.1.jar          |    2.6.0, 2.6.1     |              2.4.*              |
 98 | |         nebula-exchange-2.6.2.jar          |    2.6.0, 2.6.1     |              2.4.*              |
 99 | |         nebula-exchange-2.6.3.jar          |    2.6.0, 2.6.1     |              2.4.*              |
100 | |    nebula-exchange_spark_2.2-3.x.x.jar     |        3.x.x        |              2.2.*              |
101 | |    nebula-exchange_spark_2.4-3.x.x.jar     |        3.x.x        |              2.4.*              |
102 | |    nebula-exchange_spark_3.0-3.x.x.jar     |        3.x.x        | `3.0.*`,`3.1.*`,`3.2.*`,`3.3.*` |
103 | | nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar |       nightly       |              2.2.*              |
104 | | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar |       nightly       |              2.4.*              |
105 | | nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar |       nightly       | `3.0.*`,`3.1.*`,`3.2.*`,`3.3.*` |
106 | 
107 | ## 使用说明
108 | 
109 | 特性 & 注意事项：
110 | 
111 | *1. Nebula Graph 2.0 支持 String 类型和 Integer 类型的点 id 。*
112 | 
113 | *2. Exchange 2.0 新增 null、Date、DateTime、Time 类型数据的导入（ DateTime 是 UTC 时区，非 Local time）。*
114 | 
115 | *3. Exchange 2.0 支持 Hive on Spark 以外的 Hive 数据源，需在配置文件中配置 Hive
116 | 源，具体配置示例参考 [application.conf](https://github.com/vesoft-inc/nebula-exchange/blob/master/exchange-common/src/test/resources/application.conf)
117 | 中 Hive 的配置。*
118 | 
119 | *4. Exchange 2.0 将导入失败的 INSERT 语句进行落盘，存于配置文件的 error/output 路径中。*
120 | 
121 | *5. Exchange 2.5.0 支持SST导入，但不支持属性的 default 值。*
122 | 
123 | *6.
124 | 配置文件参考 [application.conf](https://github.com/vesoft-inc/nebula-exchange/blob/master/exchange-common/src/test/resources/application.conf)。*
125 | 
126 | *7. Exchange 2.0 的导入命令：*
127 | 
128 | ```
129 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange --master local nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c /path/to/application.conf
130 | ```
131 | 
132 | 如果数据源有HIVE，则导入命令最后还需要加 `-h` 表示启用HIVE数据源。
133 | 
134 | 注：在Yarn-Cluster模式下提交 Exchange，请使用如下提交命令：
135 | 
136 | ```
137 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \
138 | --master yarn-cluster \
139 | --files application.conf \
140 | --conf spark.driver.extraClassPath=./ \
141 | --conf spark.executor.extraClassPath=./ \
142 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar \
143 | -c application.conf
144 | ```
145 | 
146 | 注：使用 Nebula Exchange 进行 SST 文件生成时，会涉及到 Spark 的 shuffle 操作，请注意在提交命令中增加
147 | spark.sql.shuffle.partition 的配置：
148 | 
149 | ```
150 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \
151 | --master local \
152 | --conf spark.sql.shuffle.partitions=200 \
153 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar \
154 | -c application.conf
155 | ```
156 | *8. 自 3.7.0 版本，Exchange支持配置RSA加密后的NebulaGraph密码，并支持生成加密的密码。*
157 | 
158 | 关于 Nebula Exchange 的更多说明，请参考 Exchange 2.0
159 | 的[使用手册](https://docs.nebula-graph.com.cn/2.6.2/nebula-exchange/about-exchange/ex-ug-what-is-exchange/) 。
160 | 
161 | ## 贡献
162 | 
163 | Nebula Exchange 2.0 是一个完全开源的项目，欢迎开源爱好者通过以下方式参与：
164 | 
165 | - 前往 [Nebula Graph 论坛](https://discuss.nebula-graph.com.cn/ "点击前往“Nebula Graph 论坛") 上参与
166 |   Issue 讨论，如答疑、提供想法或者报告无法解决的问题
167 | - 撰写或改进文档
168 | - 提交优化代码
169 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NebulaGraph Exchange
  2 | 
  3 | [中文版](https://github.com/vesoft-inc/nebula-exchange/blob/master/README-CN.md)
  4 | 
  5 | NebulaGraph Exchange (referred to as Exchange) is an Apache Spark™ application used to migrate data
  6 | in bulk from different sources to NebulaGraph in a distributed way(Spark). It supports a variety of
  7 | batch or streaming data sources and allows direct writing to NebulaGraph through side-loading (SST
  8 | Files).
  9 | 
 10 | Exchange supports Spark versions 2.2, 2.4, and 3.0 along with their respective toolkits
 11 | named: `nebula-exchange_spark_2.2`, `nebula-exchange_spark_2.4`, and `nebula-exchange_spark_3.0`.
 12 | 
 13 | > Note:
 14 | > - Exchange 3.4.0 does not support Apache Kafka and Apache Pulsar. Please use Exchange of version
 15 |     3.0.0, 3.3.0, or 3.5.0 to load data from Apache Kafka or Apache Pulsar to NebulaGraph for now.
 16 | > - This repo covers only NebulaGraph 2.x and 3.x, for NebulaGraph v1.x, please
 17 |     use [NebulaGraph Exchange v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/exchange).
 18 | 
 19 | ## Build or Download Exchange
 20 | 
 21 | 1. Build the latest Exchange
 22 | 
 23 |     ```bash
 24 |     $ git clone https://github.com/vesoft-inc/nebula-exchange.git
 25 |     $ cd nebula-exchange
 26 |     $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2
 27 |     $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4
 28 |     $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0
 29 |     ```
 30 | 
 31 |    After packaging, the newly generated JAR files can be found in the following path:
 32 |     - nebula-exchange/nebula-exchange_spark_2.2/target/ contains
 33 |       nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar
 34 |     - nebula-exchange/nebula-exchange_spark_2.4/target/ contains
 35 |       nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar
 36 |     - nebula-exchange/nebula-exchange_spark_3.0/target/ contains
 37 |       nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar
 38 | 
 39 | 3. Download from the GitHub artifact
 40 | 
 41 |    **Released Version:**
 42 | 
 43 |    [GitHub Releases](https://github.com/vesoft-inc/nebula-exchange/releases)
 44 |    or [Downloads](https://www.nebula-graph.io/release?exchange=)
 45 | 
 46 |    **Snapshot Version:**
 47 | 
 48 |    [GitHub Actions Artifacts](https://github.com/vesoft-inc/nebula-exchange/actions/workflows/snapshot.yml)
 49 | 
 50 | ## Get Started
 51 | 
 52 | Here is an example command to run the Exchange:
 53 | 
 54 | ```bash
 55 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange --master local nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c /path/to/application.conf
 56 | ```
 57 | 
 58 | And when the source is **Hive**, run:
 59 | 
 60 | ```bash
 61 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange --master local nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c /path/to/application.conf -h
 62 | ```
 63 | 
 64 | Run the Exchange in **Yarn-Cluster** mode:
 65 | 
 66 | ```bash
 67 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \
 68 | --master yarn-cluster \
 69 | --files application.conf \
 70 | --conf spark.driver.extraClassPath=./ \
 71 | --conf spark.executor.extraClassPath=./ \
 72 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar \
 73 | -c application.conf
 74 | ```
 75 | 
 76 | Note: When using Exchange to generate SST files, please add `spark.sql.shuffle.partition`
 77 | in `--conf` for Spark's shuffle operation:
 78 | 
 79 | ```
 80 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \
 81 | --master local \
 82 | --conf spark.sql.shuffle.partitions=200 \
 83 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar \
 84 | -c application.conf
 85 | ```
 86 | 
 87 | For more details, please refer
 88 | to [NebulaGraph Exchange Docs](https://docs.nebula-graph.io/master/import-export/nebula-exchange/about-exchange/ex-ug-what-is-exchange/)
 89 | 
 90 | ## How to get the config file
 91 | 
 92 | You can get the template config file with your datasource through the command:
 93 | 
 94 | ```agsl
 95 | java -cp nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar com.vesoft.exchange.common.GenerateConfigTemplate -s {source} -p
 96 | {target-path-to-save-config-file}
 97 | ```
 98 | 
 99 | Such as your datasource is csv, and want to save the template config file in /tmp/, please run:
100 | 
101 | ```agsl
102 | java -cp nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar com.vesoft.exchange.common.GenerateConfigTemplate -s csv -p /tmp
103 | ```
104 | 
105 | ## encrypt NebulaGraph's password
106 | ```agsl
107 | spark-submit --master local --class com.vesoft.exchange.common.PasswordEncryption nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -p {password}
108 | ```
109 | When encrypt the password `nebula`, the output includes RSA public key, private key, encrypted password：
110 | ```agsl
111 | =================== public key begin ===================
112 | MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCLl7LaNSEXlZo2hYiJqzxgyFBQdkxbQXYU/xQthsBJwjOPhkiY37nokzKnjNlp6mv5ZUomqxLsoNQHEJ6BZD4VPiaiElFAkTD+gyul1v8f3A446Fr2rnVLogWHnz8ECPt7X8jwmpiKOXkOPIhqU5E0Cua+Kk0nnVosbos/VShfiQIDAQAB
113 | =================== public key end ===================
114 | 
115 | 
116 | =================== private key begin ===================
117 | MIICeAIBADANBgkqhkiG9w0BAQEFAASCAmIwggJeAgEAAoGBAIuXsto1IReVmjaFiImrPGDIUFB2TFtBdhT/FC2GwEnCM4+GSJjfueiTMqeM2Wnqa/llSiarEuyg1AcQnoFkPhU+JqISUUCRMP6DK6XW/x/cDjjoWvaudUuiBYefPwQI+3tfyPCamIo5eQ48iGpTkTQK5r4qTSedWixuiz9VKF+JAgMBAAECgYADWbfEPwQ1UbTq3Bej3kVLuWMcG0rH4fFYnaq5UQOqgYvFRR7W9H+80lOj6+CIB0ViLgkylmaU4WNVbBOx3VsUFFWSqIIIviKubg8m8ey7KAd9X2wMEcUHi4JyS2+/WSacaXYS5LOmMevvuaOwLEV0QmyM+nNGRIjUdzCLR1935QJBAM+IF8YD5GnoAPPjGIDS1Ljhu/u/Gj6/YBCQKSHQ5+HxHEKjQ/YxQZ/otchmMZanYelf1y+byuJX3NZ04/KSGT8CQQCsMaoFO2rF5M84HpAXPi6yH2chbtz0VTKZworwUnpmMVbNUojf4VwzAyOhT1U5o0PpFbpi+NqQhC63VUN5k003AkEArI8vnVGNMlZbvG7e5/bmM9hWs2viSbxdB0inOtv2g1M1OV+B2gp405ru0/PNVcRV0HQFfCuhVfTSxmspQoAihwJBAJW6EZa/FZbB4JVxreUoAr6Lo8dkeOhT9M3SZbGWZivaFxot/Cp/8QXCYwbuzrJxjqlsZUeOD6694Uk08JkURn0CQQC8V6aRa8ylMhLJFkGkMDHLqHcQCmY53Kd73mUu4+mjMJLZh14zQD9ydFtc0lbLXTeBAMWV3uEdeLhRvdAo3OwV
118 | =================== private key end ===================
119 | 
120 | 
121 | =================== encrypted  password begin ===================
122 | Io+3y3mLOMnZJJNUPHZ8pKb4VfTvg6wUh6jSu5xdmLAoX/59tK1HTwoN40aOOWJwa1a5io7S4JqcX/jEcAorw7pelITr+F4oB0AMCt71d+gJuu3/lw9bjUEl9tF4Raj82y2Dg39wYbagN84fZMgCD63TPiDIevSr6+MFKASpGrY=
123 | =================== encrypted  password end ===================
124 | check: the real password decrypted by private key and encrypted password is: nebula
125 | ```
126 | 
127 | ## Version Compatibility Matrix
128 | 
129 | Here is the version correspondence between Exchange and NebulaGraph:
130 | 
131 | |              Exchange Version              | Nebula Version |          Spark Version          |
132 | |:------------------------------------------:|:--------------:|:-------------------------------:|
133 | |         nebula-exchange-2.0.0.jar          |  2.0.0, 2.0.1  |              2.4.*              |
134 | |         nebula-exchange-2.0.1.jar          |  2.0.0, 2.0.1  |              2.4.*              |
135 | |         nebula-exchange-2.1.0.jar          |  2.0.0, 2.0.1  |              2.4.*              |
136 | |         nebula-exchange-2.5.0.jar          |  2.5.0, 2.5.1  |              2.4.*              |
137 | |         nebula-exchange-2.5.1.jar          |  2.5.0, 2.5.1  |              2.4.*              |
138 | |         nebula-exchange-2.5.2.jar          |  2.5.0, 2.5.1  |              2.4.*              |
139 | |         nebula-exchange-2.6.0.jar          |  2.6.0, 2.6.1  |              2.4.*              |
140 | |         nebula-exchange-2.6.1.jar          |  2.6.0, 2.6.1  |              2.4.*              |
141 | |         nebula-exchange-2.6.2.jar          |  2.6.0, 2.6.1  |              2.4.*              |
142 | |         nebula-exchange-2.6.3.jar          |  2.6.0, 2.6.1  |              2.4.*              |
143 | |    nebula-exchange_spark_2.2-3.x.x.jar     |     3.x.x      |              2.2.*              |
144 | |    nebula-exchange_spark_2.4-3.x.x.jar     |     3.x.x      |              2.4.*              |
145 | |    nebula-exchange_spark_3.0-3.x.x.jar     |     3.x.x      | `3.0.*`,`3.1.*`,`3.2.*`,`3.3.*` |
146 | | nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar |    nightly     |              2.2.*              |
147 | | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar |    nightly     |              2.4.*              |
148 | | nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar |    nightly     | `3.0.*`,`3.1.*`,`3.2.*`,`3.3.*` |
149 | 
150 | ## Feature History
151 | 
152 | 1. *Since 2.0* Exchange allows for the import of vertex data with both String and Integer type IDs.
153 | 2. *Since 2.0* Exchange also supports importing data of various types, including Null, Date,
154 |    DateTime (using UTC instead of local time), and Time.
155 | 3. *Since 2.0* In addition to Hive on Spark, Exchange can import data from other Hive sources as
156 |    well.
157 | 4. *Since 2.0* If there are failures during the data import process, Exchange supports recording and
158 |    retrying the INSERT statement.
159 | 5. *Since 2.5* While SST import is supported by Exchange, property default values are not yet
160 |    supported.
161 | 6. *Since 3.0* Exchange is compatible with Spark 2.2, Spark 2.4, and Spark 3.0.
162 | 7. *Since 3.7* Exchange supports to config the encrypted NebulaGraph password and supports to generate the encryption password.
163 | Refer
164 | to [application.conf](https://github.com/vesoft-inc/nebula-exchange/blob/master/exchange-common/src/test/resources/application.conf)
165 | as an example to edit the configuration file.
166 | 


--------------------------------------------------------------------------------
/bench/NEBULA_DDL/SPACE_SF1:
--------------------------------------------------------------------------------
 1 | create space sf1(vid_type=int64,partition_num=100,replica_factor=3);
 2 | USE sf1;
 3 | CREATE TAG IF NOT EXISTS `Place`(`name` string,`url` string,`type` string);
 4 | CREATE TAG IF NOT EXISTS `Comment`(`creationDate` string,`locationIP` string,`browserUsed` string,`content` string,`length` int);
 5 | CREATE TAG IF NOT EXISTS `Organisation`(`type` string,`name` string,`url` string);
 6 | CREATE TAG IF NOT EXISTS `Person`(`firstName` string,`lastName` string,`gender` string,`birthday` string,`creationDate` string,`locationIP` string,`browserUsed` string);
 7 | CREATE TAG IF NOT EXISTS `Tagclass`(`name` string,`url` string);
 8 | CREATE TAG IF NOT EXISTS `Forum`(`title` string,`creationDate` string);
 9 | CREATE TAG IF NOT EXISTS `Post`(`imageFile` string,`creationDate` string,`locationIP` string,`browserUsed` string,`language` string,`content` string,`length` int);
10 | CREATE TAG IF NOT EXISTS `Tag`(`name` string,`url` string);
11 | CREATE EDGE IF NOT EXISTS `IS_PART_OF`();
12 | CREATE EDGE IF NOT EXISTS `LIKES`(`creationDate` string);
13 | CREATE EDGE IF NOT EXISTS `HAS_CREATOR`();
14 | CREATE EDGE IF NOT EXISTS `HAS_INTEREST`();
15 | CREATE EDGE IF NOT EXISTS `IS_SUBCLASS_OF`();
16 | CREATE EDGE IF NOT EXISTS `IS_LOCATED_IN`();
17 | CREATE EDGE IF NOT EXISTS `HAS_MODERATOR`();
18 | CREATE EDGE IF NOT EXISTS `HAS_TAG`();
19 | CREATE EDGE IF NOT EXISTS `WORK_AT`(`workFrom` int);
20 | CREATE EDGE IF NOT EXISTS `REPLY_OF`();
21 | CREATE EDGE IF NOT EXISTS `STUDY_AT`(`classYear` int);
22 | CREATE EDGE IF NOT EXISTS `CONTAINER_OF`();
23 | CREATE EDGE IF NOT EXISTS `HAS_MEMBER`(`joinDate` string);
24 | CREATE EDGE IF NOT EXISTS `KNOWS`(`creationDate` string);
25 | CREATE EDGE IF NOT EXISTS `HAS_TYPE`();
26 | 


--------------------------------------------------------------------------------
/bench/NEBULA_DDL/SPACE_SF100:
--------------------------------------------------------------------------------
 1 | create space sf100(vid_type=int64,partition_num=100,replica_factor=3);
 2 | USE sf100;
 3 | CREATE TAG IF NOT EXISTS `Place`(`name` string,`url` string,`type` string);
 4 | CREATE TAG IF NOT EXISTS `Comment`(`creationDate` string,`locationIP` string,`browserUsed` string,`content` string,`length` int);
 5 | CREATE TAG IF NOT EXISTS `Organisation`(`type` string,`name` string,`url` string);
 6 | CREATE TAG IF NOT EXISTS `Person`(`firstName` string,`lastName` string,`gender` string,`birthday` string,`creationDate` string,`locationIP` string,`browserUsed` string);
 7 | CREATE TAG IF NOT EXISTS `Tagclass`(`name` string,`url` string);
 8 | CREATE TAG IF NOT EXISTS `Forum`(`title` string,`creationDate` string);
 9 | CREATE TAG IF NOT EXISTS `Post`(`imageFile` string,`creationDate` string,`locationIP` string,`browserUsed` string,`language` string,`content` string,`length` int);
10 | CREATE TAG IF NOT EXISTS `Tag`(`name` string,`url` string);
11 | CREATE EDGE IF NOT EXISTS `IS_PART_OF`();
12 | CREATE EDGE IF NOT EXISTS `LIKES`(`creationDate` string);
13 | CREATE EDGE IF NOT EXISTS `HAS_CREATOR`();
14 | CREATE EDGE IF NOT EXISTS `HAS_INTEREST`();
15 | CREATE EDGE IF NOT EXISTS `IS_SUBCLASS_OF`();
16 | CREATE EDGE IF NOT EXISTS `IS_LOCATED_IN`();
17 | CREATE EDGE IF NOT EXISTS `HAS_MODERATOR`();
18 | CREATE EDGE IF NOT EXISTS `HAS_TAG`();
19 | CREATE EDGE IF NOT EXISTS `WORK_AT`(`workFrom` int);
20 | CREATE EDGE IF NOT EXISTS `REPLY_OF`();
21 | CREATE EDGE IF NOT EXISTS `STUDY_AT`(`classYear` int);
22 | CREATE EDGE IF NOT EXISTS `CONTAINER_OF`();
23 | CREATE EDGE IF NOT EXISTS `HAS_MEMBER`(`joinDate` string);
24 | CREATE EDGE IF NOT EXISTS `KNOWS`(`creationDate` string);
25 | CREATE EDGE IF NOT EXISTS `HAS_TYPE`();
26 | 


--------------------------------------------------------------------------------
/bench/NEBULA_DDL/SPACE_SF30:
--------------------------------------------------------------------------------
 1 | create space sf30(vid_type=int64,partition_num=100,replica_factor=3);
 2 | USE sf30;
 3 | CREATE TAG IF NOT EXISTS `Place`(`name` string,`url` string,`type` string);
 4 | CREATE TAG IF NOT EXISTS `Comment`(`creationDate` string,`locationIP` string,`browserUsed` string,`content` string,`length` int);
 5 | CREATE TAG IF NOT EXISTS `Organisation`(`type` string,`name` string,`url` string);
 6 | CREATE TAG IF NOT EXISTS `Person`(`firstName` string,`lastName` string,`gender` string,`birthday` string,`creationDate` string,`locationIP` string,`browserUsed` string);
 7 | CREATE TAG IF NOT EXISTS `Tagclass`(`name` string,`url` string);
 8 | CREATE TAG IF NOT EXISTS `Forum`(`title` string,`creationDate` string);
 9 | CREATE TAG IF NOT EXISTS `Post`(`imageFile` string,`creationDate` string,`locationIP` string,`browserUsed` string,`language` string,`content` string,`length` int);
10 | CREATE TAG IF NOT EXISTS `Tag`(`name` string,`url` string);
11 | CREATE EDGE IF NOT EXISTS `IS_PART_OF`();
12 | CREATE EDGE IF NOT EXISTS `LIKES`(`creationDate` string);
13 | CREATE EDGE IF NOT EXISTS `HAS_CREATOR`();
14 | CREATE EDGE IF NOT EXISTS `HAS_INTEREST`();
15 | CREATE EDGE IF NOT EXISTS `IS_SUBCLASS_OF`();
16 | CREATE EDGE IF NOT EXISTS `IS_LOCATED_IN`();
17 | CREATE EDGE IF NOT EXISTS `HAS_MODERATOR`();
18 | CREATE EDGE IF NOT EXISTS `HAS_TAG`();
19 | CREATE EDGE IF NOT EXISTS `WORK_AT`(`workFrom` int);
20 | CREATE EDGE IF NOT EXISTS `REPLY_OF`();
21 | CREATE EDGE IF NOT EXISTS `STUDY_AT`(`classYear` int);
22 | CREATE EDGE IF NOT EXISTS `CONTAINER_OF`();
23 | CREATE EDGE IF NOT EXISTS `HAS_MEMBER`(`joinDate` string);
24 | CREATE EDGE IF NOT EXISTS `KNOWS`(`creationDate` string);
25 | CREATE EDGE IF NOT EXISTS `HAS_TYPE`();
26 | 


--------------------------------------------------------------------------------
/bench/exchange-test.md:
--------------------------------------------------------------------------------
 1 | # Nebula-Exchange test result
 2 | We use LDBC dataset to test the exchange client import performance.
 3 | 
 4 | # prepare
 5 | * The Nebula Schema DDL is configed in bench/NEBULA_DDL. 
 6 | 
 7 | * The exchange config file is configed in bench/EXCHANGE_CONFIG.
 8 | 
 9 | # import command
10 | 
11 | for space sf1, the command is:
12 | ```
13 | spark-submit --master "spark://127.0.0.1:7077" \
14 | --driver-memory=2G \
15 | --num-executors=3 \
16 | --executor-memory=10G \
17 | --executor-cores=20 \
18 | --class com.vesoft.nebula.exchange.Exchange \
19 | nebula-exchange-2.6.0.jar -c app_sf1.conf
20 | ```
21 | 
22 | for space sf30, the command is:
23 | 
24 | ```
25 | spark-submit --master "spark://127.0.0.1:7077" \
26 | --driver-memory=2G \
27 | --num-executors=3 \
28 | --executor-memory=30G \
29 | --executor-cores=20 \
30 | --class com.vesoft.nebula.exchange.Exchange \
31 | nebula-exchange-2.6.0.jar -c app_sf30.conf
32 | ```
33 | 
34 | for space sf100, the command is:
35 | ```
36 | spark-submit --master "spark://127.0.0.1:7077" \
37 | --driver-memory=2G \
38 | --num-executors=3 \
39 | --executor-memory=30G \
40 | --executor-cores=20 \
41 | --class com.vesoft.nebula.exchange.Exchange \
42 | nebula-exchange-2.6.0.jar -c app_sf100.conf
43 | ```
44 | 
45 | # import result
46 | Here is the import result:
47 | 
48 | When Space has 1 replica, and the auto-compact is enable.
49 | 
50 | |  Dataset |             Data Amount          |cores|executor-memory|spark-partition|batch size|duration|   speed  |
51 | |:--------:|:--------------------------------:|:---:|:-------------:|:-------------:|:--------:|:------:|:--------:|
52 | |LDBC sf1  | vertex:3165488  edge:17256029    |  60 |       10G     |       60      |   2000   |  56s   | 360,000/s |
53 | |LDBC sf30 | vertex:88673640 edge:540915215   |  60 |       20G     |       60      |   2000   | 7.5min |1,399,086/s|
54 | |LDBC sf100| vertex:282386021 edge:1775513185 |  60 |       30G     |       60      |   2000   | 27min  |1,270,303/s|
55 | 
56 | When Space has 1 replica, and the auto-compact is false.
57 | 
58 | |  Dataset |             Data Amount          |cores|executor-memory|spark-partition|batch size|duration|   speed  |
59 | |:--------:|:--------------------------------:|:---:|:-------------:|:-------------:|:--------:|:------:|:--------:|
60 | |LDBC sf1  | vertex:3165488  edge:17256029    |  60 |       10G     |       60      |   2000   |   49s  | 416,765/s|
61 | |LDBC sf30 | vertex:88673640 edge:540915215   |  60 |       20G     |       60      |   2000   |  6.3min|1,665,578/s|
62 | |LDBC sf100| vertex:282386021 edge:1775513185 |  60 |       30G     |       60      |   2000   |  22min |1,559,014/s|
63 | 
64 | After data import, space sf100 with one replica will take to finish the manual compaction.
65 | 
66 | 
67 | 
68 | When Space has 3 replicas, and the auto-compact is closed.
69 | 
70 | |  Dataset  |            Data Amount           |cores|executor-memory|spark-partition|batch size|duration|  speed  |
71 | |:---------:|:--------------------------------:|:---:|:-------------:|:-------------:|:--------:|:------:|:-------:|
72 | |LDBC sf1   | vertex:3165488  edge:17256029    |  60 |     10G       |       60      |  2000    | 58s    |352,095/s |
73 | |LDBC sf30  | vertex:88673640 edge:540915215   |  60 |     20G       |       60      |  2000    | 17min  |617,243/s|
74 | |LDBC sf100 | vertex:282386021 edge:1775513185 |  60 |     30G       |       60      |  2000    | 42min  |816,623/s|
75 | 
76 | After data import, space sf100 with three replicas will take 1.1h to finish the manual compaction.
77 | 
78 | # other information
79 | > The Spark cluster and nebula cluster are separated
80 | 
81 | > Spark cluster has three workers, nebula cluster has three metad, three graphd and three storaged. 
82 | 
83 | > The clusters have 10 Gigabit Network, each nebula machine has 1.5T SSD disk and 256G memory.
84 | 
85 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 vesoft inc. All rights reserved.
 2 | #
 3 | # This source code is licensed under Apache 2.0 License.
 4 | 
 5 | # For more configuration details:
 6 | # https://docs.codecov.io/docs/codecov-yaml
 7 | 
 8 | # validate the configuration:
 9 | # curl -X POST --data-binary @codecov.yml https://codecov.io/validate
10 | 
11 | codecov:
12 |   require_ci_to_pass: false
13 | 


--------------------------------------------------------------------------------
/conf-template/client_import/bigquery_datasource.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --executor-cores=20 \
 7 | # --jars $(echo /bigquery-jdbc-dependency-path/*.jar | tr ' ' ',')
 8 | # --class com.vesoft.nebula.exchange.Exchange \
 9 | # nebula-exchange-3.0-SNAPSHOT.jar -c bigquery_datasource.conf
10 | 
11 | # you can get all dependency jars for bigquery from https://cloud.google.com/bigquery/docs/reference/odbc-jdbc-drivers?hl=zh-cn#jdbc_release_1331004
12 | {
13 |   # Spark config
14 |   spark: {
15 |     app: {
16 |       name: NebulaGraph Exchange
17 |     }
18 |   }
19 | 
20 |   # Nebula Graph config
21 |   nebula: {
22 |     address:{
23 |       graph: ["127.0.0.1:9669"]
24 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
25 |       # use `SHOW meta leader` to see your meta leader's address
26 |       meta: ["127.0.0.1:9559"]
27 |     }
28 |     user: root
29 |     pswd: nebula
30 |     space: test
31 | 
32 |     # nebula client connection parameters
33 |     connection {
34 |       # socket connect & execute timeout, unit: millisecond
35 |       timeout: 30000
36 |     }
37 | 
38 |     error: {
39 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
40 |       max: 32
41 |       # failed data will be recorded in output path, format with ngql
42 |       output: /tmp/errors
43 |     }
44 | 
45 |     # use google's RateLimiter to limit the requests send to NebulaGraph
46 |     rate: {
47 |       # the stable throughput of RateLimiter
48 |       limit: 1024
49 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
50 |       # if it can't be obtained within the specified timeout, then give up the request.
51 |       timeout: 1000
52 |     }
53 |   }
54 | 
55 |   # Processing tags
56 |   tags: [
57 |     {
58 |       name: tag-name-1
59 |       type: {
60 |         source: jdbc
61 |         sink: client
62 |       }
63 | 
64 |       # bigquery url, the auth way if configed in url. In this example, OAuthPvtKeyPath=/tmp/bq-reader-sa-key.json file should be accessible for all spark workers.
65 |       url:"jdbc:bigquery://https://www.googleapis.com/bigquery/v2:443;ProjectId=nebula-cloud-test;OAuthType=0;OAuthServiceAcctEmail=bq-reader@nebula-cloud-test.iam.gserviceaccount.com;OAuthPvtKeyPath=/tmp/bq-reader-sa-key.json"
66 |       # JDBC driver
67 |       driver:"com.simba.googlebigquery.jdbc.Driver"
68 | 
69 |       user:"bq-reader@nebula-cloud-test.iam.gserviceaccount.com"
70 |       password:"not_used_but_required"
71 | 
72 |       sentence:"select id, firstName, lastName, gender from dataset.person"
73 | 
74 |       fields: [firstName, lastName, gender]
75 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
76 |       vertex: {
77 |         field: id
78 |       }
79 |       batch: 2000
80 |       partition: 60
81 |     }
82 |   ]
83 | }
84 | 


--------------------------------------------------------------------------------
/conf-template/client_import/csv_datasource.conf:
--------------------------------------------------------------------------------
  1 | # Use the command to submit the exchange job:
  2 | 
  3 | # spark-submit \
  4 | # --master "spark://master_ip:7077" \
  5 | # --driver-memory=2G --executor-memory=30G  \
  6 | # --num-executors=3 --executor-cores=20 \
  7 | # --class com.vesoft.nebula.exchange.Exchange \
  8 | # nebula-exchange-3.0-SNAPSHOT.jar -c csv_datasource.conf
  9 | 
 10 | {
 11 |   # Spark config
 12 |   spark: {
 13 |     app: {
 14 |       name: NebulaGraph Exchange
 15 |     }
 16 |   }
 17 | 
 18 |   # Nebula Graph config
 19 |   nebula: {
 20 |     address:{
 21 |       graph: ["127.0.0.1:9669"]
 22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
 23 |       # use `SHOW meta leader` to see your meta leader's address
 24 |       meta: ["127.0.0.1:9559"]
 25 |     }
 26 |     user: root
 27 |     pswd: nebula
 28 |     space: test
 29 | 
 30 |     # nebula client connection parameters
 31 |     connection {
 32 |       # socket connect & execute timeout, unit: millisecond
 33 |       timeout: 30000
 34 |     }
 35 | 
 36 |     error: {
 37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
 38 |       max: 32
 39 |       # failed data will be recorded in output path, format with ngql
 40 |       output: /tmp/errors
 41 |     }
 42 | 
 43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
 44 |     rate: {
 45 |       # the stable throughput of RateLimiter
 46 |       limit: 1024
 47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
 48 |       # if it can't be obtained within the specified timeout, then give up the request.
 49 |       timeout: 1000
 50 |     }
 51 |   }
 52 | 
 53 |   # Processing tags
 54 |   tags: [
 55 |     {
 56 |       name: tag-name-1
 57 |       type: {
 58 |         source: csv
 59 |         sink: client
 60 |       }
 61 |       # if your file in not in hdfs, config "file:///path/test.csv"
 62 |       path: "hdfs://ip:port/path/test.csv"
 63 |       # if your csv file has no header, then use _c0,_c1,_c2,.. to indicate fields
 64 |       fields: [csv-field-0, csv-field-1, csv-field-2]
 65 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
 66 |       vertex: {
 67 |         field: csv-field-0
 68 |         # add the prefix for vertex id value, eg: original id is 12345, and the real id will be: tag1_12345
 69 |         prefix:"tag1"
 70 |          udf: {
 71 |            separator: "_"
 72 |            oldColNames: [parquet-field-0, parquet-field-1]
 73 |            newColName: new-parquet-field
 74 |          }
 75 |       }
 76 | 
 77 |       separator: ","
 78 |       header: true
 79 |       batch: 2000
 80 |       partition: 60
 81 |     }
 82 |   ]
 83 | 
 84 |   # process edges
 85 |   edges: [
 86 |     {
 87 |       name: edge-name-1
 88 |       type: {
 89 |         source: csv
 90 |         sink: client
 91 |       }
 92 |       path: "hdfs://ip:port/path/test.csv"
 93 |       fields: [csv-field-0, csv-field-1, csv-field-2]
 94 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
 95 |       source: {
 96 |         field: csv-field-0
 97 |         # add the prefix for source id value, eg: original id is 12345, and the real id will be: edge1_12345
 98 |         prefix:"edge1"
 99 |       }
100 |         target: csv-field-1
101 |         ranking: csv-field-2
102 |         separator: ","
103 |         header: true
104 |         batch: 2000
105 |         partition: 60
106 |       }
107 |   ]
108 | }
109 | 


--------------------------------------------------------------------------------
/conf-template/client_import/hive_datasource.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --executor-cores=20 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange-3.0-SNAPSHOT.jar -c hive_datasource.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address:{
21 |       graph:["127.0.0.1:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta:["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     # nebula client connection parameters
31 |     connection {
32 |       # socket connect & execute timeout, unit: millisecond
33 |       timeout: 30000
34 |     }
35 | 
36 |     error: {
37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
38 |       max: 32
39 |       # failed data will be recorded in output path, format with ngql
40 |       output: /tmp/errors
41 |     }
42 | 
43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
44 |     rate: {
45 |       # the stable throughput of RateLimiter
46 |       limit: 1024
47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
48 |       # if it can't be obtained within the specified timeout, then give up the request.
49 |       timeout: 1000
50 |     }
51 |   }
52 | 
53 |   # Processing tags
54 |   tags: [
55 |     {
56 |       name: tag-name-1
57 |       type: {
58 |         source: hive
59 |         sink: client
60 |       }
61 |       exec: "select hive-field0, hive-field1, hive-field2 from database.table"
62 |       fields: [hive-field-0, hive-field-1, hive-field-2]
63 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
64 |       vertex: hive-field-0
65 |       batch: 2000
66 |       partition: 60
67 |     }
68 |   ]
69 | 
70 |   # process edges
71 |   edges: [
72 |     {
73 |       name: edge-name-1
74 |       type: {
75 |         source: hive
76 |         sink: client
77 |       }
78 |       exec: "select hive-field0, hive-field1, hive-field2 from database.table"
79 |       fields: [ hive-field-0, hive-field-1, hive-field-2]
80 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
81 |       source: hive-field-0
82 |       target: hive-field-1
83 |       ranking: hive-filed-2
84 |       batch: 2000
85 |       partition: 60
86 |     }
87 |   ]
88 | }
89 | 


--------------------------------------------------------------------------------
/conf-template/sst_import/csv_datasource.conf:
--------------------------------------------------------------------------------
  1 | # Use the command to submit the exchange job:
  2 | 
  3 | # spark-submit \
  4 | # --master "spark://master_ip:7077" \
  5 | # --driver-memory=2G --executor-memory=30G  \
  6 | # --num-executors=3 --executor-cores=20 \
  7 | # --class com.vesoft.nebula.exchange.Exchange \
  8 | # nebula-exchange-3.0-SNAPSHOT.jar -c csv_datasource.conf
  9 | 
 10 | {
 11 |   # Spark config
 12 |   spark: {
 13 |     app: {
 14 |       name: NebulaGraph Exchange
 15 |     }
 16 |   }
 17 | 
 18 |   # Nebula Graph config
 19 |   nebula: {
 20 |     address:{
 21 |       graph:["127.0.0.1:9669"]
 22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
 23 |       # use `SHOW meta leader` to see your meta leader's address
 24 |       meta:["127.0.0.1:9559"]
 25 |     }
 26 |     user: root
 27 |     pswd: nebula
 28 |     space: test
 29 | 
 30 |     path:{
 31 |         # any path that owns read and write access is ok
 32 |         local:"/tmp"
 33 |         remote:"/sst"
 34 |         hdfs.namenode: "hdfs://name_node:9000"
 35 |     }
 36 | 
 37 |     # nebula client connection parameters
 38 |     connection {
 39 |       # socket connect & execute timeout, unit: millisecond
 40 |       timeout: 30000
 41 |     }
 42 | 
 43 |     error: {
 44 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
 45 |       max: 32
 46 |       # failed data will be recorded in output path, format with ngql
 47 |       output: /tmp/errors
 48 |     }
 49 | 
 50 |     # use google's RateLimiter to limit the requests send to NebulaGraph
 51 |     rate: {
 52 |       # the stable throughput of RateLimiter
 53 |       limit: 1024
 54 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
 55 |       # if it can't be obtained within the specified timeout, then give up the request.
 56 |       timeout: 1000
 57 |     }
 58 |   }
 59 | 
 60 |   # Processing tags
 61 |   tags: [
 62 |     {
 63 |       name: tag-name-1
 64 |       type: {
 65 |         source: csv
 66 |         sink: sst
 67 |       }
 68 |       # if your file in not in hdfs, config "file:///path/test.csv"
 69 |       path: "hdfs://ip:port/path/test.csv"
 70 |       # if your csv file has no header, then use _c0,_c1,_c2,.. to indicate fields
 71 |       fields: [csv-field-0, csv-field-1, csv-field-2]
 72 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
 73 |       vertex: csv-field-0
 74 |       separator: ","
 75 |       header: true
 76 |       batch: 2000
 77 |       partition: 60
 78 |     }
 79 |   ]
 80 | 
 81 |   # process edges
 82 |   edges: [
 83 |     {
 84 |       name: edge-name-1
 85 |       type: {
 86 |         source: csv
 87 |         sink: sst
 88 |       }
 89 |       path: "hdfs://ip:port/path/test.csv"
 90 |       fields: [csv-field-0, csv-field-1, csv-field-2]
 91 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
 92 |       source: csv-field-0
 93 |       target: csv-field-1
 94 |       ranking: csv-field-2
 95 |       separator: ","
 96 |       header: true
 97 |       batch: 2000
 98 |       partition: 60
 99 |     }
100 |   ]
101 | }
102 | 


--------------------------------------------------------------------------------
/conf-template/sst_import/hive_datasource.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --executor-cores=20 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange-3.0-SNAPSHOT.jar -c hive_datasource.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address:{
21 |       graph:["127.0.0.1:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta:["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     path:{
31 |         # any path that owns read and write access is ok
32 |         local:"/tmp"
33 |         remote:"/sst"
34 |         hdfs.namenode: "hdfs://name_node:9000"
35 |     }
36 | 
37 |     # nebula client connection parameters
38 |     connection {
39 |       # socket connect & execute timeout, unit: millisecond
40 |       timeout: 30000
41 |     }
42 | 
43 |     error: {
44 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
45 |       max: 32
46 |       # failed data will be recorded in output path, format with ngql
47 |       output: /tmp/errors
48 |     }
49 | 
50 |     # use google's RateLimiter to limit the requests send to NebulaGraph
51 |     rate: {
52 |       # the stable throughput of RateLimiter
53 |       limit: 1024
54 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
55 |       # if it can't be obtained within the specified timeout, then give up the request.
56 |       timeout: 1000
57 |     }
58 |   }
59 | 
60 |   # Processing tags
61 |   tags: [
62 |     {
63 |       name: tag-name-1
64 |       type: {
65 |         source: hive
66 |         sink: sst
67 |       }
68 |       exec: "select hive-field0, hive-field1, hive-field2 from database.table"
69 |       fields: [hive-field-0, hive-field-1, hive-field-2]
70 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
71 |       vertex: hive-field-0
72 |       batch: 2000
73 |       partition: 60
74 |     }
75 |   ]
76 | 
77 |   # process edges
78 |   edges: [
79 |     {
80 |       name: edge-name-1
81 |       type: {
82 |         source: hive
83 |         sink: sst
84 |       }
85 |       exec: "select hive-field0, hive-field1, hive-field2 from database.table"
86 |       fields: [ hive-field-0, hive-field-1, hive-field-2]
87 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
88 |       source: hive-field-0
89 |       target: hive-field-1
90 |       ranking: hive-filed-2
91 |       batch: 2000
92 |       partition: 60
93 |     }
94 |   ]
95 | }
96 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/java/com/vesoft/exchange/common/FileMigrate.java:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2023 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common;
 7 | 
 8 | import java.io.BufferedReader;
 9 | import java.io.BufferedWriter;
10 | import java.io.File;
11 | import java.io.FileWriter;
12 | import java.io.IOException;
13 | import java.io.InputStream;
14 | import java.io.InputStreamReader;
15 | 
16 | public class FileMigrate {
17 |     //Logger log = Logger.getLogger(FileMigrate.class);
18 | 
19 | 
20 |     /**
21 |      * migrate the source file to target path
22 |      *
23 |      * @param sourceFile template config file
24 |      * @param path       target path to save the config info
25 |      */
26 |     public void saveConfig(String sourceFile, String path) {
27 |         InputStream inputStream =
28 |                 this.getClass().getClassLoader().getResourceAsStream(sourceFile);
29 |         if (inputStream == null) {
30 |             System.exit(-1);
31 |         }
32 |         File file = new File(path);
33 |         if (file.exists()) {
34 |             file.delete();
35 |         }
36 |         FileWriter writer = null;
37 |         BufferedWriter bufferedWriter = null;
38 |         BufferedReader reader = null;
39 |         try {
40 |             writer = new FileWriter(path);
41 |             bufferedWriter = new BufferedWriter(writer);
42 | 
43 |             reader = new BufferedReader(new InputStreamReader(inputStream));
44 |             String line = null;
45 |             while ((line = reader.readLine()) != null) {
46 |                 bufferedWriter.write(line);
47 |                 bufferedWriter.write("\n");
48 |             }
49 |         } catch (IOException e) {
50 |             System.out.println("Failed to migrate the template conf file:" + e.getMessage());
51 |             e.printStackTrace();
52 |         } finally {
53 |             try {
54 |                 if (bufferedWriter != null) {
55 |                     bufferedWriter.close();
56 |                 }
57 |                 if (reader != null) {
58 |                     reader.close();
59 |                 }
60 |             } catch (IOException e) {
61 |                 System.out.println("Failed to close the writer or reader:" + e.getMessage());
62 |                 e.printStackTrace();
63 |             }
64 |         }
65 | 
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/config_template/csv.conf:
--------------------------------------------------------------------------------
  1 | # Use the command to submit the exchange job:
  2 | 
  3 | # spark-submit \
  4 | # --master "spark://master_ip:7077" \
  5 | # --driver-memory=2G --executor-memory=30G  \
  6 | # --num-executors=3 --total-executor-cores=60 \
  7 | # --class com.vesoft.nebula.exchange.Exchange \
  8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c csv.conf
  9 | 
 10 | {
 11 |   # Spark config
 12 |   spark: {
 13 |     app: {
 14 |       name: NebulaGraph Exchange
 15 |     }
 16 |   }
 17 | 
 18 |   # Nebula Graph config
 19 |   nebula: {
 20 |     address:{
 21 |       graph: ["127.0.0.1:9669","127.0.0.2:9669"]
 22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
 23 |       # use `SHOW meta leader` to see your meta leader's address
 24 |       meta: ["127.0.0.1:9559"]
 25 |     }
 26 |     user: root
 27 |     pswd: nebula
 28 |     space: test
 29 | 
 30 |     # nebula client connection parameters
 31 |     connection {
 32 |       # socket connect & execute timeout, unit: millisecond
 33 |       timeout: 30000
 34 |     }
 35 | 
 36 |     error: {
 37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
 38 |       max: 32
 39 |       # failed data will be recorded in output path, format with ngql
 40 |       output: "hdfs://127.0.0.1:9000/tmp/errors"
 41 |     }
 42 | 
 43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
 44 |     rate: {
 45 |       # the stable throughput of RateLimiter
 46 |       limit: 1024
 47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
 48 |       # if it can't be obtained within the specified timeout, then give up the request.
 49 |       timeout: 1000
 50 |     }
 51 |   }
 52 | 
 53 |   # Processing tags
 54 |   tags: [
 55 |     {
 56 |       name: tag-name
 57 |       type: {
 58 |         source: csv
 59 |         sink: client
 60 |       }
 61 |       # if your file in not in hdfs, config "file:///path/test.csv"
 62 |       path: "hdfs://ip:port/path/test.csv"
 63 |       # if your csv file has no header, then use _c0,_c1,_c2,.. to indicate fields
 64 |       fields: [csv-field-1, csv-field-2, csv-field-3]
 65 |       nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3]
 66 |       vertex: {
 67 |         field: csv-field-0
 68 |       }
 69 |       separator: ","
 70 |       header: true
 71 |       batch: 2000
 72 |       partition: 60
 73 |     }
 74 |   ]
 75 | 
 76 |   # process edges
 77 |   edges: [
 78 |     {
 79 |       name: edge-name
 80 |       type: {
 81 |         source: csv
 82 |         sink: client
 83 |       }
 84 |       path: "hdfs://ip:port/path/test.csv"
 85 |       fields: [csv-field-2, csv-field-3, csv-field-4]
 86 |       nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3]
 87 |       source: {
 88 |         field: csv-field-0
 89 |       }
 90 |       target: {
 91 |         field: csv-field-1
 92 |       }
 93 |         #ranking: csv-field-2
 94 |         separator: ","
 95 |         header: true
 96 |         batch: 2000
 97 |         partition: 60
 98 |       }
 99 |   ]
100 | }
101 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/config_template/hbase.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --total-executor-cores=60 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c hbase.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address: {
21 |       graph: ["127.0.0.1:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta: ["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     # nebula client connection parameters
31 |     connection {
32 |       # socket connect & execute timeout, unit: millisecond
33 |       timeout: 30000
34 |     }
35 | 
36 |     error: {
37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
38 |       max: 32
39 |       # failed data will be recorded in output path, format with ngql
40 |       output: /tmp/errors
41 |     }
42 | 
43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
44 |     rate: {
45 |       # the stable throughput of RateLimiter
46 |       limit: 1024
47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
48 |       # if it can't be obtained within the specified timeout, then give up the request.
49 |       timeout: 1000
50 |     }
51 |   }
52 | 
53 |   # Processing tags
54 |   tags: [
55 |     {
56 |       name: tag-name-1
57 |       type: {
58 |         source: hbase
59 |         sink: client
60 |       }
61 |       host: 127.0.0.1
62 |       port: 2181
63 |       table: hbase-table
64 |       columnFamily: hbase-table-cloumnfamily
65 |       fields: [hbase-field-0, hbase-field-1, hbase-field-2]
66 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
67 |       # if fields or vertex contains rowkey, please configure it as "rowkey".
68 |       vertex: rowkey
69 |       batch: 2000
70 |       partition: 60
71 |     }
72 |   ]
73 | 
74 |   # process edges
75 |   edges: [
76 |     {
77 |       name: edge-name-1
78 |       type: {
79 |         source: hbase
80 |         sink: client
81 |       }
82 |       host: 127.0.0.1
83 |       port: 2181
84 |       table: hbase-table
85 |       columnFamily: hbase-table-cloumnfamily
86 |       fields: [hbase-field-0, hbase-field-1, hbase-field-2]
87 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
88 |       source: hbase-field-0
89 |       target: hbase-field-1
90 |       ranking: hbase-filed-2
91 |       batch: 2000
92 |       partition: 60
93 |     }
94 |   ]
95 | }
96 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/config_template/hive.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --executor-cores=20 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c hive.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address: {
21 |       graph: ["127.0.0.1:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta: ["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     # nebula client connection parameters
31 |     connection {
32 |       # socket connect & execute timeout, unit: millisecond
33 |       timeout: 30000
34 |     }
35 | 
36 |     error: {
37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
38 |       max: 32
39 |       # failed data will be recorded in output path, format with ngql
40 |       output: /tmp/errors
41 |     }
42 | 
43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
44 |     rate: {
45 |       # the stable throughput of RateLimiter
46 |       limit: 1024
47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
48 |       # if it can't be obtained within the specified timeout, then give up the request.
49 |       timeout: 1000
50 |     }
51 |   }
52 | 
53 |   # Processing tags
54 |   tags: [
55 |     {
56 |       name: tag-name-1
57 |       type: {
58 |         source: hive
59 |         sink: client
60 |       }
61 |       exec: "select hive-field0, hive-field1, hive-field2 from database.table"
62 |       fields: [hive-field-0, hive-field-1, hive-field-2]
63 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
64 |       vertex: hive-field-0
65 |       batch: 2000
66 |       partition: 60
67 |     }
68 |   ]
69 | 
70 |   # process edges
71 |   edges: [
72 |     {
73 |       name: edge-name-1
74 |       type: {
75 |         source: hive
76 |         sink: client
77 |       }
78 |       exec: "select hive-field0, hive-field1, hive-field2 from database.table"
79 |       fields: [hive-field-0, hive-field-1, hive-field-2]
80 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
81 |       source: hive-field-0
82 |       target: hive-field-1
83 |       ranking: hive-filed-2
84 |       batch: 2000
85 |       partition: 60
86 |     }
87 |   ]
88 | }
89 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/config_template/jdbc.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --executor-cores=20 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c jdbc.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address: {
21 |       graph: ["127.0.0.1:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta: ["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     # nebula client connection parameters
31 |     connection {
32 |       # socket connect & execute timeout, unit: millisecond
33 |       timeout: 30000
34 |     }
35 | 
36 |     error: {
37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
38 |       max: 32
39 |       # failed data will be recorded in output path, format with ngql
40 |       output: /tmp/errors
41 |     }
42 | 
43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
44 |     rate: {
45 |       # the stable throughput of RateLimiter
46 |       limit: 1024
47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
48 |       # if it can't be obtained within the specified timeout, then give up the request.
49 |       timeout: 1000
50 |     }
51 |   }
52 | 
53 |   # Processing tags
54 |   tags: [
55 |     {
56 |       name: tag-name-1
57 |       type: {
58 |         source: jdbc
59 |         sink: client
60 |       }
61 |       url: "jdbc:oracle:thin:@host:1521:db"
62 |       driver: "oracle.jdbc.driver.OracleDriver"
63 |       user: "root"
64 |       password: "nebula"
65 |       sentence: "select oracle-field-0, oracle-field-1, oracle-field-2 from table"
66 |       fields: [db-field-0, db-field-1, db-field-2]
67 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
68 |       vertex: db-field-0
69 |       batch: 2000
70 |       partition: 60
71 |     }
72 |   ]
73 | 
74 |   # process edges
75 |   edges: [
76 |     {
77 |       name: edge-name-1
78 |       type: {
79 |         source: jdbc
80 |         sink: client
81 |       }
82 |       url: "jdbc:oracle:thin:@host:1521:db"
83 |       driver: "oracle.jdbc.driver.OracleDriver"
84 |       user: "root"
85 |       password: "nebula"
86 |       sentence: "select db-field-0, db-field-1, db-field-2 from table"
87 |       fields: [db-field-0, db-field-1, db-field-2]
88 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
89 |       source: db-field-0
90 |       target: db-field-1
91 |       #ranking: db-filed-2
92 |       batch: 2000
93 |       partition: 60
94 |     }
95 |   ]
96 | }
97 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/config_template/json.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --total-executor-cores=60 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c json.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address:{
21 |       graph: ["127.0.0.1:9669","127.0.0.2:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta: ["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     # nebula client connection parameters
31 |     connection {
32 |       # socket connect & execute timeout, unit: millisecond
33 |       timeout: 30000
34 |     }
35 | 
36 |     error: {
37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
38 |       max: 32
39 |       # failed data will be recorded in output path, format with ngql
40 |       output: "hdfs://127.0.0.1:9000/tmp/errors"
41 |     }
42 | 
43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
44 |     rate: {
45 |       # the stable throughput of RateLimiter
46 |       limit: 1024
47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
48 |       # if it can't be obtained within the specified timeout, then give up the request.
49 |       timeout: 1000
50 |     }
51 |   }
52 | 
53 |   # Processing tags
54 |   tags: [
55 |     {
56 |       name: tag-name
57 |       type: {
58 |         source: json
59 |         sink: client
60 |       }
61 |       # if your file in not in hdfs, config "file:///path/test.json"
62 |       path: "hdfs://ip:port/path/test.json"
63 |       fields: [json-field-1, json-field-2, json-field-3]
64 |       nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3]
65 |       vertex: {
66 |         field: json-field-0
67 |       }
68 |       batch: 2000
69 |       partition: 60
70 |     }
71 |   ]
72 | 
73 |   # process edges
74 |   edges: [
75 |     {
76 |       name: edge-name
77 |       type: {
78 |         source: json
79 |         sink: client
80 |       }
81 |       path: "hdfs://ip:port/path/test.json"
82 |       fields: [json-field-2, json-field-3, json-field-4]
83 |       nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3]
84 |       source: {
85 |         field: json-field-0
86 |       }
87 |       target: {
88 |         field: json-field-1
89 |       }
90 |         #ranking: json-field-2
91 |         batch: 2000
92 |         partition: 60
93 |       }
94 |   ]
95 | }
96 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/config_template/kafka.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --executor-cores=20 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c kafka.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address:{
21 |       graph:["127.0.0.1:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta:["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     # nebula client connection parameters
31 |     connection {
32 |       # socket connect & execute timeout, unit: millisecond
33 |       timeout: 30000
34 |     }
35 | 
36 |     error: {
37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
38 |       max: 32
39 |       # failed data will be recorded in output path, format with ngql
40 |       output: /tmp/errors
41 |     }
42 | 
43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
44 |     rate: {
45 |       # the stable throughput of RateLimiter
46 |       limit: 1024
47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
48 |       # if it can't be obtained within the specified timeout, then give up the request.
49 |       timeout: 1000
50 |     }
51 |   }
52 | 
53 |   # Processing tags
54 |   tags: [
55 |     {
56 |       name: tag-name-1
57 |       type: {
58 |         source: kafka
59 |         sink: client
60 |       }
61 |       service: "kafka.service.address"
62 |       topic: "topic-name"
63 |       fields: [kafka-field-0, kafka-field-1, kafka-field-2]
64 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
65 |       vertex: kafka-field-0
66 |       batch: 2000
67 |       partition: 60
68 |     }
69 |   ]
70 | 
71 |   # process edges
72 |   edges: [
73 |     {
74 |       name: edge-name-1
75 |       type: {
76 |         source: kafka
77 |         sink: client
78 |       }
79 |       service: "kafka.service.address"
80 |       topic: "topic-name"
81 |       fields: [ kafka-field-3, kafka-field-4, kafka-field-5]
82 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
83 |       source: kafka-field-0
84 |       target: kafka-field-1
85 |       #ranking: kafka-filed-2
86 |       batch: 2000
87 |       partition: 60
88 |     }
89 |   ]
90 | }
91 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/config_template/neo4j.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --executor-cores=20 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c neo4j.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address:{
21 |       graph:["127.0.0.1:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta:["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     # nebula client connection parameters
31 |     connection {
32 |       # socket connect & execute timeout, unit: millisecond
33 |       timeout: 30000
34 |     }
35 | 
36 |     error: {
37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
38 |       max: 32
39 |       # failed data will be recorded in output path, format with ngql
40 |       output: /tmp/errors
41 |     }
42 | 
43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
44 |     rate: {
45 |       # the stable throughput of RateLimiter
46 |       limit: 1024
47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
48 |       # if it can't be obtained within the specified timeout, then give up the request.
49 |       timeout: 1000
50 |     }
51 |   }
52 | 
53 |   # Processing tags
54 |   tags: [
55 |     {
56 |       name: tag-name-1
57 |       type: {
58 |         source: neo4j
59 |         sink: client
60 |       }
61 |       server: "bolt://127.0.0.1:7687"
62 |       user: neo4j
63 |       password: neo4j
64 |       exec: "match (a:vertex_label)-[r:edge_label]->(b:vertex_label) return a.neo4j-source-field, b.neo4j-target-field, r.neo4j-field-0 as neo4j-field-0, r.neo4j-field-1 as neo4j-field-1 order by id(r)"
65 |       fields: [neo4j-field-0, neo4j-field-1, neo4j-field-2]
66 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
67 |       vertex: neo4j-field-0
68 |       batch: 2000
69 |       partition: 60
70 |     }
71 |   ]
72 | 
73 |   # process edges
74 |   edges: [
75 |     {
76 |       name: edge-name-1
77 |       type: {
78 |         source: neo4j
79 |         sink: client
80 |       }
81 |       server: "bolt://127.0.0.1:7687"
82 |       user: neo4j
83 |       password: neo4j
84 |       exec: "match (a:vertex_label)-[r:edge_label]->(b:vertex_label) return a.neo4j-source-field, b.neo4j-target-field, r.neo4j-field-0 as neo4j-field-0, r.neo4j-field-1 as neo4j-field-1 order by id(r)"
85 |       fields: [ neo4j-field-0, neo4j-field-1, neo4j-field-2]
86 |       nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2]
87 |       source: neo4j-field-0
88 |       target: neo4j-field-1
89 |       #ranking: neo4j-filed-2
90 |       batch: 2000
91 |       partition: 60
92 |     }
93 |   ]
94 | }
95 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/config_template/orc.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --total-executor-cores=60 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c orc.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address:{
21 |       graph: ["127.0.0.1:9669","127.0.0.2:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta: ["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     # nebula client connection parameters
31 |     connection {
32 |       # socket connect & execute timeout, unit: millisecond
33 |       timeout: 30000
34 |     }
35 | 
36 |     error: {
37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
38 |       max: 32
39 |       # failed data will be recorded in output path, format with ngql
40 |       output: "hdfs://127.0.0.1:9000/tmp/errors"
41 |     }
42 | 
43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
44 |     rate: {
45 |       # the stable throughput of RateLimiter
46 |       limit: 1024
47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
48 |       # if it can't be obtained within the specified timeout, then give up the request.
49 |       timeout: 1000
50 |     }
51 |   }
52 | 
53 |   # Processing tags
54 |   tags: [
55 |     {
56 |       name: tag-name
57 |       type: {
58 |         source: orc
59 |         sink: client
60 |       }
61 |       # if your file in not in hdfs, config "file:///path/test.orc"
62 |       path: "hdfs://ip:port/path/test.orc"
63 |       fields: [orc-field-1, orc-field-2, orc-field-3]
64 |       nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3]
65 |       vertex: {
66 |         field: orc-field-0
67 |       }
68 |       batch: 2000
69 |       partition: 60
70 |     }
71 |   ]
72 | 
73 |   # process edges
74 |   edges: [
75 |     {
76 |       name: edge-name
77 |       type: {
78 |         source: orc
79 |         sink: client
80 |       }
81 |       path: "hdfs://ip:port/path/test.orc"
82 |       fields: [orc-field-2, orc-field-3, orc-field-4]
83 |       nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3]
84 |       source: {
85 |         field: orc-field-0
86 |       }
87 |       target: {
88 |         field: orc-field-1
89 |       }
90 |         #ranking: orc-field-2
91 |         batch: 2000
92 |         partition: 60
93 |       }
94 |   ]
95 | }
96 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/config_template/parquet.conf:
--------------------------------------------------------------------------------
 1 | # Use the command to submit the exchange job:
 2 | 
 3 | # spark-submit \
 4 | # --master "spark://master_ip:7077" \
 5 | # --driver-memory=2G --executor-memory=30G  \
 6 | # --num-executors=3 --total-executor-cores=60 \
 7 | # --class com.vesoft.nebula.exchange.Exchange \
 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c parquet.conf
 9 | 
10 | {
11 |   # Spark config
12 |   spark: {
13 |     app: {
14 |       name: NebulaGraph Exchange
15 |     }
16 |   }
17 | 
18 |   # Nebula Graph config
19 |   nebula: {
20 |     address:{
21 |       graph: ["127.0.0.1:9669","127.0.0.2:9669"]
22 |       # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta.
23 |       # use `SHOW meta leader` to see your meta leader's address
24 |       meta: ["127.0.0.1:9559"]
25 |     }
26 |     user: root
27 |     pswd: nebula
28 |     space: test
29 | 
30 |     # nebula client connection parameters
31 |     connection {
32 |       # socket connect & execute timeout, unit: millisecond
33 |       timeout: 30000
34 |     }
35 | 
36 |     error: {
37 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
38 |       max: 32
39 |       # failed data will be recorded in output path, format with ngql
40 |       output: "hdfs://127.0.0.1:9000/tmp/errors"
41 |     }
42 | 
43 |     # use google's RateLimiter to limit the requests send to NebulaGraph
44 |     rate: {
45 |       # the stable throughput of RateLimiter
46 |       limit: 1024
47 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
48 |       # if it can't be obtained within the specified timeout, then give up the request.
49 |       timeout: 1000
50 |     }
51 |   }
52 | 
53 |   # Processing tags
54 |   tags: [
55 |     {
56 |       name: tag-name
57 |       type: {
58 |         source: orc
59 |         sink: client
60 |       }
61 |       # if your file in not in hdfs, config "file:///path/test.orc"
62 |       path: "hdfs://ip:port/path/test.orc"
63 |       fields: [orc-field-1, orc-field-2, orc-field-3]
64 |       nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3]
65 |       vertex: {
66 |         field: orc-field-0
67 |       }
68 |       batch: 2000
69 |       partition: 60
70 |     }
71 |   ]
72 | 
73 |   # process edges
74 |   edges: [
75 |     {
76 |       name: edge-name
77 |       type: {
78 |         source: orc
79 |         sink: client
80 |       }
81 |       path: "hdfs://ip:port/path/test.orc"
82 |       fields: [orc-field-2, orc-field-3, orc-field-4]
83 |       nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3]
84 |       source: {
85 |         field: orc-field-0
86 |       }
87 |       target: {
88 |         field: orc-field-1
89 |       }
90 |         #ranking: orc-field-2
91 |         batch: 2000
92 |         partition: 60
93 |       }
94 |   ]
95 | }
96 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Global logging configuration
2 | log4j.rootLogger=INFO, stdout
3 | # Console output...
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n
7 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/CheckPointHandler.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common
 7 | 
 8 | import com.vesoft.exchange.common.config.{SchemaConfigEntry, SourceCategory}
 9 | import com.vesoft.exchange.common.utils.HDFSUtils
10 | import org.apache.spark.TaskContext
11 | 
12 | /**
13 |   * CheckPointHandler handle the checkpoint files for Neo4j and Janusgraph
14 |   */
15 | object CheckPointHandler {
16 | 
17 |   def checkSupportResume(value: SourceCategory.Value): Boolean = {
18 |     value match {
19 |       case SourceCategory.NEO4J       => true
20 |       case SourceCategory.JANUS_GRAPH => true
21 |       case _                          => false
22 |     }
23 |   }
24 | 
25 |   def getPathAndOffset(schemaConfig: SchemaConfigEntry,
26 |                        breakPointCount: Long): Option[(String, Long)] = {
27 |     val partitionId = TaskContext.getPartitionId()
28 |     if (checkSupportResume(schemaConfig.dataSourceConfigEntry.category) && schemaConfig.checkPointPath.isDefined) {
29 |       val path   = s"${schemaConfig.checkPointPath.get}/${schemaConfig.name}.${partitionId}"
30 |       val offset = breakPointCount + fetchOffset(path)
31 |       Some((path, offset))
32 |     } else {
33 |       None
34 |     }
35 |   }
36 | 
37 |   def fetchOffset(path: String): Long = {
38 |     HDFSUtils.getContent(path).toLong
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/ErrorHandler.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common
 7 | 
 8 | import org.apache.hadoop.conf.Configuration
 9 | import org.apache.hadoop.fs.{FileSystem, Path}
10 | import org.apache.log4j.Logger
11 | 
12 | import java.util.UUID
13 | import scala.collection.mutable.ArrayBuffer
14 | 
15 | object ErrorHandler {
16 |   @transient
17 |   private[this] val LOG = Logger.getLogger(this.getClass)
18 | 
19 |   /**
20 |    * clean all the failed data for error path before reload.
21 |    *
22 |    * @param path path to clean
23 |    */
24 |   def clear(path: String): Unit = {
25 |     try {
26 |       val fileSystem = FileSystem.get(new Configuration())
27 |       val filesStatus = fileSystem.listStatus(new Path(path))
28 |       for (file <- filesStatus) {
29 |         if (!file.getPath.getName.startsWith("reload.")) {
30 |           fileSystem.delete(file.getPath, true)
31 |         }
32 |       }
33 |     } catch {
34 |       case e: Throwable => {
35 |         LOG.error(s"$path cannot be clean, but this error does not affect the import result, " +
36 |           s"you can only focus on the reload files.",
37 |           e)
38 |       }
39 |     }
40 |   }
41 | 
42 |   /**
43 |    * save the failed execute statement.
44 |    *
45 |    * @param buffer buffer saved failed ngql
46 |    * @param path   path to write these buffer ngql
47 |    */
48 |   def save(buffer: ArrayBuffer[String], path: String): Unit = {
49 |     val targetPath = new Path(path)
50 |     val fileSystem = targetPath.getFileSystem(new Configuration())
51 |     val errors = if (fileSystem.exists(targetPath)) {
52 |       val newPath = s"${path}_append_${UUID.randomUUID().toString}"
53 |       LOG.info(s"create reload path $newPath")
54 |       // For kafka, the error ngql need to append to a same file instead of overwrite
55 |       fileSystem.create(new Path(newPath))
56 |     } else {
57 |       LOG.info(s"create reload path $path")
58 |       fileSystem.create(targetPath)
59 |     }
60 | 
61 |     try {
62 |       for (error <- buffer) {
63 |         errors.write(error.getBytes)
64 |         errors.writeBytes("\n")
65 |       }
66 |     } finally {
67 |       errors.close()
68 |     }
69 |   }
70 | 
71 |   /**
72 |    * check if path exists
73 |    *
74 |    * @param path error path
75 |    * @return true if path exists
76 |    */
77 |   def existError(path: String): Boolean = {
78 |     val errorPath = new Path(path)
79 |     val fileSystem = errorPath.getFileSystem(new Configuration())
80 |     fileSystem.exists(new Path(path))
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/GenerateConfigTemplate.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2023 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common
 7 | 
 8 | import com.vesoft.exchange.common.config.SourceCategory
 9 | import org.apache.commons.cli.{
10 |   CommandLine,
11 |   CommandLineParser,
12 |   HelpFormatter,
13 |   Option,
14 |   Options,
15 |   ParseException,
16 |   PosixParser
17 | }
18 | 
19 | object GenerateConfigTemplate {
20 | 
21 |   def main(args: Array[String]): Unit = {
22 |     val sourceOption = new Option("s", "dataSource", true, "data source type")
23 |     sourceOption.setRequired(true)
24 | 
25 |     val pathOption = new Option("p", "path", true, "target path to save the template config file")
26 |     pathOption.setRequired(true)
27 | 
28 |     val options = new Options
29 |     options.addOption(sourceOption)
30 |     options.addOption(pathOption)
31 | 
32 |     var cli: CommandLine             = null
33 |     val cliParser: CommandLineParser = new PosixParser()
34 |     val helpFormatter                = new HelpFormatter
35 |     try {
36 |       cli = cliParser.parse(options, args)
37 |     } catch {
38 |       case e: ParseException =>
39 |         helpFormatter.printHelp(">>>> options", options)
40 |         e.printStackTrace()
41 |         System.exit(1)
42 |     }
43 |     val source: String = cli.getOptionValue("s")
44 |     val path: String   = cli.getOptionValue("p")
45 | 
46 |     getConfigTemplate(source, path)
47 |   }
48 | 
49 |   def getConfigTemplate(source: String, path: String): Unit = {
50 |     val sourceCategory = SourceCategory.withName(source.trim.toUpperCase)
51 | 
52 |     val fileMigrate = new FileMigrate
53 |     sourceCategory match {
54 |       case SourceCategory.CSV =>
55 |         fileMigrate.saveConfig("config_template/csv.conf", path + "/csv.conf")
56 |       case SourceCategory.JSON =>
57 |         fileMigrate.saveConfig("config_template/json.conf", path + "/json.conf")
58 |       case SourceCategory.ORC =>
59 |         fileMigrate.saveConfig("config_template/orc.conf", path + "/orc.conf")
60 |       case SourceCategory.PARQUET =>
61 |         fileMigrate.saveConfig("config_template/parquet.conf", path + "/parquet.conf")
62 |       case SourceCategory.HIVE =>
63 |         fileMigrate.saveConfig("config_template/hive.conf", path + "/hive.conf")
64 |       case SourceCategory.JDBC | SourceCategory.MYSQL | SourceCategory.CLICKHOUSE |
65 |           SourceCategory.MAXCOMPUTE | SourceCategory.ORC | SourceCategory.POSTGRESQL =>
66 |         fileMigrate.saveConfig("config_template/jdbc.conf", path + "/jdbc.conf")
67 |       case SourceCategory.NEO4J =>
68 |         fileMigrate.saveConfig("config_template/neo4j.conf", path + "/neo4j.conf")
69 |       case _ => throw new IllegalArgumentException(s"does not support datasource $sourceCategory")
70 |     }
71 |   }
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/GraphProvider.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common
 7 | 
 8 | import com.google.common.net.HostAndPort
 9 | import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, UserConfigEntry}
10 | import com.vesoft.nebula.client.graph.NebulaPoolConfig
11 | import com.vesoft.nebula.client.graph.data.{
12 |   CASignedSSLParam,
13 |   HostAddress,
14 |   ResultSet,
15 |   SSLParam,
16 |   SelfSignedSSLParam
17 | }
18 | import com.vesoft.nebula.client.graph.net.{NebulaPool, Session}
19 | import org.apache.log4j.Logger
20 | 
21 | import scala.collection.JavaConverters._
22 | import scala.collection.mutable.ListBuffer
23 | 
24 | /**
25 |   * GraphProvider for Nebula Graph Service
26 |   */
27 | class GraphProvider(addresses: List[HostAddress],
28 |                     timeout: Int,
29 |                     sslConfigEntry: SslConfigEntry)
30 |     extends AutoCloseable
31 |     with Serializable {
32 |   private[this] lazy val LOG = Logger.getLogger(this.getClass)
33 | 
34 |   @transient val nebulaPoolConfig = new NebulaPoolConfig
35 |   @transient val pool: NebulaPool = new NebulaPool
36 |   val randAddr                    = scala.util.Random.shuffle(addresses)
37 | 
38 |   nebulaPoolConfig.setTimeout(timeout)
39 | 
40 |   // com.vesoft.exchange.common.config graph ssl
41 |   nebulaPoolConfig.setEnableSsl(sslConfigEntry.enableGraph)
42 |   if (sslConfigEntry.enableGraph) {
43 |     var sslParam: SSLParam = null
44 |     if (sslConfigEntry.signType == SslType.CA) {
45 |       val ca = sslConfigEntry.caSignParam
46 |       sslParam = new CASignedSSLParam(ca.caCrtFilePath, ca.crtFilePath, ca.keyFilePath)
47 |     } else {
48 |       val self = sslConfigEntry.selfSignParam
49 |       sslParam = new SelfSignedSSLParam(self.crtFilePath, self.keyFilePath, self.password)
50 |     }
51 |     nebulaPoolConfig.setSslParam(sslParam)
52 |   }
53 | 
54 |   pool.init(randAddr.asJava, nebulaPoolConfig)
55 | 
56 |   def getGraphClient(userConfigEntry: UserConfigEntry): Session = {
57 |     pool.getSession(userConfigEntry.user, userConfigEntry.password, true);
58 |   }
59 | 
60 |   def releaseGraphClient(session: Session): Unit = {
61 |     session.release()
62 |   }
63 | 
64 |   override def close(): Unit = {
65 |     pool.close()
66 |   }
67 | 
68 |   def switchSpace(session: Session, space: String): (HostAddress, ResultSet) = {
69 |     val switchStatment = s"use $space"
70 |     LOG.info(s">>>>>> switch space $space")
71 |     val result = submit(session, switchStatment)
72 |     result
73 |   }
74 | 
75 |   def submit(session: Session, statement: String): (HostAddress, ResultSet) = {
76 |     val result = session.execute(statement)
77 |     (session.getGraphHost, result)
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/MetaProvider.scala:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
  2 |  *
  3 |  * This source code is licensed under Apache 2.0 License.
  4 |  */
  5 | 
  6 | package com.vesoft.exchange.common
  7 | 
  8 | import com.google.common.net.HostAndPort
  9 | import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, Type}
 10 | import com.vesoft.nebula.PropertyType
 11 | import com.vesoft.nebula.client.graph.data.{
 12 |   CASignedSSLParam,
 13 |   HostAddress,
 14 |   SSLParam,
 15 |   SelfSignedSSLParam
 16 | }
 17 | import com.vesoft.nebula.client.meta.MetaClient
 18 | import com.vesoft.nebula.meta.{EdgeItem, TagItem}
 19 | import org.apache.log4j.Logger
 20 | 
 21 | import scala.collection.JavaConverters._
 22 | import scala.collection.mutable
 23 | import scala.collection.mutable.ListBuffer
 24 | 
 25 | /**
 26 |   * MetaProvider provide nebula graph meta query operations.
 27 |   */
 28 | class MetaProvider(addresses: List[HostAddress],
 29 |                    timeout: Int,
 30 |                    retry: Int,
 31 |                    sslConfigEntry: SslConfigEntry)
 32 |     extends AutoCloseable
 33 |     with Serializable {
 34 |   private[this] lazy val LOG = Logger.getLogger(this.getClass)
 35 | 
 36 |   private var metaClient: MetaClient = null
 37 |   var sslParam: SSLParam             = null
 38 |   // com.vesoft.exchange.common.config meta ssl
 39 |   if (sslConfigEntry.enableMeta) {
 40 |     if (sslConfigEntry.signType == SslType.CA) {
 41 |       val ca = sslConfigEntry.caSignParam
 42 |       sslParam = new CASignedSSLParam(ca.caCrtFilePath, ca.crtFilePath, ca.keyFilePath)
 43 |     } else {
 44 |       val self = sslConfigEntry.selfSignParam
 45 |       sslParam = new SelfSignedSSLParam(self.crtFilePath, self.keyFilePath, self.password)
 46 |     }
 47 |     metaClient = new MetaClient(addresses.asJava, timeout, retry, retry, true, sslParam)
 48 |   } else {
 49 |     metaClient = new MetaClient(addresses.asJava, timeout, retry, retry)
 50 |   }
 51 | 
 52 |   metaClient.connect()
 53 | 
 54 |   def getPartNumber(space: String): Int = {
 55 |     metaClient.getPartsAlloc(space).size()
 56 |   }
 57 | 
 58 |   def getVidType(space: String): VidType.Value = {
 59 |     val vidType = metaClient.getSpace(space).getProperties.getVid_type.getType
 60 |     if (vidType == PropertyType.FIXED_STRING) {
 61 |       return VidType.STRING
 62 |     }
 63 |     VidType.INT
 64 |   }
 65 | 
 66 |   def getTagSchema(space: String, tag: String): Map[String, Integer] = {
 67 |     val tagSchema = metaClient.getTag(space, tag)
 68 |     val schema    = new mutable.HashMap[String, Integer]
 69 | 
 70 |     val columns = tagSchema.getColumns
 71 |     for (colDef <- columns.asScala) {
 72 |       schema.put(new String(colDef.getName), colDef.getType.getType.getValue)
 73 |     }
 74 |     schema.toMap
 75 |   }
 76 | 
 77 |   def getEdgeSchema(space: String, edge: String): Map[String, Integer] = {
 78 |     val edgeSchema = metaClient.getEdge(space, edge)
 79 |     val schema     = new mutable.HashMap[String, Integer]
 80 | 
 81 |     val columns = edgeSchema.getColumns
 82 |     for (colDef <- columns.asScala) {
 83 |       schema.put(new String(colDef.getName), colDef.getType.getType.getValue)
 84 |     }
 85 |     schema.toMap
 86 |   }
 87 | 
 88 |   def getLabelType(space: String, label: String): Type.Value = {
 89 |     val tags = metaClient.getTags(space)
 90 |     for (tag <- tags.asScala) {
 91 |       if (new String(tag.getTag_name).equals(label)) {
 92 |         return Type.VERTEX
 93 |       }
 94 |     }
 95 |     val edges = metaClient.getEdges(space)
 96 |     for (edge <- edges.asScala) {
 97 |       if (new String(edge.getEdge_name).equals(label)) {
 98 |         return Type.EDGE
 99 |       }
100 |     }
101 |     null
102 |   }
103 | 
104 |   def getSpaceVidLen(space: String): Int = {
105 |     val spaceItem = metaClient.getSpace(space);
106 |     if (spaceItem == null) {
107 |       throw new IllegalArgumentException(s"space $space does not exist.")
108 |     }
109 |     spaceItem.getProperties.getVid_type.getType_length
110 |   }
111 | 
112 |   def getTagItem(space: String, tag: String): TagItem = {
113 |     val tagItemList = metaClient.getTags(space).asScala
114 |     for (tagItem: TagItem <- tagItemList) {
115 |       if (new String(tagItem.tag_name).equals(tag)) {
116 |         return tagItem
117 |       }
118 |     }
119 |     throw new IllegalArgumentException(s"tag ${space}.${tag} does not exist.")
120 |   }
121 | 
122 |   def getEdgeItem(space: String, edge: String): EdgeItem = {
123 |     val edgeItemList = metaClient.getEdges(space).asScala
124 |     for (edgeItem: EdgeItem <- edgeItemList) {
125 |       if (new String(edgeItem.edge_name).equals(edge)) {
126 |         return edgeItem
127 |       }
128 |     }
129 |     throw new IllegalArgumentException(s"edge ${space}.${edge} does not exist.")
130 |   }
131 | 
132 |   override def close(): Unit = {
133 |     metaClient.close()
134 |   }
135 | 
136 | }
137 | 
138 | object VidType extends Enumeration {
139 |   type Type = Value
140 | 
141 |   val STRING = Value("STRING")
142 |   val INT    = Value("INT")
143 | }
144 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/Package.scala:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2021 vesoft inc. All rights reserved.
  2 |  *
  3 |  * This source code is licensed under Apache 2.0 License.
  4 |  */
  5 | 
  6 | package com.vesoft.exchange
  7 | 
  8 | import com.google.common.base.Optional
  9 | import com.google.common.util.concurrent.ListenableFuture
 10 | import com.vesoft.exchange.common.utils.NebulaUtils
 11 | 
 12 | import scala.collection.mutable.ListBuffer
 13 | 
 14 | package object common {
 15 | 
 16 |   type GraphSpaceID   = Int
 17 |   type PartitionID    = Int
 18 |   type TagID          = Int
 19 |   type EdgeType       = Int
 20 |   type SchemaID       = (TagID, EdgeType)
 21 |   type TagVersion     = Long
 22 |   type EdgeVersion    = Long
 23 |   type SchemaVersion  = (TagVersion, EdgeVersion)
 24 |   type VertexID       = Long
 25 |   type VertexIDSlice  = String
 26 |   type EdgeRank       = Long
 27 |   type PropertyNames  = List[String]
 28 |   type PropertyValues = List[Any]
 29 |   type ProcessResult  = ListBuffer[WriterResult]
 30 |   type WriterResult   = ListenableFuture[Optional[Integer]]
 31 | 
 32 |   case class Vertex(vertexID: VertexIDSlice, values: PropertyValues) {
 33 | 
 34 |     def propertyValues = values.mkString(", ")
 35 | 
 36 |     override def toString: String = {
 37 |       s"Vertex ID: ${vertexID}, " +
 38 |         s"Values: ${values.mkString(", ")}"
 39 |     }
 40 |   }
 41 | 
 42 |   case class Vertices(names: PropertyNames,
 43 |                       values: List[Vertex],
 44 |                       policy: Option[KeyPolicy.Value] = None) {
 45 | 
 46 |     def propertyNames: String = NebulaUtils.escapePropName(names).mkString(",")
 47 | 
 48 |     override def toString: String = {
 49 |       s"Vertices: " +
 50 |         s"Property Names: ${names.mkString(", ")}" +
 51 |         s"Vertex Values: ${values.mkString(", ")} " +
 52 |         s"with policy ${policy}"
 53 |     }
 54 |   }
 55 | 
 56 |   case class Edge(source: VertexIDSlice,
 57 |                   destination: VertexIDSlice,
 58 |                   ranking: Option[EdgeRank],
 59 |                   values: PropertyValues) {
 60 | 
 61 |     def this(source: VertexIDSlice, destination: VertexIDSlice, values: PropertyValues) = {
 62 |       this(source, destination, None, values)
 63 |     }
 64 | 
 65 |     def propertyValues: String = values.mkString(", ")
 66 | 
 67 |     override def toString: String = {
 68 |       val rank = if (ranking.isEmpty) 0 else ranking.get
 69 |       s"Edge: ${source}->${destination}@${rank} values: ${propertyValues}"
 70 |     }
 71 |   }
 72 | 
 73 |   case class Edges(names: PropertyNames,
 74 |                    values: List[Edge],
 75 |                    sourcePolicy: Option[KeyPolicy.Value] = None,
 76 |                    targetPolicy: Option[KeyPolicy.Value] = None) {
 77 |     def propertyNames: String = NebulaUtils.escapePropName(names).mkString(",")
 78 | 
 79 |     override def toString: String = {
 80 |       "Edges:" +
 81 |         s"Property Names: ${names.mkString(", ")}" +
 82 |         s"with source policy ${sourcePolicy}" +
 83 |         s"with target policy ${targetPolicy}"
 84 |     }
 85 |   }
 86 | 
 87 |   object KeyPolicy extends Enumeration {
 88 |     type POLICY = Value
 89 |     val HASH = Value("hash")
 90 |     val UUID = Value("uuid")
 91 |   }
 92 | 
 93 |   case class Offset(start: Long, size: Long)
 94 | }
 95 | 
 96 | final case class Argument(config: String = "application.conf",
 97 |                           hive: Boolean = false,
 98 |                           directly: Boolean = false,
 99 |                           dry: Boolean = false,
100 |                           reload: String = "",
101 |                           variable: Boolean = false,
102 |                           param: String = "")
103 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/PasswordEncryption.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2023 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common
 7 | 
 8 | import com.vesoft.exchange.Argument
 9 | 
10 | import java.security.spec.{PKCS8EncodedKeySpec, X509EncodedKeySpec}
11 | import java.security.{KeyFactory, KeyPairGenerator, SecureRandom}
12 | import java.util.Base64
13 | import javax.crypto.Cipher
14 | import javax.crypto.spec.SecretKeySpec
15 | 
16 | object PasswordEncryption {
17 |   private val algorithm = "RSA"
18 |   private val charset = "UTF-8"
19 | 
20 |   def main(args: Array[String]): Unit = {
21 |     val passwdOption = new scopt.OptionParser[PasswordConfig]("encrypt password") {
22 |       head("encrypt password")
23 | 
24 |       opt[String]('p', "passwd")
25 |         .required()
26 |         .valueName("passwd")
27 |         .action((x, c) => c.copy(password = x))
28 |         .text("your real password")
29 |     }.parse(args, PasswordConfig())
30 | 
31 |     require(passwdOption.isDefined && passwdOption.get.password != null, "lack of password parameter")
32 | 
33 |     val password:String = passwdOption.get.password
34 | 
35 |     val (encryptedPasswd, privateKey) = encryptPassword(password)
36 |     println(s"=================== private key begin ===================")
37 |     println(privateKey)
38 |     println(s"=================== private key end ===================\n\n")
39 | 
40 |     println(s"=================== encrypted  password begin ===================")
41 |     println(encryptedPasswd)
42 |     println(s"=================== encrypted  password end ===================")
43 | 
44 |     println(s"check: the real password decrypted by private key and encrypted password is: ${decryptPassword(encryptedPasswd, privateKey)}")
45 |   }
46 | 
47 |   /**
48 |    * encrypt the password
49 |    *
50 |    * @param password real password
51 |    * @return (encryptedPasswd, privateKey)
52 |    */
53 |   def encryptPassword(password: String): (String, String) = {
54 |     val keyPairGenerator = KeyPairGenerator.getInstance(algorithm)
55 |     keyPairGenerator.initialize(1024, new SecureRandom())
56 |     val keyPair = keyPairGenerator.generateKeyPair()
57 |     val privateKey = keyPair.getPrivate
58 |     val privateKeyStr = new String(Base64.getEncoder.encode(privateKey.getEncoded), charset)
59 |     val publicKey = keyPair.getPublic
60 |     val publicKeyStr = new String(Base64.getEncoder.encode(publicKey.getEncoded), charset)
61 |     println(s"=================== public key begin ===================")
62 |     println(publicKeyStr)
63 |     println(s"=================== public key end ===================\n\n")
64 | 
65 |     // encrypt the password
66 |     val encoded = Base64.getDecoder.decode(publicKeyStr)
67 |     val rsaPublicKey = KeyFactory.getInstance(algorithm).generatePublic(new X509EncodedKeySpec(encoded))
68 |     val cipher = Cipher.getInstance(algorithm)
69 |     cipher.init(Cipher.ENCRYPT_MODE, rsaPublicKey)
70 |     val encodePasswd = new String(Base64.getEncoder.encode(cipher.doFinal(password.getBytes(charset))), charset)
71 |     (encodePasswd, privateKeyStr)
72 |   }
73 | 
74 |   /**
75 |    * decrypt the encrypted password with private key
76 |    *
77 |    * @param encryptedPassword encrypted password
78 |    * @param privateKey        rsa private key
79 |    * @return real password
80 |    */
81 |   def decryptPassword(encryptedPassword: String, privateKey: String): String = {
82 |     val encryptedPasswdBytes = Base64.getDecoder.decode(encryptedPassword)
83 |     val decodedPrivateKey = Base64.getDecoder.decode(privateKey)
84 |     val rsaPrivateKey = KeyFactory.getInstance(algorithm).generatePrivate(new PKCS8EncodedKeySpec(decodedPrivateKey))
85 |     val cipher = Cipher.getInstance(algorithm)
86 |     cipher.init(Cipher.DECRYPT_MODE, rsaPrivateKey)
87 |     val password = new String(cipher.doFinal(encryptedPasswdBytes), charset)
88 |     password
89 |   }
90 | 
91 | 
92 | }
93 | 
94 | final case class PasswordConfig(password: String = null)
95 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/config/SchemaConfigs.scala:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
  2 |  *
  3 |  * This source code is licensed under Apache 2.0 License.
  4 |  */
  5 | 
  6 | package com.vesoft.exchange.common.config
  7 | 
  8 | import com.vesoft.exchange.common.KeyPolicy
  9 | 
 10 | /**
 11 |   * SchemaConfigEntry is tag/edge super class use to save some basic parameter for importer.
 12 |   */
 13 | sealed trait SchemaConfigEntry {
 14 | 
 15 |   /** nebula tag or edge name */
 16 |   def name: String
 17 | 
 18 |   /** see{@link DataSourceConfigEntry}*/
 19 |   def dataSourceConfigEntry: DataSourceConfigEntry
 20 | 
 21 |   /** see{@link DataSinkConfigEntry}*/
 22 |   def dataSinkConfigEntry: DataSinkConfigEntry
 23 | 
 24 |   /** data source fields which are going to be import to nebula as properties */
 25 |   def fields: List[String]
 26 | 
 27 |   /** nebula properties which are going to fill value with data source value*/
 28 |   def nebulaFields: List[String]
 29 | 
 30 |   /** vertex or edge amount of one batch import */
 31 |   def batch: Int
 32 | 
 33 |   /** spark partition */
 34 |   def partition: Int
 35 | 
 36 |   /** check point path */
 37 |   def checkPointPath: Option[String]
 38 | 
 39 |   /** write mode */
 40 |   def writeMode: WriteMode.Mode
 41 | }
 42 | 
 43 | /**
 44 |   *
 45 |   * @param name
 46 |   * @param dataSourceConfigEntry
 47 |   * @param dataSinkConfigEntry
 48 |   * @param fields
 49 |   * @param nebulaFields
 50 |   * @param vertexField
 51 |   * @param vertexPolicy
 52 |   * @param batch
 53 |   * @param partition
 54 |   * @param checkPointPath
 55 |   */
 56 | case class TagConfigEntry(override val name: String,
 57 |                           override val dataSourceConfigEntry: DataSourceConfigEntry,
 58 |                           override val dataSinkConfigEntry: DataSinkConfigEntry,
 59 |                           override val fields: List[String],
 60 |                           override val nebulaFields: List[String],
 61 |                           override val writeMode: WriteMode.Mode,
 62 |                           vertexField: String,
 63 |                           vertexPolicy: Option[KeyPolicy.Value],
 64 |                           vertexPrefix: String,
 65 |                           override val batch: Int,
 66 |                           override val partition: Int,
 67 |                           override val checkPointPath: Option[String],
 68 |                           repartitionWithNebula: Boolean = true,
 69 |                           enableTagless: Boolean = false,
 70 |                           ignoreIndex: Boolean = false,
 71 |                           deleteEdge: Boolean = false,
 72 |                           vertexUdf: Option[UdfConfigEntry] = None,
 73 |                           filterConfig: Option[FilterConfigEntry] = None)
 74 |     extends SchemaConfigEntry {
 75 |   require(name.trim.nonEmpty, "tag name cannot be empty")
 76 |   require(vertexField.trim.nonEmpty, "tag vertex id cannot be empty")
 77 |   require(batch > 0, "batch config must be larger than 0")
 78 |   require(fields.size == nebulaFields.size,
 79 |           "fields and nebula.fields must have the same element number")
 80 | 
 81 |   override def toString: String = {
 82 |     s"Tag name: $name, " +
 83 |       s"source: $dataSourceConfigEntry, " +
 84 |       s"sink: $dataSinkConfigEntry, " +
 85 |       s"writeMode: $writeMode, " +
 86 |       s"vertex field: $vertexField, " +
 87 |       s"vertex policy: $vertexPolicy, " +
 88 |       s"batch: $batch, " +
 89 |       s"partition: $partition, " +
 90 |       s"repartitionWithNebula: $repartitionWithNebula, " +
 91 |       s"enableTagless: $enableTagless, " +
 92 |       s"ignoreIndex: $ignoreIndex, " +
 93 |       s"vertexUdf: $vertexUdf, " +
 94 |       s"filter: $filterConfig."
 95 |   }
 96 | }
 97 | 
 98 | /**
 99 |   *
100 |   * @param name
101 |   * @param dataSourceConfigEntry
102 |   * @param dataSinkConfigEntry
103 |   * @param fields
104 |   *  @param nebulaFields
105 |   * @param sourceField
106 |   * @param sourcePolicy
107 |   * @param rankingField
108 |   * @param targetField
109 |   * @param targetPolicy
110 |   * @param isGeo
111 |   * @param latitude
112 |   * @param longitude
113 |   * @param batch
114 |   * @param partition
115 |   * @param checkPointPath
116 |   */
117 | case class EdgeConfigEntry(override val name: String,
118 |                            override val dataSourceConfigEntry: DataSourceConfigEntry,
119 |                            override val dataSinkConfigEntry: DataSinkConfigEntry,
120 |                            override val fields: List[String],
121 |                            override val nebulaFields: List[String],
122 |                            override val writeMode: WriteMode.Mode,
123 |                            sourceField: String,
124 |                            sourcePolicy: Option[KeyPolicy.Value],
125 |                            sourcePrefix: String,
126 |                            rankingField: Option[String],
127 |                            targetField: String,
128 |                            targetPolicy: Option[KeyPolicy.Value],
129 |                            targetPrefix: String,
130 |                            isGeo: Boolean,
131 |                            latitude: Option[String],
132 |                            longitude: Option[String],
133 |                            override val batch: Int,
134 |                            override val partition: Int,
135 |                            override val checkPointPath: Option[String],
136 |                            repartitionWithNebula: Boolean = false,
137 |                            ignoreIndex: Boolean = false,
138 |                            srcVertexUdf: Option[UdfConfigEntry] = None,
139 |                            dstVertexUdf: Option[UdfConfigEntry] = None,
140 |                            filterConfig: Option[FilterConfigEntry] = None)
141 |     extends SchemaConfigEntry {
142 |   require(name.trim.nonEmpty, "edge name cannot be empty")
143 |   require(sourceField.trim.nonEmpty, "edge source id cannot be empty")
144 |   require(targetField.trim.nonEmpty, "edge target id cannot be empty")
145 |   require(batch > 0, "batch config must be larger than 0")
146 |   require(fields.size == nebulaFields.size,
147 |     "fields and nebula.fields must have the same element number")
148 | 
149 |   override def toString: String = {
150 |     if (isGeo) {
151 |       s"Edge name: $name, " +
152 |         s"source: $dataSourceConfigEntry, " +
153 |         s"sink: $dataSinkConfigEntry, " +
154 |         s"writeMode: $writeMode, " +
155 |         s"latitude: $latitude, " +
156 |         s"longitude: $longitude, " +
157 |         s"source field: $sourceField, " +
158 |         s"source policy: $sourcePolicy, " +
159 |         s"ranking: $rankingField, " +
160 |         s"target field: $targetField, " +
161 |         s"target policy: $targetPolicy, " +
162 |         s"batch: $batch, " +
163 |         s"partition: $partition, " +
164 |         s"ignoreIndex: $ignoreIndex, " +
165 |         s"srcVertexUdf: $srcVertexUdf" +
166 |         s"dstVertexUdf: $dstVertexUdf."
167 |     } else {
168 |       s"Edge name: $name, " +
169 |         s"source: $dataSourceConfigEntry, " +
170 |         s"sink: $dataSinkConfigEntry, " +
171 |         s"writeMode: $writeMode, " +
172 |         s"source field: $sourceField, " +
173 |         s"source policy: $sourcePolicy, " +
174 |         s"ranking: $rankingField, " +
175 |         s"target field: $targetField, " +
176 |         s"target policy: $targetPolicy, " +
177 |         s"batch: $batch, " +
178 |         s"partition: $partition, " +
179 |         s"ignoreIndex: $ignoreIndex, " +
180 |         s"srcVertexUdf: $srcVertexUdf" +
181 |         s"dstVertexUdf: $dstVertexUdf."
182 |     }
183 |   }
184 | }
185 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/config/SinkConfigs.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common.config
 7 | 
 8 | /**
 9 |   * SinkCategory is used to expression the writer's type.
10 |   */
11 | object SinkCategory extends Enumeration {
12 |   type Type = Value
13 | 
14 |   val CLIENT = Value("CLIENT")
15 |   val SST    = Value("SST")
16 | }
17 | 
18 | class SinkCategory
19 | 
20 | /**
21 |   * DataSinkConfigEntry
22 |   */
23 | sealed trait DataSinkConfigEntry {
24 |   def category: SinkCategory.Value
25 | }
26 | 
27 | /**
28 |   * FileBaseSinkConfigEntry
29 |   */
30 | case class FileBaseSinkConfigEntry(override val category: SinkCategory.Value,
31 |                                    localPath: String,
32 |                                    remotePath: String,
33 |                                    fsName: Option[String])
34 |     extends DataSinkConfigEntry {
35 | 
36 |   override def toString: String = {
37 |     val fullRemotePath =
38 |       if (fsName.isDefined) s"${fsName.get}$remotePath"
39 |       else remotePath
40 |     s"File sink: from ${localPath} to $fullRemotePath"
41 |   }
42 | }
43 | 
44 | /**
45 |   * NebulaSinkConfigEntry use to specified the nebula service's address.
46 |   */
47 | case class NebulaSinkConfigEntry(override val category: SinkCategory.Value, addresses: List[String])
48 |     extends DataSinkConfigEntry {
49 |   override def toString: String = {
50 |     s"Nebula sink addresses: ${addresses.mkString("[", ", ", "]")}"
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/processor/ReloadProcessor.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common.processor
 7 | 
 8 | import com.vesoft.exchange.common.{ErrorHandler, GraphProvider}
 9 | import com.vesoft.exchange.common.GraphProvider
10 | import com.vesoft.exchange.common.config.Configs
11 | import com.vesoft.exchange.common.writer.NebulaGraphClientWriter
12 | import org.apache.log4j.Logger
13 | import org.apache.spark.{SparkEnv, TaskContext}
14 | import org.apache.spark.sql.{DataFrame, Row}
15 | import org.apache.spark.util.LongAccumulator
16 | 
17 | import java.util.regex.Pattern
18 | import scala.collection.mutable.ArrayBuffer
19 | 
20 | class ReloadProcessor(data: DataFrame,
21 |                       config: Configs,
22 |                       batchSuccess: LongAccumulator,
23 |                       batchFailure: LongAccumulator,
24 |                       recordSuccess: LongAccumulator)
25 |     extends Processor {
26 |   @transient
27 |   private[this] lazy val LOG = Logger.getLogger(this.getClass)
28 | 
29 |   override def process(): Unit = {
30 |     data.foreachPartition((rows: Iterator[Row]) => processEachPartition(rows))
31 |   }
32 | 
33 |   private def processEachPartition(iterator: Iterator[Row]): Unit = {
34 |     val graphProvider =
35 |       new GraphProvider(config.databaseConfig.getGraphAddress,
36 |                         config.connectionConfig.timeout,
37 |                         config.sslConfig)
38 | 
39 |     val writer = new NebulaGraphClientWriter(config.databaseConfig,
40 |                                              config.userConfig,
41 |                                              config.rateConfig,
42 |                                              null,
43 |                                              graphProvider,
44 |                                              config.executionConfig)
45 | 
46 |     val errorBuffer = ArrayBuffer[String]()
47 | 
48 |     writer.prepare()
49 |     // batch write
50 |     val startTime = System.currentTimeMillis
51 |     iterator.foreach { row =>
52 |       val ngql          = row.getString(0)
53 |       val failStatement = writer.writeNgql(ngql)
54 |       if (failStatement == null) {
55 |         batchSuccess.add(1)
56 |         recordSuccess.add(1)
57 |       } else {
58 |         errorBuffer.append(failStatement)
59 |         batchFailure.add(1)
60 |       }
61 |     }
62 |     if (errorBuffer.nonEmpty) {
63 |       ErrorHandler.save(
64 |         errorBuffer,
65 |         s"${config.errorConfig.errorPath}/${SparkEnv.get.blockManager.conf.getAppId}/reload.${TaskContext
66 |           .getPartitionId()}")
67 |       errorBuffer.clear()
68 |     }
69 |     LOG.info(s">>>>> data reload in partition ${TaskContext
70 |       .getPartitionId()} cost ${System.currentTimeMillis() - startTime}ms")
71 |     writer.close()
72 |     graphProvider.close()
73 |   }
74 | 
75 |   /**
76 |     * compute the record amount of ngql
77 |     * @param ngql nebula insert ngql
78 |     */
79 |   private def computeRecordNumber(ngql: String): Int = {
80 |     val substring = ": ("
81 |     var count     = 0
82 |     var index     = 0
83 |     while (index != -1) {
84 |       count += 1
85 |       index = ngql.indexOf(substring, index)
86 |       if (index != (-1)) {
87 |         index += substring.length
88 |       }
89 |     }
90 |     count
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/utils/ConfigTemplateUtils.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2023 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common.utils
 7 | 
 8 | import com.vesoft.exchange.common.FileMigrate
 9 | import com.vesoft.exchange.common.config.SourceCategory
10 | 
11 | import java.io.{BufferedInputStream, BufferedOutputStream, File, FileOutputStream, InputStream}
12 | 
13 | object ConfigTemplateUtils {
14 | 
15 |   def getConfigTemplate(source: String, path: String): Unit = {
16 |     val sourceCategory = SourceCategory.withName(source.trim.toUpperCase)
17 | 
18 |     val fileMigrate = new FileMigrate
19 |     sourceCategory match {
20 |       case SourceCategory.CSV =>
21 |         fileMigrate.saveConfig("config_template/csv.conf", path + "/csv.conf")
22 |       case SourceCategory.JSON =>
23 |         fileMigrate.saveConfig("config_template/json.conf", path + "/json.conf")
24 |       case SourceCategory.ORC =>
25 |         fileMigrate.saveConfig("config_template/orc.conf", path + "/orc.conf")
26 |       case SourceCategory.PARQUET =>
27 |         fileMigrate.saveConfig("config_template/parquet.conf", path + "/parquet.conf")
28 |       case SourceCategory.HIVE =>
29 |         fileMigrate.saveConfig("config_template/hive.conf", path + "/hive.conf")
30 |       case SourceCategory.HBASE=>
31 |         fileMigrate.saveConfig("config_template/hbase.conf", path + "/hbase.conf")
32 |       case SourceCategory.JDBC | SourceCategory.MYSQL | SourceCategory.CLICKHOUSE |
33 |           SourceCategory.MAXCOMPUTE | SourceCategory.ORC | SourceCategory.POSTGRESQL =>
34 |         fileMigrate.saveConfig("config_template/jdbc.conf", path + "/jdbc.conf")
35 |       case SourceCategory.NEO4J =>
36 |         fileMigrate.saveConfig("config_template/neo4j.conf", path + "/neo4j.conf")
37 |       case SourceCategory.KAFKA | SourceCategory.PULSAR =>
38 |         fileMigrate.saveConfig("config_template/kafka.conf", path + "/kafka.conf")
39 |       case _ => throw new IllegalArgumentException(s"does not support datasource $sourceCategory")
40 |     }
41 |   }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/utils/HDFSUtils.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common.utils
 7 | 
 8 | import java.io.File
 9 | import java.nio.charset.Charset
10 | 
11 | import org.apache.hadoop.conf.Configuration
12 | import org.apache.hadoop.fs.{FileSystem, Path}
13 | import org.apache.log4j.Logger
14 | 
15 | import scala.io.Source
16 | 
17 | object HDFSUtils {
18 |   private[this] val LOG = Logger.getLogger(this.getClass)
19 | 
20 |   def getFileSystem(namenode: String = null): FileSystem = {
21 |     val conf = new Configuration()
22 |     if (namenode != null) {
23 |       conf.set("fs.default.name", namenode)
24 |       conf.set("fs.defaultFS", namenode)
25 |     }
26 |     FileSystem.get(conf)
27 |   }
28 | 
29 |   def list(path: String): List[String] = {
30 |     val system = getFileSystem()
31 |     system.listStatus(new Path(path)).map(_.getPath.getName).toList
32 |   }
33 | 
34 |   def exists(path: String): Boolean = {
35 |     val system = getFileSystem()
36 |     system.exists(new Path(path))
37 |   }
38 | 
39 |   def getContent(path: String): String = {
40 |     val system      = getFileSystem()
41 |     val inputStream = system.open(new Path(path))
42 |     Source.fromInputStream(inputStream).mkString
43 |   }
44 | 
45 |   def saveContent(path: String,
46 |                   content: String,
47 |                   charset: Charset = Charset.defaultCharset()): Unit = {
48 |     val system       = getFileSystem()
49 |     val outputStream = system.create(new Path(path))
50 |     try {
51 |       outputStream.write(content.getBytes(charset))
52 |     } finally {
53 |       outputStream.close()
54 |     }
55 |   }
56 | 
57 |   def upload(localPath: String, remotePath: String, namenode: String = null): Unit = {
58 |     try {
59 |       val localFile = new File(localPath)
60 |       if (!localFile.exists() || localFile.length() <= 0) {
61 |         return
62 |       }
63 |     } catch {
64 |       case e: Throwable =>
65 |         LOG.warn("check for empty local file error, but you can ignore this check error. " +
66 |                    "If there is empty sst file in your hdfs, please delete it manually",
67 |                  e)
68 |     }
69 |     val system = getFileSystem(namenode)
70 |     system.copyFromLocalFile(new Path(localPath), new Path(remotePath))
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/utils/NebulaPartitioner.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common.utils
 7 | 
 8 | import java.nio.{ByteBuffer, ByteOrder}
 9 | import org.apache.spark.Partitioner
10 | 
11 | class NebulaPartitioner(partitions: Int) extends Partitioner {
12 |   require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")
13 | 
14 |   override def numPartitions: Int = partitions
15 | 
16 |   override def getPartition(key: Any): Int = {
17 |     var part = ByteBuffer
18 |       .wrap(key.asInstanceOf[Array[Byte]], 0, 4)
19 |       .order(ByteOrder.nativeOrder)
20 |       .getInt >> 8
21 |     if (part <= 0) {
22 |       part = part + partitions
23 |     }
24 |     part - 1
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/utils/NebulaUtils.scala:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2021 vesoft inc. All rights reserved.
  2 |  *
  3 |  * This source code is licensed under Apache 2.0 License.
  4 |  */
  5 | 
  6 | package com.vesoft.exchange.common.utils
  7 | 
  8 | import com.google.common.base.{CharMatcher, Strings}
  9 | 
 10 | import java.nio.charset.Charset
 11 | import java.nio.ByteBuffer
 12 | import java.nio.ByteOrder
 13 | import com.google.common.primitives.UnsignedLong
 14 | import com.vesoft.exchange.common.MetaProvider
 15 | import com.vesoft.exchange.common.VidType
 16 | import com.vesoft.exchange.common.config.{SchemaConfigEntry, Type}
 17 | import com.vesoft.nebula.client.graph.data.HostAddress
 18 | import org.apache.commons.codec.digest.MurmurHash2
 19 | import org.apache.log4j.Logger
 20 | 
 21 | import scala.collection.JavaConversions.seqAsJavaList
 22 | import scala.collection.mutable
 23 | import scala.collection.mutable.ListBuffer
 24 | 
 25 | object NebulaUtils {
 26 |   val DEFAULT_EMPTY_VALUE: String = "_NEBULA_EMPTY"
 27 | 
 28 |   private[this] val LOG = Logger.getLogger(this.getClass)
 29 | 
 30 |   def getDataSourceFieldType(sourceConfig: SchemaConfigEntry,
 31 |                              space: String,
 32 |                              metaProvider: MetaProvider): Map[String, Int] = {
 33 |     val nebulaFields = sourceConfig.nebulaFields
 34 |     val sourceFields = sourceConfig.fields
 35 |     val label        = sourceConfig.name
 36 | 
 37 |     var nebulaSchemaMap: Map[String, Integer] = null
 38 |     val dataType: Type.Value                  = metaProvider.getLabelType(space, label)
 39 |     if (dataType == null) {
 40 |       throw new IllegalArgumentException(s"label $label does not exist.")
 41 |     }
 42 |     if (dataType == Type.VERTEX) {
 43 |       nebulaSchemaMap = metaProvider.getTagSchema(space, label)
 44 |     } else {
 45 |       nebulaSchemaMap = metaProvider.getEdgeSchema(space, label)
 46 |     }
 47 | 
 48 |     val sourceSchemaMap: mutable.Map[String, Int] = mutable.HashMap[String, Int]()
 49 |     for (i <- nebulaFields.indices) {
 50 |       val nebulaField = nebulaFields.get(i)
 51 |       if (!nebulaSchemaMap.contains(nebulaField)) {
 52 |         throw new IllegalArgumentException(
 53 |           s"property name $nebulaField is not defined in NebulaGraph")
 54 |       }
 55 |       sourceSchemaMap.put(sourceFields.get(i), nebulaSchemaMap(nebulaField))
 56 |     }
 57 |     sourceSchemaMap.toMap
 58 |   }
 59 | 
 60 |   def isNumic(str: String): Boolean = {
 61 |     val newStr: String = if (str.startsWith("-")) {
 62 |       str.substring(1)
 63 |     } else { str }
 64 | 
 65 |     for (char <- newStr.toCharArray) {
 66 |       if (!Character.isDigit(char)) return false
 67 |     }
 68 |     true
 69 |   }
 70 | 
 71 |   def escapeUtil(str: String): String = {
 72 |     var s = str
 73 |     if (s.contains("\\")) {
 74 |       s = s.replaceAll("\\\\", "\\\\\\\\")
 75 |     }
 76 |     if (s.contains("\t")) {
 77 |       s = s.replaceAll("\t", "\\\\t")
 78 |     }
 79 |     if (s.contains("\n")) {
 80 |       s = s.replaceAll("\n", "\\\\n")
 81 |     }
 82 |     if (s.contains("\"")) {
 83 |       s = s.replaceAll("\"", "\\\\\"")
 84 |     }
 85 |     if (s.contains("\'")) {
 86 |       s = s.replaceAll("\'", "\\\\'")
 87 |     }
 88 |     if (s.contains("\r")) {
 89 |       s = s.replaceAll("\r", "\\\\r")
 90 |     }
 91 |     if (s.contains("\b")) {
 92 |       s = s.replaceAll("\b", "\\\\b")
 93 |     }
 94 |     s
 95 |   }
 96 | 
 97 |   def getPartitionId(id: String, partitionSize: Int, vidType: VidType.Value): Int = {
 98 |     val hashValue: Long = if (vidType == VidType.STRING) {
 99 |       // todo charset must be the same with Nebula Space
100 |       val byteId = id.getBytes(Charset.forName("UTF-8"))
101 |       if (byteId.length == 8) {
102 |         //byte array to long, need to take care of endianess
103 |         ByteBuffer.wrap(byteId).order(ByteOrder.nativeOrder).getLong
104 |       } else {
105 |         MurmurHash2.hash64(byteId, byteId.length, 0xc70f6907)
106 |       }
107 |     } else {
108 |       id.toLong
109 |     }
110 |     val unsignedValue = UnsignedLong.fromLongBits(hashValue)
111 |     val partSize      = UnsignedLong.fromLongBits(partitionSize)
112 |     unsignedValue.mod(partSize).intValue + 1
113 |   }
114 | 
115 |   def escapePropName(nebulaFields: List[String]): List[String] = {
116 |     val propNames: ListBuffer[String] = new ListBuffer[String]
117 |     for (key <- nebulaFields) {
118 |       val sb = new StringBuilder()
119 |       sb.append("`")
120 |       sb.append(key)
121 |       sb.append("`")
122 |       propNames.append(sb.toString())
123 |     }
124 |     propNames.toList
125 |   }
126 | 
127 |   def getAddressFromString(addr: String): HostAddress = {
128 |     if (addr == null) {
129 |       throw new IllegalArgumentException("wrong address format.")
130 |     }
131 |     var host: String       = null
132 |     var portString: String = null
133 | 
134 |     if (addr.startsWith("[")) {
135 |       val hostAndPort = getHostAndPortFromBracketedHost(addr)
136 |       host = hostAndPort._1
137 |       portString = hostAndPort._2
138 |     } else {
139 |       val colonPos = addr.indexOf(":")
140 |       if (colonPos >= 0 && addr.indexOf(":", colonPos + 1) == -1) {
141 |         host = addr.substring(0, colonPos)
142 |         portString = addr.substring(colonPos + 1)
143 |       } else {
144 |         host = addr
145 |       }
146 |     }
147 | 
148 |     var port = -1;
149 |     if (!Strings.isNullOrEmpty(portString)) {
150 |       for (c <- portString.toCharArray) {
151 |         if (!Character.isDigit(c)) {
152 |           throw new IllegalArgumentException(s"Port must be numeric: $addr")
153 |         }
154 |       }
155 |       port = Integer.parseInt(portString)
156 |       if (port < 0 || port > 65535) {
157 |         throw new IllegalArgumentException(s"Port number out of range: $addr")
158 |       }
159 |     }
160 |     new HostAddress(host, port)
161 |   }
162 | 
163 |   def getHostAndPortFromBracketedHost(addr: String): (String, String) = {
164 |     val colonIndex        = addr.indexOf(":")
165 |     val closeBracketIndex = addr.lastIndexOf("]")
166 |     if (colonIndex < 0 || closeBracketIndex < colonIndex) {
167 |       throw new IllegalArgumentException(s"invalid bracketed host/port: $addr")
168 |     }
169 |     val host: String = addr.substring(1, closeBracketIndex)
170 |     if (closeBracketIndex + 1 == addr.length) {
171 |       return (host, "")
172 |     } else {
173 |       if (addr.charAt(closeBracketIndex + 1) != ':') {
174 |         throw new IllegalArgumentException(s"only a colon may follow a close bracket: $addr")
175 |       }
176 |       for (i <- closeBracketIndex + 2 until addr.length) {
177 |         if (!Character.isDigit(addr.charAt(i))) {
178 |           throw new IllegalArgumentException(s"Port must be numeric: $addr")
179 |         }
180 |       }
181 |     }
182 |     (host, addr.substring(closeBracketIndex + 2))
183 |   }
184 | }
185 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/utils/SparkValidate.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common.utils
 7 | 
 8 | object SparkValidate {
 9 |   def validate(sparkVersion: String, supportedVersions: String*): Unit = {
10 |     if (sparkVersion != "UNKNOWN" && !supportedVersions.exists(sparkVersion.matches)) {
11 |       throw new RuntimeException(
12 |         s"""Your current spark version ${sparkVersion} is not supported by the current NebulaGraph Exchange.
13 |            | please visit https://github.com/vesoft-inc/nebula-exchange#version-match to know which Exchange you need.
14 |            | """.stripMargin)
15 |     }
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/writer/FileBaseWriter.scala:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
  2 |  *
  3 |  * This source code is licensed under Apache 2.0 License.
  4 |  */
  5 | 
  6 | package com.vesoft.exchange.common.writer
  7 | 
  8 | import java.nio.{ByteBuffer, ByteOrder}
  9 | import java.nio.file.{Files, Paths}
 10 | 
 11 | import com.vesoft.exchange.common.config.FileBaseSinkConfigEntry
 12 | import com.vesoft.exchange.common.utils.HDFSUtils
 13 | import org.apache.spark.TaskContext
 14 | import org.apache.spark.sql.Row
 15 | import org.apache.spark.util.LongAccumulator
 16 | import org.rocksdb.{EnvOptions, Options, RocksDB, SstFileWriter}
 17 | import org.slf4j.LoggerFactory
 18 | 
 19 | /**
 20 |   * NebulaSSTWriter
 21 |   */
 22 | class NebulaSSTWriter(path: String) extends Writer {
 23 |   var isOpen = false
 24 | 
 25 |   private val LOG = LoggerFactory.getLogger(getClass)
 26 | 
 27 |   try {
 28 |     RocksDB.loadLibrary()
 29 |     LOG.info(">>>>> Loading RocksDB successfully")
 30 |   } catch {
 31 |     case _: Exception =>
 32 |       LOG.error(">>>>> Can't load RocksDB library!")
 33 |   }
 34 | 
 35 |   // TODO More Config ...
 36 |   val options = new Options()
 37 |     .setCreateIfMissing(true)
 38 | 
 39 |   val env                   = new EnvOptions()
 40 |   var writer: SstFileWriter = _
 41 | 
 42 |   override def prepare(): Unit = {
 43 |     writer = new SstFileWriter(env, options)
 44 |     writer.open(path)
 45 |     isOpen = true
 46 |   }
 47 | 
 48 |   def write(key: Array[Byte], value: Array[Byte]): Unit = {
 49 |     writer.put(key, value)
 50 |   }
 51 | 
 52 |   override def close(): Unit = {
 53 |     if (isOpen) {
 54 |       writer.finish()
 55 |       writer.close()
 56 |     }
 57 |     options.close()
 58 |     env.close()
 59 |   }
 60 | 
 61 | }
 62 | 
 63 | class GenerateSstFile extends Serializable {
 64 |   private val LOG = LoggerFactory.getLogger(getClass)
 65 | 
 66 |   def writeSstFiles(iterator: Iterator[Row],
 67 |                     fileBaseConfig: FileBaseSinkConfigEntry,
 68 |                     partitionNum: Int,
 69 |                     namenode: String,
 70 |                     batchFailure: LongAccumulator): Unit = {
 71 |     val taskID                  = TaskContext.get().taskAttemptId()
 72 |     var writer: NebulaSSTWriter = null
 73 |     var currentPart             = -1
 74 |     var currentPrefix           = -1
 75 |     val localPath               = fileBaseConfig.localPath
 76 |     val remotePath              = fileBaseConfig.remotePath
 77 |     try {
 78 |       iterator.foreach { vertex =>
 79 |         val key   = vertex.getAs[Array[Byte]](0)
 80 |         val value = vertex.getAs[Array[Byte]](1)
 81 |         var part = ByteBuffer
 82 |           .wrap(key, 0, 4)
 83 |           .order(ByteOrder.nativeOrder)
 84 |           .getInt >> 8
 85 |         if (part <= 0) {
 86 |           part = part + partitionNum
 87 |         }
 88 |         // extract the prefix value for vertex key, there's two values
 89 |         // 1: vertex key with tag, 7: vertex key without tag
 90 |         val prefix: Int = ByteBuffer.wrap(key, 0, 1).get
 91 | 
 92 |         if (part != currentPart || prefix != currentPrefix) {
 93 |           if (writer != null) {
 94 |             writer.close()
 95 |             val localFile = s"$localPath/$currentPart-$taskID-$currentPrefix.sst"
 96 |             HDFSUtils.upload(localFile,
 97 |                              s"$remotePath/${currentPart}/$currentPart-$taskID-$currentPrefix.sst",
 98 |                              namenode)
 99 |             Files.delete(Paths.get(localFile))
100 |           }
101 |           currentPart = part
102 |           currentPrefix = prefix
103 |           val tmp = s"$localPath/$currentPart-$taskID-$currentPrefix.sst"
104 |           writer = new NebulaSSTWriter(tmp)
105 |           writer.prepare()
106 |         }
107 |         writer.write(key, value)
108 |       }
109 |     } catch {
110 |       case e: Throwable => {
111 |         LOG.error(">>>>> sst file write error,", e)
112 |         batchFailure.add(1)
113 |       }
114 |     } finally {
115 |       if (writer != null) {
116 |         writer.close()
117 |         val localFile = s"$localPath/$currentPart-$taskID-$currentPrefix.sst"
118 |         HDFSUtils.upload(localFile,
119 |                          s"$remotePath/${currentPart}/$currentPart-$taskID-$currentPrefix.sst",
120 |                          namenode)
121 |         Files.delete(Paths.get(localFile))
122 |       }
123 |     }
124 |   }
125 | }
126 | 


--------------------------------------------------------------------------------
/exchange-common/src/main/scala/com/vesoft/exchange/common/writer/Writer.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common.writer
 7 | 
 8 | /**
 9 |   *
10 |   */
11 | trait Writer extends Serializable {
12 | 
13 |   def prepare(): Unit
14 | 
15 |   def close()
16 | }
17 | 


--------------------------------------------------------------------------------
/exchange-common/src/test/resources/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | version: '3.4'
  2 | services:
  3 |   metad0:
  4 |     image: vesoft/nebula-metad:nightly
  5 |     environment:
  6 |       USER: root
  7 |       TZ:   "${TZ}"
  8 |     command:
  9 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
 10 |       - --local_ip=172.28.1.1
 11 |       - --ws_ip=172.28.1.1
 12 |       - --port=9559
 13 |       - --data_path=/data/meta
 14 |       - --log_dir=/logs
 15 |       - --v=0
 16 |       - --minloglevel=0
 17 |       - --heartbeat_interval_secs=2
 18 |     healthcheck:
 19 |       test: ["CMD", "curl", "-f", "http://172.28.1.1:11000/status"]
 20 |       interval: 30s
 21 |       timeout: 10s
 22 |       retries: 3
 23 |       start_period: 20s
 24 |     ports:
 25 |       - "9559:9559"
 26 |       - 11000
 27 |       - 11002
 28 |     volumes:
 29 |       - ./data/meta0:/data/meta:Z
 30 |       - ./logs/meta0:/logs:Z
 31 |     networks:
 32 |       nebula-net:
 33 |         ipv4_address: 172.28.1.1
 34 |     restart: on-failure
 35 |     cap_add:
 36 |       - SYS_PTRACE
 37 | 
 38 |   metad1:
 39 |     image: vesoft/nebula-metad:nightly
 40 |     environment:
 41 |       USER: root
 42 |       TZ:   "${TZ}"
 43 |     command:
 44 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
 45 |       - --local_ip=172.28.1.2
 46 |       - --ws_ip=172.28.1.2
 47 |       - --port=9559
 48 |       - --data_path=/data/meta
 49 |       - --log_dir=/logs
 50 |       - --v=0
 51 |       - --minloglevel=0
 52 |       - --heartbeat_interval_secs=2
 53 |     healthcheck:
 54 |       test: ["CMD", "curl", "-f", "http://172.28.1.2:11000/status"]
 55 |       interval: 30s
 56 |       timeout: 10s
 57 |       retries: 3
 58 |       start_period: 20s
 59 |     ports:
 60 |       - "9560:9559"
 61 |       - 11000
 62 |       - 11002
 63 |     volumes:
 64 |       - ./data/meta1:/data/meta:Z
 65 |       - ./logs/meta1:/logs:Z
 66 |     networks:
 67 |       nebula-net:
 68 |         ipv4_address: 172.28.1.2
 69 |     restart: on-failure
 70 |     cap_add:
 71 |       - SYS_PTRACE
 72 | 
 73 |   metad2:
 74 |     image: vesoft/nebula-metad:nightly
 75 |     environment:
 76 |       USER: root
 77 |       TZ:   "${TZ}"
 78 |     command:
 79 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
 80 |       - --local_ip=172.28.1.3
 81 |       - --ws_ip=172.28.1.3
 82 |       - --port=9559
 83 |       - --data_path=/data/meta
 84 |       - --log_dir=/logs
 85 |       - --v=0
 86 |       - --minloglevel=0
 87 |       - --heartbeat_interval_secs=2
 88 |     healthcheck:
 89 |       test: ["CMD", "curl", "-f", "http://172.28.1.3:11000/status"]
 90 |       interval: 30s
 91 |       timeout: 10s
 92 |       retries: 3
 93 |       start_period: 20s
 94 |     ports:
 95 |       - "9561:9559"
 96 |       - 11000
 97 |       - 11002
 98 |     volumes:
 99 |       - ./data/meta2:/data/meta:Z
100 |       - ./logs/meta2:/logs:Z
101 |     networks:
102 |       nebula-net:
103 |         ipv4_address: 172.28.1.3
104 |     restart: on-failure
105 |     cap_add:
106 |       - SYS_PTRACE
107 | 
108 |   storaged0:
109 |     image: vesoft/nebula-storaged:nightly
110 |     environment:
111 |       USER: root
112 |       TZ:   "${TZ}"
113 |     command:
114 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
115 |       - --local_ip=172.28.2.1
116 |       - --ws_ip=172.28.2.1
117 |       - --port=9779
118 |       - --data_path=/data/storage
119 |       - --log_dir=/logs
120 |       - --v=0
121 |       - --minloglevel=0
122 |       - --heartbeat_interval_secs=2
123 |     depends_on:
124 |       - metad0
125 |       - metad1
126 |       - metad2
127 |     healthcheck:
128 |       test: ["CMD", "curl", "-f", "http://172.28.2.1:12000/status"]
129 |       interval: 30s
130 |       timeout: 10s
131 |       retries: 3
132 |       start_period: 20s
133 |     ports:
134 |       - "9779:9779"
135 |       - 12000
136 |       - 12002
137 |     volumes:
138 |       - ./data/storage0:/data/storage:Z
139 |       - ./logs/storage0:/logs:Z
140 |     networks:
141 |       nebula-net:
142 |         ipv4_address: 172.28.2.1
143 |     restart: on-failure
144 |     cap_add:
145 |       - SYS_PTRACE
146 | 
147 |   storaged1:
148 |     image: vesoft/nebula-storaged:nightly
149 |     environment:
150 |       USER: root
151 |       TZ:   "${TZ}"
152 |     command:
153 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
154 |       - --local_ip=172.28.2.2
155 |       - --ws_ip=172.28.2.2
156 |       - --port=9779
157 |       - --data_path=/data/storage
158 |       - --log_dir=/logs
159 |       - --v=0
160 |       - --minloglevel=0
161 |       - --heartbeat_interval_secs=2
162 |     depends_on:
163 |       - metad0
164 |       - metad1
165 |       - metad2
166 |     healthcheck:
167 |       test: ["CMD", "curl", "-f", "http://172.28.2.2:12000/status"]
168 |       interval: 30s
169 |       timeout: 10s
170 |       retries: 3
171 |       start_period: 20s
172 |     ports:
173 |       - "9780:9779"
174 |       - 12000
175 |       - 12002
176 |     volumes:
177 |       - ./data/storage1:/data/storage:Z
178 |       - ./logs/storage1:/logs:Z
179 |     networks:
180 |       nebula-net:
181 |         ipv4_address: 172.28.2.2
182 |     restart: on-failure
183 |     cap_add:
184 |       - SYS_PTRACE
185 | 
186 |   storaged2:
187 |     image: vesoft/nebula-storaged:nightly
188 |     environment:
189 |       USER: root
190 |       TZ:   "${TZ}"
191 |     command:
192 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
193 |       - --local_ip=172.28.2.3
194 |       - --ws_ip=172.28.2.3
195 |       - --port=9779
196 |       - --data_path=/data/storage
197 |       - --log_dir=/logs
198 |       - --v=0
199 |       - --minloglevel=0
200 |       - --heartbeat_interval_secs=2
201 |     depends_on:
202 |       - metad0
203 |       - metad1
204 |       - metad2
205 |     healthcheck:
206 |       test: ["CMD", "curl", "-f", "http://172.28.2.3:12000/status"]
207 |       interval: 30s
208 |       timeout: 10s
209 |       retries: 3
210 |       start_period: 20s
211 |     ports:
212 |       - "9781:9779"
213 |       - 12000
214 |       - 12002
215 |     volumes:
216 |       - ./data/storage2:/data/storage:Z
217 |       - ./logs/storage2:/logs:Z
218 |     networks:
219 |       nebula-net:
220 |         ipv4_address: 172.28.2.3
221 |     restart: on-failure
222 |     cap_add:
223 |       - SYS_PTRACE
224 | 
225 |   graphd0:
226 |     image: vesoft/nebula-graphd:nightly
227 |     environment:
228 |       USER: root
229 |       TZ:   "${TZ}"
230 |     command:
231 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
232 |       - --port=9669
233 |       - --ws_ip=172.28.3.1
234 |       - --log_dir=/logs
235 |       - --v=0
236 |       - --minloglevel=0
237 |       - --heartbeat_interval_secs=2
238 |     depends_on:
239 |       - metad0
240 |       - metad1
241 |       - metad2
242 |     healthcheck:
243 |       test: ["CMD", "curl", "-f", "http://172.28.3.1:13000/status"]
244 |       interval: 30s
245 |       timeout: 10s
246 |       retries: 3
247 |       start_period: 20s
248 |     ports:
249 |       - "9669:9669"
250 |       - 13000
251 |       - 13002
252 |     volumes:
253 |       - ./logs/graph0:/logs:Z
254 |     networks:
255 |       nebula-net:
256 |         ipv4_address: 172.28.3.1
257 |     restart: on-failure
258 |     cap_add:
259 |       - SYS_PTRACE
260 | 
261 |   graphd1:
262 |     image: vesoft/nebula-graphd:nightly
263 |     environment:
264 |       USER: root
265 |       TZ:   "${TZ}"
266 |     command:
267 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
268 |       - --port=9669
269 |       - --ws_ip=172.28.3.2
270 |       - --log_dir=/logs
271 |       - --v=0
272 |       - --minloglevel=0
273 |       - --heartbeat_interval_secs=2
274 |     depends_on:
275 |       - metad0
276 |       - metad1
277 |       - metad2
278 |     healthcheck:
279 |       test: ["CMD", "curl", "-f", "http://172.28.3.2:13000/status"]
280 |       interval: 30s
281 |       timeout: 10s
282 |       retries: 3
283 |       start_period: 20s
284 |     ports:
285 |       - "9670:9669"
286 |       - 13000
287 |       - 13002
288 |     volumes:
289 |       - ./logs/graph1:/logs:Z
290 |     networks:
291 |       nebula-net:
292 |         ipv4_address: 172.28.3.2
293 |     restart: on-failure
294 |     cap_add:
295 |       - SYS_PTRACE
296 | 
297 |   graphd2:
298 |     image: vesoft/nebula-graphd:nightly
299 |     environment:
300 |       USER: root
301 |       TZ:   "${TZ}"
302 |     command:
303 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
304 |       - --port=9669
305 |       - --ws_ip=172.28.3.3
306 |       - --log_dir=/logs
307 |       - --v=0
308 |       - --minloglevel=0
309 |       - --heartbeat_interval_secs=2
310 |     depends_on:
311 |       - metad0
312 |       - metad1
313 |       - metad2
314 |     healthcheck:
315 |       test: ["CMD", "curl", "-f", "http://172.28.3.3:13000/status"]
316 |       interval: 30s
317 |       timeout: 10s
318 |       retries: 3
319 |       start_period: 20s
320 |     ports:
321 |       - "9671:9669"
322 |       - 13000
323 |       - 13002
324 |     volumes:
325 |       - ./logs/graph2:/logs:Z
326 |     networks:
327 |       nebula-net:
328 |         ipv4_address: 172.28.3.3
329 |     restart: on-failure
330 |     cap_add:
331 |       - SYS_PTRACE
332 |       
333 |   console:
334 |     image: vesoft/nebula-console:nightly
335 |     entrypoint: ""
336 |     command: 
337 |       - sh
338 |       - -c
339 |       - |
340 |         sleep 3 &&
341 |         nebula-console -addr graphd0 -port 9669 -u root -p nebula -e 'ADD HOSTS "172.28.2.1":9779,"172.28.2.2":9779,"172.28.2.3":9779' &&
342 |         sleep 36000
343 |     depends_on:
344 |       - graphd0
345 |     networks:
346 |       - nebula-net
347 |       
348 | networks:
349 |   nebula-net:
350 |     ipam:
351 |       driver: default
352 |       config:
353 |         - subnet: 172.28.0.0/16
354 | 


--------------------------------------------------------------------------------
/exchange-common/src/test/resources/edge.csv:
--------------------------------------------------------------------------------
 1 | src,dst,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14
 2 | 101,102,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2)
 3 | 102,103,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4)
 4 | 103,101,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6)
 5 | 104,106,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7)
 6 | 105,107,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5)
 7 | 106,108,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)"
 8 | 107,101,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)"
 9 | 108,109,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)"
10 | 109,110,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)"
11 | 110,-101,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)"
12 | -101,102,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
13 | -102,-103,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
14 | -103,-101,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
15 | 


--------------------------------------------------------------------------------
/exchange-common/src/test/resources/process_application.conf:
--------------------------------------------------------------------------------
  1 | {
  2 |   # Spark relation com.vesoft.exchange.common.config
  3 |   spark: {
  4 |     app: {
  5 |       name: Nebula Exchange 2.0
  6 |     }
  7 | 
  8 |     master:local
  9 | 
 10 |     driver: {
 11 |       cores: 1
 12 |       maxResultSize: 1G
 13 |     }
 14 | 
 15 |     executor: {
 16 |         memory:1G
 17 |     }
 18 | 
 19 |     cores:{
 20 |       max: 16
 21 |     }
 22 |   }
 23 | 
 24 |   # if the hive is hive-on-spark with derby mode， you can ignore this hive configure
 25 |   # get the com.vesoft.exchange.common.config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml
 26 | 
 27 |     hive: {
 28 |       warehouse: "hdfs://NAMENODE_IP:9000/apps/svr/hive-xxx/warehouse/"
 29 |       connectionURL: "jdbc:mysql://your_ip:3306/hive_spark?characterEncoding=UTF-8"
 30 |       connectionDriverName: "com.mysql.jdbc.Driver"
 31 |       connectionUserName: "user"
 32 |       connectionPassword: "password"
 33 |     }
 34 | 
 35 |   # Nebula Graph relation com.vesoft.exchange.common.config
 36 |   nebula: {
 37 |     address:{
 38 |       graph:["127.0.0.1:9669", "127.0.0.1:9670", "127.0.0.1:9671"]
 39 |       meta:["127.0.0.1:9559", "127.0.0.1:9560", "127.0.0.1:9561"]
 40 |     }
 41 |     user: root
 42 |     pswd: nebula
 43 |     space: test_string
 44 | 
 45 |     # parameters for SST import, not required
 46 |     path:{
 47 |         local:"/tmp"
 48 |         remote:"/sst"
 49 |         hdfs.namenode: "hdfs://name_node:9000"
 50 |     }
 51 | 
 52 |     # nebula client connection parameters
 53 |     connection {
 54 |       timeout: 3000
 55 |       retry: 3
 56 |     }
 57 | 
 58 |     # nebula client execution parameters
 59 |     execution {
 60 |       retry: 3
 61 |     }
 62 | 
 63 |     error: {
 64 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
 65 |       max: 32
 66 |       # failed import job will be recorded in output path
 67 |       output: /tmp/errors
 68 |     }
 69 | 
 70 |     # use google's RateLimiter to limit the requests send to NebulaGraph
 71 |     rate: {
 72 |       # the stable throughput of RateLimiter
 73 |       limit: 1024
 74 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
 75 |       # if it can't be obtained within the specified timeout, then give up the request.
 76 |       timeout: 1000
 77 |     }
 78 |   }
 79 | 
 80 |   # Processing tags
 81 |   # There are tag com.vesoft.exchange.common.config examples for different dataSources.
 82 |   tags: [
 83 |     {
 84 |       name: person
 85 |       type: {
 86 |         source: csv
 87 |         sink: client
 88 |       }
 89 |       path: "file://src/test/resources/data.csv"
 90 |       fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14]
 91 |       nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14]
 92 |       vertex: {
 93 |         field:id
 94 |         #policy:hash
 95 |       }
 96 |       header:true
 97 |       batch: 2
 98 |       partition: 5
 99 |     }
100 |   ]
101 | 
102 |   # There are tag com.vesoft.exchange.common.config examples for different dataSources.
103 |     edges: [
104 |       {
105 |         name: friend
106 |         type: {
107 |           source: csv
108 |           sink: client
109 |         }
110 |         path: "file://src/test/resources/data.csv"
111 |         fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14]
112 |         nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14]
113 |         source: {
114 |           field:src
115 |           #policy:hash
116 |         }
117 |         target: {
118 |           field:dst
119 |           #policy:hash
120 |         }
121 |         header:true
122 |         batch: 2
123 |         partition: 5
124 |       }
125 |     ]
126 | }
127 | 


--------------------------------------------------------------------------------
/exchange-common/src/test/resources/vertex.csv:
--------------------------------------------------------------------------------
 1 | id,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14
 2 | 101,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2)
 3 | 102,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4)
 4 | 103,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6)
 5 | 104,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7)
 6 | 105,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5)
 7 | 106,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)"
 8 | 107,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)"
 9 | 108,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)"
10 | 109,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)"
11 | 1010,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)"
12 | -101,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
13 | -102,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
14 | -103,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
15 | 


--------------------------------------------------------------------------------
/exchange-common/src/test/scala/com/vesoft/exchange/common/GraphProviderSuite.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common
 7 | 
 8 | import com.google.common.net.HostAndPort
 9 | import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, UserConfigEntry}
10 | import com.vesoft.nebula.client.graph.data.HostAddress
11 | import com.vesoft.nebula.client.graph.exception.AuthFailedException
12 | import com.vesoft.nebula.client.graph.net.Session
13 | import org.junit.{After, Before, Test}
14 | import org.scalatest.Assertions.assertThrows
15 | 
16 | class GraphProviderSuite {
17 |   var graphProvider: GraphProvider = _
18 |   var session: Session             = _
19 |   val userConfig                   = UserConfigEntry("root", "nebula")
20 | 
21 |   @Before
22 |   def setUp(): Unit = {
23 |     val mockData = new NebulaGraphMock
24 |     mockData.mockStringIdGraph()
25 |     mockData.mockIntIdGraph()
26 |     mockData.close()
27 | 
28 |     val sslConfig = SslConfigEntry(false, false, SslType.CA, null, null)
29 |     graphProvider =
30 |       new GraphProvider(List(new HostAddress("127.0.0.1", 9669)), 5000, sslConfig)
31 |   }
32 | 
33 |   @After
34 |   def tearDown(): Unit = {
35 |     graphProvider.close()
36 |   }
37 | 
38 |   @Test
39 |   def switchSpaceSuite(): Unit = {
40 |     session = graphProvider.getGraphClient(userConfig)
41 |     assert(graphProvider.switchSpace(session, "test_string")._2.isSucceeded)
42 |     assert(graphProvider.switchSpace(session, "test_int")._2.isSucceeded)
43 |     graphProvider.releaseGraphClient(session)
44 |   }
45 | 
46 |   @Test
47 |   def submitSuite(): Unit = {
48 |     session = graphProvider.getGraphClient(userConfig)
49 |     assert(graphProvider.submit(session, "show hosts")._2.isSucceeded)
50 |     graphProvider.releaseGraphClient(session)
51 |   }
52 | 
53 |   @Test
54 |   def switchSpaceWithoutPermissionSuite(): Unit = {
55 |     val wrongUserConfig = UserConfigEntry("user", "12345")
56 |     assertThrows[AuthFailedException](graphProvider.getGraphClient(wrongUserConfig))
57 |   }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/exchange-common/src/test/scala/com/vesoft/exchange/common/MetaProviderSuite.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common
 7 | 
 8 | import com.google.common.net.HostAndPort
 9 | import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, Type}
10 | import com.vesoft.nebula.client.graph.data.HostAddress
11 | import com.vesoft.nebula.client.meta.exception.ExecuteFailedException
12 | import org.junit.{After, Before, Test}
13 | import org.scalatest.Assertions.assertThrows
14 | 
15 | class MetaProviderSuite {
16 | 
17 |   var metaProvider: MetaProvider = _
18 |   @Before
19 |   def setUp(): Unit = {
20 |     val mockData = new NebulaGraphMock
21 |     mockData.mockStringIdGraph()
22 |     mockData.mockIntIdGraph()
23 |     mockData.close()
24 | 
25 |     val sslConfig = SslConfigEntry(false, false, SslType.CA, null, null)
26 |     metaProvider = new MetaProvider(List(new HostAddress("127.0.0.1", 9559)), 5000, 1, sslConfig)
27 |   }
28 | 
29 |   @After
30 |   def tearDown(): Unit = {
31 |     if (metaProvider != null)
32 |       metaProvider.close()
33 |   }
34 | 
35 |   @Test
36 |   def getPartNumberSuite(): Unit = {
37 |     assert(metaProvider.getPartNumber("test_string") == 10)
38 |     assert(metaProvider.getPartNumber("test_int") == 10)
39 |   }
40 | 
41 |   @Test
42 |   def getVidTypeSuite(): Unit = {
43 |     assert(metaProvider.getVidType("test_string") == VidType.STRING)
44 |     assert(metaProvider.getVidType("test_int") == VidType.INT)
45 |   }
46 | 
47 |   @Test
48 |   def getTagSchemaSuite(): Unit = {
49 |     val tagSchema = metaProvider.getTagSchema("test_string", "person")
50 |     assert(tagSchema.size == 14)
51 |   }
52 | 
53 |   @Test
54 |   def getEdgeSchemaSuite(): Unit = {
55 |     val edgeSchema = metaProvider.getEdgeSchema("test_string", "friend")
56 |     assert(edgeSchema.size == 14)
57 |   }
58 | 
59 |   @Test
60 |   def getLabelTypeSuite(): Unit = {
61 |     assert(metaProvider.getLabelType("test_string", "person") == Type.VERTEX)
62 |     assert(metaProvider.getLabelType("test_string", "friend") == Type.EDGE)
63 |     assert(metaProvider.getLabelType("test_int", "person") == Type.VERTEX)
64 |     assert(metaProvider.getLabelType("test_int", "friend") == Type.EDGE)
65 |   }
66 | 
67 |   @Test
68 |   def getSpaceVidLenSuite(): Unit = {
69 |     assert(metaProvider.getSpaceVidLen("test_string") == 8)
70 |     assert(metaProvider.getSpaceVidLen("test_int") == 8)
71 |     assertThrows[ExecuteFailedException](metaProvider.getSpaceVidLen("not_exist_space"))
72 |   }
73 | 
74 |   @Test
75 |   def getTagItemSuite(): Unit = {
76 |     val tagItem = metaProvider.getTagItem("test_string", "person")
77 |     assert(new String(tagItem.tag_name).equals("person"))
78 |   }
79 | 
80 |   @Test
81 |   def getNoExistTagSuite(): Unit = {
82 |     assertThrows[IllegalArgumentException](metaProvider.getTagItem("test_string", "no_exist_tag"))
83 |   }
84 | 
85 |   @Test
86 |   def getEdgeItemSuite(): Unit = {
87 |     val edgeItem = metaProvider.getEdgeItem("test_string", "friend")
88 |     assert(new String(edgeItem.edge_name).equals("friend"))
89 |   }
90 | 
91 |   @Test
92 |   def getNoExistEdgeSuite(): Unit = {
93 |     assertThrows[IllegalArgumentException](metaProvider.getEdgeItem("test_string", "no_exist_edge"))
94 |   }
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------
/exchange-common/src/test/scala/com/vesoft/exchange/common/utils/SparkValidateSuite.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common.utils
 7 | 
 8 | import org.junit.Test
 9 | import org.scalatest.Assertions.assertThrows
10 | 
11 | class SparkValidateSuite {
12 | 
13 |   @Test
14 |   def validateSuite(): Unit = {
15 |     SparkValidate.validate("2.2.0", "2.2.*")
16 |     SparkValidate.validate("2.4.4", "2.4.*")
17 |     SparkValidate.validate("3.0.0", "3.0.*", "3.1.*", "3.2.*", "3.3.*")
18 |     assertThrows[RuntimeException](SparkValidate.validate("2.4.0", "2.2.*"))
19 |     assertThrows[RuntimeException](SparkValidate.validate("2.2.0", "2.4.*"))
20 |     assertThrows[RuntimeException](
21 |       SparkValidate.validate("2.4.0", "3.0.*", "3.1.*", "3.2.*", "3.3.*"))
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/exchange-common/src/test/scala/com/vesoft/exchange/common/writer/FileBaseWriterSuite.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.exchange.common.writer
 7 | 
 8 | import com.vesoft.exchange.common.config.{FileBaseSinkConfigEntry, SinkCategory}
 9 | import org.apache.spark.sql.{Dataset, Encoders, Row, SparkSession}
10 | import org.junit.Test
11 | 
12 | class FileBaseWriterSuite {
13 | 
14 |   @Test
15 |   def writeSstFilesSuite(): Unit = {
16 |     val spark = SparkSession.builder().master("local").getOrCreate()
17 |     import spark.implicits._
18 |     // generate byte[] key using encoder's getVertexKey, space:"test", tag: "person"
19 |     val key1  = "01a40200310000000000000000000000000000000000000002000000" // id: "1"
20 |     val key2  = "01170000320000000000000000000000000000000000000002000000" // id: "2"
21 |     val key3  = "01fe0000330000000000000000000000000000000000000002000000" // id: "3"
22 |     val key4  = "01a90300340000000000000000000000000000000000000002000000" // id: "4"
23 |     val key5  = "01220200350000000000000000000000000000000000000002000000" // id: "5"
24 |     val value = "abc"
25 |     // construct test dataset
26 |     val data: Dataset[(Array[Byte], Array[Byte])] = spark.sparkContext
27 |       .parallelize(
28 |         List(key1.getBytes(), key2.getBytes(), key3.getBytes(), key4.getBytes(), key5.getBytes()))
29 |       .map(line => (line, value.getBytes()))
30 |       .toDF("key", "value")
31 |       .map { row =>
32 |         (row.getAs[Array[Byte]](0), row.getAs[Array[Byte]](1))
33 |       }(Encoders.tuple(Encoders.BINARY, Encoders.BINARY))
34 | 
35 |     val generateSstFile = new GenerateSstFile
36 | 
37 |     val fileBaseConfig =
38 |       FileBaseSinkConfigEntry(SinkCategory.SST, "/tmp", "/tmp/remote", None)
39 |     val batchFailure = spark.sparkContext.longAccumulator(s"batchFailure.test}")
40 | 
41 |     data
42 |       .toDF("key", "value")
43 |       .sortWithinPartitions("key")
44 |       .foreachPartition { iterator: Iterator[Row] =>
45 |         generateSstFile.writeSstFiles(iterator, fileBaseConfig, 10, null, batchFailure)
46 |       }
47 |     assert(batchFailure.value == 0)
48 |   }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/exchange-common/src/test/scala/com/vesoft/exchange/common/writer/ServerBaseWriterSuite.scala:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022 vesoft inc. All rights reserved.
  2 |  *
  3 |  * This source code is licensed under Apache 2.0 License.
  4 |  */
  5 | 
  6 | package com.vesoft.exchange.common.writer
  7 | 
  8 | import com.vesoft.exchange.common
  9 | import com.vesoft.exchange.common.{Edge, Edges, Vertex, Vertices}
 10 | import org.junit.Test
 11 | 
 12 | import scala.collection.mutable.ListBuffer
 13 | 
 14 | class ServerBaseWriterSuite extends ServerBaseWriter {
 15 | 
 16 |   @Test
 17 |   def toExecuteSentenceSuiteForVertex(): Unit = {
 18 |     val vertices: ListBuffer[Vertex] = new ListBuffer[Vertex]
 19 |     val tagName                      = "person"
 20 |     val propNames                    = List("name", "age", "gender", "high", "weight")
 21 | 
 22 |     val props1 = List("\"Tom\"", 10, 0, 172.5, 55)
 23 |     val props2 = List("\"Jena\"", 12, 1, 165.5, 45)
 24 |     vertices.append(Vertex("\"vid1\"", props1))
 25 |     vertices.append(Vertex("\"vid2\"", props2))
 26 |     val nebulaVertices = Vertices(propNames, vertices.toList)
 27 | 
 28 |     val sentence = toExecuteSentence(tagName, nebulaVertices, false)
 29 |     val expectSentence =
 30 |       "INSERT VERTEX `person`(`name`,`age`,`gender`,`high`,`weight`) VALUES " +
 31 |         "\"vid1\": (\"Tom\", 10, 0, 172.5, 55), " +
 32 |         "\"vid2\": (\"Jena\", 12, 1, 165.5, 45)"
 33 |     assert(sentence.equals(expectSentence))
 34 |   }
 35 | 
 36 |   @Test
 37 |   def toDeleteExecuteSentenceSuiteForVertex(): Unit = {
 38 |     val vertices: ListBuffer[Vertex] = new ListBuffer[Vertex]
 39 |     val propNames                    = List("name", "age", "gender", "high", "weight")
 40 | 
 41 |     val props1 = List("\"Tom\"", 10, 0, 172.5, 55)
 42 |     val props2 = List("\"Jena\"", 12, 1, 165.5, 45)
 43 |     vertices.append(Vertex("\"vid1\"", props1))
 44 |     vertices.append(Vertex("\"vid2\"", props2))
 45 |     val nebulaVertices = Vertices(propNames, vertices.toList)
 46 | 
 47 |     val sentence = toDeleteExecuteSentence(nebulaVertices, false)
 48 |     val expectSentence =
 49 |       "DELETE VERTEX \"vid1\", \"vid2\""
 50 |     assert(sentence.equals(expectSentence))
 51 |   }
 52 | 
 53 |   @Test
 54 |    def toUpdateExecuteSentenceSuiteForVertex(): Unit = {
 55 |     val vertices: ListBuffer[Vertex] = new ListBuffer[Vertex]
 56 |     val propNames = List("col_string",
 57 |       "col_fixed_string",
 58 |       "col_bool",
 59 |       "col_int",
 60 |       "col_int64",
 61 |       "col_double",
 62 |       "col_date",
 63 |       "col_geo")
 64 | 
 65 |     val props1 =
 66 |       List("\"name\"", "\"name\"", true, 10, 100L, 1.0, "2021-11-12", "LINESTRING(1 2, 3 4)")
 67 |     val props2 = List("\"name2\"",
 68 |       "\"name2\"",
 69 |       false,
 70 |       11,
 71 |       101L,
 72 |       2.0,
 73 |       "2021-11-13",
 74 |       "POLYGON((0 1, 1 2, 2 3, 0 1))")
 75 | 
 76 |     vertices.append(Vertex("\"vid1\"", props1))
 77 |     vertices.append(Vertex("\"vid2\"", props2))
 78 |     val nebulaVertices = Vertices(propNames, vertices.toList, None)
 79 | 
 80 |     val sentence = toUpdateExecuteSentence("person", nebulaVertices)
 81 |     val expectSentence =
 82 |       "UPDATE VERTEX ON `person` \"vid1\" SET `col_string`=\"name\",`col_fixed_string`=\"name\"," +
 83 |         "`col_bool`=true,`col_int`=10,`col_int64`=100,`col_double`=1.0,`col_date`=2021-11-12," +
 84 |         "`col_geo`=LINESTRING(1 2, 3 4);UPDATE VERTEX ON `person` \"vid2\" SET " +
 85 |         "`col_string`=\"name2\",`col_fixed_string`=\"name2\",`col_bool`=false,`col_int`=11," +
 86 |         "`col_int64`=101,`col_double`=2.0,`col_date`=2021-11-13," +
 87 |         "`col_geo`=POLYGON((0 1, 1 2, 2 3, 0 1))"
 88 |     assert(expectSentence.equals(sentence))
 89 |   }
 90 | 
 91 |   @Test
 92 |   def toExecuteSentenceSuiteForVertexWithSymbol(): Unit = {
 93 |     val vertices: ListBuffer[Vertex] = new ListBuffer[Vertex]
 94 |     val tagName                      = "person,test_with^symbol#"
 95 |     val propNames                    = List("name_1", "age-1", "gender&1", "high%1", "weight,1")
 96 | 
 97 |     val props1 = List("\"Tom\"", 10, 0, 172.5, 55)
 98 |     val props2 = List("\"Jena\"", 12, 1, 165.5, 45)
 99 |     vertices.append(Vertex("\"vid_1\"", props1))
100 |     vertices.append(Vertex("\"vid,2\"", props2))
101 |     val nebulaVertices = Vertices(propNames, vertices.toList)
102 | 
103 |     val sentence = toExecuteSentence(tagName, nebulaVertices, false)
104 |     val expectSentence =
105 |       "INSERT VERTEX `person,test_with^symbol#`(`name_1`,`age-1`,`gender&1`,`high%1`,`weight,1`) VALUES " +
106 |         "\"vid_1\": (\"Tom\", 10, 0, 172.5, 55), " +
107 |         "\"vid,2\": (\"Jena\", 12, 1, 165.5, 45)"
108 |     assert(sentence.equals(expectSentence))
109 |   }
110 | 
111 |   @Test
112 |   def toExecuteSentenceSuiteForEdge(): Unit = {
113 |     val edges: ListBuffer[Edge] = new ListBuffer[Edge]
114 |     val edgeType                = "friend"
115 |     val propNames               = List("src_name", "dst_name", "time", "address", "relation")
116 | 
117 |     val props1 = List("\"Tom\"", "\"Jena\"", "2022-08-25", "hangzhou", "friend")
118 |     val props2 = List("\"Jena\"", "\"Bob\"", "2022-08-25", "shanghai", "friend")
119 |     edges.append(Edge("\"vid1\"", "\"vid2\"", Some(0L), props1))
120 |     edges.append(Edge("\"vid2\"", "\"vid3\"", Some(1L), props2))
121 |     val nebulaEdges = Edges(propNames, edges.toList)
122 |     val sentence    = toExecuteSentence(edgeType, nebulaEdges, false)
123 |     val expectSentence = "INSERT EDGE `friend`(`src_name`,`dst_name`,`time`,`address`,`relation`) VALUES" +
124 |       " \"vid1\"->\"vid2\"@0: (\"Tom\", \"Jena\", 2022-08-25, hangzhou, friend), " +
125 |       "\"vid2\"->\"vid3\"@1: (\"Jena\", \"Bob\", 2022-08-25, shanghai, friend)"
126 |     assert(sentence.equals(expectSentence))
127 |   }
128 | 
129 |   @Test
130 |   def toDeleteExecuteSentenceSuiteForEdge(): Unit = {
131 |     val edges: ListBuffer[Edge] = new ListBuffer[Edge]
132 |     val edgeType                = "friend"
133 |     val propNames               = List("src_name", "dst_name", "time", "address", "relation")
134 | 
135 |     val props1 = List("\"Tom\"", "\"Jena\"", "2022-08-25", "hangzhou", "friend")
136 |     val props2 = List("\"Jena\"", "\"Bob\"", "2022-08-25", "shanghai", "friend")
137 |     edges.append(Edge("\"vid1\"", "\"vid2\"", Some(0L), props1))
138 |     edges.append(Edge("\"vid2\"", "\"vid3\"", Some(1L), props2))
139 |     val nebulaEdges = Edges(propNames, edges.toList)
140 |     val sentence    = toDeleteExecuteSentence(edgeType, nebulaEdges)
141 |     val expectSentence = "DELETE EDGE `friend` " +
142 |       "\"vid1\"->\"vid2\"@0, " +
143 |       "\"vid2\"->\"vid3\"@1"
144 |     println(sentence)
145 |     println(expectSentence)
146 |     assert(sentence.equals(expectSentence))
147 |   }
148 | 
149 |   @Test
150 |   def toUpdateExecuteSuiteForEdge(): Unit = {
151 |     val edges: ListBuffer[Edge] = new ListBuffer[Edge]
152 |     val propNames = List("col_string",
153 |                          "col_fixed_string",
154 |                          "col_bool",
155 |                          "col_int",
156 |                          "col_int64",
157 |                          "col_double",
158 |                          "col_date",
159 |                          "col_geo")
160 |     val props1 = List("\"Tom\"", "\"Tom\"", true, 10, 100L, 1.0, "2021-11-12", "POINT(1 2)")
161 |     val props2 = List("\"Bob\"", "\"Bob\"", false, 20, 200L, 2.0, "2021-05-01", "POINT(2 3)")
162 |     edges.append(Edge("\"vid1\"", "\"vid2\"", Some(1L), props1))
163 |     edges.append(Edge("\"vid2\"", "\"vid1\"", Some(2L), props2))
164 | 
165 |     val nebulaEdges         = Edges(propNames, edges.toList, None, None)
166 |     val sentence = toUpdateExecuteSentence("friend", nebulaEdges)
167 |     val expectSentence =
168 |       "UPDATE EDGE ON `friend` \"vid1\"->\"vid2\"@1 SET `col_string`=\"Tom\"," +
169 |         "`col_fixed_string`=\"Tom\",`col_bool`=true,`col_int`=10,`col_int64`=100," +
170 |         "`col_double`=1.0,`col_date`=2021-11-12,`col_geo`=POINT(1 2);" +
171 |         "UPDATE EDGE ON `friend` \"vid2\"->\"vid1\"@2 SET `col_string`=\"Bob\"," +
172 |         "`col_fixed_string`=\"Bob\",`col_bool`=false,`col_int`=20,`col_int64`=200," +
173 |         "`col_double`=2.0,`col_date`=2021-05-01,`col_geo`=POINT(2 3)"
174 |     assert(expectSentence.equals(sentence))
175 |   }
176 | 
177 |   @Test
178 |   def toExecuteSentenceSuiteForEdgeWithSymbol(): Unit = {
179 |     val edges: ListBuffer[Edge] = new ListBuffer[Edge]
180 |     val edgeType                = "friend"
181 |     val propNames               = List("src_name", "dst_name", "time", "address", "relation")
182 | 
183 |     val props1 = List("\"Tom\"", "\"Jena\"", "2022-08-25", "hangzhou", "friend")
184 |     val props2 = List("\"Jena\"", "\"Bob\"", "2022-08-25", "shanghai", "friend")
185 |     edges.append(Edge("\"vid_1\"", "\"vid_2\"", Some(0L), props1))
186 |     edges.append(Edge("\"vid_2,test-1\"", "\"vid&3^test*a\"", Some(1L), props2))
187 |     val nebulaEdges = Edges(propNames, edges.toList)
188 |     val sentence    = toExecuteSentence(edgeType, nebulaEdges, false)
189 |     val expectSentence = "INSERT EDGE `friend`(`src_name`,`dst_name`,`time`,`address`,`relation`) VALUES " +
190 |       "\"vid_1\"->\"vid_2\"@0: (\"Tom\", \"Jena\", 2022-08-25, hangzhou, friend), " +
191 |       "\"vid_2,test-1\"->\"vid&3^test*a\"@1: (\"Jena\", \"Bob\", 2022-08-25, shanghai, friend)"
192 |     assert(sentence.equals(expectSentence))
193 |   }
194 | 
195 | 
196 |   override def writeVertices(vertices: Vertices, ignoreIndex: Boolean): List[String] = ???
197 | 
198 |   override def writeEdges(edges: common.Edges, ignoreIndex: Boolean): List[String] = ???
199 | 
200 |   override def writeNgql(ngql: String): String = ???
201 | 
202 |   override def prepare(): Unit = ???
203 | 
204 |   override def close(): Unit = ???
205 | }
206 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.2/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Global logging configuration
2 | log4j.rootLogger=INFO, stdout
3 | # Console output...
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n
7 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
  2 |  *
  3 |  * This source code is licensed under Apache 2.0 License.
  4 |  */
  5 | 
  6 | package com.vesoft.nebula.exchange.reader
  7 | 
  8 | import com.vesoft.exchange.common.config.FileBaseSourceConfigEntry
  9 | import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE
 10 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
 11 | import org.apache.spark.sql.types.StructType
 12 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 13 | 
 14 | /**
 15 |   * The FileBaseReader is the abstract class for HDFS file reader.
 16 |   *
 17 |   * @param session
 18 |   * @param path
 19 |   */
 20 | abstract class FileBaseReader(val session: SparkSession, val path: String) extends Reader {
 21 | 
 22 |   require(path.trim.nonEmpty)
 23 | 
 24 |   override def close(): Unit = {
 25 |     session.close()
 26 |   }
 27 | }
 28 | 
 29 | /**
 30 |   * The ParquetReader extend the FileBaseReader and support read parquet file from HDFS.
 31 |   *
 32 |   * @param session
 33 |   * @param parquetConfig
 34 |   */
 35 | class ParquetReader(override val session: SparkSession, parquetConfig: FileBaseSourceConfigEntry)
 36 |     extends FileBaseReader(session, parquetConfig.path) {
 37 | 
 38 |   override def read(): DataFrame = {
 39 |     session.read.parquet(path)
 40 |   }
 41 | }
 42 | 
 43 | /**
 44 |   * The ORCReader extend the FileBaseReader and support read orc file from HDFS.
 45 |   *
 46 |   * @param session
 47 |   * @param orcConfig
 48 |   */
 49 | class ORCReader(override val session: SparkSession, orcConfig: FileBaseSourceConfigEntry)
 50 |     extends FileBaseReader(session, orcConfig.path) {
 51 | 
 52 |   override def read(): DataFrame = {
 53 |     session.read.orc(path)
 54 |   }
 55 | }
 56 | 
 57 | /**
 58 |   * The JSONReader extend the FileBaseReader and support read json file from HDFS.
 59 |   *
 60 |   * @param session
 61 |   * @param jsonConfig
 62 |   */
 63 | class JSONReader(override val session: SparkSession, jsonConfig: FileBaseSourceConfigEntry)
 64 |     extends FileBaseReader(session, jsonConfig.path) {
 65 | 
 66 |   override def read(): DataFrame = {
 67 |     session.read.json(path)
 68 |   }
 69 | }
 70 | 
 71 | /**
 72 |   * The CSVReader extend the FileBaseReader and support read csv file from HDFS.
 73 |   * All types of the structure are StringType.
 74 |   *
 75 |   * @param session
 76 |   * @param csvConfig
 77 |   */
 78 | class CSVReader(override val session: SparkSession, csvConfig: FileBaseSourceConfigEntry)
 79 |     extends FileBaseReader(session, csvConfig.path) {
 80 | 
 81 |   override def read(): DataFrame = {
 82 |     session.read
 83 |       .option("delimiter", csvConfig.separator.get)
 84 |       .option("header", csvConfig.header.get)
 85 |       .option("emptyValue", DEFAULT_EMPTY_VALUE)
 86 |       .csv(path)
 87 |   }
 88 | }
 89 | 
 90 | /**
 91 |   * The CustomReader extend the FileBaseReader and support read text file from HDFS.
 92 |   * Transformation is a function convert a line into Row.
 93 |   * The structure of the row should be specified.
 94 |   *
 95 |   * @param session
 96 |   * @param customConfig
 97 |   * @param transformation
 98 |   * @param structType
 99 |   */
100 | abstract class CustomReader(override val session: SparkSession,
101 |                             customConfig: FileBaseSourceConfigEntry,
102 |                             transformation: String => Row,
103 |                             filter: Row => Boolean,
104 |                             structType: StructType)
105 |     extends FileBaseReader(session, customConfig.path) {
106 | 
107 |   override def read(): DataFrame = {
108 |     val encoder = RowEncoder.apply(structType)
109 |     session.read
110 |       .text(path)
111 |       .filter(!_.getString(0).isEmpty)
112 |       .map(row => transformation(row.getString(0)))(encoder)
113 |       .filter(filter)
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.nebula.exchange.reader
 7 | 
 8 | import com.vesoft.exchange.common.Offset
 9 | import com.vesoft.exchange.common.utils.HDFSUtils
10 | import org.apache.spark.sql.{DataFrame, SparkSession}
11 | 
12 | /**
13 |   * The Reader is used to create a DataFrame from the source, such as Hive or HDFS.
14 |   */
15 | trait Reader extends Serializable {
16 |   def session: SparkSession
17 | 
18 |   def read(): DataFrame
19 | 
20 |   def close(): Unit
21 | }
22 | 
23 | trait CheckPointSupport extends Serializable {
24 | 
25 |   def getOffsets(totalCount: Long,
26 |                  parallel: Int,
27 |                  checkPointPath: Option[String],
28 |                  checkPointNamePrefix: String): List[Offset] = {
29 |     if (totalCount <= 0)
30 |       throw new RuntimeException(s"${checkPointNamePrefix}: return data count<=0")
31 | 
32 |     val batchSizes = List.fill((totalCount % parallel).toInt)(totalCount / parallel + 1) ::: List
33 |       .fill((parallel - totalCount % parallel).toInt)(totalCount / parallel)
34 | 
35 |     val startOffsets = batchSizes.scanLeft(0L)(_ + _).init
36 | 
37 |     val checkPointOffsets = checkPointPath match {
38 |       case Some(path) =>
39 |         val files = Range(0, parallel).map(i => s"${path}/${checkPointNamePrefix}.${i}").toList
40 |         if (files.forall(HDFSUtils.exists))
41 |           files.map(HDFSUtils.getContent(_).trim.toLong).sorted
42 |         else startOffsets
43 |       case _ => startOffsets
44 |     }
45 | 
46 |     if (checkPointOffsets.zip(startOffsets).exists(x => x._1 < x._2))
47 |       throw new RuntimeException(
48 |         s"Check Point file maybe previous. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file")
49 | 
50 |     val eachPartitionLimit = {
51 |       batchSizes
52 |         .zip(startOffsets.zip(checkPointOffsets))
53 |         .map(x => {
54 |           x._1 - (x._2._2 - x._2._1)
55 |         })
56 |     }
57 |     val offsets = checkPointOffsets.zip(eachPartitionLimit).map(x => Offset(x._1, x._2))
58 |     if (offsets.exists(_.size < 0L))
59 |       throw new RuntimeException(
60 |         s"Check point file maybe broken. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file")
61 |     offsets
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.nebula.exchange.reader
 7 | 
 8 | import com.vesoft.exchange.common.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry}
 9 | import org.apache.spark.sql.types.StringType
10 | import org.apache.spark.sql.{DataFrame, SparkSession}
11 | 
12 | /**
13 |   * Spark Streaming
14 |   *
15 |   * @param session
16 |   */
17 | abstract class StreamingBaseReader(override val session: SparkSession) extends Reader {
18 | 
19 |   override def close(): Unit = {
20 |     session.close()
21 |   }
22 | }
23 | 
24 | /**
25 |   *
26 |   * @param session
27 |   * @param kafkaConfig
28 |   * @param targetFields
29 |   */
30 | class KafkaReader(override val session: SparkSession,
31 |                   kafkaConfig: KafkaSourceConfigEntry,
32 |                   targetFields: List[String])
33 |     extends StreamingBaseReader(session) {
34 | 
35 |   require(
36 |     kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty)
37 | 
38 |   override def read(): DataFrame = {
39 |     import org.apache.spark.sql.functions._
40 |     import session.implicits._
41 |     val fields = targetFields.distinct
42 |     val reader =
43 |       session.readStream
44 |         .format("kafka")
45 |         .option("kafka.bootstrap.servers", kafkaConfig.server)
46 |         .option("subscribe", kafkaConfig.topic)
47 |         .option("startingOffsets", kafkaConfig.startingOffsets)
48 | 
49 |     if (kafkaConfig.securityProtocol.isDefined) {
50 |       reader.option("kafka.security.protocol", kafkaConfig.securityProtocol.get)
51 |       reader.option("kafka.sasl.mechanism", kafkaConfig.mechanism.get)
52 |     }
53 |     if (kafkaConfig.kerberos) {
54 |       reader.option("kafka.sasl.kerberos.service.name", kafkaConfig.kerberosServiceName)
55 |     }
56 | 
57 |     val maxOffsetsPerTrigger = kafkaConfig.maxOffsetsPerTrigger
58 |     if (maxOffsetsPerTrigger.isDefined)
59 |       reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get)
60 | 
61 |     reader
62 |       .load()
63 |       .select($"value".cast(StringType))
64 |       .select(json_tuple($"value", fields: _*))
65 |       .toDF(fields: _*)
66 | 
67 |   }
68 | }
69 | 
70 | /**
71 |   *
72 |   * @param session
73 |   * @param pulsarConfig
74 |   */
75 | class PulsarReader(override val session: SparkSession, pulsarConfig: PulsarSourceConfigEntry)
76 |     extends StreamingBaseReader(session) {
77 | 
78 |   override def read(): DataFrame = {
79 |     session.readStream
80 |       .format("pulsar")
81 |       .option("service.url", pulsarConfig.serviceUrl)
82 |       .option("admin.url", pulsarConfig.adminUrl)
83 |       .options(pulsarConfig.options)
84 |       .load()
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled class file
 2 | *.class
 3 | 
 4 | # Log file
 5 | *.log
 6 | 
 7 | # BlueJ files
 8 | *.ctxt
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 | 
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 | 
25 | # build target
26 | target/
27 | 
28 | # IDE
29 | .idea/
30 | .eclipse/
31 | *.iml
32 | 
33 | spark-importer.ipr
34 | spark-importer.iws
35 | 
36 | .DS_Store
37 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Global logging configuration
2 | log4j.rootLogger=INFO, stdout
3 | # Console output...
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n
7 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
  2 |  *
  3 |  * This source code is licensed under Apache 2.0 License.
  4 |  */
  5 | 
  6 | package com.vesoft.nebula.exchange.reader
  7 | 
  8 | import com.vesoft.exchange.common.config.FileBaseSourceConfigEntry
  9 | import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE
 10 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
 11 | import org.apache.spark.sql.types.StructType
 12 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 13 | 
 14 | /**
 15 |   * The FileBaseReader is the abstract class for HDFS file reader.
 16 |   *
 17 |   * @param session
 18 |   * @param path
 19 |   */
 20 | abstract class FileBaseReader(val session: SparkSession, val path: String) extends Reader {
 21 | 
 22 |   require(path.trim.nonEmpty)
 23 | 
 24 |   override def close(): Unit = {
 25 |     session.close()
 26 |   }
 27 | }
 28 | 
 29 | /**
 30 |   * The ParquetReader extend the FileBaseReader and support read parquet file from HDFS.
 31 |   *
 32 |   * @param session
 33 |   * @param parquetConfig
 34 |   */
 35 | class ParquetReader(override val session: SparkSession, parquetConfig: FileBaseSourceConfigEntry)
 36 |     extends FileBaseReader(session, parquetConfig.path) {
 37 | 
 38 |   override def read(): DataFrame = {
 39 |     session.read.parquet(path)
 40 |   }
 41 | }
 42 | 
 43 | /**
 44 |   * The ORCReader extend the FileBaseReader and support read orc file from HDFS.
 45 |   *
 46 |   * @param session
 47 |   * @param orcConfig
 48 |   */
 49 | class ORCReader(override val session: SparkSession, orcConfig: FileBaseSourceConfigEntry)
 50 |     extends FileBaseReader(session, orcConfig.path) {
 51 | 
 52 |   override def read(): DataFrame = {
 53 |     session.read.orc(path)
 54 |   }
 55 | }
 56 | 
 57 | /**
 58 |   * The JSONReader extend the FileBaseReader and support read json file from HDFS.
 59 |   *
 60 |   * @param session
 61 |   * @param jsonConfig
 62 |   */
 63 | class JSONReader(override val session: SparkSession, jsonConfig: FileBaseSourceConfigEntry)
 64 |     extends FileBaseReader(session, jsonConfig.path) {
 65 | 
 66 |   override def read(): DataFrame = {
 67 |     session.read.json(path)
 68 |   }
 69 | }
 70 | 
 71 | /**
 72 |   * The CSVReader extend the FileBaseReader and support read csv file from HDFS.
 73 |   * All types of the structure are StringType.
 74 |   *
 75 |   * @param session
 76 |   * @param csvConfig
 77 |   */
 78 | class CSVReader(override val session: SparkSession, csvConfig: FileBaseSourceConfigEntry)
 79 |     extends FileBaseReader(session, csvConfig.path) {
 80 | 
 81 |   override def read(): DataFrame = {
 82 |     session.read
 83 |       .option("delimiter", csvConfig.separator.get)
 84 |       .option("header", csvConfig.header.get)
 85 |       .option("emptyValue", DEFAULT_EMPTY_VALUE)
 86 |       .csv(path)
 87 |   }
 88 | }
 89 | 
 90 | /**
 91 |   * The CustomReader extend the FileBaseReader and support read text file from HDFS.
 92 |   * Transformation is a function convert a line into Row.
 93 |   * The structure of the row should be specified.
 94 |   *
 95 |   * @param session
 96 |   * @param customConfig
 97 |   * @param transformation
 98 |   * @param structType
 99 |   */
100 | abstract class CustomReader(override val session: SparkSession,
101 |                             customConfig: FileBaseSourceConfigEntry,
102 |                             transformation: String => Row,
103 |                             filter: Row => Boolean,
104 |                             structType: StructType)
105 |     extends FileBaseReader(session, customConfig.path) {
106 | 
107 |   override def read(): DataFrame = {
108 |     val encoder = RowEncoder.apply(structType)
109 |     session.read
110 |       .text(path)
111 |       .filter(!_.getString(0).isEmpty)
112 |       .map(row => transformation(row.getString(0)))(encoder)
113 |       .filter(filter)
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.nebula.exchange.reader
 7 | 
 8 | import com.vesoft.exchange.common.Offset
 9 | import com.vesoft.exchange.common.utils.HDFSUtils
10 | import org.apache.spark.sql.{DataFrame, SparkSession}
11 | 
12 | /**
13 |   * The Reader is used to create a DataFrame from the source, such as Hive or HDFS.
14 |   */
15 | trait Reader extends Serializable {
16 |   def session: SparkSession
17 | 
18 |   def read(): DataFrame
19 | 
20 |   def close(): Unit
21 | }
22 | 
23 | trait CheckPointSupport extends Serializable {
24 | 
25 |   def getOffsets(totalCount: Long,
26 |                  parallel: Int,
27 |                  checkPointPath: Option[String],
28 |                  checkPointNamePrefix: String): List[Offset] = {
29 |     if (totalCount <= 0)
30 |       throw new RuntimeException(s"${checkPointNamePrefix}: return data count<=0")
31 | 
32 |     val batchSizes = List.fill((totalCount % parallel).toInt)(totalCount / parallel + 1) ::: List
33 |       .fill((parallel - totalCount % parallel).toInt)(totalCount / parallel)
34 | 
35 |     val startOffsets = batchSizes.scanLeft(0L)(_ + _).init
36 | 
37 |     val checkPointOffsets = checkPointPath match {
38 |       case Some(path) =>
39 |         val files = Range(0, parallel).map(i => s"${path}/${checkPointNamePrefix}.${i}").toList
40 |         if (files.forall(HDFSUtils.exists))
41 |           files.map(HDFSUtils.getContent(_).trim.toLong).sorted
42 |         else startOffsets
43 |       case _ => startOffsets
44 |     }
45 | 
46 |     if (checkPointOffsets.zip(startOffsets).exists(x => x._1 < x._2))
47 |       throw new RuntimeException(
48 |         s"Check Point file maybe previous. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file")
49 | 
50 |     val eachPartitionLimit = {
51 |       batchSizes
52 |         .zip(startOffsets.zip(checkPointOffsets))
53 |         .map(x => {
54 |           x._1 - (x._2._2 - x._2._1)
55 |         })
56 |     }
57 |     val offsets = checkPointOffsets.zip(eachPartitionLimit).map(x => Offset(x._1, x._2))
58 |     if (offsets.exists(_.size < 0L))
59 |       throw new RuntimeException(
60 |         s"Check point file maybe broken. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file")
61 |     offsets
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.nebula.exchange.reader
 7 | 
 8 | import com.vesoft.exchange.common.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry}
 9 | import org.apache.spark.sql.types.StringType
10 | import org.apache.spark.sql.{DataFrame, SparkSession}
11 | 
12 | import scala.collection.mutable
13 | 
14 | /**
15 |   * Spark Streaming
16 |   *
17 |   * @param session
18 |   */
19 | abstract class StreamingBaseReader(override val session: SparkSession) extends Reader {
20 | 
21 |   override def close(): Unit = {
22 |     session.close()
23 |   }
24 | }
25 | 
26 | /**
27 |   *
28 |   * @param session
29 |   * @param kafkaConfig
30 |   * @param targetFields
31 |   */
32 | class KafkaReader(override val session: SparkSession,
33 |                   kafkaConfig: KafkaSourceConfigEntry,
34 |                   targetFields: List[String])
35 |     extends StreamingBaseReader(session) {
36 | 
37 |   require(
38 |     kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty)
39 | 
40 |   override def read(): DataFrame = {
41 |     import org.apache.spark.sql.functions._
42 |     import session.implicits._
43 |     val fields = targetFields.distinct
44 |     val reader =
45 |       session.readStream
46 |         .format("kafka")
47 |         .option("kafka.bootstrap.servers", kafkaConfig.server)
48 |         .option("subscribe", kafkaConfig.topic)
49 |         .option("startingOffsets", kafkaConfig.startingOffsets)
50 | 
51 |     if(kafkaConfig.securityProtocol.isDefined){
52 |       reader.option("kafka.security.protocol", kafkaConfig.securityProtocol.get)
53 |       reader.option("kafka.sasl.mechanism", kafkaConfig.mechanism.get)
54 |     }
55 |     if(kafkaConfig.kerberos){
56 |       reader.option("kafka.sasl.kerberos.service.name", kafkaConfig.kerberosServiceName)
57 |     }
58 |     val maxOffsetsPerTrigger = kafkaConfig.maxOffsetsPerTrigger
59 |     if (maxOffsetsPerTrigger.isDefined)
60 |       reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get)
61 | 
62 |     reader
63 |       .load()
64 |       .select($"value".cast(StringType))
65 |       .select(json_tuple($"value", fields: _*))
66 |       .toDF(fields: _*)
67 | 
68 |   }
69 | }
70 | 
71 | /**
72 |   *
73 |   * @param session
74 |   * @param pulsarConfig
75 |   */
76 | class PulsarReader(override val session: SparkSession, pulsarConfig: PulsarSourceConfigEntry)
77 |     extends StreamingBaseReader(session) {
78 | 
79 |   override def read(): DataFrame = {
80 |     session.readStream
81 |       .format("pulsar")
82 |       .option("service.url", pulsarConfig.serviceUrl)
83 |       .option("admin.url", pulsarConfig.adminUrl)
84 |       .options(pulsarConfig.options)
85 |       .load()
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/utils/Neo4jUtils.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.nebula.exchange.utils
 7 | 
 8 | import org.neo4j.driver.Value
 9 | 
10 | object Neo4jUtils {
11 | 
12 |   def convertNeo4jData(value: Value): String = {
13 |     value.`type`().name() match {
14 |       case "NULL" => {
15 |         null
16 |       }
17 |       case "STRING" => {
18 |         value.asString()
19 |       }
20 |       case "INTEGER" => {
21 |         value.asLong().toString
22 |       }
23 |       case "FLOAT" | "DOUBLE" => {
24 |         value.asDouble().toString
25 |       }
26 |       case "BOOLEAN" => {
27 |         value.asBoolean().toString
28 |       }
29 |       case "DATE" | "LOCAL_DATE" => {
30 |         value.asLocalDate().toString
31 |       }
32 |       case "DATE_TIME" | "LOCAL_DATE_TIME" => {
33 |         value.asLocalDateTime().toString
34 |       }
35 |       case "TIME" | "LOCAL_TIME" => {
36 |         value.asLocalTime().toString
37 |       }
38 |       case "BYTES" => {
39 |         new String(value.asByteArray())
40 |       }
41 |       case "LIST" => {
42 |         value.asList().toString
43 |       }
44 |       case "MAP" => {
45 |         value.asMap().toString
46 |       }
47 |       case _ => {
48 |         value.toString
49 |       }
50 |     }
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/src/test/resources/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | version: '3.4'
  2 | services:
  3 |   metad0:
  4 |     image: vesoft/nebula-metad:nightly
  5 |     environment:
  6 |       USER: root
  7 |       TZ:   "${TZ}"
  8 |     command:
  9 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
 10 |       - --local_ip=172.28.1.1
 11 |       - --ws_ip=172.28.1.1
 12 |       - --port=9559
 13 |       - --data_path=/data/meta
 14 |       - --log_dir=/logs
 15 |       - --v=0
 16 |       - --minloglevel=0
 17 |       - --heartbeat_interval_secs=2
 18 |     healthcheck:
 19 |       test: ["CMD", "curl", "-f", "http://172.28.1.1:11000/status"]
 20 |       interval: 30s
 21 |       timeout: 10s
 22 |       retries: 3
 23 |       start_period: 20s
 24 |     ports:
 25 |       - "9559:9559"
 26 |       - 11000
 27 |       - 11002
 28 |     volumes:
 29 |       - ./data/meta0:/data/meta:Z
 30 |       - ./logs/meta0:/logs:Z
 31 |     networks:
 32 |       nebula-net:
 33 |         ipv4_address: 172.28.1.1
 34 |     restart: on-failure
 35 |     cap_add:
 36 |       - SYS_PTRACE
 37 | 
 38 |   metad1:
 39 |     image: vesoft/nebula-metad:nightly
 40 |     environment:
 41 |       USER: root
 42 |       TZ:   "${TZ}"
 43 |     command:
 44 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
 45 |       - --local_ip=172.28.1.2
 46 |       - --ws_ip=172.28.1.2
 47 |       - --port=9559
 48 |       - --data_path=/data/meta
 49 |       - --log_dir=/logs
 50 |       - --v=0
 51 |       - --minloglevel=0
 52 |       - --heartbeat_interval_secs=2
 53 |     healthcheck:
 54 |       test: ["CMD", "curl", "-f", "http://172.28.1.2:11000/status"]
 55 |       interval: 30s
 56 |       timeout: 10s
 57 |       retries: 3
 58 |       start_period: 20s
 59 |     ports:
 60 |       - "9560:9559"
 61 |       - 11000
 62 |       - 11002
 63 |     volumes:
 64 |       - ./data/meta1:/data/meta:Z
 65 |       - ./logs/meta1:/logs:Z
 66 |     networks:
 67 |       nebula-net:
 68 |         ipv4_address: 172.28.1.2
 69 |     restart: on-failure
 70 |     cap_add:
 71 |       - SYS_PTRACE
 72 | 
 73 |   metad2:
 74 |     image: vesoft/nebula-metad:nightly
 75 |     environment:
 76 |       USER: root
 77 |       TZ:   "${TZ}"
 78 |     command:
 79 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
 80 |       - --local_ip=172.28.1.3
 81 |       - --ws_ip=172.28.1.3
 82 |       - --port=9559
 83 |       - --data_path=/data/meta
 84 |       - --log_dir=/logs
 85 |       - --v=0
 86 |       - --minloglevel=0
 87 |       - --heartbeat_interval_secs=2
 88 |     healthcheck:
 89 |       test: ["CMD", "curl", "-f", "http://172.28.1.3:11000/status"]
 90 |       interval: 30s
 91 |       timeout: 10s
 92 |       retries: 3
 93 |       start_period: 20s
 94 |     ports:
 95 |       - "9561:9559"
 96 |       - 11000
 97 |       - 11002
 98 |     volumes:
 99 |       - ./data/meta2:/data/meta:Z
100 |       - ./logs/meta2:/logs:Z
101 |     networks:
102 |       nebula-net:
103 |         ipv4_address: 172.28.1.3
104 |     restart: on-failure
105 |     cap_add:
106 |       - SYS_PTRACE
107 | 
108 |   storaged0:
109 |     image: vesoft/nebula-storaged:nightly
110 |     environment:
111 |       USER: root
112 |       TZ:   "${TZ}"
113 |     command:
114 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
115 |       - --local_ip=172.28.2.1
116 |       - --ws_ip=172.28.2.1
117 |       - --port=9779
118 |       - --data_path=/data/storage
119 |       - --log_dir=/logs
120 |       - --v=0
121 |       - --minloglevel=0
122 |       - --heartbeat_interval_secs=2
123 |     depends_on:
124 |       - metad0
125 |       - metad1
126 |       - metad2
127 |     healthcheck:
128 |       test: ["CMD", "curl", "-f", "http://172.28.2.1:12000/status"]
129 |       interval: 30s
130 |       timeout: 10s
131 |       retries: 3
132 |       start_period: 20s
133 |     ports:
134 |       - "9779:9779"
135 |       - 12000
136 |       - 12002
137 |     volumes:
138 |       - ./data/storage0:/data/storage:Z
139 |       - ./logs/storage0:/logs:Z
140 |     networks:
141 |       nebula-net:
142 |         ipv4_address: 172.28.2.1
143 |     restart: on-failure
144 |     cap_add:
145 |       - SYS_PTRACE
146 | 
147 |   storaged1:
148 |     image: vesoft/nebula-storaged:nightly
149 |     environment:
150 |       USER: root
151 |       TZ:   "${TZ}"
152 |     command:
153 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
154 |       - --local_ip=172.28.2.2
155 |       - --ws_ip=172.28.2.2
156 |       - --port=9779
157 |       - --data_path=/data/storage
158 |       - --log_dir=/logs
159 |       - --v=0
160 |       - --minloglevel=0
161 |       - --heartbeat_interval_secs=2
162 |     depends_on:
163 |       - metad0
164 |       - metad1
165 |       - metad2
166 |     healthcheck:
167 |       test: ["CMD", "curl", "-f", "http://172.28.2.2:12000/status"]
168 |       interval: 30s
169 |       timeout: 10s
170 |       retries: 3
171 |       start_period: 20s
172 |     ports:
173 |       - "9780:9779"
174 |       - 12000
175 |       - 12002
176 |     volumes:
177 |       - ./data/storage1:/data/storage:Z
178 |       - ./logs/storage1:/logs:Z
179 |     networks:
180 |       nebula-net:
181 |         ipv4_address: 172.28.2.2
182 |     restart: on-failure
183 |     cap_add:
184 |       - SYS_PTRACE
185 | 
186 |   storaged2:
187 |     image: vesoft/nebula-storaged:nightly
188 |     environment:
189 |       USER: root
190 |       TZ:   "${TZ}"
191 |     command:
192 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
193 |       - --local_ip=172.28.2.3
194 |       - --ws_ip=172.28.2.3
195 |       - --port=9779
196 |       - --data_path=/data/storage
197 |       - --log_dir=/logs
198 |       - --v=0
199 |       - --minloglevel=0
200 |       - --heartbeat_interval_secs=2
201 |     depends_on:
202 |       - metad0
203 |       - metad1
204 |       - metad2
205 |     healthcheck:
206 |       test: ["CMD", "curl", "-f", "http://172.28.2.3:12000/status"]
207 |       interval: 30s
208 |       timeout: 10s
209 |       retries: 3
210 |       start_period: 20s
211 |     ports:
212 |       - "9781:9779"
213 |       - 12000
214 |       - 12002
215 |     volumes:
216 |       - ./data/storage2:/data/storage:Z
217 |       - ./logs/storage2:/logs:Z
218 |     networks:
219 |       nebula-net:
220 |         ipv4_address: 172.28.2.3
221 |     restart: on-failure
222 |     cap_add:
223 |       - SYS_PTRACE
224 | 
225 |   graphd0:
226 |     image: vesoft/nebula-graphd:nightly
227 |     environment:
228 |       USER: root
229 |       TZ:   "${TZ}"
230 |     command:
231 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
232 |       - --port=9669
233 |       - --ws_ip=172.28.3.1
234 |       - --log_dir=/logs
235 |       - --v=0
236 |       - --minloglevel=0
237 |       - --heartbeat_interval_secs=2
238 |     depends_on:
239 |       - metad0
240 |       - metad1
241 |       - metad2
242 |     healthcheck:
243 |       test: ["CMD", "curl", "-f", "http://172.28.3.1:13000/status"]
244 |       interval: 30s
245 |       timeout: 10s
246 |       retries: 3
247 |       start_period: 20s
248 |     ports:
249 |       - "9669:9669"
250 |       - 13000
251 |       - 13002
252 |     volumes:
253 |       - ./logs/graph0:/logs:Z
254 |     networks:
255 |       nebula-net:
256 |         ipv4_address: 172.28.3.1
257 |     restart: on-failure
258 |     cap_add:
259 |       - SYS_PTRACE
260 | 
261 |   graphd1:
262 |     image: vesoft/nebula-graphd:nightly
263 |     environment:
264 |       USER: root
265 |       TZ:   "${TZ}"
266 |     command:
267 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
268 |       - --port=9669
269 |       - --ws_ip=172.28.3.2
270 |       - --log_dir=/logs
271 |       - --v=0
272 |       - --minloglevel=0
273 |       - --heartbeat_interval_secs=2
274 |     depends_on:
275 |       - metad0
276 |       - metad1
277 |       - metad2
278 |     healthcheck:
279 |       test: ["CMD", "curl", "-f", "http://172.28.3.2:13000/status"]
280 |       interval: 30s
281 |       timeout: 10s
282 |       retries: 3
283 |       start_period: 20s
284 |     ports:
285 |       - "9670:9669"
286 |       - 13000
287 |       - 13002
288 |     volumes:
289 |       - ./logs/graph1:/logs:Z
290 |     networks:
291 |       nebula-net:
292 |         ipv4_address: 172.28.3.2
293 |     restart: on-failure
294 |     cap_add:
295 |       - SYS_PTRACE
296 | 
297 |   graphd2:
298 |     image: vesoft/nebula-graphd:nightly
299 |     environment:
300 |       USER: root
301 |       TZ:   "${TZ}"
302 |     command:
303 |       - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559
304 |       - --port=9669
305 |       - --ws_ip=172.28.3.3
306 |       - --log_dir=/logs
307 |       - --v=0
308 |       - --minloglevel=0
309 |       - --heartbeat_interval_secs=2
310 |     depends_on:
311 |       - metad0
312 |       - metad1
313 |       - metad2
314 |     healthcheck:
315 |       test: ["CMD", "curl", "-f", "http://172.28.3.3:13000/status"]
316 |       interval: 30s
317 |       timeout: 10s
318 |       retries: 3
319 |       start_period: 20s
320 |     ports:
321 |       - "9671:9669"
322 |       - 13000
323 |       - 13002
324 |     volumes:
325 |       - ./logs/graph2:/logs:Z
326 |     networks:
327 |       nebula-net:
328 |         ipv4_address: 172.28.3.3
329 |     restart: on-failure
330 |     cap_add:
331 |       - SYS_PTRACE
332 |       
333 |   console:
334 |     image: vesoft/nebula-console:nightly
335 |     entrypoint: ""
336 |     command: 
337 |       - sh
338 |       - -c
339 |       - |
340 |         sleep 3 &&
341 |         nebula-console -addr graphd0 -port 9669 -u root -p nebula -e 'ADD HOSTS "172.28.2.1":9779,"172.28.2.2":9779,"172.28.2.3":9779' &&
342 |         sleep 36000
343 |     depends_on:
344 |       - graphd0
345 |     networks:
346 |       - nebula-net
347 |       
348 | networks:
349 |   nebula-net:
350 |     ipam:
351 |       driver: default
352 |       config:
353 |         - subnet: 172.28.0.0/16
354 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/src/test/resources/edge.csv:
--------------------------------------------------------------------------------
 1 | src,dst,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14
 2 | 101,102,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2)
 3 | 102,103,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4)
 4 | 103,101,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6)
 5 | 104,106,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7)
 6 | 105,107,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5)
 7 | 106,108,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)"
 8 | 107,101,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)"
 9 | 108,109,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)"
10 | 109,110,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)"
11 | 110,-101,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)"
12 | -101,102,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
13 | -102,-103,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
14 | -103,-101,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
15 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/src/test/resources/process_application.conf:
--------------------------------------------------------------------------------
  1 | {
  2 |   # Spark relation com.vesoft.exchange.common.config
  3 |   spark: {
  4 |     app: {
  5 |       name: Nebula Exchange 2.0
  6 |     }
  7 | 
  8 |     master:local
  9 | 
 10 |     driver: {
 11 |       cores: 1
 12 |       maxResultSize: 1G
 13 |     }
 14 | 
 15 |     executor: {
 16 |         memory:1G
 17 |     }
 18 | 
 19 |     cores:{
 20 |       max: 16
 21 |     }
 22 |   }
 23 | 
 24 |   # if the hive is hive-on-spark with derby mode， you can ignore this hive configure
 25 |   # get the com.vesoft.exchange.common.config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml
 26 | 
 27 |     hive: {
 28 |       warehouse: "hdfs://NAMENODE_IP:9000/apps/svr/hive-xxx/warehouse/"
 29 |       connectionURL: "jdbc:mysql://your_ip:3306/hive_spark?characterEncoding=UTF-8"
 30 |       connectionDriverName: "com.mysql.jdbc.Driver"
 31 |       connectionUserName: "user"
 32 |       connectionPassword: "password"
 33 |     }
 34 | 
 35 |   # Nebula Graph relation com.vesoft.exchange.common.config
 36 |   nebula: {
 37 |     address:{
 38 |       graph:["127.0.0.1:9669", "127.0.0.1:9670", "127.0.0.1:9671"]
 39 |       meta:["127.0.0.1:9559", "127.0.0.1:9560", "127.0.0.1:9561"]
 40 |     }
 41 |     user: root
 42 |     pswd: nebula
 43 |     space: test_string
 44 | 
 45 |     # parameters for SST import, not required
 46 |     path:{
 47 |         local:"/tmp"
 48 |         remote:"/sst"
 49 |         hdfs.namenode: "hdfs://name_node:9000"
 50 |     }
 51 | 
 52 |     # nebula client connection parameters
 53 |     connection {
 54 |       timeout: 3000
 55 |       retry: 3
 56 |     }
 57 | 
 58 |     # nebula client execution parameters
 59 |     execution {
 60 |       retry: 3
 61 |     }
 62 | 
 63 |     error: {
 64 |       # max number of failures, if the number of failures is bigger than max, then exit the application.
 65 |       max: 32
 66 |       # failed import job will be recorded in output path
 67 |       output: /tmp/errors
 68 |     }
 69 | 
 70 |     # use google's RateLimiter to limit the requests send to NebulaGraph
 71 |     rate: {
 72 |       # the stable throughput of RateLimiter
 73 |       limit: 1024
 74 |       # Acquires a permit from RateLimiter, unit: MILLISECONDS
 75 |       # if it can't be obtained within the specified timeout, then give up the request.
 76 |       timeout: 1000
 77 |     }
 78 |   }
 79 | 
 80 |   # Processing tags
 81 |   # There are tag com.vesoft.exchange.common.config examples for different dataSources.
 82 |   tags: [
 83 |     {
 84 |       name: person
 85 |       type: {
 86 |         source: csv
 87 |         sink: client
 88 |       }
 89 |       path: "file://src/test/resources/data.csv"
 90 |       fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14]
 91 |       nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14]
 92 |       vertex: {
 93 |         field:id
 94 |         #policy:hash
 95 |       }
 96 |       header:true
 97 |       batch: 2
 98 |       partition: 5
 99 |     }
100 |   ]
101 | 
102 |   # There are tag com.vesoft.exchange.common.config examples for different dataSources.
103 |     edges: [
104 |       {
105 |         name: friend
106 |         type: {
107 |           source: csv
108 |           sink: client
109 |         }
110 |         path: "file://src/test/resources/data.csv"
111 |         fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14]
112 |         nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14]
113 |         source: {
114 |           field:src
115 |           #policy:hash
116 |         }
117 |         target: {
118 |           field:dst
119 |           #policy:hash
120 |         }
121 |         header:true
122 |         batch: 2
123 |         partition: 5
124 |       }
125 |     ]
126 | }
127 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_2.4/src/test/resources/vertex.csv:
--------------------------------------------------------------------------------
 1 | id,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14
 2 | 101,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2)
 3 | 102,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4)
 4 | 103,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6)
 5 | 104,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7)
 6 | 105,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5)
 7 | 106,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)"
 8 | 107,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)"
 9 | 108,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)"
10 | 109,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)"
11 | 1010,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)"
12 | -101,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
13 | -102,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
14 | -103,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))"
15 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_3.0/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Global logging configuration
2 | log4j.rootLogger=INFO, stdout
3 | # Console output...
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n
7 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
  2 |  *
  3 |  * This source code is licensed under Apache 2.0 License.
  4 |  */
  5 | 
  6 | package com.vesoft.nebula.exchange.reader
  7 | 
  8 | import com.vesoft.exchange.common.config.FileBaseSourceConfigEntry
  9 | import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE
 10 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
 11 | import org.apache.spark.sql.types.StructType
 12 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 13 | 
 14 | /**
 15 |   * The FileBaseReader is the abstract class for HDFS file reader.
 16 |   *
 17 |   * @param session
 18 |   * @param path
 19 |   */
 20 | abstract class FileBaseReader(val session: SparkSession, val path: String) extends Reader {
 21 | 
 22 |   require(path.trim.nonEmpty)
 23 | 
 24 |   override def close(): Unit = {
 25 |     session.close()
 26 |   }
 27 | }
 28 | 
 29 | /**
 30 |   * The ParquetReader extend the FileBaseReader and support read parquet file from HDFS.
 31 |   *
 32 |   * @param session
 33 |   * @param parquetConfig
 34 |   */
 35 | class ParquetReader(override val session: SparkSession, parquetConfig: FileBaseSourceConfigEntry)
 36 |     extends FileBaseReader(session, parquetConfig.path) {
 37 | 
 38 |   override def read(): DataFrame = {
 39 |     session.read.parquet(path)
 40 |   }
 41 | }
 42 | 
 43 | /**
 44 |   * The ORCReader extend the FileBaseReader and support read orc file from HDFS.
 45 |   *
 46 |   * @param session
 47 |   * @param orcConfig
 48 |   */
 49 | class ORCReader(override val session: SparkSession, orcConfig: FileBaseSourceConfigEntry)
 50 |     extends FileBaseReader(session, orcConfig.path) {
 51 | 
 52 |   override def read(): DataFrame = {
 53 |     session.read.orc(path)
 54 |   }
 55 | }
 56 | 
 57 | /**
 58 |   * The JSONReader extend the FileBaseReader and support read json file from HDFS.
 59 |   *
 60 |   * @param session
 61 |   * @param jsonConfig
 62 |   */
 63 | class JSONReader(override val session: SparkSession, jsonConfig: FileBaseSourceConfigEntry)
 64 |     extends FileBaseReader(session, jsonConfig.path) {
 65 | 
 66 |   override def read(): DataFrame = {
 67 |     session.read.json(path)
 68 |   }
 69 | }
 70 | 
 71 | /**
 72 |   * The CSVReader extend the FileBaseReader and support read csv file from HDFS.
 73 |   * All types of the structure are StringType.
 74 |   *
 75 |   * @param session
 76 |   * @param csvConfig
 77 |   */
 78 | class CSVReader(override val session: SparkSession, csvConfig: FileBaseSourceConfigEntry)
 79 |     extends FileBaseReader(session, csvConfig.path) {
 80 | 
 81 |   override def read(): DataFrame = {
 82 |     session.read
 83 |       .option("delimiter", csvConfig.separator.get)
 84 |       .option("header", csvConfig.header.get)
 85 |       .option("emptyValue", DEFAULT_EMPTY_VALUE)
 86 |       .csv(path)
 87 |   }
 88 | }
 89 | 
 90 | /**
 91 |   * The CustomReader extend the FileBaseReader and support read text file from HDFS.
 92 |   * Transformation is a function convert a line into Row.
 93 |   * The structure of the row should be specified.
 94 |   *
 95 |   * @param session
 96 |   * @param customConfig
 97 |   * @param transformation
 98 |   * @param structType
 99 |   */
100 | abstract class CustomReader(override val session: SparkSession,
101 |                             customConfig: FileBaseSourceConfigEntry,
102 |                             transformation: String => Row,
103 |                             filter: Row => Boolean,
104 |                             structType: StructType)
105 |     extends FileBaseReader(session, customConfig.path) {
106 | 
107 |   override def read(): DataFrame = {
108 |     val encoder = RowEncoder.apply(structType)
109 |     session.read
110 |       .text(path)
111 |       .filter(!_.getString(0).isEmpty)
112 |       .map(row => transformation(row.getString(0)))(encoder)
113 |       .filter(filter)
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.nebula.exchange.reader
 7 | 
 8 | import com.vesoft.exchange.common.Offset
 9 | import com.vesoft.exchange.common.utils.HDFSUtils
10 | import org.apache.spark.sql.{DataFrame, SparkSession}
11 | 
12 | /**
13 |   * The Reader is used to create a DataFrame from the source, such as Hive or HDFS.
14 |   */
15 | trait Reader extends Serializable {
16 |   def session: SparkSession
17 | 
18 |   def read(): DataFrame
19 | 
20 |   def close(): Unit
21 | }
22 | 
23 | trait CheckPointSupport extends Serializable {
24 | 
25 |   def getOffsets(totalCount: Long,
26 |                  parallel: Int,
27 |                  checkPointPath: Option[String],
28 |                  checkPointNamePrefix: String): List[Offset] = {
29 |     if (totalCount <= 0)
30 |       throw new RuntimeException(s"${checkPointNamePrefix}: return data count<=0")
31 | 
32 |     val batchSizes = List.fill((totalCount % parallel).toInt)(totalCount / parallel + 1) ::: List
33 |       .fill((parallel - totalCount % parallel).toInt)(totalCount / parallel)
34 | 
35 |     val startOffsets = batchSizes.scanLeft(0L)(_ + _).init
36 | 
37 |     val checkPointOffsets = checkPointPath match {
38 |       case Some(path) =>
39 |         val files = Range(0, parallel).map(i => s"${path}/${checkPointNamePrefix}.${i}").toList
40 |         if (files.forall(HDFSUtils.exists))
41 |           files.map(HDFSUtils.getContent(_).trim.toLong).sorted
42 |         else startOffsets
43 |       case _ => startOffsets
44 |     }
45 | 
46 |     if (checkPointOffsets.zip(startOffsets).exists(x => x._1 < x._2))
47 |       throw new RuntimeException(
48 |         s"Check Point file maybe previous. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file")
49 | 
50 |     val eachPartitionLimit = {
51 |       batchSizes
52 |         .zip(startOffsets.zip(checkPointOffsets))
53 |         .map(x => {
54 |           x._1 - (x._2._2 - x._2._1)
55 |         })
56 |     }
57 |     val offsets = checkPointOffsets.zip(eachPartitionLimit).map(x => Offset(x._1, x._2))
58 |     if (offsets.exists(_.size < 0L))
59 |       throw new RuntimeException(
60 |         s"Check point file maybe broken. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file")
61 |     offsets
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved.
 2 |  *
 3 |  * This source code is licensed under Apache 2.0 License.
 4 |  */
 5 | 
 6 | package com.vesoft.nebula.exchange.reader
 7 | 
 8 | import com.vesoft.exchange.common.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry}
 9 | import org.apache.spark.sql.types.StringType
10 | import org.apache.spark.sql.{DataFrame, SparkSession}
11 | 
12 | /**
13 |   * Spark Streaming
14 |   *
15 |   * @param session
16 |   */
17 | abstract class StreamingBaseReader(override val session: SparkSession) extends Reader {
18 | 
19 |   override def close(): Unit = {
20 |     session.close()
21 |   }
22 | }
23 | 
24 | /**
25 |   *
26 |   * @param session
27 |   * @param kafkaConfig
28 |   * @param targetFields
29 |   */
30 | class KafkaReader(override val session: SparkSession,
31 |                   kafkaConfig: KafkaSourceConfigEntry,
32 |                   targetFields: List[String])
33 |     extends StreamingBaseReader(session) {
34 | 
35 |   require(
36 |     kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty)
37 | 
38 |   override def read(): DataFrame = {
39 |     import org.apache.spark.sql.functions._
40 |     import session.implicits._
41 |     val fields = targetFields.distinct
42 |     val reader =
43 |       session.readStream
44 |         .format("kafka")
45 |         .option("kafka.bootstrap.servers", kafkaConfig.server)
46 |         .option("subscribe", kafkaConfig.topic)
47 |         .option("startingOffsets", kafkaConfig.startingOffsets)
48 | 
49 |     if (kafkaConfig.securityProtocol.isDefined) {
50 |       reader.option("kafka.security.protocol", kafkaConfig.securityProtocol.get)
51 |       reader.option("kafka.sasl.mechanism", kafkaConfig.mechanism.get)
52 |     }
53 |     if (kafkaConfig.kerberos) {
54 |       reader.option("kafka.sasl.kerberos.service.name", kafkaConfig.kerberosServiceName)
55 |     }
56 | 
57 |     val maxOffsetsPerTrigger = kafkaConfig.maxOffsetsPerTrigger
58 |     if (maxOffsetsPerTrigger.isDefined)
59 |       reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get)
60 | 
61 |     reader
62 |       .load()
63 |       .select($"value".cast(StringType))
64 |       .select(json_tuple($"value", fields: _*))
65 |       .toDF(fields: _*)
66 | 
67 |   }
68 | }
69 | 
70 | /**
71 |   *
72 |   * @param session
73 |   * @param pulsarConfig
74 |   */
75 | class PulsarReader(override val session: SparkSession, pulsarConfig: PulsarSourceConfigEntry)
76 |     extends StreamingBaseReader(session) {
77 | 
78 |   override def read(): DataFrame = {
79 |     session.readStream
80 |       .format("pulsar")
81 |       .option("service.url", pulsarConfig.serviceUrl)
82 |       .option("admin.url", pulsarConfig.adminUrl)
83 |       .options(pulsarConfig.options)
84 |       .load()
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------