├── .github └── workflows │ ├── ISSUE_TEMPLATE.md │ ├── PULL_REQUEST_TEMPLATE.md │ ├── check_label.yml │ ├── pull_request.yml │ ├── release.yml │ └── snapshot.yml ├── .gitignore ├── .scalafmt.conf ├── .travis.yml ├── LICENSE ├── LICENSES └── Apache-2.0.txt ├── README-CN.md ├── README.md ├── bench ├── EXCHANGE_CONFIG │ ├── app_sf1.conf │ ├── app_sf100.conf │ ├── app_sf1000_sst_without_header.conf │ └── app_sf30.conf ├── NEBULA_DDL │ ├── SPACE_SF1 │ ├── SPACE_SF100 │ └── SPACE_SF30 └── exchange-test.md ├── codecov.yml ├── conf-template ├── client_import │ ├── bigquery_datasource.conf │ ├── csv_datasource.conf │ └── hive_datasource.conf └── sst_import │ ├── csv_datasource.conf │ └── hive_datasource.conf ├── exchange-common ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── vesoft │ │ │ └── exchange │ │ │ └── common │ │ │ └── FileMigrate.java │ ├── resources │ │ ├── config_template │ │ │ ├── csv.conf │ │ │ ├── hbase.conf │ │ │ ├── hive.conf │ │ │ ├── jdbc.conf │ │ │ ├── json.conf │ │ │ ├── kafka.conf │ │ │ ├── neo4j.conf │ │ │ ├── orc.conf │ │ │ └── parquet.conf │ │ └── log4j.properties │ └── scala │ │ └── com │ │ └── vesoft │ │ └── exchange │ │ └── common │ │ ├── CheckPointHandler.scala │ │ ├── ErrorHandler.scala │ │ ├── GenerateConfigTemplate.scala │ │ ├── GraphProvider.scala │ │ ├── MetaProvider.scala │ │ ├── Package.scala │ │ ├── PasswordEncryption.scala │ │ ├── config │ │ ├── Configs.scala │ │ ├── SchemaConfigs.scala │ │ ├── SinkConfigs.scala │ │ └── SourceConfigs.scala │ │ ├── processor │ │ ├── Processor.scala │ │ └── ReloadProcessor.scala │ │ ├── utils │ │ ├── ConfigTemplateUtils.scala │ │ ├── HDFSUtils.scala │ │ ├── NebulaPartitioner.scala │ │ ├── NebulaUtils.scala │ │ └── SparkValidate.scala │ │ └── writer │ │ ├── FileBaseWriter.scala │ │ ├── ServerBaseWriter.scala │ │ └── Writer.scala │ └── test │ ├── resources │ ├── application.conf │ ├── docker-compose.yaml │ ├── edge.csv │ ├── process_application.conf │ └── vertex.csv │ └── scala │ └── com │ └── vesoft │ └── exchange │ └── common │ ├── GraphProviderSuite.scala │ ├── MetaProviderSuite.scala │ ├── NebulaGraphMock.scala │ ├── config │ └── ConfigsSuite.scala │ ├── processor │ └── ProcessorSuite.scala │ ├── utils │ ├── NebulaUtilsSuite.scala │ └── SparkValidateSuite.scala │ └── writer │ ├── FileBaseWriterSuite.scala │ └── ServerBaseWriterSuite.scala ├── nebula-exchange_spark_2.2 ├── pom.xml └── src │ ├── main │ ├── resources │ │ └── log4j.properties │ └── scala │ │ └── com │ │ └── vesoft │ │ └── nebula │ │ └── exchange │ │ ├── Exchange.scala │ │ ├── processor │ │ ├── EdgeProcessor.scala │ │ └── VerticesProcessor.scala │ │ └── reader │ │ ├── FileBaseReader.scala │ │ ├── Reader.scala │ │ ├── ServerBaseReader.scala │ │ └── StreamingBaseReader.scala │ └── test │ └── scala │ └── com │ └── vesoft │ └── nebula │ └── exchange │ └── processor │ ├── EdgeProcessorSuite.scala │ └── VerticesProcessorSuite.scala ├── nebula-exchange_spark_2.4 ├── .gitignore ├── pom.xml └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ ├── application_encrypt_password.conf │ │ └── log4j.properties │ └── scala │ │ └── com │ │ └── vesoft │ │ └── nebula │ │ └── exchange │ │ ├── Exchange.scala │ │ ├── processor │ │ ├── EdgeProcessor.scala │ │ └── VerticesProcessor.scala │ │ ├── reader │ │ ├── FileBaseReader.scala │ │ ├── Reader.scala │ │ ├── ServerBaseReader.scala │ │ └── StreamingBaseReader.scala │ │ └── utils │ │ └── Neo4jUtils.scala │ └── test │ ├── resources │ ├── application.conf │ ├── docker-compose.yaml │ ├── edge.csv │ ├── process_application.conf │ └── vertex.csv │ └── scala │ └── com │ └── vesoft │ └── nebula │ └── exchange │ └── processor │ ├── EdgeProcessorSuite.scala │ └── VerticesProcessorSuite.scala ├── nebula-exchange_spark_3.0 ├── pom.xml └── src │ ├── main │ ├── resources │ │ └── log4j.properties │ └── scala │ │ └── com │ │ └── vesoft │ │ └── nebula │ │ └── exchange │ │ ├── Exchange.scala │ │ ├── processor │ │ ├── EdgeProcessor.scala │ │ └── VerticesProcessor.scala │ │ └── reader │ │ ├── FileBaseReader.scala │ │ ├── Reader.scala │ │ ├── ServerBaseReader.scala │ │ └── StreamingBaseReader.scala │ └── test │ └── scala │ └── com │ └── vesoft │ └── nebula │ └── exchange │ └── processor │ ├── EdgeProcessorSuite.scala │ └── VerticesProcessorSuite.scala └── pom.xml /.github/workflows/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | #### Expected behavior 2 | 3 | #### Actual behavior 4 | 5 | #### Steps to reproduce 6 | 7 | #### JVM version (e.g. `java -version`) 8 | 9 | #### Scala version (e.g. `scala -version`) 10 | 11 | #### OS version (e.g. `uname -a`) 12 | -------------------------------------------------------------------------------- /.github/workflows/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Motivation: 2 | 3 | Why you're making that change and what is the problem you're trying to solve. 4 | 5 | Modification: 6 | 7 | Describe the modifications you've done. 8 | 9 | Result: 10 | 11 | Fixes #. 12 | -------------------------------------------------------------------------------- /.github/workflows/check_label.yml: -------------------------------------------------------------------------------- 1 | name: Auto label 2 | 3 | on: 4 | issues: 5 | types: 6 | - reopened 7 | - opened 8 | - labeled 9 | - unlabeled 10 | - closed 11 | 12 | env: 13 | GH_PAT: ${{ secrets.GITHUB_TOKEN }} 14 | EVENT: ${{ toJSON(github.event)}} 15 | EVENT_NAME: ${{ github.event_name}} 16 | 17 | jobs: 18 | sync: 19 | name: auto label 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: HarrisChu/auto_label@v1 23 | -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Maven 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven 3 | 4 | name: pull_request 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: 11 | - master 12 | - 'v[0-9]+.*' 13 | 14 | jobs: 15 | build: 16 | 17 | runs-on: ubuntu-latest 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up JDK 1.8 22 | uses: actions/setup-java@v1 23 | with: 24 | java-version: 1.8 25 | 26 | - name: Cache the Maven packages to speed up build 27 | uses: actions/cache@v2 28 | with: 29 | path: ~/.m2/repository 30 | key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} 31 | restore-keys: ${{ runner.os }}-maven- 32 | 33 | - name: Install nebula-graph 34 | run: | 35 | mkdir tmp 36 | pushd tmp 37 | git clone https://github.com/vesoft-inc/nebula-docker-compose.git 38 | pushd nebula-docker-compose/ 39 | cp ../../exchange-common/src/test/resources/docker-compose.yaml . 40 | docker-compose up -d 41 | sleep 10 42 | popd 43 | popd 44 | 45 | - name: Build with Maven 46 | run: | 47 | mvn clean package -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2 48 | mvn clean package -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4 49 | mvn clean package -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0 50 | 51 | - uses: codecov/codecov-action@v2 52 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Maven 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven 3 | 4 | name: release 5 | 6 | on: 7 | release: 8 | types: published 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up JDK 1.8 17 | uses: actions/setup-java@v1 18 | with: 19 | java-version: 1.8 20 | 21 | - name: Cache the Maven packages to speed up build 22 | uses: actions/cache@v2 23 | with: 24 | path: ~/.m2/repository 25 | key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} 26 | restore-keys: ${{ runner.os }}-maven- 27 | 28 | - name: Install nebula-graph 29 | run: | 30 | mkdir tmp 31 | pushd tmp 32 | git clone https://github.com/vesoft-inc/nebula-docker-compose.git 33 | pushd nebula-docker-compose/ 34 | cp ../../exchange-common/src/test/resources/docker-compose.yaml . 35 | docker-compose up -d 36 | sleep 10 37 | popd 38 | popd 39 | 40 | - name: Build with Maven 41 | run: | 42 | mvn clean package -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2 43 | mvn clean package -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4 44 | mvn clean package -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0 45 | 46 | - name: Get the version 47 | id: get_version 48 | run: | 49 | tag=$(echo ${{ github.ref }} | rev | cut -d/ -f1 | rev) 50 | tagnum=$(echo $tag | sed 's/^v//') 51 | echo "::set-output name=tag::$tag" 52 | echo "::set-output name=tagnum::$tagnum" 53 | shell: bash 54 | 55 | - name: upload to release assets 56 | uses: softprops/action-gh-release@v1 57 | env: 58 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 59 | with: 60 | files: | 61 | nebula-exchange_spark_2.2/target/nebula-exchange_spark_2.2-${{ steps.get_version.outputs.tagnum }}.jar 62 | nebula-exchange_spark_2.4/target/nebula-exchange_spark_2.4-${{ steps.get_version.outputs.tagnum }}.jar 63 | nebula-exchange_spark_3.0/target/nebula-exchange_spark_3.0-${{ steps.get_version.outputs.tagnum }}.jar 64 | 65 | - name: upload to oss 66 | run: | 67 | wget http://gosspublic.alicdn.com/ossutil/1.7.8/ossutil64 68 | chmod 755 ossutil64 69 | ./ossutil64 -e ${{ secrets.OSS_ENDPOINT }} \ 70 | -i ${{ secrets.OSS_ID }} \ 71 | -k ${{ secrets.OSS_SECRET }} \ 72 | -f cp nebula-exchange_spark_2.2/target/nebula-exchange_spark_2.2-${{ steps.get_version.outputs.tagnum}}.jar oss://nebula-graph/maven2/nebula-exchange/${{ steps.get_version.outputs.tagnum }}/ 73 | ./ossutil64 -e ${{ secrets.OSS_ENDPOINT }} \ 74 | -i ${{ secrets.OSS_ID }} \ 75 | -k ${{ secrets.OSS_SECRET }} \ 76 | -f cp nebula-exchange_spark_2.4/target/nebula-exchange_spark_2.4-${{ steps.get_version.outputs.tagnum }}.jar oss://nebula-graph/maven2/nebula-exchange/${{ steps.get_version.outputs.tagnum }}/ 77 | ./ossutil64 -e ${{ secrets.OSS_ENDPOINT }} \ 78 | -i ${{ secrets.OSS_ID }} \ 79 | -k ${{ secrets.OSS_SECRET }} \ 80 | -f cp nebula-exchange_spark_3.0/target/nebula-exchange_spark_3.0-${{ steps.get_version.outputs.tagnum }}.jar oss://nebula-graph/maven2/nebula-exchange/${{ steps.get_version.outputs.tagnum }}/ 81 | -------------------------------------------------------------------------------- /.github/workflows/snapshot.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Maven 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven 3 | 4 | name: snapshot 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | schedule: 10 | - cron: '0 6 * * *' 11 | 12 | jobs: 13 | deploy: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up JDK 1.8 19 | uses: actions/setup-java@v1 20 | with: 21 | java-version: 1.8 22 | 23 | - name: Cache the Maven packages to speed up build 24 | uses: actions/cache@v2 25 | with: 26 | path: ~/.m2/repository 27 | key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} 28 | restore-keys: ${{ runner.os }}-maven- 29 | 30 | - name: Install nebula-graph 31 | run: | 32 | mkdir tmp 33 | pushd tmp 34 | git clone https://github.com/vesoft-inc/nebula-docker-compose.git 35 | pushd nebula-docker-compose/ 36 | cp ../../exchange-common/src/test/resources/docker-compose.yaml . 37 | docker-compose up -d 38 | sleep 10 39 | popd 40 | popd 41 | 42 | - name: Build with Maven 43 | run: | 44 | mvn clean package -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2 45 | mvn clean package -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4 46 | mvn clean package -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0 47 | 48 | - name: upload Exchange with Spark 2.2 to snapshot assets 49 | uses: actions/upload-artifact@v2 50 | with: 51 | name: nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar 52 | path: 53 | nebula-exchange_spark_2.2/target/nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar 54 | 55 | - name: upload Exchange with Spark 2.4 to snapshot assets 56 | uses: actions/upload-artifact@v2 57 | with: 58 | name: nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar 59 | path: 60 | nebula-exchange_spark_2.4/target/nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar 61 | 62 | - name: upload Exchange with Spark 3.0 to snapshot assets 63 | uses: actions/upload-artifact@v2 64 | with: 65 | name: nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar 66 | path: 67 | nebula-exchange_spark_3.0/target/nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | 25 | # build target 26 | target/ 27 | 28 | # IDE 29 | .idea/ 30 | .eclipse/ 31 | *.iml 32 | .project 33 | .bloop 34 | .metals 35 | .settings 36 | .vscode 37 | .classpath 38 | .factorypath 39 | 40 | spark-importer.ipr 41 | spark-importer.iws 42 | 43 | # mac 44 | .DS_Store 45 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | align = more 2 | maxColumn = 100 3 | docstrings = ScalaDoc 4 | assumeStandardLibraryStripMargin = true -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 vesoft inc. All rights reserved. 2 | # 3 | # This source code is licensed under Apache 2.0 License. 4 | 5 | language: java 6 | 7 | jdk: 8 | - oraclejdk11 9 | - openjdk8 10 | - openjdk11 11 | 12 | install: mvn clean compile package install -Dgpg.skip -Dmaven.javadoc.skip=true 13 | -------------------------------------------------------------------------------- /README-CN.md: -------------------------------------------------------------------------------- 1 | # 欢迎使用 NebulaGraph Exchange 2 | 3 | [English](https://github.com/vesoft-inc/nebula-exchange/blob/master/README.md) 4 | 5 | NebulaGraph Exchange(以下简称 Exchange)是一款 Apache Spark™ 应用,用于在分布式环境中将集群中的数据批量迁移到 6 | NebulaGraph 中,它能支持多种不同格式的批式数据和流式数据的迁移,它还支持直接与 SST File 方式的 7 | NebulaGraph 写入。 8 | 9 | Exchange 支持的 Spark 版本包括 2.2、2.4 和 10 | 3.0,对应的工具包名分别为 `nebula-exchange_spark_2.2`、`nebula-exchange_spark_2.4` 11 | 和 `nebula-exchange_spark_3.0`。 12 | 13 | > 注意: 14 | > - 3.4.0 版本不支持 kafka 和 pulsar, 若需将 kafka 或 pulsar 数据导入 NebulaGraph,请使用 3.0.0 或 15 | 3.3.0 或 3.5.0 版本。 16 | > - 本仓库仅支持 NebulaGraph 2.x 和 3.x,如果您在使用 NebulaGraph 17 | v1.x,请使用 [NebulaExchange v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/exchange) 18 | ,或参考 Exchange 1.0 19 | 的使用文档[NebulaExchange 用户手册](https://docs.nebula-graph.com.cn/3.6.0/import-export/nebula-exchange/about-exchange/ex-ug-what-is-exchange/ "点击前往 Nebula Graph 网站")。 20 | 21 | 22 | ## 如何获取 23 | 24 | 1. 编译打包最新的 Exchange。 25 | 26 | ```bash 27 | $ git clone https://github.com/vesoft-inc/nebula-exchange.git 28 | $ cd nebula-exchange 29 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2 30 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4 31 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0 32 | ``` 33 | 34 | 编译打包完成后,可以: 35 | - 在 nebula-exchange/nebula-exchange_spark_2.2/target/ 目录下找到 36 | nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar 文件; 37 | - 在 nebula-exchange/nebula-exchange_spark_2.4/target/ 目录下找到 38 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar 文件; 39 | - 以及在 nebula-exchange/nebula-exchange_spark_3.0/target/ 目录下找到 40 | nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar 文件。 41 | 42 | 3. 在官网或 GitHub 下载 43 | 44 | **正式版本** 45 | 46 | [GitHub Releases](https://github.com/vesoft-inc/nebula-exchange/releases) 47 | 或者 [Downloads](https://www.nebula-graph.com.cn/release?exchange=) 48 | 49 | **快照版本** 50 | 51 | 进入[GitHub Actions Artifacts](https://github.com/vesoft-inc/nebula-exchange/actions/workflows/snapshot.yml) 52 | 页面点击任意 workflow 后,从 Artifacts 中,根据需求下载下载。 53 | 54 | ## 自动生成示例配置文件 55 | 56 | 通过如下命令,指定要导入的数据源,即可获得该数据源所对应的配置文件示例。 57 | ```agsl 58 | java -cp nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar com.vesoft.exchange.common.GenerateConfigTemplate -s {source} -p 59 | {target-path-to-save-config-file} 60 | ``` 61 | 62 | ## 加密 NebulaGraph 密码 63 | ```agsl 64 | spark-submit --master local --class com.vesoft.exchange.common.PasswordEncryption nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -p {password} 65 | ``` 66 | 加密 密码 nebula,输出结果包括RSA 公钥、私钥和加密后的password,示例: 67 | ```agsl 68 | =================== public key begin =================== 69 | MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCLl7LaNSEXlZo2hYiJqzxgyFBQdkxbQXYU/xQthsBJwjOPhkiY37nokzKnjNlp6mv5ZUomqxLsoNQHEJ6BZD4VPiaiElFAkTD+gyul1v8f3A446Fr2rnVLogWHnz8ECPt7X8jwmpiKOXkOPIhqU5E0Cua+Kk0nnVosbos/VShfiQIDAQAB 70 | =================== public key end =================== 71 | 72 | 73 | =================== private key begin =================== 74 | MIICeAIBADANBgkqhkiG9w0BAQEFAASCAmIwggJeAgEAAoGBAIuXsto1IReVmjaFiImrPGDIUFB2TFtBdhT/FC2GwEnCM4+GSJjfueiTMqeM2Wnqa/llSiarEuyg1AcQnoFkPhU+JqISUUCRMP6DK6XW/x/cDjjoWvaudUuiBYefPwQI+3tfyPCamIo5eQ48iGpTkTQK5r4qTSedWixuiz9VKF+JAgMBAAECgYADWbfEPwQ1UbTq3Bej3kVLuWMcG0rH4fFYnaq5UQOqgYvFRR7W9H+80lOj6+CIB0ViLgkylmaU4WNVbBOx3VsUFFWSqIIIviKubg8m8ey7KAd9X2wMEcUHi4JyS2+/WSacaXYS5LOmMevvuaOwLEV0QmyM+nNGRIjUdzCLR1935QJBAM+IF8YD5GnoAPPjGIDS1Ljhu/u/Gj6/YBCQKSHQ5+HxHEKjQ/YxQZ/otchmMZanYelf1y+byuJX3NZ04/KSGT8CQQCsMaoFO2rF5M84HpAXPi6yH2chbtz0VTKZworwUnpmMVbNUojf4VwzAyOhT1U5o0PpFbpi+NqQhC63VUN5k003AkEArI8vnVGNMlZbvG7e5/bmM9hWs2viSbxdB0inOtv2g1M1OV+B2gp405ru0/PNVcRV0HQFfCuhVfTSxmspQoAihwJBAJW6EZa/FZbB4JVxreUoAr6Lo8dkeOhT9M3SZbGWZivaFxot/Cp/8QXCYwbuzrJxjqlsZUeOD6694Uk08JkURn0CQQC8V6aRa8ylMhLJFkGkMDHLqHcQCmY53Kd73mUu4+mjMJLZh14zQD9ydFtc0lbLXTeBAMWV3uEdeLhRvdAo3OwV 75 | =================== private key end =================== 76 | 77 | 78 | =================== encrypted password begin =================== 79 | Io+3y3mLOMnZJJNUPHZ8pKb4VfTvg6wUh6jSu5xdmLAoX/59tK1HTwoN40aOOWJwa1a5io7S4JqcX/jEcAorw7pelITr+F4oB0AMCt71d+gJuu3/lw9bjUEl9tF4Raj82y2Dg39wYbagN84fZMgCD63TPiDIevSr6+MFKASpGrY= 80 | =================== encrypted password end =================== 81 | check: the real password decrypted by private key and encrypted password is: nebula 82 | ``` 83 | 84 | ## 版本匹配 85 | 86 | Exchange 和 NebulaGraph 的版本对应关系如下: 87 | 88 | | Exchange Version | NebulaGraph Version | Spark Version | 89 | |:------------------------------------------:|:-------------------:|:-------------------------------:| 90 | | nebula-exchange-2.0.0.jar | 2.0.0, 2.0.1 | 2.4.* | 91 | | nebula-exchange-2.0.1.jar | 2.0.0, 2.0.1 | 2.4.* | 92 | | nebula-exchange-2.1.0.jar | 2.0.0, 2.0.1 | 2.4.* | 93 | | nebula-exchange-2.5.0.jar | 2.5.0, 2.5.1 | 2.4.* | 94 | | nebula-exchange-2.5.1.jar | 2.5.0, 2.5.1 | 2.4.* | 95 | | nebula-exchange-2.5.2.jar | 2.5.0, 2.5.1 | 2.4.* | 96 | | nebula-exchange-2.6.0.jar | 2.6.0, 2.6.1 | 2.4.* | 97 | | nebula-exchange-2.6.1.jar | 2.6.0, 2.6.1 | 2.4.* | 98 | | nebula-exchange-2.6.2.jar | 2.6.0, 2.6.1 | 2.4.* | 99 | | nebula-exchange-2.6.3.jar | 2.6.0, 2.6.1 | 2.4.* | 100 | | nebula-exchange_spark_2.2-3.x.x.jar | 3.x.x | 2.2.* | 101 | | nebula-exchange_spark_2.4-3.x.x.jar | 3.x.x | 2.4.* | 102 | | nebula-exchange_spark_3.0-3.x.x.jar | 3.x.x | `3.0.*`,`3.1.*`,`3.2.*`,`3.3.*` | 103 | | nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar | nightly | 2.2.* | 104 | | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar | nightly | 2.4.* | 105 | | nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar | nightly | `3.0.*`,`3.1.*`,`3.2.*`,`3.3.*` | 106 | 107 | ## 使用说明 108 | 109 | 特性 & 注意事项: 110 | 111 | *1. Nebula Graph 2.0 支持 String 类型和 Integer 类型的点 id 。* 112 | 113 | *2. Exchange 2.0 新增 null、Date、DateTime、Time 类型数据的导入( DateTime 是 UTC 时区,非 Local time)。* 114 | 115 | *3. Exchange 2.0 支持 Hive on Spark 以外的 Hive 数据源,需在配置文件中配置 Hive 116 | 源,具体配置示例参考 [application.conf](https://github.com/vesoft-inc/nebula-exchange/blob/master/exchange-common/src/test/resources/application.conf) 117 | 中 Hive 的配置。* 118 | 119 | *4. Exchange 2.0 将导入失败的 INSERT 语句进行落盘,存于配置文件的 error/output 路径中。* 120 | 121 | *5. Exchange 2.5.0 支持SST导入,但不支持属性的 default 值。* 122 | 123 | *6. 124 | 配置文件参考 [application.conf](https://github.com/vesoft-inc/nebula-exchange/blob/master/exchange-common/src/test/resources/application.conf)。* 125 | 126 | *7. Exchange 2.0 的导入命令:* 127 | 128 | ``` 129 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange --master local nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c /path/to/application.conf 130 | ``` 131 | 132 | 如果数据源有HIVE,则导入命令最后还需要加 `-h` 表示启用HIVE数据源。 133 | 134 | 注:在Yarn-Cluster模式下提交 Exchange,请使用如下提交命令: 135 | 136 | ``` 137 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \ 138 | --master yarn-cluster \ 139 | --files application.conf \ 140 | --conf spark.driver.extraClassPath=./ \ 141 | --conf spark.executor.extraClassPath=./ \ 142 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar \ 143 | -c application.conf 144 | ``` 145 | 146 | 注:使用 Nebula Exchange 进行 SST 文件生成时,会涉及到 Spark 的 shuffle 操作,请注意在提交命令中增加 147 | spark.sql.shuffle.partition 的配置: 148 | 149 | ``` 150 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \ 151 | --master local \ 152 | --conf spark.sql.shuffle.partitions=200 \ 153 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar \ 154 | -c application.conf 155 | ``` 156 | *8. 自 3.7.0 版本,Exchange支持配置RSA加密后的NebulaGraph密码,并支持生成加密的密码。* 157 | 158 | 关于 Nebula Exchange 的更多说明,请参考 Exchange 2.0 159 | 的[使用手册](https://docs.nebula-graph.com.cn/2.6.2/nebula-exchange/about-exchange/ex-ug-what-is-exchange/) 。 160 | 161 | ## 贡献 162 | 163 | Nebula Exchange 2.0 是一个完全开源的项目,欢迎开源爱好者通过以下方式参与: 164 | 165 | - 前往 [Nebula Graph 论坛](https://discuss.nebula-graph.com.cn/ "点击前往“Nebula Graph 论坛") 上参与 166 | Issue 讨论,如答疑、提供想法或者报告无法解决的问题 167 | - 撰写或改进文档 168 | - 提交优化代码 169 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NebulaGraph Exchange 2 | 3 | [中文版](https://github.com/vesoft-inc/nebula-exchange/blob/master/README-CN.md) 4 | 5 | NebulaGraph Exchange (referred to as Exchange) is an Apache Spark™ application used to migrate data 6 | in bulk from different sources to NebulaGraph in a distributed way(Spark). It supports a variety of 7 | batch or streaming data sources and allows direct writing to NebulaGraph through side-loading (SST 8 | Files). 9 | 10 | Exchange supports Spark versions 2.2, 2.4, and 3.0 along with their respective toolkits 11 | named: `nebula-exchange_spark_2.2`, `nebula-exchange_spark_2.4`, and `nebula-exchange_spark_3.0`. 12 | 13 | > Note: 14 | > - Exchange 3.4.0 does not support Apache Kafka and Apache Pulsar. Please use Exchange of version 15 | 3.0.0, 3.3.0, or 3.5.0 to load data from Apache Kafka or Apache Pulsar to NebulaGraph for now. 16 | > - This repo covers only NebulaGraph 2.x and 3.x, for NebulaGraph v1.x, please 17 | use [NebulaGraph Exchange v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/exchange). 18 | 19 | ## Build or Download Exchange 20 | 21 | 1. Build the latest Exchange 22 | 23 | ```bash 24 | $ git clone https://github.com/vesoft-inc/nebula-exchange.git 25 | $ cd nebula-exchange 26 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2 27 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4 28 | $ mvn clean package -Dmaven.test.skip=true -Dgpg.skip -Dmaven.javadoc.skip=true -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0 29 | ``` 30 | 31 | After packaging, the newly generated JAR files can be found in the following path: 32 | - nebula-exchange/nebula-exchange_spark_2.2/target/ contains 33 | nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar 34 | - nebula-exchange/nebula-exchange_spark_2.4/target/ contains 35 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar 36 | - nebula-exchange/nebula-exchange_spark_3.0/target/ contains 37 | nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar 38 | 39 | 3. Download from the GitHub artifact 40 | 41 | **Released Version:** 42 | 43 | [GitHub Releases](https://github.com/vesoft-inc/nebula-exchange/releases) 44 | or [Downloads](https://www.nebula-graph.io/release?exchange=) 45 | 46 | **Snapshot Version:** 47 | 48 | [GitHub Actions Artifacts](https://github.com/vesoft-inc/nebula-exchange/actions/workflows/snapshot.yml) 49 | 50 | ## Get Started 51 | 52 | Here is an example command to run the Exchange: 53 | 54 | ```bash 55 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange --master local nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c /path/to/application.conf 56 | ``` 57 | 58 | And when the source is **Hive**, run: 59 | 60 | ```bash 61 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange --master local nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c /path/to/application.conf -h 62 | ``` 63 | 64 | Run the Exchange in **Yarn-Cluster** mode: 65 | 66 | ```bash 67 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \ 68 | --master yarn-cluster \ 69 | --files application.conf \ 70 | --conf spark.driver.extraClassPath=./ \ 71 | --conf spark.executor.extraClassPath=./ \ 72 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar \ 73 | -c application.conf 74 | ``` 75 | 76 | Note: When using Exchange to generate SST files, please add `spark.sql.shuffle.partition` 77 | in `--conf` for Spark's shuffle operation: 78 | 79 | ``` 80 | $SPARK_HOME/bin/spark-submit --class com.vesoft.nebula.exchange.Exchange \ 81 | --master local \ 82 | --conf spark.sql.shuffle.partitions=200 \ 83 | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar \ 84 | -c application.conf 85 | ``` 86 | 87 | For more details, please refer 88 | to [NebulaGraph Exchange Docs](https://docs.nebula-graph.io/master/import-export/nebula-exchange/about-exchange/ex-ug-what-is-exchange/) 89 | 90 | ## How to get the config file 91 | 92 | You can get the template config file with your datasource through the command: 93 | 94 | ```agsl 95 | java -cp nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar com.vesoft.exchange.common.GenerateConfigTemplate -s {source} -p 96 | {target-path-to-save-config-file} 97 | ``` 98 | 99 | Such as your datasource is csv, and want to save the template config file in /tmp/, please run: 100 | 101 | ```agsl 102 | java -cp nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar com.vesoft.exchange.common.GenerateConfigTemplate -s csv -p /tmp 103 | ``` 104 | 105 | ## encrypt NebulaGraph's password 106 | ```agsl 107 | spark-submit --master local --class com.vesoft.exchange.common.PasswordEncryption nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -p {password} 108 | ``` 109 | When encrypt the password `nebula`, the output includes RSA public key, private key, encrypted password: 110 | ```agsl 111 | =================== public key begin =================== 112 | MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCLl7LaNSEXlZo2hYiJqzxgyFBQdkxbQXYU/xQthsBJwjOPhkiY37nokzKnjNlp6mv5ZUomqxLsoNQHEJ6BZD4VPiaiElFAkTD+gyul1v8f3A446Fr2rnVLogWHnz8ECPt7X8jwmpiKOXkOPIhqU5E0Cua+Kk0nnVosbos/VShfiQIDAQAB 113 | =================== public key end =================== 114 | 115 | 116 | =================== private key begin =================== 117 | MIICeAIBADANBgkqhkiG9w0BAQEFAASCAmIwggJeAgEAAoGBAIuXsto1IReVmjaFiImrPGDIUFB2TFtBdhT/FC2GwEnCM4+GSJjfueiTMqeM2Wnqa/llSiarEuyg1AcQnoFkPhU+JqISUUCRMP6DK6XW/x/cDjjoWvaudUuiBYefPwQI+3tfyPCamIo5eQ48iGpTkTQK5r4qTSedWixuiz9VKF+JAgMBAAECgYADWbfEPwQ1UbTq3Bej3kVLuWMcG0rH4fFYnaq5UQOqgYvFRR7W9H+80lOj6+CIB0ViLgkylmaU4WNVbBOx3VsUFFWSqIIIviKubg8m8ey7KAd9X2wMEcUHi4JyS2+/WSacaXYS5LOmMevvuaOwLEV0QmyM+nNGRIjUdzCLR1935QJBAM+IF8YD5GnoAPPjGIDS1Ljhu/u/Gj6/YBCQKSHQ5+HxHEKjQ/YxQZ/otchmMZanYelf1y+byuJX3NZ04/KSGT8CQQCsMaoFO2rF5M84HpAXPi6yH2chbtz0VTKZworwUnpmMVbNUojf4VwzAyOhT1U5o0PpFbpi+NqQhC63VUN5k003AkEArI8vnVGNMlZbvG7e5/bmM9hWs2viSbxdB0inOtv2g1M1OV+B2gp405ru0/PNVcRV0HQFfCuhVfTSxmspQoAihwJBAJW6EZa/FZbB4JVxreUoAr6Lo8dkeOhT9M3SZbGWZivaFxot/Cp/8QXCYwbuzrJxjqlsZUeOD6694Uk08JkURn0CQQC8V6aRa8ylMhLJFkGkMDHLqHcQCmY53Kd73mUu4+mjMJLZh14zQD9ydFtc0lbLXTeBAMWV3uEdeLhRvdAo3OwV 118 | =================== private key end =================== 119 | 120 | 121 | =================== encrypted password begin =================== 122 | Io+3y3mLOMnZJJNUPHZ8pKb4VfTvg6wUh6jSu5xdmLAoX/59tK1HTwoN40aOOWJwa1a5io7S4JqcX/jEcAorw7pelITr+F4oB0AMCt71d+gJuu3/lw9bjUEl9tF4Raj82y2Dg39wYbagN84fZMgCD63TPiDIevSr6+MFKASpGrY= 123 | =================== encrypted password end =================== 124 | check: the real password decrypted by private key and encrypted password is: nebula 125 | ``` 126 | 127 | ## Version Compatibility Matrix 128 | 129 | Here is the version correspondence between Exchange and NebulaGraph: 130 | 131 | | Exchange Version | Nebula Version | Spark Version | 132 | |:------------------------------------------:|:--------------:|:-------------------------------:| 133 | | nebula-exchange-2.0.0.jar | 2.0.0, 2.0.1 | 2.4.* | 134 | | nebula-exchange-2.0.1.jar | 2.0.0, 2.0.1 | 2.4.* | 135 | | nebula-exchange-2.1.0.jar | 2.0.0, 2.0.1 | 2.4.* | 136 | | nebula-exchange-2.5.0.jar | 2.5.0, 2.5.1 | 2.4.* | 137 | | nebula-exchange-2.5.1.jar | 2.5.0, 2.5.1 | 2.4.* | 138 | | nebula-exchange-2.5.2.jar | 2.5.0, 2.5.1 | 2.4.* | 139 | | nebula-exchange-2.6.0.jar | 2.6.0, 2.6.1 | 2.4.* | 140 | | nebula-exchange-2.6.1.jar | 2.6.0, 2.6.1 | 2.4.* | 141 | | nebula-exchange-2.6.2.jar | 2.6.0, 2.6.1 | 2.4.* | 142 | | nebula-exchange-2.6.3.jar | 2.6.0, 2.6.1 | 2.4.* | 143 | | nebula-exchange_spark_2.2-3.x.x.jar | 3.x.x | 2.2.* | 144 | | nebula-exchange_spark_2.4-3.x.x.jar | 3.x.x | 2.4.* | 145 | | nebula-exchange_spark_3.0-3.x.x.jar | 3.x.x | `3.0.*`,`3.1.*`,`3.2.*`,`3.3.*` | 146 | | nebula-exchange_spark_2.2-3.0-SNAPSHOT.jar | nightly | 2.2.* | 147 | | nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar | nightly | 2.4.* | 148 | | nebula-exchange_spark_3.0-3.0-SNAPSHOT.jar | nightly | `3.0.*`,`3.1.*`,`3.2.*`,`3.3.*` | 149 | 150 | ## Feature History 151 | 152 | 1. *Since 2.0* Exchange allows for the import of vertex data with both String and Integer type IDs. 153 | 2. *Since 2.0* Exchange also supports importing data of various types, including Null, Date, 154 | DateTime (using UTC instead of local time), and Time. 155 | 3. *Since 2.0* In addition to Hive on Spark, Exchange can import data from other Hive sources as 156 | well. 157 | 4. *Since 2.0* If there are failures during the data import process, Exchange supports recording and 158 | retrying the INSERT statement. 159 | 5. *Since 2.5* While SST import is supported by Exchange, property default values are not yet 160 | supported. 161 | 6. *Since 3.0* Exchange is compatible with Spark 2.2, Spark 2.4, and Spark 3.0. 162 | 7. *Since 3.7* Exchange supports to config the encrypted NebulaGraph password and supports to generate the encryption password. 163 | Refer 164 | to [application.conf](https://github.com/vesoft-inc/nebula-exchange/blob/master/exchange-common/src/test/resources/application.conf) 165 | as an example to edit the configuration file. 166 | -------------------------------------------------------------------------------- /bench/NEBULA_DDL/SPACE_SF1: -------------------------------------------------------------------------------- 1 | create space sf1(vid_type=int64,partition_num=100,replica_factor=3); 2 | USE sf1; 3 | CREATE TAG IF NOT EXISTS `Place`(`name` string,`url` string,`type` string); 4 | CREATE TAG IF NOT EXISTS `Comment`(`creationDate` string,`locationIP` string,`browserUsed` string,`content` string,`length` int); 5 | CREATE TAG IF NOT EXISTS `Organisation`(`type` string,`name` string,`url` string); 6 | CREATE TAG IF NOT EXISTS `Person`(`firstName` string,`lastName` string,`gender` string,`birthday` string,`creationDate` string,`locationIP` string,`browserUsed` string); 7 | CREATE TAG IF NOT EXISTS `Tagclass`(`name` string,`url` string); 8 | CREATE TAG IF NOT EXISTS `Forum`(`title` string,`creationDate` string); 9 | CREATE TAG IF NOT EXISTS `Post`(`imageFile` string,`creationDate` string,`locationIP` string,`browserUsed` string,`language` string,`content` string,`length` int); 10 | CREATE TAG IF NOT EXISTS `Tag`(`name` string,`url` string); 11 | CREATE EDGE IF NOT EXISTS `IS_PART_OF`(); 12 | CREATE EDGE IF NOT EXISTS `LIKES`(`creationDate` string); 13 | CREATE EDGE IF NOT EXISTS `HAS_CREATOR`(); 14 | CREATE EDGE IF NOT EXISTS `HAS_INTEREST`(); 15 | CREATE EDGE IF NOT EXISTS `IS_SUBCLASS_OF`(); 16 | CREATE EDGE IF NOT EXISTS `IS_LOCATED_IN`(); 17 | CREATE EDGE IF NOT EXISTS `HAS_MODERATOR`(); 18 | CREATE EDGE IF NOT EXISTS `HAS_TAG`(); 19 | CREATE EDGE IF NOT EXISTS `WORK_AT`(`workFrom` int); 20 | CREATE EDGE IF NOT EXISTS `REPLY_OF`(); 21 | CREATE EDGE IF NOT EXISTS `STUDY_AT`(`classYear` int); 22 | CREATE EDGE IF NOT EXISTS `CONTAINER_OF`(); 23 | CREATE EDGE IF NOT EXISTS `HAS_MEMBER`(`joinDate` string); 24 | CREATE EDGE IF NOT EXISTS `KNOWS`(`creationDate` string); 25 | CREATE EDGE IF NOT EXISTS `HAS_TYPE`(); 26 | -------------------------------------------------------------------------------- /bench/NEBULA_DDL/SPACE_SF100: -------------------------------------------------------------------------------- 1 | create space sf100(vid_type=int64,partition_num=100,replica_factor=3); 2 | USE sf100; 3 | CREATE TAG IF NOT EXISTS `Place`(`name` string,`url` string,`type` string); 4 | CREATE TAG IF NOT EXISTS `Comment`(`creationDate` string,`locationIP` string,`browserUsed` string,`content` string,`length` int); 5 | CREATE TAG IF NOT EXISTS `Organisation`(`type` string,`name` string,`url` string); 6 | CREATE TAG IF NOT EXISTS `Person`(`firstName` string,`lastName` string,`gender` string,`birthday` string,`creationDate` string,`locationIP` string,`browserUsed` string); 7 | CREATE TAG IF NOT EXISTS `Tagclass`(`name` string,`url` string); 8 | CREATE TAG IF NOT EXISTS `Forum`(`title` string,`creationDate` string); 9 | CREATE TAG IF NOT EXISTS `Post`(`imageFile` string,`creationDate` string,`locationIP` string,`browserUsed` string,`language` string,`content` string,`length` int); 10 | CREATE TAG IF NOT EXISTS `Tag`(`name` string,`url` string); 11 | CREATE EDGE IF NOT EXISTS `IS_PART_OF`(); 12 | CREATE EDGE IF NOT EXISTS `LIKES`(`creationDate` string); 13 | CREATE EDGE IF NOT EXISTS `HAS_CREATOR`(); 14 | CREATE EDGE IF NOT EXISTS `HAS_INTEREST`(); 15 | CREATE EDGE IF NOT EXISTS `IS_SUBCLASS_OF`(); 16 | CREATE EDGE IF NOT EXISTS `IS_LOCATED_IN`(); 17 | CREATE EDGE IF NOT EXISTS `HAS_MODERATOR`(); 18 | CREATE EDGE IF NOT EXISTS `HAS_TAG`(); 19 | CREATE EDGE IF NOT EXISTS `WORK_AT`(`workFrom` int); 20 | CREATE EDGE IF NOT EXISTS `REPLY_OF`(); 21 | CREATE EDGE IF NOT EXISTS `STUDY_AT`(`classYear` int); 22 | CREATE EDGE IF NOT EXISTS `CONTAINER_OF`(); 23 | CREATE EDGE IF NOT EXISTS `HAS_MEMBER`(`joinDate` string); 24 | CREATE EDGE IF NOT EXISTS `KNOWS`(`creationDate` string); 25 | CREATE EDGE IF NOT EXISTS `HAS_TYPE`(); 26 | -------------------------------------------------------------------------------- /bench/NEBULA_DDL/SPACE_SF30: -------------------------------------------------------------------------------- 1 | create space sf30(vid_type=int64,partition_num=100,replica_factor=3); 2 | USE sf30; 3 | CREATE TAG IF NOT EXISTS `Place`(`name` string,`url` string,`type` string); 4 | CREATE TAG IF NOT EXISTS `Comment`(`creationDate` string,`locationIP` string,`browserUsed` string,`content` string,`length` int); 5 | CREATE TAG IF NOT EXISTS `Organisation`(`type` string,`name` string,`url` string); 6 | CREATE TAG IF NOT EXISTS `Person`(`firstName` string,`lastName` string,`gender` string,`birthday` string,`creationDate` string,`locationIP` string,`browserUsed` string); 7 | CREATE TAG IF NOT EXISTS `Tagclass`(`name` string,`url` string); 8 | CREATE TAG IF NOT EXISTS `Forum`(`title` string,`creationDate` string); 9 | CREATE TAG IF NOT EXISTS `Post`(`imageFile` string,`creationDate` string,`locationIP` string,`browserUsed` string,`language` string,`content` string,`length` int); 10 | CREATE TAG IF NOT EXISTS `Tag`(`name` string,`url` string); 11 | CREATE EDGE IF NOT EXISTS `IS_PART_OF`(); 12 | CREATE EDGE IF NOT EXISTS `LIKES`(`creationDate` string); 13 | CREATE EDGE IF NOT EXISTS `HAS_CREATOR`(); 14 | CREATE EDGE IF NOT EXISTS `HAS_INTEREST`(); 15 | CREATE EDGE IF NOT EXISTS `IS_SUBCLASS_OF`(); 16 | CREATE EDGE IF NOT EXISTS `IS_LOCATED_IN`(); 17 | CREATE EDGE IF NOT EXISTS `HAS_MODERATOR`(); 18 | CREATE EDGE IF NOT EXISTS `HAS_TAG`(); 19 | CREATE EDGE IF NOT EXISTS `WORK_AT`(`workFrom` int); 20 | CREATE EDGE IF NOT EXISTS `REPLY_OF`(); 21 | CREATE EDGE IF NOT EXISTS `STUDY_AT`(`classYear` int); 22 | CREATE EDGE IF NOT EXISTS `CONTAINER_OF`(); 23 | CREATE EDGE IF NOT EXISTS `HAS_MEMBER`(`joinDate` string); 24 | CREATE EDGE IF NOT EXISTS `KNOWS`(`creationDate` string); 25 | CREATE EDGE IF NOT EXISTS `HAS_TYPE`(); 26 | -------------------------------------------------------------------------------- /bench/exchange-test.md: -------------------------------------------------------------------------------- 1 | # Nebula-Exchange test result 2 | We use LDBC dataset to test the exchange client import performance. 3 | 4 | # prepare 5 | * The Nebula Schema DDL is configed in bench/NEBULA_DDL. 6 | 7 | * The exchange config file is configed in bench/EXCHANGE_CONFIG. 8 | 9 | # import command 10 | 11 | for space sf1, the command is: 12 | ``` 13 | spark-submit --master "spark://127.0.0.1:7077" \ 14 | --driver-memory=2G \ 15 | --num-executors=3 \ 16 | --executor-memory=10G \ 17 | --executor-cores=20 \ 18 | --class com.vesoft.nebula.exchange.Exchange \ 19 | nebula-exchange-2.6.0.jar -c app_sf1.conf 20 | ``` 21 | 22 | for space sf30, the command is: 23 | 24 | ``` 25 | spark-submit --master "spark://127.0.0.1:7077" \ 26 | --driver-memory=2G \ 27 | --num-executors=3 \ 28 | --executor-memory=30G \ 29 | --executor-cores=20 \ 30 | --class com.vesoft.nebula.exchange.Exchange \ 31 | nebula-exchange-2.6.0.jar -c app_sf30.conf 32 | ``` 33 | 34 | for space sf100, the command is: 35 | ``` 36 | spark-submit --master "spark://127.0.0.1:7077" \ 37 | --driver-memory=2G \ 38 | --num-executors=3 \ 39 | --executor-memory=30G \ 40 | --executor-cores=20 \ 41 | --class com.vesoft.nebula.exchange.Exchange \ 42 | nebula-exchange-2.6.0.jar -c app_sf100.conf 43 | ``` 44 | 45 | # import result 46 | Here is the import result: 47 | 48 | When Space has 1 replica, and the auto-compact is enable. 49 | 50 | | Dataset | Data Amount |cores|executor-memory|spark-partition|batch size|duration| speed | 51 | |:--------:|:--------------------------------:|:---:|:-------------:|:-------------:|:--------:|:------:|:--------:| 52 | |LDBC sf1 | vertex:3165488 edge:17256029 | 60 | 10G | 60 | 2000 | 56s | 360,000/s | 53 | |LDBC sf30 | vertex:88673640 edge:540915215 | 60 | 20G | 60 | 2000 | 7.5min |1,399,086/s| 54 | |LDBC sf100| vertex:282386021 edge:1775513185 | 60 | 30G | 60 | 2000 | 27min |1,270,303/s| 55 | 56 | When Space has 1 replica, and the auto-compact is false. 57 | 58 | | Dataset | Data Amount |cores|executor-memory|spark-partition|batch size|duration| speed | 59 | |:--------:|:--------------------------------:|:---:|:-------------:|:-------------:|:--------:|:------:|:--------:| 60 | |LDBC sf1 | vertex:3165488 edge:17256029 | 60 | 10G | 60 | 2000 | 49s | 416,765/s| 61 | |LDBC sf30 | vertex:88673640 edge:540915215 | 60 | 20G | 60 | 2000 | 6.3min|1,665,578/s| 62 | |LDBC sf100| vertex:282386021 edge:1775513185 | 60 | 30G | 60 | 2000 | 22min |1,559,014/s| 63 | 64 | After data import, space sf100 with one replica will take to finish the manual compaction. 65 | 66 | 67 | 68 | When Space has 3 replicas, and the auto-compact is closed. 69 | 70 | | Dataset | Data Amount |cores|executor-memory|spark-partition|batch size|duration| speed | 71 | |:---------:|:--------------------------------:|:---:|:-------------:|:-------------:|:--------:|:------:|:-------:| 72 | |LDBC sf1 | vertex:3165488 edge:17256029 | 60 | 10G | 60 | 2000 | 58s |352,095/s | 73 | |LDBC sf30 | vertex:88673640 edge:540915215 | 60 | 20G | 60 | 2000 | 17min |617,243/s| 74 | |LDBC sf100 | vertex:282386021 edge:1775513185 | 60 | 30G | 60 | 2000 | 42min |816,623/s| 75 | 76 | After data import, space sf100 with three replicas will take 1.1h to finish the manual compaction. 77 | 78 | # other information 79 | > The Spark cluster and nebula cluster are separated 80 | 81 | > Spark cluster has three workers, nebula cluster has three metad, three graphd and three storaged. 82 | 83 | > The clusters have 10 Gigabit Network, each nebula machine has 1.5T SSD disk and 256G memory. 84 | 85 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 vesoft inc. All rights reserved. 2 | # 3 | # This source code is licensed under Apache 2.0 License. 4 | 5 | # For more configuration details: 6 | # https://docs.codecov.io/docs/codecov-yaml 7 | 8 | # validate the configuration: 9 | # curl -X POST --data-binary @codecov.yml https://codecov.io/validate 10 | 11 | codecov: 12 | require_ci_to_pass: false 13 | -------------------------------------------------------------------------------- /conf-template/client_import/bigquery_datasource.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --executor-cores=20 \ 7 | # --jars $(echo /bigquery-jdbc-dependency-path/*.jar | tr ' ' ',') 8 | # --class com.vesoft.nebula.exchange.Exchange \ 9 | # nebula-exchange-3.0-SNAPSHOT.jar -c bigquery_datasource.conf 10 | 11 | # you can get all dependency jars for bigquery from https://cloud.google.com/bigquery/docs/reference/odbc-jdbc-drivers?hl=zh-cn#jdbc_release_1331004 12 | { 13 | # Spark config 14 | spark: { 15 | app: { 16 | name: NebulaGraph Exchange 17 | } 18 | } 19 | 20 | # Nebula Graph config 21 | nebula: { 22 | address:{ 23 | graph: ["127.0.0.1:9669"] 24 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 25 | # use `SHOW meta leader` to see your meta leader's address 26 | meta: ["127.0.0.1:9559"] 27 | } 28 | user: root 29 | pswd: nebula 30 | space: test 31 | 32 | # nebula client connection parameters 33 | connection { 34 | # socket connect & execute timeout, unit: millisecond 35 | timeout: 30000 36 | } 37 | 38 | error: { 39 | # max number of failures, if the number of failures is bigger than max, then exit the application. 40 | max: 32 41 | # failed data will be recorded in output path, format with ngql 42 | output: /tmp/errors 43 | } 44 | 45 | # use google's RateLimiter to limit the requests send to NebulaGraph 46 | rate: { 47 | # the stable throughput of RateLimiter 48 | limit: 1024 49 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 50 | # if it can't be obtained within the specified timeout, then give up the request. 51 | timeout: 1000 52 | } 53 | } 54 | 55 | # Processing tags 56 | tags: [ 57 | { 58 | name: tag-name-1 59 | type: { 60 | source: jdbc 61 | sink: client 62 | } 63 | 64 | # bigquery url, the auth way if configed in url. In this example, OAuthPvtKeyPath=/tmp/bq-reader-sa-key.json file should be accessible for all spark workers. 65 | url:"jdbc:bigquery://https://www.googleapis.com/bigquery/v2:443;ProjectId=nebula-cloud-test;OAuthType=0;OAuthServiceAcctEmail=bq-reader@nebula-cloud-test.iam.gserviceaccount.com;OAuthPvtKeyPath=/tmp/bq-reader-sa-key.json" 66 | # JDBC driver 67 | driver:"com.simba.googlebigquery.jdbc.Driver" 68 | 69 | user:"bq-reader@nebula-cloud-test.iam.gserviceaccount.com" 70 | password:"not_used_but_required" 71 | 72 | sentence:"select id, firstName, lastName, gender from dataset.person" 73 | 74 | fields: [firstName, lastName, gender] 75 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 76 | vertex: { 77 | field: id 78 | } 79 | batch: 2000 80 | partition: 60 81 | } 82 | ] 83 | } 84 | -------------------------------------------------------------------------------- /conf-template/client_import/csv_datasource.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --executor-cores=20 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange-3.0-SNAPSHOT.jar -c csv_datasource.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph: ["127.0.0.1:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta: ["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: /tmp/errors 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name-1 57 | type: { 58 | source: csv 59 | sink: client 60 | } 61 | # if your file in not in hdfs, config "file:///path/test.csv" 62 | path: "hdfs://ip:port/path/test.csv" 63 | # if your csv file has no header, then use _c0,_c1,_c2,.. to indicate fields 64 | fields: [csv-field-0, csv-field-1, csv-field-2] 65 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 66 | vertex: { 67 | field: csv-field-0 68 | # add the prefix for vertex id value, eg: original id is 12345, and the real id will be: tag1_12345 69 | prefix:"tag1" 70 | udf: { 71 | separator: "_" 72 | oldColNames: [parquet-field-0, parquet-field-1] 73 | newColName: new-parquet-field 74 | } 75 | } 76 | 77 | separator: "," 78 | header: true 79 | batch: 2000 80 | partition: 60 81 | } 82 | ] 83 | 84 | # process edges 85 | edges: [ 86 | { 87 | name: edge-name-1 88 | type: { 89 | source: csv 90 | sink: client 91 | } 92 | path: "hdfs://ip:port/path/test.csv" 93 | fields: [csv-field-0, csv-field-1, csv-field-2] 94 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 95 | source: { 96 | field: csv-field-0 97 | # add the prefix for source id value, eg: original id is 12345, and the real id will be: edge1_12345 98 | prefix:"edge1" 99 | } 100 | target: csv-field-1 101 | ranking: csv-field-2 102 | separator: "," 103 | header: true 104 | batch: 2000 105 | partition: 60 106 | } 107 | ] 108 | } 109 | -------------------------------------------------------------------------------- /conf-template/client_import/hive_datasource.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --executor-cores=20 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange-3.0-SNAPSHOT.jar -c hive_datasource.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph:["127.0.0.1:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta:["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: /tmp/errors 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name-1 57 | type: { 58 | source: hive 59 | sink: client 60 | } 61 | exec: "select hive-field0, hive-field1, hive-field2 from database.table" 62 | fields: [hive-field-0, hive-field-1, hive-field-2] 63 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 64 | vertex: hive-field-0 65 | batch: 2000 66 | partition: 60 67 | } 68 | ] 69 | 70 | # process edges 71 | edges: [ 72 | { 73 | name: edge-name-1 74 | type: { 75 | source: hive 76 | sink: client 77 | } 78 | exec: "select hive-field0, hive-field1, hive-field2 from database.table" 79 | fields: [ hive-field-0, hive-field-1, hive-field-2] 80 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 81 | source: hive-field-0 82 | target: hive-field-1 83 | ranking: hive-filed-2 84 | batch: 2000 85 | partition: 60 86 | } 87 | ] 88 | } 89 | -------------------------------------------------------------------------------- /conf-template/sst_import/csv_datasource.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --executor-cores=20 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange-3.0-SNAPSHOT.jar -c csv_datasource.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph:["127.0.0.1:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta:["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | path:{ 31 | # any path that owns read and write access is ok 32 | local:"/tmp" 33 | remote:"/sst" 34 | hdfs.namenode: "hdfs://name_node:9000" 35 | } 36 | 37 | # nebula client connection parameters 38 | connection { 39 | # socket connect & execute timeout, unit: millisecond 40 | timeout: 30000 41 | } 42 | 43 | error: { 44 | # max number of failures, if the number of failures is bigger than max, then exit the application. 45 | max: 32 46 | # failed data will be recorded in output path, format with ngql 47 | output: /tmp/errors 48 | } 49 | 50 | # use google's RateLimiter to limit the requests send to NebulaGraph 51 | rate: { 52 | # the stable throughput of RateLimiter 53 | limit: 1024 54 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 55 | # if it can't be obtained within the specified timeout, then give up the request. 56 | timeout: 1000 57 | } 58 | } 59 | 60 | # Processing tags 61 | tags: [ 62 | { 63 | name: tag-name-1 64 | type: { 65 | source: csv 66 | sink: sst 67 | } 68 | # if your file in not in hdfs, config "file:///path/test.csv" 69 | path: "hdfs://ip:port/path/test.csv" 70 | # if your csv file has no header, then use _c0,_c1,_c2,.. to indicate fields 71 | fields: [csv-field-0, csv-field-1, csv-field-2] 72 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 73 | vertex: csv-field-0 74 | separator: "," 75 | header: true 76 | batch: 2000 77 | partition: 60 78 | } 79 | ] 80 | 81 | # process edges 82 | edges: [ 83 | { 84 | name: edge-name-1 85 | type: { 86 | source: csv 87 | sink: sst 88 | } 89 | path: "hdfs://ip:port/path/test.csv" 90 | fields: [csv-field-0, csv-field-1, csv-field-2] 91 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 92 | source: csv-field-0 93 | target: csv-field-1 94 | ranking: csv-field-2 95 | separator: "," 96 | header: true 97 | batch: 2000 98 | partition: 60 99 | } 100 | ] 101 | } 102 | -------------------------------------------------------------------------------- /conf-template/sst_import/hive_datasource.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --executor-cores=20 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange-3.0-SNAPSHOT.jar -c hive_datasource.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph:["127.0.0.1:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta:["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | path:{ 31 | # any path that owns read and write access is ok 32 | local:"/tmp" 33 | remote:"/sst" 34 | hdfs.namenode: "hdfs://name_node:9000" 35 | } 36 | 37 | # nebula client connection parameters 38 | connection { 39 | # socket connect & execute timeout, unit: millisecond 40 | timeout: 30000 41 | } 42 | 43 | error: { 44 | # max number of failures, if the number of failures is bigger than max, then exit the application. 45 | max: 32 46 | # failed data will be recorded in output path, format with ngql 47 | output: /tmp/errors 48 | } 49 | 50 | # use google's RateLimiter to limit the requests send to NebulaGraph 51 | rate: { 52 | # the stable throughput of RateLimiter 53 | limit: 1024 54 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 55 | # if it can't be obtained within the specified timeout, then give up the request. 56 | timeout: 1000 57 | } 58 | } 59 | 60 | # Processing tags 61 | tags: [ 62 | { 63 | name: tag-name-1 64 | type: { 65 | source: hive 66 | sink: sst 67 | } 68 | exec: "select hive-field0, hive-field1, hive-field2 from database.table" 69 | fields: [hive-field-0, hive-field-1, hive-field-2] 70 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 71 | vertex: hive-field-0 72 | batch: 2000 73 | partition: 60 74 | } 75 | ] 76 | 77 | # process edges 78 | edges: [ 79 | { 80 | name: edge-name-1 81 | type: { 82 | source: hive 83 | sink: sst 84 | } 85 | exec: "select hive-field0, hive-field1, hive-field2 from database.table" 86 | fields: [ hive-field-0, hive-field-1, hive-field-2] 87 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 88 | source: hive-field-0 89 | target: hive-field-1 90 | ranking: hive-filed-2 91 | batch: 2000 92 | partition: 60 93 | } 94 | ] 95 | } 96 | -------------------------------------------------------------------------------- /exchange-common/src/main/java/com/vesoft/exchange/common/FileMigrate.java: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2023 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.BufferedWriter; 10 | import java.io.File; 11 | import java.io.FileWriter; 12 | import java.io.IOException; 13 | import java.io.InputStream; 14 | import java.io.InputStreamReader; 15 | 16 | public class FileMigrate { 17 | //Logger log = Logger.getLogger(FileMigrate.class); 18 | 19 | 20 | /** 21 | * migrate the source file to target path 22 | * 23 | * @param sourceFile template config file 24 | * @param path target path to save the config info 25 | */ 26 | public void saveConfig(String sourceFile, String path) { 27 | InputStream inputStream = 28 | this.getClass().getClassLoader().getResourceAsStream(sourceFile); 29 | if (inputStream == null) { 30 | System.exit(-1); 31 | } 32 | File file = new File(path); 33 | if (file.exists()) { 34 | file.delete(); 35 | } 36 | FileWriter writer = null; 37 | BufferedWriter bufferedWriter = null; 38 | BufferedReader reader = null; 39 | try { 40 | writer = new FileWriter(path); 41 | bufferedWriter = new BufferedWriter(writer); 42 | 43 | reader = new BufferedReader(new InputStreamReader(inputStream)); 44 | String line = null; 45 | while ((line = reader.readLine()) != null) { 46 | bufferedWriter.write(line); 47 | bufferedWriter.write("\n"); 48 | } 49 | } catch (IOException e) { 50 | System.out.println("Failed to migrate the template conf file:" + e.getMessage()); 51 | e.printStackTrace(); 52 | } finally { 53 | try { 54 | if (bufferedWriter != null) { 55 | bufferedWriter.close(); 56 | } 57 | if (reader != null) { 58 | reader.close(); 59 | } 60 | } catch (IOException e) { 61 | System.out.println("Failed to close the writer or reader:" + e.getMessage()); 62 | e.printStackTrace(); 63 | } 64 | } 65 | 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/config_template/csv.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --total-executor-cores=60 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c csv.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph: ["127.0.0.1:9669","127.0.0.2:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta: ["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: "hdfs://127.0.0.1:9000/tmp/errors" 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name 57 | type: { 58 | source: csv 59 | sink: client 60 | } 61 | # if your file in not in hdfs, config "file:///path/test.csv" 62 | path: "hdfs://ip:port/path/test.csv" 63 | # if your csv file has no header, then use _c0,_c1,_c2,.. to indicate fields 64 | fields: [csv-field-1, csv-field-2, csv-field-3] 65 | nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3] 66 | vertex: { 67 | field: csv-field-0 68 | } 69 | separator: "," 70 | header: true 71 | batch: 2000 72 | partition: 60 73 | } 74 | ] 75 | 76 | # process edges 77 | edges: [ 78 | { 79 | name: edge-name 80 | type: { 81 | source: csv 82 | sink: client 83 | } 84 | path: "hdfs://ip:port/path/test.csv" 85 | fields: [csv-field-2, csv-field-3, csv-field-4] 86 | nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3] 87 | source: { 88 | field: csv-field-0 89 | } 90 | target: { 91 | field: csv-field-1 92 | } 93 | #ranking: csv-field-2 94 | separator: "," 95 | header: true 96 | batch: 2000 97 | partition: 60 98 | } 99 | ] 100 | } 101 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/config_template/hbase.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --total-executor-cores=60 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c hbase.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address: { 21 | graph: ["127.0.0.1:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta: ["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: /tmp/errors 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name-1 57 | type: { 58 | source: hbase 59 | sink: client 60 | } 61 | host: 127.0.0.1 62 | port: 2181 63 | table: hbase-table 64 | columnFamily: hbase-table-cloumnfamily 65 | fields: [hbase-field-0, hbase-field-1, hbase-field-2] 66 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 67 | # if fields or vertex contains rowkey, please configure it as "rowkey". 68 | vertex: rowkey 69 | batch: 2000 70 | partition: 60 71 | } 72 | ] 73 | 74 | # process edges 75 | edges: [ 76 | { 77 | name: edge-name-1 78 | type: { 79 | source: hbase 80 | sink: client 81 | } 82 | host: 127.0.0.1 83 | port: 2181 84 | table: hbase-table 85 | columnFamily: hbase-table-cloumnfamily 86 | fields: [hbase-field-0, hbase-field-1, hbase-field-2] 87 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 88 | source: hbase-field-0 89 | target: hbase-field-1 90 | ranking: hbase-filed-2 91 | batch: 2000 92 | partition: 60 93 | } 94 | ] 95 | } 96 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/config_template/hive.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --executor-cores=20 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c hive.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address: { 21 | graph: ["127.0.0.1:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta: ["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: /tmp/errors 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name-1 57 | type: { 58 | source: hive 59 | sink: client 60 | } 61 | exec: "select hive-field0, hive-field1, hive-field2 from database.table" 62 | fields: [hive-field-0, hive-field-1, hive-field-2] 63 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 64 | vertex: hive-field-0 65 | batch: 2000 66 | partition: 60 67 | } 68 | ] 69 | 70 | # process edges 71 | edges: [ 72 | { 73 | name: edge-name-1 74 | type: { 75 | source: hive 76 | sink: client 77 | } 78 | exec: "select hive-field0, hive-field1, hive-field2 from database.table" 79 | fields: [hive-field-0, hive-field-1, hive-field-2] 80 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 81 | source: hive-field-0 82 | target: hive-field-1 83 | ranking: hive-filed-2 84 | batch: 2000 85 | partition: 60 86 | } 87 | ] 88 | } 89 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/config_template/jdbc.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --executor-cores=20 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c jdbc.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address: { 21 | graph: ["127.0.0.1:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta: ["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: /tmp/errors 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name-1 57 | type: { 58 | source: jdbc 59 | sink: client 60 | } 61 | url: "jdbc:oracle:thin:@host:1521:db" 62 | driver: "oracle.jdbc.driver.OracleDriver" 63 | user: "root" 64 | password: "nebula" 65 | sentence: "select oracle-field-0, oracle-field-1, oracle-field-2 from table" 66 | fields: [db-field-0, db-field-1, db-field-2] 67 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 68 | vertex: db-field-0 69 | batch: 2000 70 | partition: 60 71 | } 72 | ] 73 | 74 | # process edges 75 | edges: [ 76 | { 77 | name: edge-name-1 78 | type: { 79 | source: jdbc 80 | sink: client 81 | } 82 | url: "jdbc:oracle:thin:@host:1521:db" 83 | driver: "oracle.jdbc.driver.OracleDriver" 84 | user: "root" 85 | password: "nebula" 86 | sentence: "select db-field-0, db-field-1, db-field-2 from table" 87 | fields: [db-field-0, db-field-1, db-field-2] 88 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 89 | source: db-field-0 90 | target: db-field-1 91 | #ranking: db-filed-2 92 | batch: 2000 93 | partition: 60 94 | } 95 | ] 96 | } 97 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/config_template/json.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --total-executor-cores=60 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c json.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph: ["127.0.0.1:9669","127.0.0.2:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta: ["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: "hdfs://127.0.0.1:9000/tmp/errors" 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name 57 | type: { 58 | source: json 59 | sink: client 60 | } 61 | # if your file in not in hdfs, config "file:///path/test.json" 62 | path: "hdfs://ip:port/path/test.json" 63 | fields: [json-field-1, json-field-2, json-field-3] 64 | nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3] 65 | vertex: { 66 | field: json-field-0 67 | } 68 | batch: 2000 69 | partition: 60 70 | } 71 | ] 72 | 73 | # process edges 74 | edges: [ 75 | { 76 | name: edge-name 77 | type: { 78 | source: json 79 | sink: client 80 | } 81 | path: "hdfs://ip:port/path/test.json" 82 | fields: [json-field-2, json-field-3, json-field-4] 83 | nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3] 84 | source: { 85 | field: json-field-0 86 | } 87 | target: { 88 | field: json-field-1 89 | } 90 | #ranking: json-field-2 91 | batch: 2000 92 | partition: 60 93 | } 94 | ] 95 | } 96 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/config_template/kafka.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --executor-cores=20 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c kafka.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph:["127.0.0.1:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta:["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: /tmp/errors 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name-1 57 | type: { 58 | source: kafka 59 | sink: client 60 | } 61 | service: "kafka.service.address" 62 | topic: "topic-name" 63 | fields: [kafka-field-0, kafka-field-1, kafka-field-2] 64 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 65 | vertex: kafka-field-0 66 | batch: 2000 67 | partition: 60 68 | } 69 | ] 70 | 71 | # process edges 72 | edges: [ 73 | { 74 | name: edge-name-1 75 | type: { 76 | source: kafka 77 | sink: client 78 | } 79 | service: "kafka.service.address" 80 | topic: "topic-name" 81 | fields: [ kafka-field-3, kafka-field-4, kafka-field-5] 82 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 83 | source: kafka-field-0 84 | target: kafka-field-1 85 | #ranking: kafka-filed-2 86 | batch: 2000 87 | partition: 60 88 | } 89 | ] 90 | } 91 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/config_template/neo4j.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --executor-cores=20 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c neo4j.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph:["127.0.0.1:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta:["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: /tmp/errors 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name-1 57 | type: { 58 | source: neo4j 59 | sink: client 60 | } 61 | server: "bolt://127.0.0.1:7687" 62 | user: neo4j 63 | password: neo4j 64 | exec: "match (a:vertex_label)-[r:edge_label]->(b:vertex_label) return a.neo4j-source-field, b.neo4j-target-field, r.neo4j-field-0 as neo4j-field-0, r.neo4j-field-1 as neo4j-field-1 order by id(r)" 65 | fields: [neo4j-field-0, neo4j-field-1, neo4j-field-2] 66 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 67 | vertex: neo4j-field-0 68 | batch: 2000 69 | partition: 60 70 | } 71 | ] 72 | 73 | # process edges 74 | edges: [ 75 | { 76 | name: edge-name-1 77 | type: { 78 | source: neo4j 79 | sink: client 80 | } 81 | server: "bolt://127.0.0.1:7687" 82 | user: neo4j 83 | password: neo4j 84 | exec: "match (a:vertex_label)-[r:edge_label]->(b:vertex_label) return a.neo4j-source-field, b.neo4j-target-field, r.neo4j-field-0 as neo4j-field-0, r.neo4j-field-1 as neo4j-field-1 order by id(r)" 85 | fields: [ neo4j-field-0, neo4j-field-1, neo4j-field-2] 86 | nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] 87 | source: neo4j-field-0 88 | target: neo4j-field-1 89 | #ranking: neo4j-filed-2 90 | batch: 2000 91 | partition: 60 92 | } 93 | ] 94 | } 95 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/config_template/orc.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --total-executor-cores=60 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c orc.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph: ["127.0.0.1:9669","127.0.0.2:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta: ["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: "hdfs://127.0.0.1:9000/tmp/errors" 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name 57 | type: { 58 | source: orc 59 | sink: client 60 | } 61 | # if your file in not in hdfs, config "file:///path/test.orc" 62 | path: "hdfs://ip:port/path/test.orc" 63 | fields: [orc-field-1, orc-field-2, orc-field-3] 64 | nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3] 65 | vertex: { 66 | field: orc-field-0 67 | } 68 | batch: 2000 69 | partition: 60 70 | } 71 | ] 72 | 73 | # process edges 74 | edges: [ 75 | { 76 | name: edge-name 77 | type: { 78 | source: orc 79 | sink: client 80 | } 81 | path: "hdfs://ip:port/path/test.orc" 82 | fields: [orc-field-2, orc-field-3, orc-field-4] 83 | nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3] 84 | source: { 85 | field: orc-field-0 86 | } 87 | target: { 88 | field: orc-field-1 89 | } 90 | #ranking: orc-field-2 91 | batch: 2000 92 | partition: 60 93 | } 94 | ] 95 | } 96 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/config_template/parquet.conf: -------------------------------------------------------------------------------- 1 | # Use the command to submit the exchange job: 2 | 3 | # spark-submit \ 4 | # --master "spark://master_ip:7077" \ 5 | # --driver-memory=2G --executor-memory=30G \ 6 | # --num-executors=3 --total-executor-cores=60 \ 7 | # --class com.vesoft.nebula.exchange.Exchange \ 8 | # nebula-exchange_spark_2.4-3.0-SNAPSHOT.jar -c parquet.conf 9 | 10 | { 11 | # Spark config 12 | spark: { 13 | app: { 14 | name: NebulaGraph Exchange 15 | } 16 | } 17 | 18 | # Nebula Graph config 19 | nebula: { 20 | address:{ 21 | graph: ["127.0.0.1:9669","127.0.0.2:9669"] 22 | # if your NebulaGraph server is in virtual network like k8s, please config the leader address of meta. 23 | # use `SHOW meta leader` to see your meta leader's address 24 | meta: ["127.0.0.1:9559"] 25 | } 26 | user: root 27 | pswd: nebula 28 | space: test 29 | 30 | # nebula client connection parameters 31 | connection { 32 | # socket connect & execute timeout, unit: millisecond 33 | timeout: 30000 34 | } 35 | 36 | error: { 37 | # max number of failures, if the number of failures is bigger than max, then exit the application. 38 | max: 32 39 | # failed data will be recorded in output path, format with ngql 40 | output: "hdfs://127.0.0.1:9000/tmp/errors" 41 | } 42 | 43 | # use google's RateLimiter to limit the requests send to NebulaGraph 44 | rate: { 45 | # the stable throughput of RateLimiter 46 | limit: 1024 47 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 48 | # if it can't be obtained within the specified timeout, then give up the request. 49 | timeout: 1000 50 | } 51 | } 52 | 53 | # Processing tags 54 | tags: [ 55 | { 56 | name: tag-name 57 | type: { 58 | source: orc 59 | sink: client 60 | } 61 | # if your file in not in hdfs, config "file:///path/test.orc" 62 | path: "hdfs://ip:port/path/test.orc" 63 | fields: [orc-field-1, orc-field-2, orc-field-3] 64 | nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3] 65 | vertex: { 66 | field: orc-field-0 67 | } 68 | batch: 2000 69 | partition: 60 70 | } 71 | ] 72 | 73 | # process edges 74 | edges: [ 75 | { 76 | name: edge-name 77 | type: { 78 | source: orc 79 | sink: client 80 | } 81 | path: "hdfs://ip:port/path/test.orc" 82 | fields: [orc-field-2, orc-field-3, orc-field-4] 83 | nebula.fields: [nebula-field-1, nebula-field-2, nebula-field-3] 84 | source: { 85 | field: orc-field-0 86 | } 87 | target: { 88 | field: orc-field-1 89 | } 90 | #ranking: orc-field-2 91 | batch: 2000 92 | partition: 60 93 | } 94 | ] 95 | } 96 | -------------------------------------------------------------------------------- /exchange-common/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Global logging configuration 2 | log4j.rootLogger=INFO, stdout 3 | # Console output... 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n 7 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/CheckPointHandler.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common 7 | 8 | import com.vesoft.exchange.common.config.{SchemaConfigEntry, SourceCategory} 9 | import com.vesoft.exchange.common.utils.HDFSUtils 10 | import org.apache.spark.TaskContext 11 | 12 | /** 13 | * CheckPointHandler handle the checkpoint files for Neo4j and Janusgraph 14 | */ 15 | object CheckPointHandler { 16 | 17 | def checkSupportResume(value: SourceCategory.Value): Boolean = { 18 | value match { 19 | case SourceCategory.NEO4J => true 20 | case SourceCategory.JANUS_GRAPH => true 21 | case _ => false 22 | } 23 | } 24 | 25 | def getPathAndOffset(schemaConfig: SchemaConfigEntry, 26 | breakPointCount: Long): Option[(String, Long)] = { 27 | val partitionId = TaskContext.getPartitionId() 28 | if (checkSupportResume(schemaConfig.dataSourceConfigEntry.category) && schemaConfig.checkPointPath.isDefined) { 29 | val path = s"${schemaConfig.checkPointPath.get}/${schemaConfig.name}.${partitionId}" 30 | val offset = breakPointCount + fetchOffset(path) 31 | Some((path, offset)) 32 | } else { 33 | None 34 | } 35 | } 36 | 37 | def fetchOffset(path: String): Long = { 38 | HDFSUtils.getContent(path).toLong 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/ErrorHandler.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common 7 | 8 | import org.apache.hadoop.conf.Configuration 9 | import org.apache.hadoop.fs.{FileSystem, Path} 10 | import org.apache.log4j.Logger 11 | 12 | import java.util.UUID 13 | import scala.collection.mutable.ArrayBuffer 14 | 15 | object ErrorHandler { 16 | @transient 17 | private[this] val LOG = Logger.getLogger(this.getClass) 18 | 19 | /** 20 | * clean all the failed data for error path before reload. 21 | * 22 | * @param path path to clean 23 | */ 24 | def clear(path: String): Unit = { 25 | try { 26 | val fileSystem = FileSystem.get(new Configuration()) 27 | val filesStatus = fileSystem.listStatus(new Path(path)) 28 | for (file <- filesStatus) { 29 | if (!file.getPath.getName.startsWith("reload.")) { 30 | fileSystem.delete(file.getPath, true) 31 | } 32 | } 33 | } catch { 34 | case e: Throwable => { 35 | LOG.error(s"$path cannot be clean, but this error does not affect the import result, " + 36 | s"you can only focus on the reload files.", 37 | e) 38 | } 39 | } 40 | } 41 | 42 | /** 43 | * save the failed execute statement. 44 | * 45 | * @param buffer buffer saved failed ngql 46 | * @param path path to write these buffer ngql 47 | */ 48 | def save(buffer: ArrayBuffer[String], path: String): Unit = { 49 | val targetPath = new Path(path) 50 | val fileSystem = targetPath.getFileSystem(new Configuration()) 51 | val errors = if (fileSystem.exists(targetPath)) { 52 | val newPath = s"${path}_append_${UUID.randomUUID().toString}" 53 | LOG.info(s"create reload path $newPath") 54 | // For kafka, the error ngql need to append to a same file instead of overwrite 55 | fileSystem.create(new Path(newPath)) 56 | } else { 57 | LOG.info(s"create reload path $path") 58 | fileSystem.create(targetPath) 59 | } 60 | 61 | try { 62 | for (error <- buffer) { 63 | errors.write(error.getBytes) 64 | errors.writeBytes("\n") 65 | } 66 | } finally { 67 | errors.close() 68 | } 69 | } 70 | 71 | /** 72 | * check if path exists 73 | * 74 | * @param path error path 75 | * @return true if path exists 76 | */ 77 | def existError(path: String): Boolean = { 78 | val errorPath = new Path(path) 79 | val fileSystem = errorPath.getFileSystem(new Configuration()) 80 | fileSystem.exists(new Path(path)) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/GenerateConfigTemplate.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2023 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common 7 | 8 | import com.vesoft.exchange.common.config.SourceCategory 9 | import org.apache.commons.cli.{ 10 | CommandLine, 11 | CommandLineParser, 12 | HelpFormatter, 13 | Option, 14 | Options, 15 | ParseException, 16 | PosixParser 17 | } 18 | 19 | object GenerateConfigTemplate { 20 | 21 | def main(args: Array[String]): Unit = { 22 | val sourceOption = new Option("s", "dataSource", true, "data source type") 23 | sourceOption.setRequired(true) 24 | 25 | val pathOption = new Option("p", "path", true, "target path to save the template config file") 26 | pathOption.setRequired(true) 27 | 28 | val options = new Options 29 | options.addOption(sourceOption) 30 | options.addOption(pathOption) 31 | 32 | var cli: CommandLine = null 33 | val cliParser: CommandLineParser = new PosixParser() 34 | val helpFormatter = new HelpFormatter 35 | try { 36 | cli = cliParser.parse(options, args) 37 | } catch { 38 | case e: ParseException => 39 | helpFormatter.printHelp(">>>> options", options) 40 | e.printStackTrace() 41 | System.exit(1) 42 | } 43 | val source: String = cli.getOptionValue("s") 44 | val path: String = cli.getOptionValue("p") 45 | 46 | getConfigTemplate(source, path) 47 | } 48 | 49 | def getConfigTemplate(source: String, path: String): Unit = { 50 | val sourceCategory = SourceCategory.withName(source.trim.toUpperCase) 51 | 52 | val fileMigrate = new FileMigrate 53 | sourceCategory match { 54 | case SourceCategory.CSV => 55 | fileMigrate.saveConfig("config_template/csv.conf", path + "/csv.conf") 56 | case SourceCategory.JSON => 57 | fileMigrate.saveConfig("config_template/json.conf", path + "/json.conf") 58 | case SourceCategory.ORC => 59 | fileMigrate.saveConfig("config_template/orc.conf", path + "/orc.conf") 60 | case SourceCategory.PARQUET => 61 | fileMigrate.saveConfig("config_template/parquet.conf", path + "/parquet.conf") 62 | case SourceCategory.HIVE => 63 | fileMigrate.saveConfig("config_template/hive.conf", path + "/hive.conf") 64 | case SourceCategory.JDBC | SourceCategory.MYSQL | SourceCategory.CLICKHOUSE | 65 | SourceCategory.MAXCOMPUTE | SourceCategory.ORC | SourceCategory.POSTGRESQL => 66 | fileMigrate.saveConfig("config_template/jdbc.conf", path + "/jdbc.conf") 67 | case SourceCategory.NEO4J => 68 | fileMigrate.saveConfig("config_template/neo4j.conf", path + "/neo4j.conf") 69 | case _ => throw new IllegalArgumentException(s"does not support datasource $sourceCategory") 70 | } 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/GraphProvider.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common 7 | 8 | import com.google.common.net.HostAndPort 9 | import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, UserConfigEntry} 10 | import com.vesoft.nebula.client.graph.NebulaPoolConfig 11 | import com.vesoft.nebula.client.graph.data.{ 12 | CASignedSSLParam, 13 | HostAddress, 14 | ResultSet, 15 | SSLParam, 16 | SelfSignedSSLParam 17 | } 18 | import com.vesoft.nebula.client.graph.net.{NebulaPool, Session} 19 | import org.apache.log4j.Logger 20 | 21 | import scala.collection.JavaConverters._ 22 | import scala.collection.mutable.ListBuffer 23 | 24 | /** 25 | * GraphProvider for Nebula Graph Service 26 | */ 27 | class GraphProvider(addresses: List[HostAddress], 28 | timeout: Int, 29 | sslConfigEntry: SslConfigEntry) 30 | extends AutoCloseable 31 | with Serializable { 32 | private[this] lazy val LOG = Logger.getLogger(this.getClass) 33 | 34 | @transient val nebulaPoolConfig = new NebulaPoolConfig 35 | @transient val pool: NebulaPool = new NebulaPool 36 | val randAddr = scala.util.Random.shuffle(addresses) 37 | 38 | nebulaPoolConfig.setTimeout(timeout) 39 | 40 | // com.vesoft.exchange.common.config graph ssl 41 | nebulaPoolConfig.setEnableSsl(sslConfigEntry.enableGraph) 42 | if (sslConfigEntry.enableGraph) { 43 | var sslParam: SSLParam = null 44 | if (sslConfigEntry.signType == SslType.CA) { 45 | val ca = sslConfigEntry.caSignParam 46 | sslParam = new CASignedSSLParam(ca.caCrtFilePath, ca.crtFilePath, ca.keyFilePath) 47 | } else { 48 | val self = sslConfigEntry.selfSignParam 49 | sslParam = new SelfSignedSSLParam(self.crtFilePath, self.keyFilePath, self.password) 50 | } 51 | nebulaPoolConfig.setSslParam(sslParam) 52 | } 53 | 54 | pool.init(randAddr.asJava, nebulaPoolConfig) 55 | 56 | def getGraphClient(userConfigEntry: UserConfigEntry): Session = { 57 | pool.getSession(userConfigEntry.user, userConfigEntry.password, true); 58 | } 59 | 60 | def releaseGraphClient(session: Session): Unit = { 61 | session.release() 62 | } 63 | 64 | override def close(): Unit = { 65 | pool.close() 66 | } 67 | 68 | def switchSpace(session: Session, space: String): (HostAddress, ResultSet) = { 69 | val switchStatment = s"use $space" 70 | LOG.info(s">>>>>> switch space $space") 71 | val result = submit(session, switchStatment) 72 | result 73 | } 74 | 75 | def submit(session: Session, statement: String): (HostAddress, ResultSet) = { 76 | val result = session.execute(statement) 77 | (session.getGraphHost, result) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/MetaProvider.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common 7 | 8 | import com.google.common.net.HostAndPort 9 | import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, Type} 10 | import com.vesoft.nebula.PropertyType 11 | import com.vesoft.nebula.client.graph.data.{ 12 | CASignedSSLParam, 13 | HostAddress, 14 | SSLParam, 15 | SelfSignedSSLParam 16 | } 17 | import com.vesoft.nebula.client.meta.MetaClient 18 | import com.vesoft.nebula.meta.{EdgeItem, TagItem} 19 | import org.apache.log4j.Logger 20 | 21 | import scala.collection.JavaConverters._ 22 | import scala.collection.mutable 23 | import scala.collection.mutable.ListBuffer 24 | 25 | /** 26 | * MetaProvider provide nebula graph meta query operations. 27 | */ 28 | class MetaProvider(addresses: List[HostAddress], 29 | timeout: Int, 30 | retry: Int, 31 | sslConfigEntry: SslConfigEntry) 32 | extends AutoCloseable 33 | with Serializable { 34 | private[this] lazy val LOG = Logger.getLogger(this.getClass) 35 | 36 | private var metaClient: MetaClient = null 37 | var sslParam: SSLParam = null 38 | // com.vesoft.exchange.common.config meta ssl 39 | if (sslConfigEntry.enableMeta) { 40 | if (sslConfigEntry.signType == SslType.CA) { 41 | val ca = sslConfigEntry.caSignParam 42 | sslParam = new CASignedSSLParam(ca.caCrtFilePath, ca.crtFilePath, ca.keyFilePath) 43 | } else { 44 | val self = sslConfigEntry.selfSignParam 45 | sslParam = new SelfSignedSSLParam(self.crtFilePath, self.keyFilePath, self.password) 46 | } 47 | metaClient = new MetaClient(addresses.asJava, timeout, retry, retry, true, sslParam) 48 | } else { 49 | metaClient = new MetaClient(addresses.asJava, timeout, retry, retry) 50 | } 51 | 52 | metaClient.connect() 53 | 54 | def getPartNumber(space: String): Int = { 55 | metaClient.getPartsAlloc(space).size() 56 | } 57 | 58 | def getVidType(space: String): VidType.Value = { 59 | val vidType = metaClient.getSpace(space).getProperties.getVid_type.getType 60 | if (vidType == PropertyType.FIXED_STRING) { 61 | return VidType.STRING 62 | } 63 | VidType.INT 64 | } 65 | 66 | def getTagSchema(space: String, tag: String): Map[String, Integer] = { 67 | val tagSchema = metaClient.getTag(space, tag) 68 | val schema = new mutable.HashMap[String, Integer] 69 | 70 | val columns = tagSchema.getColumns 71 | for (colDef <- columns.asScala) { 72 | schema.put(new String(colDef.getName), colDef.getType.getType.getValue) 73 | } 74 | schema.toMap 75 | } 76 | 77 | def getEdgeSchema(space: String, edge: String): Map[String, Integer] = { 78 | val edgeSchema = metaClient.getEdge(space, edge) 79 | val schema = new mutable.HashMap[String, Integer] 80 | 81 | val columns = edgeSchema.getColumns 82 | for (colDef <- columns.asScala) { 83 | schema.put(new String(colDef.getName), colDef.getType.getType.getValue) 84 | } 85 | schema.toMap 86 | } 87 | 88 | def getLabelType(space: String, label: String): Type.Value = { 89 | val tags = metaClient.getTags(space) 90 | for (tag <- tags.asScala) { 91 | if (new String(tag.getTag_name).equals(label)) { 92 | return Type.VERTEX 93 | } 94 | } 95 | val edges = metaClient.getEdges(space) 96 | for (edge <- edges.asScala) { 97 | if (new String(edge.getEdge_name).equals(label)) { 98 | return Type.EDGE 99 | } 100 | } 101 | null 102 | } 103 | 104 | def getSpaceVidLen(space: String): Int = { 105 | val spaceItem = metaClient.getSpace(space); 106 | if (spaceItem == null) { 107 | throw new IllegalArgumentException(s"space $space does not exist.") 108 | } 109 | spaceItem.getProperties.getVid_type.getType_length 110 | } 111 | 112 | def getTagItem(space: String, tag: String): TagItem = { 113 | val tagItemList = metaClient.getTags(space).asScala 114 | for (tagItem: TagItem <- tagItemList) { 115 | if (new String(tagItem.tag_name).equals(tag)) { 116 | return tagItem 117 | } 118 | } 119 | throw new IllegalArgumentException(s"tag ${space}.${tag} does not exist.") 120 | } 121 | 122 | def getEdgeItem(space: String, edge: String): EdgeItem = { 123 | val edgeItemList = metaClient.getEdges(space).asScala 124 | for (edgeItem: EdgeItem <- edgeItemList) { 125 | if (new String(edgeItem.edge_name).equals(edge)) { 126 | return edgeItem 127 | } 128 | } 129 | throw new IllegalArgumentException(s"edge ${space}.${edge} does not exist.") 130 | } 131 | 132 | override def close(): Unit = { 133 | metaClient.close() 134 | } 135 | 136 | } 137 | 138 | object VidType extends Enumeration { 139 | type Type = Value 140 | 141 | val STRING = Value("STRING") 142 | val INT = Value("INT") 143 | } 144 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/Package.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange 7 | 8 | import com.google.common.base.Optional 9 | import com.google.common.util.concurrent.ListenableFuture 10 | import com.vesoft.exchange.common.utils.NebulaUtils 11 | 12 | import scala.collection.mutable.ListBuffer 13 | 14 | package object common { 15 | 16 | type GraphSpaceID = Int 17 | type PartitionID = Int 18 | type TagID = Int 19 | type EdgeType = Int 20 | type SchemaID = (TagID, EdgeType) 21 | type TagVersion = Long 22 | type EdgeVersion = Long 23 | type SchemaVersion = (TagVersion, EdgeVersion) 24 | type VertexID = Long 25 | type VertexIDSlice = String 26 | type EdgeRank = Long 27 | type PropertyNames = List[String] 28 | type PropertyValues = List[Any] 29 | type ProcessResult = ListBuffer[WriterResult] 30 | type WriterResult = ListenableFuture[Optional[Integer]] 31 | 32 | case class Vertex(vertexID: VertexIDSlice, values: PropertyValues) { 33 | 34 | def propertyValues = values.mkString(", ") 35 | 36 | override def toString: String = { 37 | s"Vertex ID: ${vertexID}, " + 38 | s"Values: ${values.mkString(", ")}" 39 | } 40 | } 41 | 42 | case class Vertices(names: PropertyNames, 43 | values: List[Vertex], 44 | policy: Option[KeyPolicy.Value] = None) { 45 | 46 | def propertyNames: String = NebulaUtils.escapePropName(names).mkString(",") 47 | 48 | override def toString: String = { 49 | s"Vertices: " + 50 | s"Property Names: ${names.mkString(", ")}" + 51 | s"Vertex Values: ${values.mkString(", ")} " + 52 | s"with policy ${policy}" 53 | } 54 | } 55 | 56 | case class Edge(source: VertexIDSlice, 57 | destination: VertexIDSlice, 58 | ranking: Option[EdgeRank], 59 | values: PropertyValues) { 60 | 61 | def this(source: VertexIDSlice, destination: VertexIDSlice, values: PropertyValues) = { 62 | this(source, destination, None, values) 63 | } 64 | 65 | def propertyValues: String = values.mkString(", ") 66 | 67 | override def toString: String = { 68 | val rank = if (ranking.isEmpty) 0 else ranking.get 69 | s"Edge: ${source}->${destination}@${rank} values: ${propertyValues}" 70 | } 71 | } 72 | 73 | case class Edges(names: PropertyNames, 74 | values: List[Edge], 75 | sourcePolicy: Option[KeyPolicy.Value] = None, 76 | targetPolicy: Option[KeyPolicy.Value] = None) { 77 | def propertyNames: String = NebulaUtils.escapePropName(names).mkString(",") 78 | 79 | override def toString: String = { 80 | "Edges:" + 81 | s"Property Names: ${names.mkString(", ")}" + 82 | s"with source policy ${sourcePolicy}" + 83 | s"with target policy ${targetPolicy}" 84 | } 85 | } 86 | 87 | object KeyPolicy extends Enumeration { 88 | type POLICY = Value 89 | val HASH = Value("hash") 90 | val UUID = Value("uuid") 91 | } 92 | 93 | case class Offset(start: Long, size: Long) 94 | } 95 | 96 | final case class Argument(config: String = "application.conf", 97 | hive: Boolean = false, 98 | directly: Boolean = false, 99 | dry: Boolean = false, 100 | reload: String = "", 101 | variable: Boolean = false, 102 | param: String = "") 103 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/PasswordEncryption.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2023 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common 7 | 8 | import com.vesoft.exchange.Argument 9 | 10 | import java.security.spec.{PKCS8EncodedKeySpec, X509EncodedKeySpec} 11 | import java.security.{KeyFactory, KeyPairGenerator, SecureRandom} 12 | import java.util.Base64 13 | import javax.crypto.Cipher 14 | import javax.crypto.spec.SecretKeySpec 15 | 16 | object PasswordEncryption { 17 | private val algorithm = "RSA" 18 | private val charset = "UTF-8" 19 | 20 | def main(args: Array[String]): Unit = { 21 | val passwdOption = new scopt.OptionParser[PasswordConfig]("encrypt password") { 22 | head("encrypt password") 23 | 24 | opt[String]('p', "passwd") 25 | .required() 26 | .valueName("passwd") 27 | .action((x, c) => c.copy(password = x)) 28 | .text("your real password") 29 | }.parse(args, PasswordConfig()) 30 | 31 | require(passwdOption.isDefined && passwdOption.get.password != null, "lack of password parameter") 32 | 33 | val password:String = passwdOption.get.password 34 | 35 | val (encryptedPasswd, privateKey) = encryptPassword(password) 36 | println(s"=================== private key begin ===================") 37 | println(privateKey) 38 | println(s"=================== private key end ===================\n\n") 39 | 40 | println(s"=================== encrypted password begin ===================") 41 | println(encryptedPasswd) 42 | println(s"=================== encrypted password end ===================") 43 | 44 | println(s"check: the real password decrypted by private key and encrypted password is: ${decryptPassword(encryptedPasswd, privateKey)}") 45 | } 46 | 47 | /** 48 | * encrypt the password 49 | * 50 | * @param password real password 51 | * @return (encryptedPasswd, privateKey) 52 | */ 53 | def encryptPassword(password: String): (String, String) = { 54 | val keyPairGenerator = KeyPairGenerator.getInstance(algorithm) 55 | keyPairGenerator.initialize(1024, new SecureRandom()) 56 | val keyPair = keyPairGenerator.generateKeyPair() 57 | val privateKey = keyPair.getPrivate 58 | val privateKeyStr = new String(Base64.getEncoder.encode(privateKey.getEncoded), charset) 59 | val publicKey = keyPair.getPublic 60 | val publicKeyStr = new String(Base64.getEncoder.encode(publicKey.getEncoded), charset) 61 | println(s"=================== public key begin ===================") 62 | println(publicKeyStr) 63 | println(s"=================== public key end ===================\n\n") 64 | 65 | // encrypt the password 66 | val encoded = Base64.getDecoder.decode(publicKeyStr) 67 | val rsaPublicKey = KeyFactory.getInstance(algorithm).generatePublic(new X509EncodedKeySpec(encoded)) 68 | val cipher = Cipher.getInstance(algorithm) 69 | cipher.init(Cipher.ENCRYPT_MODE, rsaPublicKey) 70 | val encodePasswd = new String(Base64.getEncoder.encode(cipher.doFinal(password.getBytes(charset))), charset) 71 | (encodePasswd, privateKeyStr) 72 | } 73 | 74 | /** 75 | * decrypt the encrypted password with private key 76 | * 77 | * @param encryptedPassword encrypted password 78 | * @param privateKey rsa private key 79 | * @return real password 80 | */ 81 | def decryptPassword(encryptedPassword: String, privateKey: String): String = { 82 | val encryptedPasswdBytes = Base64.getDecoder.decode(encryptedPassword) 83 | val decodedPrivateKey = Base64.getDecoder.decode(privateKey) 84 | val rsaPrivateKey = KeyFactory.getInstance(algorithm).generatePrivate(new PKCS8EncodedKeySpec(decodedPrivateKey)) 85 | val cipher = Cipher.getInstance(algorithm) 86 | cipher.init(Cipher.DECRYPT_MODE, rsaPrivateKey) 87 | val password = new String(cipher.doFinal(encryptedPasswdBytes), charset) 88 | password 89 | } 90 | 91 | 92 | } 93 | 94 | final case class PasswordConfig(password: String = null) 95 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/config/SchemaConfigs.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.config 7 | 8 | import com.vesoft.exchange.common.KeyPolicy 9 | 10 | /** 11 | * SchemaConfigEntry is tag/edge super class use to save some basic parameter for importer. 12 | */ 13 | sealed trait SchemaConfigEntry { 14 | 15 | /** nebula tag or edge name */ 16 | def name: String 17 | 18 | /** see{@link DataSourceConfigEntry}*/ 19 | def dataSourceConfigEntry: DataSourceConfigEntry 20 | 21 | /** see{@link DataSinkConfigEntry}*/ 22 | def dataSinkConfigEntry: DataSinkConfigEntry 23 | 24 | /** data source fields which are going to be import to nebula as properties */ 25 | def fields: List[String] 26 | 27 | /** nebula properties which are going to fill value with data source value*/ 28 | def nebulaFields: List[String] 29 | 30 | /** vertex or edge amount of one batch import */ 31 | def batch: Int 32 | 33 | /** spark partition */ 34 | def partition: Int 35 | 36 | /** check point path */ 37 | def checkPointPath: Option[String] 38 | 39 | /** write mode */ 40 | def writeMode: WriteMode.Mode 41 | } 42 | 43 | /** 44 | * 45 | * @param name 46 | * @param dataSourceConfigEntry 47 | * @param dataSinkConfigEntry 48 | * @param fields 49 | * @param nebulaFields 50 | * @param vertexField 51 | * @param vertexPolicy 52 | * @param batch 53 | * @param partition 54 | * @param checkPointPath 55 | */ 56 | case class TagConfigEntry(override val name: String, 57 | override val dataSourceConfigEntry: DataSourceConfigEntry, 58 | override val dataSinkConfigEntry: DataSinkConfigEntry, 59 | override val fields: List[String], 60 | override val nebulaFields: List[String], 61 | override val writeMode: WriteMode.Mode, 62 | vertexField: String, 63 | vertexPolicy: Option[KeyPolicy.Value], 64 | vertexPrefix: String, 65 | override val batch: Int, 66 | override val partition: Int, 67 | override val checkPointPath: Option[String], 68 | repartitionWithNebula: Boolean = true, 69 | enableTagless: Boolean = false, 70 | ignoreIndex: Boolean = false, 71 | deleteEdge: Boolean = false, 72 | vertexUdf: Option[UdfConfigEntry] = None, 73 | filterConfig: Option[FilterConfigEntry] = None) 74 | extends SchemaConfigEntry { 75 | require(name.trim.nonEmpty, "tag name cannot be empty") 76 | require(vertexField.trim.nonEmpty, "tag vertex id cannot be empty") 77 | require(batch > 0, "batch config must be larger than 0") 78 | require(fields.size == nebulaFields.size, 79 | "fields and nebula.fields must have the same element number") 80 | 81 | override def toString: String = { 82 | s"Tag name: $name, " + 83 | s"source: $dataSourceConfigEntry, " + 84 | s"sink: $dataSinkConfigEntry, " + 85 | s"writeMode: $writeMode, " + 86 | s"vertex field: $vertexField, " + 87 | s"vertex policy: $vertexPolicy, " + 88 | s"batch: $batch, " + 89 | s"partition: $partition, " + 90 | s"repartitionWithNebula: $repartitionWithNebula, " + 91 | s"enableTagless: $enableTagless, " + 92 | s"ignoreIndex: $ignoreIndex, " + 93 | s"vertexUdf: $vertexUdf, " + 94 | s"filter: $filterConfig." 95 | } 96 | } 97 | 98 | /** 99 | * 100 | * @param name 101 | * @param dataSourceConfigEntry 102 | * @param dataSinkConfigEntry 103 | * @param fields 104 | * @param nebulaFields 105 | * @param sourceField 106 | * @param sourcePolicy 107 | * @param rankingField 108 | * @param targetField 109 | * @param targetPolicy 110 | * @param isGeo 111 | * @param latitude 112 | * @param longitude 113 | * @param batch 114 | * @param partition 115 | * @param checkPointPath 116 | */ 117 | case class EdgeConfigEntry(override val name: String, 118 | override val dataSourceConfigEntry: DataSourceConfigEntry, 119 | override val dataSinkConfigEntry: DataSinkConfigEntry, 120 | override val fields: List[String], 121 | override val nebulaFields: List[String], 122 | override val writeMode: WriteMode.Mode, 123 | sourceField: String, 124 | sourcePolicy: Option[KeyPolicy.Value], 125 | sourcePrefix: String, 126 | rankingField: Option[String], 127 | targetField: String, 128 | targetPolicy: Option[KeyPolicy.Value], 129 | targetPrefix: String, 130 | isGeo: Boolean, 131 | latitude: Option[String], 132 | longitude: Option[String], 133 | override val batch: Int, 134 | override val partition: Int, 135 | override val checkPointPath: Option[String], 136 | repartitionWithNebula: Boolean = false, 137 | ignoreIndex: Boolean = false, 138 | srcVertexUdf: Option[UdfConfigEntry] = None, 139 | dstVertexUdf: Option[UdfConfigEntry] = None, 140 | filterConfig: Option[FilterConfigEntry] = None) 141 | extends SchemaConfigEntry { 142 | require(name.trim.nonEmpty, "edge name cannot be empty") 143 | require(sourceField.trim.nonEmpty, "edge source id cannot be empty") 144 | require(targetField.trim.nonEmpty, "edge target id cannot be empty") 145 | require(batch > 0, "batch config must be larger than 0") 146 | require(fields.size == nebulaFields.size, 147 | "fields and nebula.fields must have the same element number") 148 | 149 | override def toString: String = { 150 | if (isGeo) { 151 | s"Edge name: $name, " + 152 | s"source: $dataSourceConfigEntry, " + 153 | s"sink: $dataSinkConfigEntry, " + 154 | s"writeMode: $writeMode, " + 155 | s"latitude: $latitude, " + 156 | s"longitude: $longitude, " + 157 | s"source field: $sourceField, " + 158 | s"source policy: $sourcePolicy, " + 159 | s"ranking: $rankingField, " + 160 | s"target field: $targetField, " + 161 | s"target policy: $targetPolicy, " + 162 | s"batch: $batch, " + 163 | s"partition: $partition, " + 164 | s"ignoreIndex: $ignoreIndex, " + 165 | s"srcVertexUdf: $srcVertexUdf" + 166 | s"dstVertexUdf: $dstVertexUdf." 167 | } else { 168 | s"Edge name: $name, " + 169 | s"source: $dataSourceConfigEntry, " + 170 | s"sink: $dataSinkConfigEntry, " + 171 | s"writeMode: $writeMode, " + 172 | s"source field: $sourceField, " + 173 | s"source policy: $sourcePolicy, " + 174 | s"ranking: $rankingField, " + 175 | s"target field: $targetField, " + 176 | s"target policy: $targetPolicy, " + 177 | s"batch: $batch, " + 178 | s"partition: $partition, " + 179 | s"ignoreIndex: $ignoreIndex, " + 180 | s"srcVertexUdf: $srcVertexUdf" + 181 | s"dstVertexUdf: $dstVertexUdf." 182 | } 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/config/SinkConfigs.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.config 7 | 8 | /** 9 | * SinkCategory is used to expression the writer's type. 10 | */ 11 | object SinkCategory extends Enumeration { 12 | type Type = Value 13 | 14 | val CLIENT = Value("CLIENT") 15 | val SST = Value("SST") 16 | } 17 | 18 | class SinkCategory 19 | 20 | /** 21 | * DataSinkConfigEntry 22 | */ 23 | sealed trait DataSinkConfigEntry { 24 | def category: SinkCategory.Value 25 | } 26 | 27 | /** 28 | * FileBaseSinkConfigEntry 29 | */ 30 | case class FileBaseSinkConfigEntry(override val category: SinkCategory.Value, 31 | localPath: String, 32 | remotePath: String, 33 | fsName: Option[String]) 34 | extends DataSinkConfigEntry { 35 | 36 | override def toString: String = { 37 | val fullRemotePath = 38 | if (fsName.isDefined) s"${fsName.get}$remotePath" 39 | else remotePath 40 | s"File sink: from ${localPath} to $fullRemotePath" 41 | } 42 | } 43 | 44 | /** 45 | * NebulaSinkConfigEntry use to specified the nebula service's address. 46 | */ 47 | case class NebulaSinkConfigEntry(override val category: SinkCategory.Value, addresses: List[String]) 48 | extends DataSinkConfigEntry { 49 | override def toString: String = { 50 | s"Nebula sink addresses: ${addresses.mkString("[", ", ", "]")}" 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/processor/ReloadProcessor.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.processor 7 | 8 | import com.vesoft.exchange.common.{ErrorHandler, GraphProvider} 9 | import com.vesoft.exchange.common.GraphProvider 10 | import com.vesoft.exchange.common.config.Configs 11 | import com.vesoft.exchange.common.writer.NebulaGraphClientWriter 12 | import org.apache.log4j.Logger 13 | import org.apache.spark.{SparkEnv, TaskContext} 14 | import org.apache.spark.sql.{DataFrame, Row} 15 | import org.apache.spark.util.LongAccumulator 16 | 17 | import java.util.regex.Pattern 18 | import scala.collection.mutable.ArrayBuffer 19 | 20 | class ReloadProcessor(data: DataFrame, 21 | config: Configs, 22 | batchSuccess: LongAccumulator, 23 | batchFailure: LongAccumulator, 24 | recordSuccess: LongAccumulator) 25 | extends Processor { 26 | @transient 27 | private[this] lazy val LOG = Logger.getLogger(this.getClass) 28 | 29 | override def process(): Unit = { 30 | data.foreachPartition((rows: Iterator[Row]) => processEachPartition(rows)) 31 | } 32 | 33 | private def processEachPartition(iterator: Iterator[Row]): Unit = { 34 | val graphProvider = 35 | new GraphProvider(config.databaseConfig.getGraphAddress, 36 | config.connectionConfig.timeout, 37 | config.sslConfig) 38 | 39 | val writer = new NebulaGraphClientWriter(config.databaseConfig, 40 | config.userConfig, 41 | config.rateConfig, 42 | null, 43 | graphProvider, 44 | config.executionConfig) 45 | 46 | val errorBuffer = ArrayBuffer[String]() 47 | 48 | writer.prepare() 49 | // batch write 50 | val startTime = System.currentTimeMillis 51 | iterator.foreach { row => 52 | val ngql = row.getString(0) 53 | val failStatement = writer.writeNgql(ngql) 54 | if (failStatement == null) { 55 | batchSuccess.add(1) 56 | recordSuccess.add(1) 57 | } else { 58 | errorBuffer.append(failStatement) 59 | batchFailure.add(1) 60 | } 61 | } 62 | if (errorBuffer.nonEmpty) { 63 | ErrorHandler.save( 64 | errorBuffer, 65 | s"${config.errorConfig.errorPath}/${SparkEnv.get.blockManager.conf.getAppId}/reload.${TaskContext 66 | .getPartitionId()}") 67 | errorBuffer.clear() 68 | } 69 | LOG.info(s">>>>> data reload in partition ${TaskContext 70 | .getPartitionId()} cost ${System.currentTimeMillis() - startTime}ms") 71 | writer.close() 72 | graphProvider.close() 73 | } 74 | 75 | /** 76 | * compute the record amount of ngql 77 | * @param ngql nebula insert ngql 78 | */ 79 | private def computeRecordNumber(ngql: String): Int = { 80 | val substring = ": (" 81 | var count = 0 82 | var index = 0 83 | while (index != -1) { 84 | count += 1 85 | index = ngql.indexOf(substring, index) 86 | if (index != (-1)) { 87 | index += substring.length 88 | } 89 | } 90 | count 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/utils/ConfigTemplateUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2023 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.utils 7 | 8 | import com.vesoft.exchange.common.FileMigrate 9 | import com.vesoft.exchange.common.config.SourceCategory 10 | 11 | import java.io.{BufferedInputStream, BufferedOutputStream, File, FileOutputStream, InputStream} 12 | 13 | object ConfigTemplateUtils { 14 | 15 | def getConfigTemplate(source: String, path: String): Unit = { 16 | val sourceCategory = SourceCategory.withName(source.trim.toUpperCase) 17 | 18 | val fileMigrate = new FileMigrate 19 | sourceCategory match { 20 | case SourceCategory.CSV => 21 | fileMigrate.saveConfig("config_template/csv.conf", path + "/csv.conf") 22 | case SourceCategory.JSON => 23 | fileMigrate.saveConfig("config_template/json.conf", path + "/json.conf") 24 | case SourceCategory.ORC => 25 | fileMigrate.saveConfig("config_template/orc.conf", path + "/orc.conf") 26 | case SourceCategory.PARQUET => 27 | fileMigrate.saveConfig("config_template/parquet.conf", path + "/parquet.conf") 28 | case SourceCategory.HIVE => 29 | fileMigrate.saveConfig("config_template/hive.conf", path + "/hive.conf") 30 | case SourceCategory.HBASE=> 31 | fileMigrate.saveConfig("config_template/hbase.conf", path + "/hbase.conf") 32 | case SourceCategory.JDBC | SourceCategory.MYSQL | SourceCategory.CLICKHOUSE | 33 | SourceCategory.MAXCOMPUTE | SourceCategory.ORC | SourceCategory.POSTGRESQL => 34 | fileMigrate.saveConfig("config_template/jdbc.conf", path + "/jdbc.conf") 35 | case SourceCategory.NEO4J => 36 | fileMigrate.saveConfig("config_template/neo4j.conf", path + "/neo4j.conf") 37 | case SourceCategory.KAFKA | SourceCategory.PULSAR => 38 | fileMigrate.saveConfig("config_template/kafka.conf", path + "/kafka.conf") 39 | case _ => throw new IllegalArgumentException(s"does not support datasource $sourceCategory") 40 | } 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/utils/HDFSUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.utils 7 | 8 | import java.io.File 9 | import java.nio.charset.Charset 10 | 11 | import org.apache.hadoop.conf.Configuration 12 | import org.apache.hadoop.fs.{FileSystem, Path} 13 | import org.apache.log4j.Logger 14 | 15 | import scala.io.Source 16 | 17 | object HDFSUtils { 18 | private[this] val LOG = Logger.getLogger(this.getClass) 19 | 20 | def getFileSystem(namenode: String = null): FileSystem = { 21 | val conf = new Configuration() 22 | if (namenode != null) { 23 | conf.set("fs.default.name", namenode) 24 | conf.set("fs.defaultFS", namenode) 25 | } 26 | FileSystem.get(conf) 27 | } 28 | 29 | def list(path: String): List[String] = { 30 | val system = getFileSystem() 31 | system.listStatus(new Path(path)).map(_.getPath.getName).toList 32 | } 33 | 34 | def exists(path: String): Boolean = { 35 | val system = getFileSystem() 36 | system.exists(new Path(path)) 37 | } 38 | 39 | def getContent(path: String): String = { 40 | val system = getFileSystem() 41 | val inputStream = system.open(new Path(path)) 42 | Source.fromInputStream(inputStream).mkString 43 | } 44 | 45 | def saveContent(path: String, 46 | content: String, 47 | charset: Charset = Charset.defaultCharset()): Unit = { 48 | val system = getFileSystem() 49 | val outputStream = system.create(new Path(path)) 50 | try { 51 | outputStream.write(content.getBytes(charset)) 52 | } finally { 53 | outputStream.close() 54 | } 55 | } 56 | 57 | def upload(localPath: String, remotePath: String, namenode: String = null): Unit = { 58 | try { 59 | val localFile = new File(localPath) 60 | if (!localFile.exists() || localFile.length() <= 0) { 61 | return 62 | } 63 | } catch { 64 | case e: Throwable => 65 | LOG.warn("check for empty local file error, but you can ignore this check error. " + 66 | "If there is empty sst file in your hdfs, please delete it manually", 67 | e) 68 | } 69 | val system = getFileSystem(namenode) 70 | system.copyFromLocalFile(new Path(localPath), new Path(remotePath)) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/utils/NebulaPartitioner.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.utils 7 | 8 | import java.nio.{ByteBuffer, ByteOrder} 9 | import org.apache.spark.Partitioner 10 | 11 | class NebulaPartitioner(partitions: Int) extends Partitioner { 12 | require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.") 13 | 14 | override def numPartitions: Int = partitions 15 | 16 | override def getPartition(key: Any): Int = { 17 | var part = ByteBuffer 18 | .wrap(key.asInstanceOf[Array[Byte]], 0, 4) 19 | .order(ByteOrder.nativeOrder) 20 | .getInt >> 8 21 | if (part <= 0) { 22 | part = part + partitions 23 | } 24 | part - 1 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/utils/NebulaUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.utils 7 | 8 | import com.google.common.base.{CharMatcher, Strings} 9 | 10 | import java.nio.charset.Charset 11 | import java.nio.ByteBuffer 12 | import java.nio.ByteOrder 13 | import com.google.common.primitives.UnsignedLong 14 | import com.vesoft.exchange.common.MetaProvider 15 | import com.vesoft.exchange.common.VidType 16 | import com.vesoft.exchange.common.config.{SchemaConfigEntry, Type} 17 | import com.vesoft.nebula.client.graph.data.HostAddress 18 | import org.apache.commons.codec.digest.MurmurHash2 19 | import org.apache.log4j.Logger 20 | 21 | import scala.collection.JavaConversions.seqAsJavaList 22 | import scala.collection.mutable 23 | import scala.collection.mutable.ListBuffer 24 | 25 | object NebulaUtils { 26 | val DEFAULT_EMPTY_VALUE: String = "_NEBULA_EMPTY" 27 | 28 | private[this] val LOG = Logger.getLogger(this.getClass) 29 | 30 | def getDataSourceFieldType(sourceConfig: SchemaConfigEntry, 31 | space: String, 32 | metaProvider: MetaProvider): Map[String, Int] = { 33 | val nebulaFields = sourceConfig.nebulaFields 34 | val sourceFields = sourceConfig.fields 35 | val label = sourceConfig.name 36 | 37 | var nebulaSchemaMap: Map[String, Integer] = null 38 | val dataType: Type.Value = metaProvider.getLabelType(space, label) 39 | if (dataType == null) { 40 | throw new IllegalArgumentException(s"label $label does not exist.") 41 | } 42 | if (dataType == Type.VERTEX) { 43 | nebulaSchemaMap = metaProvider.getTagSchema(space, label) 44 | } else { 45 | nebulaSchemaMap = metaProvider.getEdgeSchema(space, label) 46 | } 47 | 48 | val sourceSchemaMap: mutable.Map[String, Int] = mutable.HashMap[String, Int]() 49 | for (i <- nebulaFields.indices) { 50 | val nebulaField = nebulaFields.get(i) 51 | if (!nebulaSchemaMap.contains(nebulaField)) { 52 | throw new IllegalArgumentException( 53 | s"property name $nebulaField is not defined in NebulaGraph") 54 | } 55 | sourceSchemaMap.put(sourceFields.get(i), nebulaSchemaMap(nebulaField)) 56 | } 57 | sourceSchemaMap.toMap 58 | } 59 | 60 | def isNumic(str: String): Boolean = { 61 | val newStr: String = if (str.startsWith("-")) { 62 | str.substring(1) 63 | } else { str } 64 | 65 | for (char <- newStr.toCharArray) { 66 | if (!Character.isDigit(char)) return false 67 | } 68 | true 69 | } 70 | 71 | def escapeUtil(str: String): String = { 72 | var s = str 73 | if (s.contains("\\")) { 74 | s = s.replaceAll("\\\\", "\\\\\\\\") 75 | } 76 | if (s.contains("\t")) { 77 | s = s.replaceAll("\t", "\\\\t") 78 | } 79 | if (s.contains("\n")) { 80 | s = s.replaceAll("\n", "\\\\n") 81 | } 82 | if (s.contains("\"")) { 83 | s = s.replaceAll("\"", "\\\\\"") 84 | } 85 | if (s.contains("\'")) { 86 | s = s.replaceAll("\'", "\\\\'") 87 | } 88 | if (s.contains("\r")) { 89 | s = s.replaceAll("\r", "\\\\r") 90 | } 91 | if (s.contains("\b")) { 92 | s = s.replaceAll("\b", "\\\\b") 93 | } 94 | s 95 | } 96 | 97 | def getPartitionId(id: String, partitionSize: Int, vidType: VidType.Value): Int = { 98 | val hashValue: Long = if (vidType == VidType.STRING) { 99 | // todo charset must be the same with Nebula Space 100 | val byteId = id.getBytes(Charset.forName("UTF-8")) 101 | if (byteId.length == 8) { 102 | //byte array to long, need to take care of endianess 103 | ByteBuffer.wrap(byteId).order(ByteOrder.nativeOrder).getLong 104 | } else { 105 | MurmurHash2.hash64(byteId, byteId.length, 0xc70f6907) 106 | } 107 | } else { 108 | id.toLong 109 | } 110 | val unsignedValue = UnsignedLong.fromLongBits(hashValue) 111 | val partSize = UnsignedLong.fromLongBits(partitionSize) 112 | unsignedValue.mod(partSize).intValue + 1 113 | } 114 | 115 | def escapePropName(nebulaFields: List[String]): List[String] = { 116 | val propNames: ListBuffer[String] = new ListBuffer[String] 117 | for (key <- nebulaFields) { 118 | val sb = new StringBuilder() 119 | sb.append("`") 120 | sb.append(key) 121 | sb.append("`") 122 | propNames.append(sb.toString()) 123 | } 124 | propNames.toList 125 | } 126 | 127 | def getAddressFromString(addr: String): HostAddress = { 128 | if (addr == null) { 129 | throw new IllegalArgumentException("wrong address format.") 130 | } 131 | var host: String = null 132 | var portString: String = null 133 | 134 | if (addr.startsWith("[")) { 135 | val hostAndPort = getHostAndPortFromBracketedHost(addr) 136 | host = hostAndPort._1 137 | portString = hostAndPort._2 138 | } else { 139 | val colonPos = addr.indexOf(":") 140 | if (colonPos >= 0 && addr.indexOf(":", colonPos + 1) == -1) { 141 | host = addr.substring(0, colonPos) 142 | portString = addr.substring(colonPos + 1) 143 | } else { 144 | host = addr 145 | } 146 | } 147 | 148 | var port = -1; 149 | if (!Strings.isNullOrEmpty(portString)) { 150 | for (c <- portString.toCharArray) { 151 | if (!Character.isDigit(c)) { 152 | throw new IllegalArgumentException(s"Port must be numeric: $addr") 153 | } 154 | } 155 | port = Integer.parseInt(portString) 156 | if (port < 0 || port > 65535) { 157 | throw new IllegalArgumentException(s"Port number out of range: $addr") 158 | } 159 | } 160 | new HostAddress(host, port) 161 | } 162 | 163 | def getHostAndPortFromBracketedHost(addr: String): (String, String) = { 164 | val colonIndex = addr.indexOf(":") 165 | val closeBracketIndex = addr.lastIndexOf("]") 166 | if (colonIndex < 0 || closeBracketIndex < colonIndex) { 167 | throw new IllegalArgumentException(s"invalid bracketed host/port: $addr") 168 | } 169 | val host: String = addr.substring(1, closeBracketIndex) 170 | if (closeBracketIndex + 1 == addr.length) { 171 | return (host, "") 172 | } else { 173 | if (addr.charAt(closeBracketIndex + 1) != ':') { 174 | throw new IllegalArgumentException(s"only a colon may follow a close bracket: $addr") 175 | } 176 | for (i <- closeBracketIndex + 2 until addr.length) { 177 | if (!Character.isDigit(addr.charAt(i))) { 178 | throw new IllegalArgumentException(s"Port must be numeric: $addr") 179 | } 180 | } 181 | } 182 | (host, addr.substring(closeBracketIndex + 2)) 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/utils/SparkValidate.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.utils 7 | 8 | object SparkValidate { 9 | def validate(sparkVersion: String, supportedVersions: String*): Unit = { 10 | if (sparkVersion != "UNKNOWN" && !supportedVersions.exists(sparkVersion.matches)) { 11 | throw new RuntimeException( 12 | s"""Your current spark version ${sparkVersion} is not supported by the current NebulaGraph Exchange. 13 | | please visit https://github.com/vesoft-inc/nebula-exchange#version-match to know which Exchange you need. 14 | | """.stripMargin) 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/writer/FileBaseWriter.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.writer 7 | 8 | import java.nio.{ByteBuffer, ByteOrder} 9 | import java.nio.file.{Files, Paths} 10 | 11 | import com.vesoft.exchange.common.config.FileBaseSinkConfigEntry 12 | import com.vesoft.exchange.common.utils.HDFSUtils 13 | import org.apache.spark.TaskContext 14 | import org.apache.spark.sql.Row 15 | import org.apache.spark.util.LongAccumulator 16 | import org.rocksdb.{EnvOptions, Options, RocksDB, SstFileWriter} 17 | import org.slf4j.LoggerFactory 18 | 19 | /** 20 | * NebulaSSTWriter 21 | */ 22 | class NebulaSSTWriter(path: String) extends Writer { 23 | var isOpen = false 24 | 25 | private val LOG = LoggerFactory.getLogger(getClass) 26 | 27 | try { 28 | RocksDB.loadLibrary() 29 | LOG.info(">>>>> Loading RocksDB successfully") 30 | } catch { 31 | case _: Exception => 32 | LOG.error(">>>>> Can't load RocksDB library!") 33 | } 34 | 35 | // TODO More Config ... 36 | val options = new Options() 37 | .setCreateIfMissing(true) 38 | 39 | val env = new EnvOptions() 40 | var writer: SstFileWriter = _ 41 | 42 | override def prepare(): Unit = { 43 | writer = new SstFileWriter(env, options) 44 | writer.open(path) 45 | isOpen = true 46 | } 47 | 48 | def write(key: Array[Byte], value: Array[Byte]): Unit = { 49 | writer.put(key, value) 50 | } 51 | 52 | override def close(): Unit = { 53 | if (isOpen) { 54 | writer.finish() 55 | writer.close() 56 | } 57 | options.close() 58 | env.close() 59 | } 60 | 61 | } 62 | 63 | class GenerateSstFile extends Serializable { 64 | private val LOG = LoggerFactory.getLogger(getClass) 65 | 66 | def writeSstFiles(iterator: Iterator[Row], 67 | fileBaseConfig: FileBaseSinkConfigEntry, 68 | partitionNum: Int, 69 | namenode: String, 70 | batchFailure: LongAccumulator): Unit = { 71 | val taskID = TaskContext.get().taskAttemptId() 72 | var writer: NebulaSSTWriter = null 73 | var currentPart = -1 74 | var currentPrefix = -1 75 | val localPath = fileBaseConfig.localPath 76 | val remotePath = fileBaseConfig.remotePath 77 | try { 78 | iterator.foreach { vertex => 79 | val key = vertex.getAs[Array[Byte]](0) 80 | val value = vertex.getAs[Array[Byte]](1) 81 | var part = ByteBuffer 82 | .wrap(key, 0, 4) 83 | .order(ByteOrder.nativeOrder) 84 | .getInt >> 8 85 | if (part <= 0) { 86 | part = part + partitionNum 87 | } 88 | // extract the prefix value for vertex key, there's two values 89 | // 1: vertex key with tag, 7: vertex key without tag 90 | val prefix: Int = ByteBuffer.wrap(key, 0, 1).get 91 | 92 | if (part != currentPart || prefix != currentPrefix) { 93 | if (writer != null) { 94 | writer.close() 95 | val localFile = s"$localPath/$currentPart-$taskID-$currentPrefix.sst" 96 | HDFSUtils.upload(localFile, 97 | s"$remotePath/${currentPart}/$currentPart-$taskID-$currentPrefix.sst", 98 | namenode) 99 | Files.delete(Paths.get(localFile)) 100 | } 101 | currentPart = part 102 | currentPrefix = prefix 103 | val tmp = s"$localPath/$currentPart-$taskID-$currentPrefix.sst" 104 | writer = new NebulaSSTWriter(tmp) 105 | writer.prepare() 106 | } 107 | writer.write(key, value) 108 | } 109 | } catch { 110 | case e: Throwable => { 111 | LOG.error(">>>>> sst file write error,", e) 112 | batchFailure.add(1) 113 | } 114 | } finally { 115 | if (writer != null) { 116 | writer.close() 117 | val localFile = s"$localPath/$currentPart-$taskID-$currentPrefix.sst" 118 | HDFSUtils.upload(localFile, 119 | s"$remotePath/${currentPart}/$currentPart-$taskID-$currentPrefix.sst", 120 | namenode) 121 | Files.delete(Paths.get(localFile)) 122 | } 123 | } 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /exchange-common/src/main/scala/com/vesoft/exchange/common/writer/Writer.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.writer 7 | 8 | /** 9 | * 10 | */ 11 | trait Writer extends Serializable { 12 | 13 | def prepare(): Unit 14 | 15 | def close() 16 | } 17 | -------------------------------------------------------------------------------- /exchange-common/src/test/resources/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | services: 3 | metad0: 4 | image: vesoft/nebula-metad:nightly 5 | environment: 6 | USER: root 7 | TZ: "${TZ}" 8 | command: 9 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 10 | - --local_ip=172.28.1.1 11 | - --ws_ip=172.28.1.1 12 | - --port=9559 13 | - --data_path=/data/meta 14 | - --log_dir=/logs 15 | - --v=0 16 | - --minloglevel=0 17 | - --heartbeat_interval_secs=2 18 | healthcheck: 19 | test: ["CMD", "curl", "-f", "http://172.28.1.1:11000/status"] 20 | interval: 30s 21 | timeout: 10s 22 | retries: 3 23 | start_period: 20s 24 | ports: 25 | - "9559:9559" 26 | - 11000 27 | - 11002 28 | volumes: 29 | - ./data/meta0:/data/meta:Z 30 | - ./logs/meta0:/logs:Z 31 | networks: 32 | nebula-net: 33 | ipv4_address: 172.28.1.1 34 | restart: on-failure 35 | cap_add: 36 | - SYS_PTRACE 37 | 38 | metad1: 39 | image: vesoft/nebula-metad:nightly 40 | environment: 41 | USER: root 42 | TZ: "${TZ}" 43 | command: 44 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 45 | - --local_ip=172.28.1.2 46 | - --ws_ip=172.28.1.2 47 | - --port=9559 48 | - --data_path=/data/meta 49 | - --log_dir=/logs 50 | - --v=0 51 | - --minloglevel=0 52 | - --heartbeat_interval_secs=2 53 | healthcheck: 54 | test: ["CMD", "curl", "-f", "http://172.28.1.2:11000/status"] 55 | interval: 30s 56 | timeout: 10s 57 | retries: 3 58 | start_period: 20s 59 | ports: 60 | - "9560:9559" 61 | - 11000 62 | - 11002 63 | volumes: 64 | - ./data/meta1:/data/meta:Z 65 | - ./logs/meta1:/logs:Z 66 | networks: 67 | nebula-net: 68 | ipv4_address: 172.28.1.2 69 | restart: on-failure 70 | cap_add: 71 | - SYS_PTRACE 72 | 73 | metad2: 74 | image: vesoft/nebula-metad:nightly 75 | environment: 76 | USER: root 77 | TZ: "${TZ}" 78 | command: 79 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 80 | - --local_ip=172.28.1.3 81 | - --ws_ip=172.28.1.3 82 | - --port=9559 83 | - --data_path=/data/meta 84 | - --log_dir=/logs 85 | - --v=0 86 | - --minloglevel=0 87 | - --heartbeat_interval_secs=2 88 | healthcheck: 89 | test: ["CMD", "curl", "-f", "http://172.28.1.3:11000/status"] 90 | interval: 30s 91 | timeout: 10s 92 | retries: 3 93 | start_period: 20s 94 | ports: 95 | - "9561:9559" 96 | - 11000 97 | - 11002 98 | volumes: 99 | - ./data/meta2:/data/meta:Z 100 | - ./logs/meta2:/logs:Z 101 | networks: 102 | nebula-net: 103 | ipv4_address: 172.28.1.3 104 | restart: on-failure 105 | cap_add: 106 | - SYS_PTRACE 107 | 108 | storaged0: 109 | image: vesoft/nebula-storaged:nightly 110 | environment: 111 | USER: root 112 | TZ: "${TZ}" 113 | command: 114 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 115 | - --local_ip=172.28.2.1 116 | - --ws_ip=172.28.2.1 117 | - --port=9779 118 | - --data_path=/data/storage 119 | - --log_dir=/logs 120 | - --v=0 121 | - --minloglevel=0 122 | - --heartbeat_interval_secs=2 123 | depends_on: 124 | - metad0 125 | - metad1 126 | - metad2 127 | healthcheck: 128 | test: ["CMD", "curl", "-f", "http://172.28.2.1:12000/status"] 129 | interval: 30s 130 | timeout: 10s 131 | retries: 3 132 | start_period: 20s 133 | ports: 134 | - "9779:9779" 135 | - 12000 136 | - 12002 137 | volumes: 138 | - ./data/storage0:/data/storage:Z 139 | - ./logs/storage0:/logs:Z 140 | networks: 141 | nebula-net: 142 | ipv4_address: 172.28.2.1 143 | restart: on-failure 144 | cap_add: 145 | - SYS_PTRACE 146 | 147 | storaged1: 148 | image: vesoft/nebula-storaged:nightly 149 | environment: 150 | USER: root 151 | TZ: "${TZ}" 152 | command: 153 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 154 | - --local_ip=172.28.2.2 155 | - --ws_ip=172.28.2.2 156 | - --port=9779 157 | - --data_path=/data/storage 158 | - --log_dir=/logs 159 | - --v=0 160 | - --minloglevel=0 161 | - --heartbeat_interval_secs=2 162 | depends_on: 163 | - metad0 164 | - metad1 165 | - metad2 166 | healthcheck: 167 | test: ["CMD", "curl", "-f", "http://172.28.2.2:12000/status"] 168 | interval: 30s 169 | timeout: 10s 170 | retries: 3 171 | start_period: 20s 172 | ports: 173 | - "9780:9779" 174 | - 12000 175 | - 12002 176 | volumes: 177 | - ./data/storage1:/data/storage:Z 178 | - ./logs/storage1:/logs:Z 179 | networks: 180 | nebula-net: 181 | ipv4_address: 172.28.2.2 182 | restart: on-failure 183 | cap_add: 184 | - SYS_PTRACE 185 | 186 | storaged2: 187 | image: vesoft/nebula-storaged:nightly 188 | environment: 189 | USER: root 190 | TZ: "${TZ}" 191 | command: 192 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 193 | - --local_ip=172.28.2.3 194 | - --ws_ip=172.28.2.3 195 | - --port=9779 196 | - --data_path=/data/storage 197 | - --log_dir=/logs 198 | - --v=0 199 | - --minloglevel=0 200 | - --heartbeat_interval_secs=2 201 | depends_on: 202 | - metad0 203 | - metad1 204 | - metad2 205 | healthcheck: 206 | test: ["CMD", "curl", "-f", "http://172.28.2.3:12000/status"] 207 | interval: 30s 208 | timeout: 10s 209 | retries: 3 210 | start_period: 20s 211 | ports: 212 | - "9781:9779" 213 | - 12000 214 | - 12002 215 | volumes: 216 | - ./data/storage2:/data/storage:Z 217 | - ./logs/storage2:/logs:Z 218 | networks: 219 | nebula-net: 220 | ipv4_address: 172.28.2.3 221 | restart: on-failure 222 | cap_add: 223 | - SYS_PTRACE 224 | 225 | graphd0: 226 | image: vesoft/nebula-graphd:nightly 227 | environment: 228 | USER: root 229 | TZ: "${TZ}" 230 | command: 231 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 232 | - --port=9669 233 | - --ws_ip=172.28.3.1 234 | - --log_dir=/logs 235 | - --v=0 236 | - --minloglevel=0 237 | - --heartbeat_interval_secs=2 238 | depends_on: 239 | - metad0 240 | - metad1 241 | - metad2 242 | healthcheck: 243 | test: ["CMD", "curl", "-f", "http://172.28.3.1:13000/status"] 244 | interval: 30s 245 | timeout: 10s 246 | retries: 3 247 | start_period: 20s 248 | ports: 249 | - "9669:9669" 250 | - 13000 251 | - 13002 252 | volumes: 253 | - ./logs/graph0:/logs:Z 254 | networks: 255 | nebula-net: 256 | ipv4_address: 172.28.3.1 257 | restart: on-failure 258 | cap_add: 259 | - SYS_PTRACE 260 | 261 | graphd1: 262 | image: vesoft/nebula-graphd:nightly 263 | environment: 264 | USER: root 265 | TZ: "${TZ}" 266 | command: 267 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 268 | - --port=9669 269 | - --ws_ip=172.28.3.2 270 | - --log_dir=/logs 271 | - --v=0 272 | - --minloglevel=0 273 | - --heartbeat_interval_secs=2 274 | depends_on: 275 | - metad0 276 | - metad1 277 | - metad2 278 | healthcheck: 279 | test: ["CMD", "curl", "-f", "http://172.28.3.2:13000/status"] 280 | interval: 30s 281 | timeout: 10s 282 | retries: 3 283 | start_period: 20s 284 | ports: 285 | - "9670:9669" 286 | - 13000 287 | - 13002 288 | volumes: 289 | - ./logs/graph1:/logs:Z 290 | networks: 291 | nebula-net: 292 | ipv4_address: 172.28.3.2 293 | restart: on-failure 294 | cap_add: 295 | - SYS_PTRACE 296 | 297 | graphd2: 298 | image: vesoft/nebula-graphd:nightly 299 | environment: 300 | USER: root 301 | TZ: "${TZ}" 302 | command: 303 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 304 | - --port=9669 305 | - --ws_ip=172.28.3.3 306 | - --log_dir=/logs 307 | - --v=0 308 | - --minloglevel=0 309 | - --heartbeat_interval_secs=2 310 | depends_on: 311 | - metad0 312 | - metad1 313 | - metad2 314 | healthcheck: 315 | test: ["CMD", "curl", "-f", "http://172.28.3.3:13000/status"] 316 | interval: 30s 317 | timeout: 10s 318 | retries: 3 319 | start_period: 20s 320 | ports: 321 | - "9671:9669" 322 | - 13000 323 | - 13002 324 | volumes: 325 | - ./logs/graph2:/logs:Z 326 | networks: 327 | nebula-net: 328 | ipv4_address: 172.28.3.3 329 | restart: on-failure 330 | cap_add: 331 | - SYS_PTRACE 332 | 333 | console: 334 | image: vesoft/nebula-console:nightly 335 | entrypoint: "" 336 | command: 337 | - sh 338 | - -c 339 | - | 340 | sleep 3 && 341 | nebula-console -addr graphd0 -port 9669 -u root -p nebula -e 'ADD HOSTS "172.28.2.1":9779,"172.28.2.2":9779,"172.28.2.3":9779' && 342 | sleep 36000 343 | depends_on: 344 | - graphd0 345 | networks: 346 | - nebula-net 347 | 348 | networks: 349 | nebula-net: 350 | ipam: 351 | driver: default 352 | config: 353 | - subnet: 172.28.0.0/16 354 | -------------------------------------------------------------------------------- /exchange-common/src/test/resources/edge.csv: -------------------------------------------------------------------------------- 1 | src,dst,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14 2 | 101,102,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2) 3 | 102,103,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4) 4 | 103,101,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6) 5 | 104,106,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7) 6 | 105,107,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5) 7 | 106,108,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)" 8 | 107,101,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)" 9 | 108,109,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)" 10 | 109,110,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)" 11 | 110,-101,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)" 12 | -101,102,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 13 | -102,-103,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 14 | -103,-101,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 15 | -------------------------------------------------------------------------------- /exchange-common/src/test/resources/process_application.conf: -------------------------------------------------------------------------------- 1 | { 2 | # Spark relation com.vesoft.exchange.common.config 3 | spark: { 4 | app: { 5 | name: Nebula Exchange 2.0 6 | } 7 | 8 | master:local 9 | 10 | driver: { 11 | cores: 1 12 | maxResultSize: 1G 13 | } 14 | 15 | executor: { 16 | memory:1G 17 | } 18 | 19 | cores:{ 20 | max: 16 21 | } 22 | } 23 | 24 | # if the hive is hive-on-spark with derby mode, you can ignore this hive configure 25 | # get the com.vesoft.exchange.common.config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml 26 | 27 | hive: { 28 | warehouse: "hdfs://NAMENODE_IP:9000/apps/svr/hive-xxx/warehouse/" 29 | connectionURL: "jdbc:mysql://your_ip:3306/hive_spark?characterEncoding=UTF-8" 30 | connectionDriverName: "com.mysql.jdbc.Driver" 31 | connectionUserName: "user" 32 | connectionPassword: "password" 33 | } 34 | 35 | # Nebula Graph relation com.vesoft.exchange.common.config 36 | nebula: { 37 | address:{ 38 | graph:["127.0.0.1:9669", "127.0.0.1:9670", "127.0.0.1:9671"] 39 | meta:["127.0.0.1:9559", "127.0.0.1:9560", "127.0.0.1:9561"] 40 | } 41 | user: root 42 | pswd: nebula 43 | space: test_string 44 | 45 | # parameters for SST import, not required 46 | path:{ 47 | local:"/tmp" 48 | remote:"/sst" 49 | hdfs.namenode: "hdfs://name_node:9000" 50 | } 51 | 52 | # nebula client connection parameters 53 | connection { 54 | timeout: 3000 55 | retry: 3 56 | } 57 | 58 | # nebula client execution parameters 59 | execution { 60 | retry: 3 61 | } 62 | 63 | error: { 64 | # max number of failures, if the number of failures is bigger than max, then exit the application. 65 | max: 32 66 | # failed import job will be recorded in output path 67 | output: /tmp/errors 68 | } 69 | 70 | # use google's RateLimiter to limit the requests send to NebulaGraph 71 | rate: { 72 | # the stable throughput of RateLimiter 73 | limit: 1024 74 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 75 | # if it can't be obtained within the specified timeout, then give up the request. 76 | timeout: 1000 77 | } 78 | } 79 | 80 | # Processing tags 81 | # There are tag com.vesoft.exchange.common.config examples for different dataSources. 82 | tags: [ 83 | { 84 | name: person 85 | type: { 86 | source: csv 87 | sink: client 88 | } 89 | path: "file://src/test/resources/data.csv" 90 | fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] 91 | nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] 92 | vertex: { 93 | field:id 94 | #policy:hash 95 | } 96 | header:true 97 | batch: 2 98 | partition: 5 99 | } 100 | ] 101 | 102 | # There are tag com.vesoft.exchange.common.config examples for different dataSources. 103 | edges: [ 104 | { 105 | name: friend 106 | type: { 107 | source: csv 108 | sink: client 109 | } 110 | path: "file://src/test/resources/data.csv" 111 | fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] 112 | nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] 113 | source: { 114 | field:src 115 | #policy:hash 116 | } 117 | target: { 118 | field:dst 119 | #policy:hash 120 | } 121 | header:true 122 | batch: 2 123 | partition: 5 124 | } 125 | ] 126 | } 127 | -------------------------------------------------------------------------------- /exchange-common/src/test/resources/vertex.csv: -------------------------------------------------------------------------------- 1 | id,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14 2 | 101,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2) 3 | 102,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4) 4 | 103,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6) 5 | 104,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7) 6 | 105,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5) 7 | 106,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)" 8 | 107,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)" 9 | 108,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)" 10 | 109,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)" 11 | 1010,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)" 12 | -101,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 13 | -102,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 14 | -103,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 15 | -------------------------------------------------------------------------------- /exchange-common/src/test/scala/com/vesoft/exchange/common/GraphProviderSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common 7 | 8 | import com.google.common.net.HostAndPort 9 | import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, UserConfigEntry} 10 | import com.vesoft.nebula.client.graph.data.HostAddress 11 | import com.vesoft.nebula.client.graph.exception.AuthFailedException 12 | import com.vesoft.nebula.client.graph.net.Session 13 | import org.junit.{After, Before, Test} 14 | import org.scalatest.Assertions.assertThrows 15 | 16 | class GraphProviderSuite { 17 | var graphProvider: GraphProvider = _ 18 | var session: Session = _ 19 | val userConfig = UserConfigEntry("root", "nebula") 20 | 21 | @Before 22 | def setUp(): Unit = { 23 | val mockData = new NebulaGraphMock 24 | mockData.mockStringIdGraph() 25 | mockData.mockIntIdGraph() 26 | mockData.close() 27 | 28 | val sslConfig = SslConfigEntry(false, false, SslType.CA, null, null) 29 | graphProvider = 30 | new GraphProvider(List(new HostAddress("127.0.0.1", 9669)), 5000, sslConfig) 31 | } 32 | 33 | @After 34 | def tearDown(): Unit = { 35 | graphProvider.close() 36 | } 37 | 38 | @Test 39 | def switchSpaceSuite(): Unit = { 40 | session = graphProvider.getGraphClient(userConfig) 41 | assert(graphProvider.switchSpace(session, "test_string")._2.isSucceeded) 42 | assert(graphProvider.switchSpace(session, "test_int")._2.isSucceeded) 43 | graphProvider.releaseGraphClient(session) 44 | } 45 | 46 | @Test 47 | def submitSuite(): Unit = { 48 | session = graphProvider.getGraphClient(userConfig) 49 | assert(graphProvider.submit(session, "show hosts")._2.isSucceeded) 50 | graphProvider.releaseGraphClient(session) 51 | } 52 | 53 | @Test 54 | def switchSpaceWithoutPermissionSuite(): Unit = { 55 | val wrongUserConfig = UserConfigEntry("user", "12345") 56 | assertThrows[AuthFailedException](graphProvider.getGraphClient(wrongUserConfig)) 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /exchange-common/src/test/scala/com/vesoft/exchange/common/MetaProviderSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common 7 | 8 | import com.google.common.net.HostAndPort 9 | import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, Type} 10 | import com.vesoft.nebula.client.graph.data.HostAddress 11 | import com.vesoft.nebula.client.meta.exception.ExecuteFailedException 12 | import org.junit.{After, Before, Test} 13 | import org.scalatest.Assertions.assertThrows 14 | 15 | class MetaProviderSuite { 16 | 17 | var metaProvider: MetaProvider = _ 18 | @Before 19 | def setUp(): Unit = { 20 | val mockData = new NebulaGraphMock 21 | mockData.mockStringIdGraph() 22 | mockData.mockIntIdGraph() 23 | mockData.close() 24 | 25 | val sslConfig = SslConfigEntry(false, false, SslType.CA, null, null) 26 | metaProvider = new MetaProvider(List(new HostAddress("127.0.0.1", 9559)), 5000, 1, sslConfig) 27 | } 28 | 29 | @After 30 | def tearDown(): Unit = { 31 | if (metaProvider != null) 32 | metaProvider.close() 33 | } 34 | 35 | @Test 36 | def getPartNumberSuite(): Unit = { 37 | assert(metaProvider.getPartNumber("test_string") == 10) 38 | assert(metaProvider.getPartNumber("test_int") == 10) 39 | } 40 | 41 | @Test 42 | def getVidTypeSuite(): Unit = { 43 | assert(metaProvider.getVidType("test_string") == VidType.STRING) 44 | assert(metaProvider.getVidType("test_int") == VidType.INT) 45 | } 46 | 47 | @Test 48 | def getTagSchemaSuite(): Unit = { 49 | val tagSchema = metaProvider.getTagSchema("test_string", "person") 50 | assert(tagSchema.size == 14) 51 | } 52 | 53 | @Test 54 | def getEdgeSchemaSuite(): Unit = { 55 | val edgeSchema = metaProvider.getEdgeSchema("test_string", "friend") 56 | assert(edgeSchema.size == 14) 57 | } 58 | 59 | @Test 60 | def getLabelTypeSuite(): Unit = { 61 | assert(metaProvider.getLabelType("test_string", "person") == Type.VERTEX) 62 | assert(metaProvider.getLabelType("test_string", "friend") == Type.EDGE) 63 | assert(metaProvider.getLabelType("test_int", "person") == Type.VERTEX) 64 | assert(metaProvider.getLabelType("test_int", "friend") == Type.EDGE) 65 | } 66 | 67 | @Test 68 | def getSpaceVidLenSuite(): Unit = { 69 | assert(metaProvider.getSpaceVidLen("test_string") == 8) 70 | assert(metaProvider.getSpaceVidLen("test_int") == 8) 71 | assertThrows[ExecuteFailedException](metaProvider.getSpaceVidLen("not_exist_space")) 72 | } 73 | 74 | @Test 75 | def getTagItemSuite(): Unit = { 76 | val tagItem = metaProvider.getTagItem("test_string", "person") 77 | assert(new String(tagItem.tag_name).equals("person")) 78 | } 79 | 80 | @Test 81 | def getNoExistTagSuite(): Unit = { 82 | assertThrows[IllegalArgumentException](metaProvider.getTagItem("test_string", "no_exist_tag")) 83 | } 84 | 85 | @Test 86 | def getEdgeItemSuite(): Unit = { 87 | val edgeItem = metaProvider.getEdgeItem("test_string", "friend") 88 | assert(new String(edgeItem.edge_name).equals("friend")) 89 | } 90 | 91 | @Test 92 | def getNoExistEdgeSuite(): Unit = { 93 | assertThrows[IllegalArgumentException](metaProvider.getEdgeItem("test_string", "no_exist_edge")) 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /exchange-common/src/test/scala/com/vesoft/exchange/common/utils/SparkValidateSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.utils 7 | 8 | import org.junit.Test 9 | import org.scalatest.Assertions.assertThrows 10 | 11 | class SparkValidateSuite { 12 | 13 | @Test 14 | def validateSuite(): Unit = { 15 | SparkValidate.validate("2.2.0", "2.2.*") 16 | SparkValidate.validate("2.4.4", "2.4.*") 17 | SparkValidate.validate("3.0.0", "3.0.*", "3.1.*", "3.2.*", "3.3.*") 18 | assertThrows[RuntimeException](SparkValidate.validate("2.4.0", "2.2.*")) 19 | assertThrows[RuntimeException](SparkValidate.validate("2.2.0", "2.4.*")) 20 | assertThrows[RuntimeException]( 21 | SparkValidate.validate("2.4.0", "3.0.*", "3.1.*", "3.2.*", "3.3.*")) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /exchange-common/src/test/scala/com/vesoft/exchange/common/writer/FileBaseWriterSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.writer 7 | 8 | import com.vesoft.exchange.common.config.{FileBaseSinkConfigEntry, SinkCategory} 9 | import org.apache.spark.sql.{Dataset, Encoders, Row, SparkSession} 10 | import org.junit.Test 11 | 12 | class FileBaseWriterSuite { 13 | 14 | @Test 15 | def writeSstFilesSuite(): Unit = { 16 | val spark = SparkSession.builder().master("local").getOrCreate() 17 | import spark.implicits._ 18 | // generate byte[] key using encoder's getVertexKey, space:"test", tag: "person" 19 | val key1 = "01a40200310000000000000000000000000000000000000002000000" // id: "1" 20 | val key2 = "01170000320000000000000000000000000000000000000002000000" // id: "2" 21 | val key3 = "01fe0000330000000000000000000000000000000000000002000000" // id: "3" 22 | val key4 = "01a90300340000000000000000000000000000000000000002000000" // id: "4" 23 | val key5 = "01220200350000000000000000000000000000000000000002000000" // id: "5" 24 | val value = "abc" 25 | // construct test dataset 26 | val data: Dataset[(Array[Byte], Array[Byte])] = spark.sparkContext 27 | .parallelize( 28 | List(key1.getBytes(), key2.getBytes(), key3.getBytes(), key4.getBytes(), key5.getBytes())) 29 | .map(line => (line, value.getBytes())) 30 | .toDF("key", "value") 31 | .map { row => 32 | (row.getAs[Array[Byte]](0), row.getAs[Array[Byte]](1)) 33 | }(Encoders.tuple(Encoders.BINARY, Encoders.BINARY)) 34 | 35 | val generateSstFile = new GenerateSstFile 36 | 37 | val fileBaseConfig = 38 | FileBaseSinkConfigEntry(SinkCategory.SST, "/tmp", "/tmp/remote", None) 39 | val batchFailure = spark.sparkContext.longAccumulator(s"batchFailure.test}") 40 | 41 | data 42 | .toDF("key", "value") 43 | .sortWithinPartitions("key") 44 | .foreachPartition { iterator: Iterator[Row] => 45 | generateSstFile.writeSstFiles(iterator, fileBaseConfig, 10, null, batchFailure) 46 | } 47 | assert(batchFailure.value == 0) 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /exchange-common/src/test/scala/com/vesoft/exchange/common/writer/ServerBaseWriterSuite.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.exchange.common.writer 7 | 8 | import com.vesoft.exchange.common 9 | import com.vesoft.exchange.common.{Edge, Edges, Vertex, Vertices} 10 | import org.junit.Test 11 | 12 | import scala.collection.mutable.ListBuffer 13 | 14 | class ServerBaseWriterSuite extends ServerBaseWriter { 15 | 16 | @Test 17 | def toExecuteSentenceSuiteForVertex(): Unit = { 18 | val vertices: ListBuffer[Vertex] = new ListBuffer[Vertex] 19 | val tagName = "person" 20 | val propNames = List("name", "age", "gender", "high", "weight") 21 | 22 | val props1 = List("\"Tom\"", 10, 0, 172.5, 55) 23 | val props2 = List("\"Jena\"", 12, 1, 165.5, 45) 24 | vertices.append(Vertex("\"vid1\"", props1)) 25 | vertices.append(Vertex("\"vid2\"", props2)) 26 | val nebulaVertices = Vertices(propNames, vertices.toList) 27 | 28 | val sentence = toExecuteSentence(tagName, nebulaVertices, false) 29 | val expectSentence = 30 | "INSERT VERTEX `person`(`name`,`age`,`gender`,`high`,`weight`) VALUES " + 31 | "\"vid1\": (\"Tom\", 10, 0, 172.5, 55), " + 32 | "\"vid2\": (\"Jena\", 12, 1, 165.5, 45)" 33 | assert(sentence.equals(expectSentence)) 34 | } 35 | 36 | @Test 37 | def toDeleteExecuteSentenceSuiteForVertex(): Unit = { 38 | val vertices: ListBuffer[Vertex] = new ListBuffer[Vertex] 39 | val propNames = List("name", "age", "gender", "high", "weight") 40 | 41 | val props1 = List("\"Tom\"", 10, 0, 172.5, 55) 42 | val props2 = List("\"Jena\"", 12, 1, 165.5, 45) 43 | vertices.append(Vertex("\"vid1\"", props1)) 44 | vertices.append(Vertex("\"vid2\"", props2)) 45 | val nebulaVertices = Vertices(propNames, vertices.toList) 46 | 47 | val sentence = toDeleteExecuteSentence(nebulaVertices, false) 48 | val expectSentence = 49 | "DELETE VERTEX \"vid1\", \"vid2\"" 50 | assert(sentence.equals(expectSentence)) 51 | } 52 | 53 | @Test 54 | def toUpdateExecuteSentenceSuiteForVertex(): Unit = { 55 | val vertices: ListBuffer[Vertex] = new ListBuffer[Vertex] 56 | val propNames = List("col_string", 57 | "col_fixed_string", 58 | "col_bool", 59 | "col_int", 60 | "col_int64", 61 | "col_double", 62 | "col_date", 63 | "col_geo") 64 | 65 | val props1 = 66 | List("\"name\"", "\"name\"", true, 10, 100L, 1.0, "2021-11-12", "LINESTRING(1 2, 3 4)") 67 | val props2 = List("\"name2\"", 68 | "\"name2\"", 69 | false, 70 | 11, 71 | 101L, 72 | 2.0, 73 | "2021-11-13", 74 | "POLYGON((0 1, 1 2, 2 3, 0 1))") 75 | 76 | vertices.append(Vertex("\"vid1\"", props1)) 77 | vertices.append(Vertex("\"vid2\"", props2)) 78 | val nebulaVertices = Vertices(propNames, vertices.toList, None) 79 | 80 | val sentence = toUpdateExecuteSentence("person", nebulaVertices) 81 | val expectSentence = 82 | "UPDATE VERTEX ON `person` \"vid1\" SET `col_string`=\"name\",`col_fixed_string`=\"name\"," + 83 | "`col_bool`=true,`col_int`=10,`col_int64`=100,`col_double`=1.0,`col_date`=2021-11-12," + 84 | "`col_geo`=LINESTRING(1 2, 3 4);UPDATE VERTEX ON `person` \"vid2\" SET " + 85 | "`col_string`=\"name2\",`col_fixed_string`=\"name2\",`col_bool`=false,`col_int`=11," + 86 | "`col_int64`=101,`col_double`=2.0,`col_date`=2021-11-13," + 87 | "`col_geo`=POLYGON((0 1, 1 2, 2 3, 0 1))" 88 | assert(expectSentence.equals(sentence)) 89 | } 90 | 91 | @Test 92 | def toExecuteSentenceSuiteForVertexWithSymbol(): Unit = { 93 | val vertices: ListBuffer[Vertex] = new ListBuffer[Vertex] 94 | val tagName = "person,test_with^symbol#" 95 | val propNames = List("name_1", "age-1", "gender&1", "high%1", "weight,1") 96 | 97 | val props1 = List("\"Tom\"", 10, 0, 172.5, 55) 98 | val props2 = List("\"Jena\"", 12, 1, 165.5, 45) 99 | vertices.append(Vertex("\"vid_1\"", props1)) 100 | vertices.append(Vertex("\"vid,2\"", props2)) 101 | val nebulaVertices = Vertices(propNames, vertices.toList) 102 | 103 | val sentence = toExecuteSentence(tagName, nebulaVertices, false) 104 | val expectSentence = 105 | "INSERT VERTEX `person,test_with^symbol#`(`name_1`,`age-1`,`gender&1`,`high%1`,`weight,1`) VALUES " + 106 | "\"vid_1\": (\"Tom\", 10, 0, 172.5, 55), " + 107 | "\"vid,2\": (\"Jena\", 12, 1, 165.5, 45)" 108 | assert(sentence.equals(expectSentence)) 109 | } 110 | 111 | @Test 112 | def toExecuteSentenceSuiteForEdge(): Unit = { 113 | val edges: ListBuffer[Edge] = new ListBuffer[Edge] 114 | val edgeType = "friend" 115 | val propNames = List("src_name", "dst_name", "time", "address", "relation") 116 | 117 | val props1 = List("\"Tom\"", "\"Jena\"", "2022-08-25", "hangzhou", "friend") 118 | val props2 = List("\"Jena\"", "\"Bob\"", "2022-08-25", "shanghai", "friend") 119 | edges.append(Edge("\"vid1\"", "\"vid2\"", Some(0L), props1)) 120 | edges.append(Edge("\"vid2\"", "\"vid3\"", Some(1L), props2)) 121 | val nebulaEdges = Edges(propNames, edges.toList) 122 | val sentence = toExecuteSentence(edgeType, nebulaEdges, false) 123 | val expectSentence = "INSERT EDGE `friend`(`src_name`,`dst_name`,`time`,`address`,`relation`) VALUES" + 124 | " \"vid1\"->\"vid2\"@0: (\"Tom\", \"Jena\", 2022-08-25, hangzhou, friend), " + 125 | "\"vid2\"->\"vid3\"@1: (\"Jena\", \"Bob\", 2022-08-25, shanghai, friend)" 126 | assert(sentence.equals(expectSentence)) 127 | } 128 | 129 | @Test 130 | def toDeleteExecuteSentenceSuiteForEdge(): Unit = { 131 | val edges: ListBuffer[Edge] = new ListBuffer[Edge] 132 | val edgeType = "friend" 133 | val propNames = List("src_name", "dst_name", "time", "address", "relation") 134 | 135 | val props1 = List("\"Tom\"", "\"Jena\"", "2022-08-25", "hangzhou", "friend") 136 | val props2 = List("\"Jena\"", "\"Bob\"", "2022-08-25", "shanghai", "friend") 137 | edges.append(Edge("\"vid1\"", "\"vid2\"", Some(0L), props1)) 138 | edges.append(Edge("\"vid2\"", "\"vid3\"", Some(1L), props2)) 139 | val nebulaEdges = Edges(propNames, edges.toList) 140 | val sentence = toDeleteExecuteSentence(edgeType, nebulaEdges) 141 | val expectSentence = "DELETE EDGE `friend` " + 142 | "\"vid1\"->\"vid2\"@0, " + 143 | "\"vid2\"->\"vid3\"@1" 144 | println(sentence) 145 | println(expectSentence) 146 | assert(sentence.equals(expectSentence)) 147 | } 148 | 149 | @Test 150 | def toUpdateExecuteSuiteForEdge(): Unit = { 151 | val edges: ListBuffer[Edge] = new ListBuffer[Edge] 152 | val propNames = List("col_string", 153 | "col_fixed_string", 154 | "col_bool", 155 | "col_int", 156 | "col_int64", 157 | "col_double", 158 | "col_date", 159 | "col_geo") 160 | val props1 = List("\"Tom\"", "\"Tom\"", true, 10, 100L, 1.0, "2021-11-12", "POINT(1 2)") 161 | val props2 = List("\"Bob\"", "\"Bob\"", false, 20, 200L, 2.0, "2021-05-01", "POINT(2 3)") 162 | edges.append(Edge("\"vid1\"", "\"vid2\"", Some(1L), props1)) 163 | edges.append(Edge("\"vid2\"", "\"vid1\"", Some(2L), props2)) 164 | 165 | val nebulaEdges = Edges(propNames, edges.toList, None, None) 166 | val sentence = toUpdateExecuteSentence("friend", nebulaEdges) 167 | val expectSentence = 168 | "UPDATE EDGE ON `friend` \"vid1\"->\"vid2\"@1 SET `col_string`=\"Tom\"," + 169 | "`col_fixed_string`=\"Tom\",`col_bool`=true,`col_int`=10,`col_int64`=100," + 170 | "`col_double`=1.0,`col_date`=2021-11-12,`col_geo`=POINT(1 2);" + 171 | "UPDATE EDGE ON `friend` \"vid2\"->\"vid1\"@2 SET `col_string`=\"Bob\"," + 172 | "`col_fixed_string`=\"Bob\",`col_bool`=false,`col_int`=20,`col_int64`=200," + 173 | "`col_double`=2.0,`col_date`=2021-05-01,`col_geo`=POINT(2 3)" 174 | assert(expectSentence.equals(sentence)) 175 | } 176 | 177 | @Test 178 | def toExecuteSentenceSuiteForEdgeWithSymbol(): Unit = { 179 | val edges: ListBuffer[Edge] = new ListBuffer[Edge] 180 | val edgeType = "friend" 181 | val propNames = List("src_name", "dst_name", "time", "address", "relation") 182 | 183 | val props1 = List("\"Tom\"", "\"Jena\"", "2022-08-25", "hangzhou", "friend") 184 | val props2 = List("\"Jena\"", "\"Bob\"", "2022-08-25", "shanghai", "friend") 185 | edges.append(Edge("\"vid_1\"", "\"vid_2\"", Some(0L), props1)) 186 | edges.append(Edge("\"vid_2,test-1\"", "\"vid&3^test*a\"", Some(1L), props2)) 187 | val nebulaEdges = Edges(propNames, edges.toList) 188 | val sentence = toExecuteSentence(edgeType, nebulaEdges, false) 189 | val expectSentence = "INSERT EDGE `friend`(`src_name`,`dst_name`,`time`,`address`,`relation`) VALUES " + 190 | "\"vid_1\"->\"vid_2\"@0: (\"Tom\", \"Jena\", 2022-08-25, hangzhou, friend), " + 191 | "\"vid_2,test-1\"->\"vid&3^test*a\"@1: (\"Jena\", \"Bob\", 2022-08-25, shanghai, friend)" 192 | assert(sentence.equals(expectSentence)) 193 | } 194 | 195 | 196 | override def writeVertices(vertices: Vertices, ignoreIndex: Boolean): List[String] = ??? 197 | 198 | override def writeEdges(edges: common.Edges, ignoreIndex: Boolean): List[String] = ??? 199 | 200 | override def writeNgql(ngql: String): String = ??? 201 | 202 | override def prepare(): Unit = ??? 203 | 204 | override def close(): Unit = ??? 205 | } 206 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.2/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Global logging configuration 2 | log4j.rootLogger=INFO, stdout 3 | # Console output... 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n 7 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.reader 7 | 8 | import com.vesoft.exchange.common.config.FileBaseSourceConfigEntry 9 | import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE 10 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 11 | import org.apache.spark.sql.types.StructType 12 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 13 | 14 | /** 15 | * The FileBaseReader is the abstract class for HDFS file reader. 16 | * 17 | * @param session 18 | * @param path 19 | */ 20 | abstract class FileBaseReader(val session: SparkSession, val path: String) extends Reader { 21 | 22 | require(path.trim.nonEmpty) 23 | 24 | override def close(): Unit = { 25 | session.close() 26 | } 27 | } 28 | 29 | /** 30 | * The ParquetReader extend the FileBaseReader and support read parquet file from HDFS. 31 | * 32 | * @param session 33 | * @param parquetConfig 34 | */ 35 | class ParquetReader(override val session: SparkSession, parquetConfig: FileBaseSourceConfigEntry) 36 | extends FileBaseReader(session, parquetConfig.path) { 37 | 38 | override def read(): DataFrame = { 39 | session.read.parquet(path) 40 | } 41 | } 42 | 43 | /** 44 | * The ORCReader extend the FileBaseReader and support read orc file from HDFS. 45 | * 46 | * @param session 47 | * @param orcConfig 48 | */ 49 | class ORCReader(override val session: SparkSession, orcConfig: FileBaseSourceConfigEntry) 50 | extends FileBaseReader(session, orcConfig.path) { 51 | 52 | override def read(): DataFrame = { 53 | session.read.orc(path) 54 | } 55 | } 56 | 57 | /** 58 | * The JSONReader extend the FileBaseReader and support read json file from HDFS. 59 | * 60 | * @param session 61 | * @param jsonConfig 62 | */ 63 | class JSONReader(override val session: SparkSession, jsonConfig: FileBaseSourceConfigEntry) 64 | extends FileBaseReader(session, jsonConfig.path) { 65 | 66 | override def read(): DataFrame = { 67 | session.read.json(path) 68 | } 69 | } 70 | 71 | /** 72 | * The CSVReader extend the FileBaseReader and support read csv file from HDFS. 73 | * All types of the structure are StringType. 74 | * 75 | * @param session 76 | * @param csvConfig 77 | */ 78 | class CSVReader(override val session: SparkSession, csvConfig: FileBaseSourceConfigEntry) 79 | extends FileBaseReader(session, csvConfig.path) { 80 | 81 | override def read(): DataFrame = { 82 | session.read 83 | .option("delimiter", csvConfig.separator.get) 84 | .option("header", csvConfig.header.get) 85 | .option("emptyValue", DEFAULT_EMPTY_VALUE) 86 | .csv(path) 87 | } 88 | } 89 | 90 | /** 91 | * The CustomReader extend the FileBaseReader and support read text file from HDFS. 92 | * Transformation is a function convert a line into Row. 93 | * The structure of the row should be specified. 94 | * 95 | * @param session 96 | * @param customConfig 97 | * @param transformation 98 | * @param structType 99 | */ 100 | abstract class CustomReader(override val session: SparkSession, 101 | customConfig: FileBaseSourceConfigEntry, 102 | transformation: String => Row, 103 | filter: Row => Boolean, 104 | structType: StructType) 105 | extends FileBaseReader(session, customConfig.path) { 106 | 107 | override def read(): DataFrame = { 108 | val encoder = RowEncoder.apply(structType) 109 | session.read 110 | .text(path) 111 | .filter(!_.getString(0).isEmpty) 112 | .map(row => transformation(row.getString(0)))(encoder) 113 | .filter(filter) 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.reader 7 | 8 | import com.vesoft.exchange.common.Offset 9 | import com.vesoft.exchange.common.utils.HDFSUtils 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | 12 | /** 13 | * The Reader is used to create a DataFrame from the source, such as Hive or HDFS. 14 | */ 15 | trait Reader extends Serializable { 16 | def session: SparkSession 17 | 18 | def read(): DataFrame 19 | 20 | def close(): Unit 21 | } 22 | 23 | trait CheckPointSupport extends Serializable { 24 | 25 | def getOffsets(totalCount: Long, 26 | parallel: Int, 27 | checkPointPath: Option[String], 28 | checkPointNamePrefix: String): List[Offset] = { 29 | if (totalCount <= 0) 30 | throw new RuntimeException(s"${checkPointNamePrefix}: return data count<=0") 31 | 32 | val batchSizes = List.fill((totalCount % parallel).toInt)(totalCount / parallel + 1) ::: List 33 | .fill((parallel - totalCount % parallel).toInt)(totalCount / parallel) 34 | 35 | val startOffsets = batchSizes.scanLeft(0L)(_ + _).init 36 | 37 | val checkPointOffsets = checkPointPath match { 38 | case Some(path) => 39 | val files = Range(0, parallel).map(i => s"${path}/${checkPointNamePrefix}.${i}").toList 40 | if (files.forall(HDFSUtils.exists)) 41 | files.map(HDFSUtils.getContent(_).trim.toLong).sorted 42 | else startOffsets 43 | case _ => startOffsets 44 | } 45 | 46 | if (checkPointOffsets.zip(startOffsets).exists(x => x._1 < x._2)) 47 | throw new RuntimeException( 48 | s"Check Point file maybe previous. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") 49 | 50 | val eachPartitionLimit = { 51 | batchSizes 52 | .zip(startOffsets.zip(checkPointOffsets)) 53 | .map(x => { 54 | x._1 - (x._2._2 - x._2._1) 55 | }) 56 | } 57 | val offsets = checkPointOffsets.zip(eachPartitionLimit).map(x => Offset(x._1, x._2)) 58 | if (offsets.exists(_.size < 0L)) 59 | throw new RuntimeException( 60 | s"Check point file maybe broken. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") 61 | offsets 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.reader 7 | 8 | import com.vesoft.exchange.common.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry} 9 | import org.apache.spark.sql.types.StringType 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | 12 | /** 13 | * Spark Streaming 14 | * 15 | * @param session 16 | */ 17 | abstract class StreamingBaseReader(override val session: SparkSession) extends Reader { 18 | 19 | override def close(): Unit = { 20 | session.close() 21 | } 22 | } 23 | 24 | /** 25 | * 26 | * @param session 27 | * @param kafkaConfig 28 | * @param targetFields 29 | */ 30 | class KafkaReader(override val session: SparkSession, 31 | kafkaConfig: KafkaSourceConfigEntry, 32 | targetFields: List[String]) 33 | extends StreamingBaseReader(session) { 34 | 35 | require( 36 | kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty) 37 | 38 | override def read(): DataFrame = { 39 | import org.apache.spark.sql.functions._ 40 | import session.implicits._ 41 | val fields = targetFields.distinct 42 | val reader = 43 | session.readStream 44 | .format("kafka") 45 | .option("kafka.bootstrap.servers", kafkaConfig.server) 46 | .option("subscribe", kafkaConfig.topic) 47 | .option("startingOffsets", kafkaConfig.startingOffsets) 48 | 49 | if (kafkaConfig.securityProtocol.isDefined) { 50 | reader.option("kafka.security.protocol", kafkaConfig.securityProtocol.get) 51 | reader.option("kafka.sasl.mechanism", kafkaConfig.mechanism.get) 52 | } 53 | if (kafkaConfig.kerberos) { 54 | reader.option("kafka.sasl.kerberos.service.name", kafkaConfig.kerberosServiceName) 55 | } 56 | 57 | val maxOffsetsPerTrigger = kafkaConfig.maxOffsetsPerTrigger 58 | if (maxOffsetsPerTrigger.isDefined) 59 | reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get) 60 | 61 | reader 62 | .load() 63 | .select($"value".cast(StringType)) 64 | .select(json_tuple($"value", fields: _*)) 65 | .toDF(fields: _*) 66 | 67 | } 68 | } 69 | 70 | /** 71 | * 72 | * @param session 73 | * @param pulsarConfig 74 | */ 75 | class PulsarReader(override val session: SparkSession, pulsarConfig: PulsarSourceConfigEntry) 76 | extends StreamingBaseReader(session) { 77 | 78 | override def read(): DataFrame = { 79 | session.readStream 80 | .format("pulsar") 81 | .option("service.url", pulsarConfig.serviceUrl) 82 | .option("admin.url", pulsarConfig.adminUrl) 83 | .options(pulsarConfig.options) 84 | .load() 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | 25 | # build target 26 | target/ 27 | 28 | # IDE 29 | .idea/ 30 | .eclipse/ 31 | *.iml 32 | 33 | spark-importer.ipr 34 | spark-importer.iws 35 | 36 | .DS_Store 37 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Global logging configuration 2 | log4j.rootLogger=INFO, stdout 3 | # Console output... 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n 7 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.reader 7 | 8 | import com.vesoft.exchange.common.config.FileBaseSourceConfigEntry 9 | import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE 10 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 11 | import org.apache.spark.sql.types.StructType 12 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 13 | 14 | /** 15 | * The FileBaseReader is the abstract class for HDFS file reader. 16 | * 17 | * @param session 18 | * @param path 19 | */ 20 | abstract class FileBaseReader(val session: SparkSession, val path: String) extends Reader { 21 | 22 | require(path.trim.nonEmpty) 23 | 24 | override def close(): Unit = { 25 | session.close() 26 | } 27 | } 28 | 29 | /** 30 | * The ParquetReader extend the FileBaseReader and support read parquet file from HDFS. 31 | * 32 | * @param session 33 | * @param parquetConfig 34 | */ 35 | class ParquetReader(override val session: SparkSession, parquetConfig: FileBaseSourceConfigEntry) 36 | extends FileBaseReader(session, parquetConfig.path) { 37 | 38 | override def read(): DataFrame = { 39 | session.read.parquet(path) 40 | } 41 | } 42 | 43 | /** 44 | * The ORCReader extend the FileBaseReader and support read orc file from HDFS. 45 | * 46 | * @param session 47 | * @param orcConfig 48 | */ 49 | class ORCReader(override val session: SparkSession, orcConfig: FileBaseSourceConfigEntry) 50 | extends FileBaseReader(session, orcConfig.path) { 51 | 52 | override def read(): DataFrame = { 53 | session.read.orc(path) 54 | } 55 | } 56 | 57 | /** 58 | * The JSONReader extend the FileBaseReader and support read json file from HDFS. 59 | * 60 | * @param session 61 | * @param jsonConfig 62 | */ 63 | class JSONReader(override val session: SparkSession, jsonConfig: FileBaseSourceConfigEntry) 64 | extends FileBaseReader(session, jsonConfig.path) { 65 | 66 | override def read(): DataFrame = { 67 | session.read.json(path) 68 | } 69 | } 70 | 71 | /** 72 | * The CSVReader extend the FileBaseReader and support read csv file from HDFS. 73 | * All types of the structure are StringType. 74 | * 75 | * @param session 76 | * @param csvConfig 77 | */ 78 | class CSVReader(override val session: SparkSession, csvConfig: FileBaseSourceConfigEntry) 79 | extends FileBaseReader(session, csvConfig.path) { 80 | 81 | override def read(): DataFrame = { 82 | session.read 83 | .option("delimiter", csvConfig.separator.get) 84 | .option("header", csvConfig.header.get) 85 | .option("emptyValue", DEFAULT_EMPTY_VALUE) 86 | .csv(path) 87 | } 88 | } 89 | 90 | /** 91 | * The CustomReader extend the FileBaseReader and support read text file from HDFS. 92 | * Transformation is a function convert a line into Row. 93 | * The structure of the row should be specified. 94 | * 95 | * @param session 96 | * @param customConfig 97 | * @param transformation 98 | * @param structType 99 | */ 100 | abstract class CustomReader(override val session: SparkSession, 101 | customConfig: FileBaseSourceConfigEntry, 102 | transformation: String => Row, 103 | filter: Row => Boolean, 104 | structType: StructType) 105 | extends FileBaseReader(session, customConfig.path) { 106 | 107 | override def read(): DataFrame = { 108 | val encoder = RowEncoder.apply(structType) 109 | session.read 110 | .text(path) 111 | .filter(!_.getString(0).isEmpty) 112 | .map(row => transformation(row.getString(0)))(encoder) 113 | .filter(filter) 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.reader 7 | 8 | import com.vesoft.exchange.common.Offset 9 | import com.vesoft.exchange.common.utils.HDFSUtils 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | 12 | /** 13 | * The Reader is used to create a DataFrame from the source, such as Hive or HDFS. 14 | */ 15 | trait Reader extends Serializable { 16 | def session: SparkSession 17 | 18 | def read(): DataFrame 19 | 20 | def close(): Unit 21 | } 22 | 23 | trait CheckPointSupport extends Serializable { 24 | 25 | def getOffsets(totalCount: Long, 26 | parallel: Int, 27 | checkPointPath: Option[String], 28 | checkPointNamePrefix: String): List[Offset] = { 29 | if (totalCount <= 0) 30 | throw new RuntimeException(s"${checkPointNamePrefix}: return data count<=0") 31 | 32 | val batchSizes = List.fill((totalCount % parallel).toInt)(totalCount / parallel + 1) ::: List 33 | .fill((parallel - totalCount % parallel).toInt)(totalCount / parallel) 34 | 35 | val startOffsets = batchSizes.scanLeft(0L)(_ + _).init 36 | 37 | val checkPointOffsets = checkPointPath match { 38 | case Some(path) => 39 | val files = Range(0, parallel).map(i => s"${path}/${checkPointNamePrefix}.${i}").toList 40 | if (files.forall(HDFSUtils.exists)) 41 | files.map(HDFSUtils.getContent(_).trim.toLong).sorted 42 | else startOffsets 43 | case _ => startOffsets 44 | } 45 | 46 | if (checkPointOffsets.zip(startOffsets).exists(x => x._1 < x._2)) 47 | throw new RuntimeException( 48 | s"Check Point file maybe previous. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") 49 | 50 | val eachPartitionLimit = { 51 | batchSizes 52 | .zip(startOffsets.zip(checkPointOffsets)) 53 | .map(x => { 54 | x._1 - (x._2._2 - x._2._1) 55 | }) 56 | } 57 | val offsets = checkPointOffsets.zip(eachPartitionLimit).map(x => Offset(x._1, x._2)) 58 | if (offsets.exists(_.size < 0L)) 59 | throw new RuntimeException( 60 | s"Check point file maybe broken. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") 61 | offsets 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.reader 7 | 8 | import com.vesoft.exchange.common.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry} 9 | import org.apache.spark.sql.types.StringType 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | 12 | import scala.collection.mutable 13 | 14 | /** 15 | * Spark Streaming 16 | * 17 | * @param session 18 | */ 19 | abstract class StreamingBaseReader(override val session: SparkSession) extends Reader { 20 | 21 | override def close(): Unit = { 22 | session.close() 23 | } 24 | } 25 | 26 | /** 27 | * 28 | * @param session 29 | * @param kafkaConfig 30 | * @param targetFields 31 | */ 32 | class KafkaReader(override val session: SparkSession, 33 | kafkaConfig: KafkaSourceConfigEntry, 34 | targetFields: List[String]) 35 | extends StreamingBaseReader(session) { 36 | 37 | require( 38 | kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty) 39 | 40 | override def read(): DataFrame = { 41 | import org.apache.spark.sql.functions._ 42 | import session.implicits._ 43 | val fields = targetFields.distinct 44 | val reader = 45 | session.readStream 46 | .format("kafka") 47 | .option("kafka.bootstrap.servers", kafkaConfig.server) 48 | .option("subscribe", kafkaConfig.topic) 49 | .option("startingOffsets", kafkaConfig.startingOffsets) 50 | 51 | if(kafkaConfig.securityProtocol.isDefined){ 52 | reader.option("kafka.security.protocol", kafkaConfig.securityProtocol.get) 53 | reader.option("kafka.sasl.mechanism", kafkaConfig.mechanism.get) 54 | } 55 | if(kafkaConfig.kerberos){ 56 | reader.option("kafka.sasl.kerberos.service.name", kafkaConfig.kerberosServiceName) 57 | } 58 | val maxOffsetsPerTrigger = kafkaConfig.maxOffsetsPerTrigger 59 | if (maxOffsetsPerTrigger.isDefined) 60 | reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get) 61 | 62 | reader 63 | .load() 64 | .select($"value".cast(StringType)) 65 | .select(json_tuple($"value", fields: _*)) 66 | .toDF(fields: _*) 67 | 68 | } 69 | } 70 | 71 | /** 72 | * 73 | * @param session 74 | * @param pulsarConfig 75 | */ 76 | class PulsarReader(override val session: SparkSession, pulsarConfig: PulsarSourceConfigEntry) 77 | extends StreamingBaseReader(session) { 78 | 79 | override def read(): DataFrame = { 80 | session.readStream 81 | .format("pulsar") 82 | .option("service.url", pulsarConfig.serviceUrl) 83 | .option("admin.url", pulsarConfig.adminUrl) 84 | .options(pulsarConfig.options) 85 | .load() 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/utils/Neo4jUtils.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.utils 7 | 8 | import org.neo4j.driver.Value 9 | 10 | object Neo4jUtils { 11 | 12 | def convertNeo4jData(value: Value): String = { 13 | value.`type`().name() match { 14 | case "NULL" => { 15 | null 16 | } 17 | case "STRING" => { 18 | value.asString() 19 | } 20 | case "INTEGER" => { 21 | value.asLong().toString 22 | } 23 | case "FLOAT" | "DOUBLE" => { 24 | value.asDouble().toString 25 | } 26 | case "BOOLEAN" => { 27 | value.asBoolean().toString 28 | } 29 | case "DATE" | "LOCAL_DATE" => { 30 | value.asLocalDate().toString 31 | } 32 | case "DATE_TIME" | "LOCAL_DATE_TIME" => { 33 | value.asLocalDateTime().toString 34 | } 35 | case "TIME" | "LOCAL_TIME" => { 36 | value.asLocalTime().toString 37 | } 38 | case "BYTES" => { 39 | new String(value.asByteArray()) 40 | } 41 | case "LIST" => { 42 | value.asList().toString 43 | } 44 | case "MAP" => { 45 | value.asMap().toString 46 | } 47 | case _ => { 48 | value.toString 49 | } 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/src/test/resources/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | services: 3 | metad0: 4 | image: vesoft/nebula-metad:nightly 5 | environment: 6 | USER: root 7 | TZ: "${TZ}" 8 | command: 9 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 10 | - --local_ip=172.28.1.1 11 | - --ws_ip=172.28.1.1 12 | - --port=9559 13 | - --data_path=/data/meta 14 | - --log_dir=/logs 15 | - --v=0 16 | - --minloglevel=0 17 | - --heartbeat_interval_secs=2 18 | healthcheck: 19 | test: ["CMD", "curl", "-f", "http://172.28.1.1:11000/status"] 20 | interval: 30s 21 | timeout: 10s 22 | retries: 3 23 | start_period: 20s 24 | ports: 25 | - "9559:9559" 26 | - 11000 27 | - 11002 28 | volumes: 29 | - ./data/meta0:/data/meta:Z 30 | - ./logs/meta0:/logs:Z 31 | networks: 32 | nebula-net: 33 | ipv4_address: 172.28.1.1 34 | restart: on-failure 35 | cap_add: 36 | - SYS_PTRACE 37 | 38 | metad1: 39 | image: vesoft/nebula-metad:nightly 40 | environment: 41 | USER: root 42 | TZ: "${TZ}" 43 | command: 44 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 45 | - --local_ip=172.28.1.2 46 | - --ws_ip=172.28.1.2 47 | - --port=9559 48 | - --data_path=/data/meta 49 | - --log_dir=/logs 50 | - --v=0 51 | - --minloglevel=0 52 | - --heartbeat_interval_secs=2 53 | healthcheck: 54 | test: ["CMD", "curl", "-f", "http://172.28.1.2:11000/status"] 55 | interval: 30s 56 | timeout: 10s 57 | retries: 3 58 | start_period: 20s 59 | ports: 60 | - "9560:9559" 61 | - 11000 62 | - 11002 63 | volumes: 64 | - ./data/meta1:/data/meta:Z 65 | - ./logs/meta1:/logs:Z 66 | networks: 67 | nebula-net: 68 | ipv4_address: 172.28.1.2 69 | restart: on-failure 70 | cap_add: 71 | - SYS_PTRACE 72 | 73 | metad2: 74 | image: vesoft/nebula-metad:nightly 75 | environment: 76 | USER: root 77 | TZ: "${TZ}" 78 | command: 79 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 80 | - --local_ip=172.28.1.3 81 | - --ws_ip=172.28.1.3 82 | - --port=9559 83 | - --data_path=/data/meta 84 | - --log_dir=/logs 85 | - --v=0 86 | - --minloglevel=0 87 | - --heartbeat_interval_secs=2 88 | healthcheck: 89 | test: ["CMD", "curl", "-f", "http://172.28.1.3:11000/status"] 90 | interval: 30s 91 | timeout: 10s 92 | retries: 3 93 | start_period: 20s 94 | ports: 95 | - "9561:9559" 96 | - 11000 97 | - 11002 98 | volumes: 99 | - ./data/meta2:/data/meta:Z 100 | - ./logs/meta2:/logs:Z 101 | networks: 102 | nebula-net: 103 | ipv4_address: 172.28.1.3 104 | restart: on-failure 105 | cap_add: 106 | - SYS_PTRACE 107 | 108 | storaged0: 109 | image: vesoft/nebula-storaged:nightly 110 | environment: 111 | USER: root 112 | TZ: "${TZ}" 113 | command: 114 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 115 | - --local_ip=172.28.2.1 116 | - --ws_ip=172.28.2.1 117 | - --port=9779 118 | - --data_path=/data/storage 119 | - --log_dir=/logs 120 | - --v=0 121 | - --minloglevel=0 122 | - --heartbeat_interval_secs=2 123 | depends_on: 124 | - metad0 125 | - metad1 126 | - metad2 127 | healthcheck: 128 | test: ["CMD", "curl", "-f", "http://172.28.2.1:12000/status"] 129 | interval: 30s 130 | timeout: 10s 131 | retries: 3 132 | start_period: 20s 133 | ports: 134 | - "9779:9779" 135 | - 12000 136 | - 12002 137 | volumes: 138 | - ./data/storage0:/data/storage:Z 139 | - ./logs/storage0:/logs:Z 140 | networks: 141 | nebula-net: 142 | ipv4_address: 172.28.2.1 143 | restart: on-failure 144 | cap_add: 145 | - SYS_PTRACE 146 | 147 | storaged1: 148 | image: vesoft/nebula-storaged:nightly 149 | environment: 150 | USER: root 151 | TZ: "${TZ}" 152 | command: 153 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 154 | - --local_ip=172.28.2.2 155 | - --ws_ip=172.28.2.2 156 | - --port=9779 157 | - --data_path=/data/storage 158 | - --log_dir=/logs 159 | - --v=0 160 | - --minloglevel=0 161 | - --heartbeat_interval_secs=2 162 | depends_on: 163 | - metad0 164 | - metad1 165 | - metad2 166 | healthcheck: 167 | test: ["CMD", "curl", "-f", "http://172.28.2.2:12000/status"] 168 | interval: 30s 169 | timeout: 10s 170 | retries: 3 171 | start_period: 20s 172 | ports: 173 | - "9780:9779" 174 | - 12000 175 | - 12002 176 | volumes: 177 | - ./data/storage1:/data/storage:Z 178 | - ./logs/storage1:/logs:Z 179 | networks: 180 | nebula-net: 181 | ipv4_address: 172.28.2.2 182 | restart: on-failure 183 | cap_add: 184 | - SYS_PTRACE 185 | 186 | storaged2: 187 | image: vesoft/nebula-storaged:nightly 188 | environment: 189 | USER: root 190 | TZ: "${TZ}" 191 | command: 192 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 193 | - --local_ip=172.28.2.3 194 | - --ws_ip=172.28.2.3 195 | - --port=9779 196 | - --data_path=/data/storage 197 | - --log_dir=/logs 198 | - --v=0 199 | - --minloglevel=0 200 | - --heartbeat_interval_secs=2 201 | depends_on: 202 | - metad0 203 | - metad1 204 | - metad2 205 | healthcheck: 206 | test: ["CMD", "curl", "-f", "http://172.28.2.3:12000/status"] 207 | interval: 30s 208 | timeout: 10s 209 | retries: 3 210 | start_period: 20s 211 | ports: 212 | - "9781:9779" 213 | - 12000 214 | - 12002 215 | volumes: 216 | - ./data/storage2:/data/storage:Z 217 | - ./logs/storage2:/logs:Z 218 | networks: 219 | nebula-net: 220 | ipv4_address: 172.28.2.3 221 | restart: on-failure 222 | cap_add: 223 | - SYS_PTRACE 224 | 225 | graphd0: 226 | image: vesoft/nebula-graphd:nightly 227 | environment: 228 | USER: root 229 | TZ: "${TZ}" 230 | command: 231 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 232 | - --port=9669 233 | - --ws_ip=172.28.3.1 234 | - --log_dir=/logs 235 | - --v=0 236 | - --minloglevel=0 237 | - --heartbeat_interval_secs=2 238 | depends_on: 239 | - metad0 240 | - metad1 241 | - metad2 242 | healthcheck: 243 | test: ["CMD", "curl", "-f", "http://172.28.3.1:13000/status"] 244 | interval: 30s 245 | timeout: 10s 246 | retries: 3 247 | start_period: 20s 248 | ports: 249 | - "9669:9669" 250 | - 13000 251 | - 13002 252 | volumes: 253 | - ./logs/graph0:/logs:Z 254 | networks: 255 | nebula-net: 256 | ipv4_address: 172.28.3.1 257 | restart: on-failure 258 | cap_add: 259 | - SYS_PTRACE 260 | 261 | graphd1: 262 | image: vesoft/nebula-graphd:nightly 263 | environment: 264 | USER: root 265 | TZ: "${TZ}" 266 | command: 267 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 268 | - --port=9669 269 | - --ws_ip=172.28.3.2 270 | - --log_dir=/logs 271 | - --v=0 272 | - --minloglevel=0 273 | - --heartbeat_interval_secs=2 274 | depends_on: 275 | - metad0 276 | - metad1 277 | - metad2 278 | healthcheck: 279 | test: ["CMD", "curl", "-f", "http://172.28.3.2:13000/status"] 280 | interval: 30s 281 | timeout: 10s 282 | retries: 3 283 | start_period: 20s 284 | ports: 285 | - "9670:9669" 286 | - 13000 287 | - 13002 288 | volumes: 289 | - ./logs/graph1:/logs:Z 290 | networks: 291 | nebula-net: 292 | ipv4_address: 172.28.3.2 293 | restart: on-failure 294 | cap_add: 295 | - SYS_PTRACE 296 | 297 | graphd2: 298 | image: vesoft/nebula-graphd:nightly 299 | environment: 300 | USER: root 301 | TZ: "${TZ}" 302 | command: 303 | - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 304 | - --port=9669 305 | - --ws_ip=172.28.3.3 306 | - --log_dir=/logs 307 | - --v=0 308 | - --minloglevel=0 309 | - --heartbeat_interval_secs=2 310 | depends_on: 311 | - metad0 312 | - metad1 313 | - metad2 314 | healthcheck: 315 | test: ["CMD", "curl", "-f", "http://172.28.3.3:13000/status"] 316 | interval: 30s 317 | timeout: 10s 318 | retries: 3 319 | start_period: 20s 320 | ports: 321 | - "9671:9669" 322 | - 13000 323 | - 13002 324 | volumes: 325 | - ./logs/graph2:/logs:Z 326 | networks: 327 | nebula-net: 328 | ipv4_address: 172.28.3.3 329 | restart: on-failure 330 | cap_add: 331 | - SYS_PTRACE 332 | 333 | console: 334 | image: vesoft/nebula-console:nightly 335 | entrypoint: "" 336 | command: 337 | - sh 338 | - -c 339 | - | 340 | sleep 3 && 341 | nebula-console -addr graphd0 -port 9669 -u root -p nebula -e 'ADD HOSTS "172.28.2.1":9779,"172.28.2.2":9779,"172.28.2.3":9779' && 342 | sleep 36000 343 | depends_on: 344 | - graphd0 345 | networks: 346 | - nebula-net 347 | 348 | networks: 349 | nebula-net: 350 | ipam: 351 | driver: default 352 | config: 353 | - subnet: 172.28.0.0/16 354 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/src/test/resources/edge.csv: -------------------------------------------------------------------------------- 1 | src,dst,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14 2 | 101,102,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2) 3 | 102,103,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4) 4 | 103,101,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6) 5 | 104,106,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7) 6 | 105,107,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5) 7 | 106,108,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)" 8 | 107,101,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)" 9 | 108,109,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)" 10 | 109,110,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)" 11 | 110,-101,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)" 12 | -101,102,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 13 | -102,-103,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 14 | -103,-101,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 15 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/src/test/resources/process_application.conf: -------------------------------------------------------------------------------- 1 | { 2 | # Spark relation com.vesoft.exchange.common.config 3 | spark: { 4 | app: { 5 | name: Nebula Exchange 2.0 6 | } 7 | 8 | master:local 9 | 10 | driver: { 11 | cores: 1 12 | maxResultSize: 1G 13 | } 14 | 15 | executor: { 16 | memory:1G 17 | } 18 | 19 | cores:{ 20 | max: 16 21 | } 22 | } 23 | 24 | # if the hive is hive-on-spark with derby mode, you can ignore this hive configure 25 | # get the com.vesoft.exchange.common.config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml 26 | 27 | hive: { 28 | warehouse: "hdfs://NAMENODE_IP:9000/apps/svr/hive-xxx/warehouse/" 29 | connectionURL: "jdbc:mysql://your_ip:3306/hive_spark?characterEncoding=UTF-8" 30 | connectionDriverName: "com.mysql.jdbc.Driver" 31 | connectionUserName: "user" 32 | connectionPassword: "password" 33 | } 34 | 35 | # Nebula Graph relation com.vesoft.exchange.common.config 36 | nebula: { 37 | address:{ 38 | graph:["127.0.0.1:9669", "127.0.0.1:9670", "127.0.0.1:9671"] 39 | meta:["127.0.0.1:9559", "127.0.0.1:9560", "127.0.0.1:9561"] 40 | } 41 | user: root 42 | pswd: nebula 43 | space: test_string 44 | 45 | # parameters for SST import, not required 46 | path:{ 47 | local:"/tmp" 48 | remote:"/sst" 49 | hdfs.namenode: "hdfs://name_node:9000" 50 | } 51 | 52 | # nebula client connection parameters 53 | connection { 54 | timeout: 3000 55 | retry: 3 56 | } 57 | 58 | # nebula client execution parameters 59 | execution { 60 | retry: 3 61 | } 62 | 63 | error: { 64 | # max number of failures, if the number of failures is bigger than max, then exit the application. 65 | max: 32 66 | # failed import job will be recorded in output path 67 | output: /tmp/errors 68 | } 69 | 70 | # use google's RateLimiter to limit the requests send to NebulaGraph 71 | rate: { 72 | # the stable throughput of RateLimiter 73 | limit: 1024 74 | # Acquires a permit from RateLimiter, unit: MILLISECONDS 75 | # if it can't be obtained within the specified timeout, then give up the request. 76 | timeout: 1000 77 | } 78 | } 79 | 80 | # Processing tags 81 | # There are tag com.vesoft.exchange.common.config examples for different dataSources. 82 | tags: [ 83 | { 84 | name: person 85 | type: { 86 | source: csv 87 | sink: client 88 | } 89 | path: "file://src/test/resources/data.csv" 90 | fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] 91 | nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] 92 | vertex: { 93 | field:id 94 | #policy:hash 95 | } 96 | header:true 97 | batch: 2 98 | partition: 5 99 | } 100 | ] 101 | 102 | # There are tag com.vesoft.exchange.common.config examples for different dataSources. 103 | edges: [ 104 | { 105 | name: friend 106 | type: { 107 | source: csv 108 | sink: client 109 | } 110 | path: "file://src/test/resources/data.csv" 111 | fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] 112 | nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] 113 | source: { 114 | field:src 115 | #policy:hash 116 | } 117 | target: { 118 | field:dst 119 | #policy:hash 120 | } 121 | header:true 122 | batch: 2 123 | partition: 5 124 | } 125 | ] 126 | } 127 | -------------------------------------------------------------------------------- /nebula-exchange_spark_2.4/src/test/resources/vertex.csv: -------------------------------------------------------------------------------- 1 | id,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14 2 | 101,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2) 3 | 102,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4) 4 | 103,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6) 5 | 104,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7) 6 | 105,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5) 7 | 106,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)" 8 | 107,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)" 9 | 108,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)" 10 | 109,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)" 11 | 1010,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)" 12 | -101,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 13 | -102,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 14 | -103,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" 15 | -------------------------------------------------------------------------------- /nebula-exchange_spark_3.0/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Global logging configuration 2 | log4j.rootLogger=INFO, stdout 3 | # Console output... 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n 7 | -------------------------------------------------------------------------------- /nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.reader 7 | 8 | import com.vesoft.exchange.common.config.FileBaseSourceConfigEntry 9 | import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE 10 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 11 | import org.apache.spark.sql.types.StructType 12 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 13 | 14 | /** 15 | * The FileBaseReader is the abstract class for HDFS file reader. 16 | * 17 | * @param session 18 | * @param path 19 | */ 20 | abstract class FileBaseReader(val session: SparkSession, val path: String) extends Reader { 21 | 22 | require(path.trim.nonEmpty) 23 | 24 | override def close(): Unit = { 25 | session.close() 26 | } 27 | } 28 | 29 | /** 30 | * The ParquetReader extend the FileBaseReader and support read parquet file from HDFS. 31 | * 32 | * @param session 33 | * @param parquetConfig 34 | */ 35 | class ParquetReader(override val session: SparkSession, parquetConfig: FileBaseSourceConfigEntry) 36 | extends FileBaseReader(session, parquetConfig.path) { 37 | 38 | override def read(): DataFrame = { 39 | session.read.parquet(path) 40 | } 41 | } 42 | 43 | /** 44 | * The ORCReader extend the FileBaseReader and support read orc file from HDFS. 45 | * 46 | * @param session 47 | * @param orcConfig 48 | */ 49 | class ORCReader(override val session: SparkSession, orcConfig: FileBaseSourceConfigEntry) 50 | extends FileBaseReader(session, orcConfig.path) { 51 | 52 | override def read(): DataFrame = { 53 | session.read.orc(path) 54 | } 55 | } 56 | 57 | /** 58 | * The JSONReader extend the FileBaseReader and support read json file from HDFS. 59 | * 60 | * @param session 61 | * @param jsonConfig 62 | */ 63 | class JSONReader(override val session: SparkSession, jsonConfig: FileBaseSourceConfigEntry) 64 | extends FileBaseReader(session, jsonConfig.path) { 65 | 66 | override def read(): DataFrame = { 67 | session.read.json(path) 68 | } 69 | } 70 | 71 | /** 72 | * The CSVReader extend the FileBaseReader and support read csv file from HDFS. 73 | * All types of the structure are StringType. 74 | * 75 | * @param session 76 | * @param csvConfig 77 | */ 78 | class CSVReader(override val session: SparkSession, csvConfig: FileBaseSourceConfigEntry) 79 | extends FileBaseReader(session, csvConfig.path) { 80 | 81 | override def read(): DataFrame = { 82 | session.read 83 | .option("delimiter", csvConfig.separator.get) 84 | .option("header", csvConfig.header.get) 85 | .option("emptyValue", DEFAULT_EMPTY_VALUE) 86 | .csv(path) 87 | } 88 | } 89 | 90 | /** 91 | * The CustomReader extend the FileBaseReader and support read text file from HDFS. 92 | * Transformation is a function convert a line into Row. 93 | * The structure of the row should be specified. 94 | * 95 | * @param session 96 | * @param customConfig 97 | * @param transformation 98 | * @param structType 99 | */ 100 | abstract class CustomReader(override val session: SparkSession, 101 | customConfig: FileBaseSourceConfigEntry, 102 | transformation: String => Row, 103 | filter: Row => Boolean, 104 | structType: StructType) 105 | extends FileBaseReader(session, customConfig.path) { 106 | 107 | override def read(): DataFrame = { 108 | val encoder = RowEncoder.apply(structType) 109 | session.read 110 | .text(path) 111 | .filter(!_.getString(0).isEmpty) 112 | .map(row => transformation(row.getString(0)))(encoder) 113 | .filter(filter) 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.reader 7 | 8 | import com.vesoft.exchange.common.Offset 9 | import com.vesoft.exchange.common.utils.HDFSUtils 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | 12 | /** 13 | * The Reader is used to create a DataFrame from the source, such as Hive or HDFS. 14 | */ 15 | trait Reader extends Serializable { 16 | def session: SparkSession 17 | 18 | def read(): DataFrame 19 | 20 | def close(): Unit 21 | } 22 | 23 | trait CheckPointSupport extends Serializable { 24 | 25 | def getOffsets(totalCount: Long, 26 | parallel: Int, 27 | checkPointPath: Option[String], 28 | checkPointNamePrefix: String): List[Offset] = { 29 | if (totalCount <= 0) 30 | throw new RuntimeException(s"${checkPointNamePrefix}: return data count<=0") 31 | 32 | val batchSizes = List.fill((totalCount % parallel).toInt)(totalCount / parallel + 1) ::: List 33 | .fill((parallel - totalCount % parallel).toInt)(totalCount / parallel) 34 | 35 | val startOffsets = batchSizes.scanLeft(0L)(_ + _).init 36 | 37 | val checkPointOffsets = checkPointPath match { 38 | case Some(path) => 39 | val files = Range(0, parallel).map(i => s"${path}/${checkPointNamePrefix}.${i}").toList 40 | if (files.forall(HDFSUtils.exists)) 41 | files.map(HDFSUtils.getContent(_).trim.toLong).sorted 42 | else startOffsets 43 | case _ => startOffsets 44 | } 45 | 46 | if (checkPointOffsets.zip(startOffsets).exists(x => x._1 < x._2)) 47 | throw new RuntimeException( 48 | s"Check Point file maybe previous. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") 49 | 50 | val eachPartitionLimit = { 51 | batchSizes 52 | .zip(startOffsets.zip(checkPointOffsets)) 53 | .map(x => { 54 | x._1 - (x._2._2 - x._2._1) 55 | }) 56 | } 57 | val offsets = checkPointOffsets.zip(eachPartitionLimit).map(x => Offset(x._1, x._2)) 58 | if (offsets.exists(_.size < 0L)) 59 | throw new RuntimeException( 60 | s"Check point file maybe broken. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") 61 | offsets 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2020 vesoft inc. All rights reserved. 2 | * 3 | * This source code is licensed under Apache 2.0 License. 4 | */ 5 | 6 | package com.vesoft.nebula.exchange.reader 7 | 8 | import com.vesoft.exchange.common.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry} 9 | import org.apache.spark.sql.types.StringType 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | 12 | /** 13 | * Spark Streaming 14 | * 15 | * @param session 16 | */ 17 | abstract class StreamingBaseReader(override val session: SparkSession) extends Reader { 18 | 19 | override def close(): Unit = { 20 | session.close() 21 | } 22 | } 23 | 24 | /** 25 | * 26 | * @param session 27 | * @param kafkaConfig 28 | * @param targetFields 29 | */ 30 | class KafkaReader(override val session: SparkSession, 31 | kafkaConfig: KafkaSourceConfigEntry, 32 | targetFields: List[String]) 33 | extends StreamingBaseReader(session) { 34 | 35 | require( 36 | kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty) 37 | 38 | override def read(): DataFrame = { 39 | import org.apache.spark.sql.functions._ 40 | import session.implicits._ 41 | val fields = targetFields.distinct 42 | val reader = 43 | session.readStream 44 | .format("kafka") 45 | .option("kafka.bootstrap.servers", kafkaConfig.server) 46 | .option("subscribe", kafkaConfig.topic) 47 | .option("startingOffsets", kafkaConfig.startingOffsets) 48 | 49 | if (kafkaConfig.securityProtocol.isDefined) { 50 | reader.option("kafka.security.protocol", kafkaConfig.securityProtocol.get) 51 | reader.option("kafka.sasl.mechanism", kafkaConfig.mechanism.get) 52 | } 53 | if (kafkaConfig.kerberos) { 54 | reader.option("kafka.sasl.kerberos.service.name", kafkaConfig.kerberosServiceName) 55 | } 56 | 57 | val maxOffsetsPerTrigger = kafkaConfig.maxOffsetsPerTrigger 58 | if (maxOffsetsPerTrigger.isDefined) 59 | reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get) 60 | 61 | reader 62 | .load() 63 | .select($"value".cast(StringType)) 64 | .select(json_tuple($"value", fields: _*)) 65 | .toDF(fields: _*) 66 | 67 | } 68 | } 69 | 70 | /** 71 | * 72 | * @param session 73 | * @param pulsarConfig 74 | */ 75 | class PulsarReader(override val session: SparkSession, pulsarConfig: PulsarSourceConfigEntry) 76 | extends StreamingBaseReader(session) { 77 | 78 | override def read(): DataFrame = { 79 | session.readStream 80 | .format("pulsar") 81 | .option("service.url", pulsarConfig.serviceUrl) 82 | .option("admin.url", pulsarConfig.adminUrl) 83 | .options(pulsarConfig.options) 84 | .load() 85 | } 86 | } 87 | --------------------------------------------------------------------------------