├── .github ├── CODEOWNERS └── workflows │ ├── ci.yml │ └── python-ci.yml ├── .gitignore ├── README.md ├── doc ├── images │ ├── cli_help.png │ ├── databricks_job_config.png │ ├── databricks_job_home.png │ ├── databricks_job_list.png │ ├── databricks_menu.png │ └── output-rel.png └── notebooks │ └── Neo4j DWH Connector quickstart.html ├── maven-release.sh ├── pom.xml ├── python ├── MANIFEST.in ├── neo4j_dwh_connector │ ├── __init__.py │ ├── _dto.py │ ├── _utils.py │ └── connector.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── spark-package-deps.txt └── test │ ├── __init__.py │ ├── _util_test.py │ └── connector_test.py └── src ├── main └── scala │ └── org │ └── neo4j │ └── dwh │ └── connector │ ├── Neo4jDWHConnector.scala │ ├── domain │ └── JobConfig.scala │ ├── generator │ └── JobConfigGenerator.scala │ └── utils │ ├── CliUtils.scala │ ├── DatasourceOptions.scala │ ├── JSONUtils.scala │ ├── JobConfigUtils.scala │ └── Utils.scala └── test ├── resources ├── JobConfig.json ├── neo4j-dwh-connector.properties ├── persons.csv ├── query.cyp └── snowflake.to.neo4j.stub.json └── scala └── org └── neo4j └── dwh └── connector ├── Neo4jDWHConnectorIT.scala ├── domain └── JobConfigTest.scala ├── generator └── JobConfigGeneratorTest.scala └── utils └── UtilsTest.scala /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @neo4j-contrib/team-connectors 2 | 3 | /.github/ @ali-ince @fbiville @venikkin -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: JVM CI with Maven 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'main' 7 | pull_request: 8 | branches: 9 | - 'main' 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | java-version: [8, 11, 17] 18 | scala-version: [2.12, 2.13] 19 | spark-version: ["3"] 20 | neo4j-version: ["4.4", "5"] 21 | name: Build with Scala ${{ matrix.scala-version }}, Spark ${{ matrix.spark-version }} and Neo4j ${{ matrix.neo4j-version }} 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Set up JDK {{ matrix.java-version }} 25 | uses: actions/setup-java@v3 26 | with: 27 | java-version: ${{ matrix.java-version }} 28 | distribution: 'temurin' 29 | - name: Cache Maven packages 30 | uses: actions/cache@v1 31 | with: 32 | path: ~/.m2 33 | key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} 34 | restore-keys: ${{ runner.os }}-m2 35 | - name: Build with Maven 36 | env: 37 | CI: true 38 | MY_ENV: "MY_ENV_value" 39 | run: mvn clean verify -Pscala-${{ matrix.scala-version }} -Pspark-${{ matrix.spark-version }} -Pneo4j-${{ matrix.neo4j-version }} --no-transfer-progress 40 | -------------------------------------------------------------------------------- /.github/workflows/python-ci.yml: -------------------------------------------------------------------------------- 1 | name: Python CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'main' 7 | pull_request: 8 | branches: 9 | - 'main' 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | python-version: [ "3.8", "3.9", "3.10", "3.11" ] 17 | neo4j-version: [ "4.4", "5" ] 18 | spark-version: 19 | - {short: "3", ext: "3.1.3", scala: "2.12"} 20 | - {short: "3", ext: "3.2.4", scala: "2.12"} 21 | - {short: "3", ext: "3.2.4", scala: "2.13"} 22 | - {short: "3", ext: "3.3.2", scala: "2.12"} 23 | - {short: "3", ext: "3.3.2", scala: "2.13"} 24 | - {short: "3", ext: "3.4.1", scala: "2.12"} 25 | - {short: "3", ext: "3.4.1", scala: "2.13"} 26 | steps: 27 | - uses: actions/checkout@v2 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v2 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | - name: Set up JDK 8 33 | uses: actions/setup-java@v1 34 | with: 35 | java-version: 1.8 36 | - uses: avides/actions-project-version-check@v1.2.0 37 | id: version 38 | with: 39 | token: ${{ secrets.GITHUB_TOKEN }} 40 | file-to-check: pom.xml 41 | only-return-version: true 42 | - name: Cache Maven packages 43 | uses: actions/cache@v1 44 | with: 45 | path: ~/.m2 46 | key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} 47 | restore-keys: ${{ runner.os }}-m2 48 | - name: Install dependencies 49 | run: | 50 | python -m pip install --upgrade pip 51 | pip install pypandoc six tzlocal==2.1 52 | pip install pyspark==${{ matrix.spark-version.ext }} "testcontainers[neo4j]" 53 | - name: Build artifact 54 | env: 55 | CI: true 56 | run: | 57 | mvn clean package -Pspark-${{ matrix.spark-version.short }} -Pscala-${{ matrix.spark-version.scala }} -DskipTests --no-transfer-progress 58 | - name: Run tests for Spark ${{ matrix.spark-version.ext }} and Neo4j ${{ matrix.neo4j-version }} 59 | if: ${{ !(matrix.spark-version.short == 2.4 && matrix.python-version == 3.8) && !(matrix.spark-version.ext == '3.2.0' && matrix.python-version == 3.5) }} 60 | run: | 61 | cd ./python 62 | export PYTHONPATH=$(pwd) 63 | python3 ./test/_util_test.py 64 | python3 ./test/connector_test.py "${{ steps.version.outputs.version }}" "${{ matrix.neo4j-version }}" 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | neo4j-home 2 | .gradle 3 | gradle/ 4 | build/ 5 | *~ 6 | \#* 7 | target 8 | out 9 | .project 10 | .classpath 11 | .settings 12 | .externalToolBuilders/ 13 | .scala_dependencies 14 | .factorypath 15 | .cache 16 | .cache-main 17 | .cache-tests 18 | *.iws 19 | *.ipr 20 | *.iml 21 | .idea 22 | .DS_Store 23 | .shell_history 24 | .mailmap 25 | .java-version 26 | .cache-main 27 | .cache-tests 28 | Thumbs.db 29 | .cache-main 30 | .cache-tests 31 | docs/guides 32 | doc/node 33 | doc/node_modules 34 | doc/package-lock.json 35 | scripts/python/local 36 | python/neo4j_dwh_connector/__pycache__/ 37 | python/neo4j_dwh_connector/.pytest_cache/ 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neo4j DWH Connector 2 | 3 | This repository contains the Neo4j Data Warehouse Connector for Apache Spark. 4 | 5 | # Goal 6 | 7 | The goal of the Neo4j DWH Connector is to simplify the interoperability between Neo4j Spark and other data sources like Snowflake, Redshift and so on. 8 | 9 | In order to do that we created this connector that via a simple JSON file creates Spark’s jobs through Spark Submit. 10 | 11 | **Nota bene** 12 | 13 | The examples that we’re providing here are for a Job that moves data from Snowflake to Neo4j but the DWH connector works also in the other way around. 14 | 15 | 16 | # How does it work? 17 | 18 | The Neo4j DWH Connector provides an easy way in order move data between Neo4j and popular Data Warehouses like: 19 | 20 | 21 | 22 | * Snowflake 23 | * BigQuery 24 | * Redshift 25 | * Azure Synapse 26 | 27 | You can you use it in two ways: 28 | 29 | 30 | 31 | * As Spark Submit Job by providing a JSON configuration that abstracts a Spark Job which moves data from one data source to another 32 | * As Scala/Python API in order to simplify writing a Spark Job that moves the dat from a database to another 33 | 34 | 35 | # How does the JSON configuration look like? 36 | 37 | Following a very simple JSON that moves data from Snowflake to Neo4j. 38 | 39 | 40 | ## Nodes 41 | 42 | 43 | ```json 44 | { 45 | "name" : "Ingest Customer table as nodes into Neo4j", 46 | "conf" : { }, 47 | "hadoopConfiguration" : { }, 48 | "source" : { 49 | "format" : "snowflake", 50 | "options" : { 51 | "sfSchema" : "TPCH_SF1", 52 | "sfPassword" : "", 53 | "sfUser" : "conker84", 54 | "dbtable" : "CUSTOMER", 55 | "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA", 56 | "sfURL" : "https://.snowflakecomputing.com" 57 | }, 58 | "columns": [ 59 | { "name": "CAST(C_ACCTBAL AS DOUBLE)", "alias": "C_ACCTBAL" }, 60 | { "name": "C_ADDRESS" }, 61 | { "name": "C_COMMENT" }, 62 | { "name": "CAST(C_CUSTKEY AS LONG)", "alias": "C_CUSTKEY" }, 63 | { "name": "C_MKTSEGMENT" }, 64 | { "name": "C_NAME" }, 65 | { "name": "CAST(C_NATIONKEY AS LONG)", "alias": "C_NATIONKEY" }, 66 | { "name": "C_PHONE" } 67 | ], 68 | "where" : "C_CUSTKEY <= 10", 69 | "printSchema" : false, 70 | "partition" : {} 71 | }, 72 | "target" : { 73 | "format" : "org.neo4j.spark.DataSource", 74 | "options" : { 75 | "url" : "neo4j+s://.databases.neo4j.io", 76 | "authentication.basic.username" : "neo4j", 77 | "labels" : ":Customer", 78 | "authentication.basic.password" : "" 79 | }, 80 | "mode" : "Append" 81 | } 82 | } 83 | ``` 84 | 85 | 86 | This Job moves data from a Snowflake instance, and in particular from **SNOWFLAKE_SAMPLE_DATA** database, **TPCH_SF1** schema and table **CUSTOMER** into Neo4j database as nodes with label **Customer**. 87 | 88 | 89 | ## Relationships 90 | 91 | 92 | ```json 93 | { 94 | "name" : "Ingest Order table as relationships into Neo4j", 95 | "conf" : { }, 96 | "hadoopConfiguration" : { }, 97 | "source" : { 98 | "format" : "snowflake", 99 | "options" : { 100 | "sfSchema" : "TPCH_SF1", 101 | "sfPassword" : "", 102 | "sfUser" : "conker84", 103 | "dbtable" : "ORDERS", 104 | "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA", 105 | "sfURL" : "https://.snowflakecomputing.com" 106 | }, 107 | "columns": [ 108 | { "name": "CAST(O_CUSTKEY AS DOUBLE)", "alias": "O_CUSTKEY" }, 109 | { "name": "O_ORDERDATE" }, 110 | { "name": "O_COMMENT" }, 111 | { "name": "CAST(O_ORDERKEY AS LONG)", "alias": "O_ORDERKEY" } 112 | ], 113 | "where" : "O_CUSTKEY <= 10", 114 | "printSchema" : false, 115 | "partition" : {} 116 | }, 117 | "target" : { 118 | "format" : "org.neo4j.spark.DataSource", 119 | "options" : { 120 | "url" : "neo4j+s://.databases.neo4j.io", 121 | "authentication.basic.username" : "neo4j", 122 | "authentication.basic.password" : "", 123 | "relationship" : "HAS_ORDER", 124 | "relationship.save.strategy" : "keys", 125 | "relationship.source.save.mode" : "Overwrite", 126 | "relationship.source.labels" : ":Customer", 127 | "relationship.source.node.keys" : "O_CUSTKEY", 128 | "relationship.target.save.mode" : "Overwrite", 129 | "relationship.target.labels" : ":Order", 130 | "relationship.target.node.keys" : "O_ORDERKEY" 131 | }, 132 | "mode" : "Overwrite" 133 | } 134 | } 135 | ``` 136 | 137 | 138 | 139 | 140 | Output: 141 | 142 | ![](doc/images/output-rel.png) 143 | 144 | 145 | ## Query 146 | 147 | ```json 148 | { 149 | "name" : "Ingest Order table as relationships into Neo4j", 150 | "conf" : { }, 151 | "hadoopConfiguration" : { }, 152 | "source" : { 153 | "format" : "snowflake", 154 | "options" : { 155 | "sfSchema" : "TPCH_SF1", 156 | "sfPassword" : "", 157 | "sfUser" : "conker84", 158 | "dbtable" : "ORDERS", 159 | "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA", 160 | "sfURL" : "https://.snowflakecomputing.com" 161 | }, 162 | "columns": [ 163 | { "name": "CAST(O_CUSTKEY AS DOUBLE)", "alias": "O_CUSTKEY" }, 164 | { "name": "O_ORDERDATE" }, 165 | { "name": "O_COMMENT" }, 166 | { "name": "CAST(O_ORDERKEY AS LONG)", "alias": "O_ORDERKEY" } 167 | ], 168 | "where" : "O_CUSTKEY <= 10", 169 | "printSchema" : false, 170 | "partition" : {} 171 | }, 172 | "target" : { 173 | "format" : "org.neo4j.spark.DataSource", 174 | "options" : { 175 | "url" : "neo4j+s://.databases.neo4j.io", 176 | "authentication.basic.username" : "neo4j", 177 | "authentication.basic.password" : "", 178 | "query" : "MERGE (s:Person:Customer{id: event.O_CUSTKEY}) MERGE(t:Order{id: event.O_ORDERKEY}) MERGE (s)-[:HAS_ORDER{date: event.O_ORDERDATE}]->(t)" 179 | }, 180 | "mode" : "Overwrite" 181 | } 182 | } 183 | ``` 184 | 185 | This Job moves data from a Snowflake instance, and in particular from **SNOWFLAKE_SAMPLE_DATA** database, **TPCH_SF1** 186 | schema and table **ORDERS** into Neo4j database as custom graph composed by the pattern 187 | `(:Person:Customer)-[:HAS_ORDER]->(:Order)`. 188 | 189 | 190 | ## Fields Description 191 | 192 | Following a detailed description of each field in the JSON configuration file: 193 | 194 | 195 | 196 | * **name**: the job name. _It’s Optional_ 197 | * **master**: the spark master url, used only for internal testing purposes; 198 | * **conf**: a set of key/value string pairs which will be applied to SparkConf. _It’s Optional_ 199 | * **hadoopConfiguration**: a set of key/value string pairs that will be applied to HadoopConfiguration. _It’s Optional_ 200 | * **source**, contains the information about the source Database (_it’s **Mandatory**_): 201 | * **format**: the format of the source Database (i.e. Snowflake, Neo4j, RedShift…). _It’s **Mandatory**_ 202 | * **options**: a set of key/value pairs that contains the required configuration parameters for the selected format. Each option set is specifically related to the selected source **format**. For each one we’ll provide a set of links in the configuration stub in order to easily retrieve the correct configuration. _It’s **Mandatory**_ 203 | * **columns**, a set of columns that you want to project, is useful in order to minimize the data movement from the source database; each column is composed of two fields (_It’s Optional_): 204 | * **name**: the name of the column. Consider that this field supports Spark SQL notation so you can manipulate the field like casting the field type, applying UDF to it and so on 205 | * **alias**: the name of the field if you want to rename it 206 | * **where**: a string that filters the data retrieved from the Database, you can use Spark SQL where condition in order to filter the data. _It’s Optional_ 207 | * **limit**: limits the number of rows returned from the source data, it’s good for testing purposes. _It’s Optional_ 208 | * **printSchema**: a boolean that prints the schema of the source data, useful for debugging. Default false 209 | * **partition**, is composed of two fields (_It’s Optional_): 210 | * **number**: the number of partitions 211 | * **by**: an optional parameter that defines the partition field 212 | * **target**, contains the information about the target Database (_it’s **Mandatory**_): 213 | * **format**: the format of the target Database (i.e. Snowflake, Neo4j, RedShift…). _It’s **Mandatory**_ 214 | * **options**: a set of key/value pairs that contains the required configuration parameters for the selected format. Each option set is specifically related to the selected target **format**. For each one we’ll provide a set of links in the configuration stub in order to easily retrieve the correct configuration. _It’s **Mandatory**_ 215 | * **mode**, the Spark save mode, it’s specifically related to the selected target **format**._ It’s Optional;_ 216 | 217 | ## Special values 218 | 219 | In the JSON you can use special values that will be replaced at runtime with an actual value. 220 | Currently we now support two special values: 221 | 222 | * `${env:}` which checks for the parameter name and replace it with the actual value 223 | * `${file:}` which checks for the file replace it with the actual text content 224 | 225 | This in order you let you keep the file cleanest as possibile 226 | 227 | Consider this json: 228 | 229 | ```json 230 | { 231 | "name" : "Ingest Order table as relationships into Neo4j", 232 | "conf" : { }, 233 | "hadoopConfiguration" : { }, 234 | "source" : { 235 | "format" : "snowflake", 236 | "options" : { 237 | "sfSchema" : "TPCH_SF1", 238 | "sfPassword" : "${env:SNOWFLAKE_PASSWORD}", 239 | "sfUser" : "${env:SNOWFLAKE_USER}", 240 | "dbtable" : "ORDERS", 241 | "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA", 242 | "sfURL" : "${env:SNOWFLAKE_URL}" 243 | }, 244 | "columns": [ 245 | { "name": "CAST(O_CUSTKEY AS DOUBLE)", "alias": "O_CUSTKEY" }, 246 | { "name": "O_ORDERDATE" }, 247 | { "name": "O_COMMENT" }, 248 | { "name": "CAST(O_ORDERKEY AS LONG)", "alias": "O_ORDERKEY" } 249 | ], 250 | "where" : "C_CUSTKEY <= 10", 251 | "printSchema" : false, 252 | "partition" : {} 253 | }, 254 | "target" : { 255 | "format" : "org.neo4j.spark.DataSource", 256 | "options" : { 257 | "url" : "${env:NEO4J_URL}", 258 | "authentication.basic.username" : "${env:NEO4J_USER}", 259 | "authentication.basic.password" : "${env:NEO4J_PASSWORD}", 260 | "query" : "${file:/tmp/my_cypher_query.cyp}" 261 | }, 262 | "mode" : "Overwrite" 263 | } 264 | } 265 | ``` 266 | 267 | Consider that you have the following env variables: 268 | 269 | * export SNOWFLAKE_USER=snowflake_foo 270 | * export SNOWFLAKE_PASSWORD=snowflake_bar 271 | * export SNOWFLAKE_URL=https://foo_bar.snowflakecomputing.com 272 | * export NEO4J_USER=neo4j_foo 273 | * export NEO4J_PASSWORD=neo4j_bar 274 | * export NEO4J_URL=neo4j+s://foo_bar.databases.neo4j.io 275 | 276 | And the content of `/tmp/my_cypher_query.cyp` is: 277 | 278 | `CREATE (p:Person{id: event.id, fullName: event.full_name})` 279 | 280 | The connector will replace the value as if you pass the following json: 281 | 282 | Consider this json: 283 | 284 | ```json 285 | { 286 | "name" : "Ingest Order table as relationships into Neo4j", 287 | "conf" : { }, 288 | "hadoopConfiguration" : { }, 289 | "source" : { 290 | "format" : "snowflake", 291 | "options" : { 292 | "sfSchema" : "TPCH_SF1", 293 | "sfPassword" : "snowflake_bar", 294 | "sfUser" : "snowflake_foo", 295 | "dbtable" : "ORDERS", 296 | "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA", 297 | "sfURL" : "https://foo_bar.snowflakecomputing.com" 298 | }, 299 | "columns": [ 300 | { "name": "CAST(O_CUSTKEY AS DOUBLE)", "alias": "O_CUSTKEY" }, 301 | { "name": "O_ORDERDATE" }, 302 | { "name": "O_COMMENT" }, 303 | { "name": "CAST(O_ORDERKEY AS LONG)", "alias": "O_ORDERKEY" } 304 | ], 305 | "where" : "C_CUSTKEY <= 10", 306 | "printSchema" : false, 307 | "partition" : {} 308 | }, 309 | "target" : { 310 | "format" : "org.neo4j.spark.DataSource", 311 | "options" : { 312 | "url" : "neo4j+s://foo_bar.databases.neo4j.io", 313 | "authentication.basic.username" : "neo4j_foo", 314 | "authentication.basic.password" : "neo4j_bar", 315 | "query" : "CREATE (p:Person{id: event.id, fullName: event.full_name})" 316 | }, 317 | "mode" : "Overwrite" 318 | } 319 | } 320 | ``` 321 | 322 | 323 | # Use the DWH Connector as Spark Submit Job 324 | 325 | The jar can be used in two ways: 326 | 327 | 328 | 329 | 1. Generate the configuration stub for the selected Source/Target databases 330 | 2. Used by Spark-submit to launch the Spark Job 331 | 332 | To move data from a selected Source to a defined Target you need to perform the following steps: 333 | 334 | 335 | 1. Prepare the JSON file with all the required configurations with the Source and the Target database 336 | 2. Copy the JSON file(s) and neo4j-dwh-connector-.jar to the server or Docker running Spark 337 | 3. Run the Spark-submit command to start the Spark Job 338 | 339 | If you, want to get the full list of available options you can use the following command: 340 | 341 | 342 | ```bash 343 | java -jar neo4j-dwh-connector-.jar -h 344 | ``` 345 | 346 | ![](doc/images/cli_help.png) 347 | 348 | 349 | 350 | ## Generate the configuration stub 351 | 352 | You can generate the configuration file in a very simple way by running the following command: 353 | 354 | 355 | ```bash 356 | java -jar neo4j-dwh-connector-.jar -c -s -t -p 357 | ``` 358 | 359 | 360 | Example, generate a stub for a job that moves data from **Snowflake** to **Neo4j** and put the configuration into **/tmp/dwh_job_config.json**: 361 | 362 | 363 | ```bash 364 | java -jar neo4j-dwh-connector-1.0.0.jar -c -s Snowflake -t Neo4j -p /tmp/dwh_job_config.json 365 | ``` 366 | 367 | 368 | 369 | ## Launch the Spark Job 370 | 371 | Once you configured the JSON file properly, in order to get the job done, you only need to launch the Spark Job from a client . 372 | 373 | **_Nota bene_** 374 | 375 | Each selected Source/Target will require external dependencies in oder work. So please consider to add them. 376 | 377 | 378 | ### Launch the Spark Job from CLI via Spark Submit 379 | 380 | In order to launch the Spark Job you need to be in the Spark directory and run the following command: 381 | 382 | 383 | ```bash 384 | ./bin/spark-submit \ 385 | --class org.neo4j.dwh.connector.Neo4jDWHConnector \ 386 | --packages \ 387 | --master spark://: \ 388 | --deploy-mode cluster \ 389 | --supervise \ 390 | --executor-memory 20G \ 391 | --total-executor-cores 100 \ 392 | /path/to/neo4j-dwh-connector-.jar \ 393 | -p /path/to/dwh_job_config.json 394 | ``` 395 | 396 | 397 | Example command to launch a Spark Job that moves data from Snowflake to Neo4j: 398 | 399 | 400 | ```bash 401 | ./bin/spark-submit \ 402 | --class org.neo4j.dwh.connector.Neo4jDWHConnector \ 403 | --packages org.neo4j:neo4j-connector-apache-spark_2.12:4.1.0_for_spark_3,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2,net.snowflake:snowflake-jdbc:3.13.15 \ 404 | --master local 405 | /path/to/neo4j-dwh-connector-1.0.0-SNAPSHOT.jar \ 406 | -p /path/to/dwh_job_config.json 407 | ``` 408 | 409 | 410 | 411 | ### Launch the Spark Job from Databricks Cloud 412 | 413 | You may also run the Spark Job from Databricks Cloud, to do this requires that you import the DWH connector jar and the JSON config [into DBFS](https://docs.databricks.com/data/data.html#:~:text=Import%20data,-If%20you%20have&text=There%20are%20two%20ways%20to,box%20on%20the%20landing%20page.) 414 | 415 | In order to create a Spark Job into Databricks cloud you have to click into the **Jobs** menu section 416 | 417 | ![](doc/images/databricks_menu.png) 418 | 419 | and then on **Create Job** 420 | 421 | ![](doc/images/databricks_job_home.png) 422 | 423 | This will open a the Job creation section: 424 | 425 | ![](doc/images/databricks_job_config.png) 426 | 427 | where you need to define: 428 | 429 | * the task name 430 | * the type of the task, please select **Spark Submit** 431 | * the cluster 432 | * the parameters which are related to the required dependencies and the DWH job configuration 433 | 434 | In particular for our Spark Job that moves data from Snowflake into Neo4j you need to apply these parameters: 435 | 436 | 437 | ```json 438 | ["--packages", "org.neo4j:neo4j-connector-apache-spark_2.12:4.1.2_for_spark_3,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2,net.snowflake:snowflake-jdbc:3.13.15", "--class", "org.neo4j.dwh.connector.Neo4jDWHConnector", "dbfs:/FileStore/dwh-connector/neo4j_dwh_connector_1_0_SNAPSHOT_jar_with_dependencies.jar", "-p", "/dbfs/FileStore/dwh-connector/job_config.json"] 439 | ``` 440 | 441 | 442 | Once create you’ll have the find it in your Job list as shown here: 443 | 444 | ![](doc/images/databricks_job_list.png) 445 | 446 | In order to start it you just need to press the play button in the Actions sections. 447 | 448 | 449 | # Use the DWH Connector via Scala APIs 450 | 451 | You can also use the connector in a notebook for instance leveraging the Scala APIs in very convenient way like this: 452 | 453 | ```scala 454 | 455 | import org.neo4j.dwh.connector.Neo4jDWHConnector 456 | import org.neo4j.dwh.connector.domain._ 457 | 458 | 459 | // the source database configuration 460 | val source = Source( 461 | format = "snowflake", // the source database (mandatory) 462 | // the configuration options it will change for every source database (mandatory) 463 | options = Map( 464 | "sfSchema" -> "TPCH_SF1", 465 | "sfPassword" -> "<**>", 466 | "sfUser" -> "<**>", 467 | "dbtable" -> "CUSTOMER", 468 | "sfDatabase" -> "SNOWFLAKE_SAMPLE_DATA", 469 | "sfURL" -> "https://<**>.eu-central-1.snowflakecomputing.com" 470 | ), 471 | // a list of selected projected columns, it can be useful in order to eventually cast data, 472 | // apply Spark's UDFs and minimize the data movement from the source database (optional) 473 | columns = Seq( 474 | Column(name = "CAST(C_ACCTBAL AS DOUBLE)", alias = "C_ACCTBAL"), 475 | Column(name = "C_ADDRESS"), 476 | Column(name = "C_COMMENT"), 477 | Column(name = "CAST(C_CUSTKEY AS LONG)", alias = "C_CUSTKEY"), 478 | Column(name = "C_MKTSEGMENT"), 479 | Column(name = "C_NAME"), 480 | Column(name = "CAST(C_NATIONKEY AS LONG)", alias = "C_NATIONKEY"), 481 | Column(name = "C_PHONE") 482 | ), 483 | where = "", // a filter for the source dataset (optional) 484 | printSchema = true, // if you want to print the schema, useful for debug purposes (optional) 485 | show = 5, // if you want show the source database, useful for debug purposes (optional) 486 | limit = 10, // the amount of rows that you want to have from the source dataset (optional) 487 | // a dataframe partition configuration (optional) 488 | partition = Partition( 489 | number = -1, // the number of partions mandatory if you want to define partitions 490 | by = "" // the field to partition (optional) 491 | ) 492 | ) 493 | 494 | // the target database configuration 495 | val target = Target( 496 | format = "org.neo4j.spark.DataSource", // the target database (mandatory) 497 | // the configuration options it will change for every source database (mandatory) 498 | options = Map( 499 | "labels" -> ":Person:Customer", 500 | "url" -> "neo4j+s://<**>.databases.neo4j.io", 501 | "authentication.basic.username" -> "neo4j", 502 | "authentication.basic.password" -> "<**>", 503 | "node.keys" -> "C_CUSTKEY" 504 | ), 505 | mode = "Overwrite" 506 | ) 507 | 508 | val config = JobConfig( 509 | name = "The name of the Spark Job", 510 | conf = Map.empty, // a configuration map, every k/v binding will be insert as Spark Configuration 511 | hadoopConfiguration = Map.empty, // a configuration map, every k/v binding will be insert as Hadoop Configuration 512 | source = source, 513 | target = target 514 | ) 515 | 516 | val connector = new Neo4jDWHConnector(sparkSession, config) 517 | 518 | // this will ingest the data from source to target database 519 | connector.run() 520 | ``` 521 | 522 | # Use DWH Connector via Python APIs 523 | 524 | You can also use the connector in a notebook for instance leveraging the Python APIs in very convenient way like this: 525 | 526 | ```python 527 | from neo4j_dwh_connector import * 528 | 529 | source = Source( 530 | format="snowflake", # the source database (mandatory) 531 | # the configuration options it will change for every source database (mandatory) 532 | options={ 533 | "sfSchema": "TPCH_SF1", 534 | "sfPassword": "****", 535 | "sfUser": "****", 536 | "dbtable": "CUSTOMER", 537 | "sfDatabase": "SNOWFLAKE_SAMPLE_DATA", 538 | "sfURL": "https://****.eu-central-1.snowflakecomputing.com" 539 | }, 540 | # a list of selected projected columns, it can be useful in order to eventually cast data, 541 | # apply Spark's UDFs and minimize the data movement from the source database (optional) 542 | columns=[ 543 | Column(name="CAST(C_ACCTBAL AS DOUBLE)", alias="C_ACCTBAL"), 544 | Column(name="C_ADDRESS"), 545 | Column(name="C_COMMENT"), 546 | Column(name="CAST(C_CUSTKEY AS LONG)", alias="C_CUSTKEY"), 547 | Column(name="C_MKTSEGMENT"), 548 | Column(name="C_NAME"), 549 | Column(name="CAST(C_NATIONKEY AS LONG)", alias="C_NATIONKEY"), 550 | Column(name="C_PHONE") 551 | ], 552 | where="", # a filter for the source dataset (optional) 553 | printSchema=True, # if you want to print the schema, useful for debug purposes (optional) 554 | show=5, # if you want show the source database, useful for debug purposes (optional) 555 | limit=10, # the amount of rows that you want to have from the source dataset (optional) 556 | # a dataframe partition configuration (optional) 557 | partition=Partition( 558 | number=-1, # the number of partions mandatory if you want to define partitions 559 | by="" # the field to partition (optional) 560 | ) 561 | ) 562 | # the target database configuration 563 | target = Target( 564 | format="org.neo4j.spark.DataSource", # the target database (mandatory) 565 | # the configuration options it will change for every source database (mandatory) 566 | options={ 567 | "labels": ":PersonNew1:CustomerNew1", 568 | "url": "neo4j+s://****.databases.neo4j.io", 569 | "authentication.basic.username": "neo4j", 570 | "authentication.basic.password": "****", 571 | "node.keys": "C_CUSTKEY" 572 | }, 573 | mode="Overwrite" 574 | ) 575 | 576 | config = JobConfig( 577 | name="The name of the Spark Job", 578 | conf={}, # a configuration dict, every k/v binding will be insert as Spark Configuration 579 | hadoopConfiguration={}, 580 | # a configuration dict, every k/v binding will be insert as Hadoop Configuration 581 | source=source, 582 | target=target 583 | ) 584 | 585 | connector = Neo4jDWHConnector(sparkSession, config) 586 | 587 | # this will ingest the data from source to target database 588 | connector.run() 589 | ``` 590 | 591 | # Supported Spark versions 592 | 593 | We support: 594 | 595 | * Spark 2.4 with Scala 2.11 and 2.12 596 | * Spark 3.x with Scala 2.12 and 2.13 597 | 598 | # Maven resolution 599 | 600 | It depends on the Spark and Scala version: 601 | * For Spark 2.4 with Scala 2.11 and 2.12 the package resolution is `org.neo4j:neo4j-dwh-connector_:_for_spark_2.4` 602 | * For Spark 3.x with Scala 2.12 and 2.13 the package resolution is `org.neo4j:neo4j-dwh-connector_:_for_spark_3` 603 | 604 | 605 | # Build it locally 606 | 607 | In order to build it locally you can use the following commands 608 | 609 | For scala 2.11 and Spark 2.4 `./maven-release.sh package 2.11 2.4` 610 | 611 | For scala 2.12 and Spark 2.4 `./maven-release.sh package 2.12 2.4` 612 | 613 | For scala 2.12 and Spark 3 `./maven-release.sh package 2.12 3` 614 | 615 | For scala 2.13 and Spark 3 `./maven-release.sh package 2.13 3` 616 | 617 | # Blog 618 | 619 | Here comes the details for Make easier than ever to move data between your data warehouse and Neo4j! 620 | 621 | https://neo4j.com/developer-blog/introducing-neo4j-data-warehouse-connector/ 622 | 623 | # Quickstart 624 | 625 | If you want to see how you can leverage Scala and Python APIs you can download: 626 | 627 | * a [Databricks notebook](doc/notebooks/Neo4j%20DWH%20Connector%20quickstart.html) ready to be imported into the Databricks cloud environment 628 | -------------------------------------------------------------------------------- /doc/images/cli_help.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/cli_help.png -------------------------------------------------------------------------------- /doc/images/databricks_job_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/databricks_job_config.png -------------------------------------------------------------------------------- /doc/images/databricks_job_home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/databricks_job_home.png -------------------------------------------------------------------------------- /doc/images/databricks_job_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/databricks_job_list.png -------------------------------------------------------------------------------- /doc/images/databricks_menu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/databricks_menu.png -------------------------------------------------------------------------------- /doc/images/output-rel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/output-rel.png -------------------------------------------------------------------------------- /maven-release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# -lt 3 ]] ; then 4 | echo "Usage ./maven-release.sh []" 5 | exit 1 6 | fi 7 | 8 | JAVA_VER=$(java -version 2>&1 | grep -i version) 9 | 10 | if [[ ! $JAVA_VER =~ 1.8 ]] ; then 11 | echo "You must use Java 8" 12 | exit 1 13 | fi 14 | 15 | exit_script() { 16 | echo "Process terminated cleaning up resources" 17 | mv -f pom.xml.bak pom.xml 18 | rm -f pom.xml.versionsBackup 19 | trap - SIGINT SIGTERM # clear the trap 20 | kill -- -$$ # Sends SIGTERM to child/sub processes 21 | } 22 | 23 | trap exit_script SIGINT SIGTERM 24 | 25 | DEPLOY_INSTALL=$1 26 | SCALA_VERSION=$2 27 | SPARK_VERSION=$3 28 | # echo "command is: mvn -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION -q -Dexec.executable=echo -Dexec.args='\${project.version}' --non-recursive exec:exec" 29 | CURRENT_VERSION=$(mvn -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION -q -Dexec.executable=echo -Dexec.args='${project.version}' --non-recursive exec:exec) 30 | #TARGET_DIR=spark-$SPARK_VERSION 31 | if [[ $# -eq 4 ]] ; then 32 | ALT_DEPLOYMENT_REPOSITORY="-DaltDeploymentRepository=$4" 33 | else 34 | ALT_DEPLOYMENT_REPOSITORY="" 35 | fi 36 | 37 | case $(sed --help 2>&1) in 38 | *GNU*) sed_i () { sed -i "$@"; };; 39 | *) sed_i () { sed -i '' "$@"; };; 40 | esac 41 | 42 | # backup files 43 | cp pom.xml pom.xml.bak 44 | 45 | # replace pom files with target scala version 46 | sed_i "s/neo4j-dwh-connector<\/artifactId>/neo4j-dwh-connector_$SCALA_VERSION<\/artifactId>/" pom.xml 47 | sed_i "s//$SCALA_VERSION<\/scala.binary.version>/" pom.xml 48 | 49 | # setting version 50 | NEW_VERSION="${CURRENT_VERSION}_for_spark_${SPARK_VERSION}" 51 | # echo "New version is $NEW_VERSION" 52 | mvn -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION versions:set -DnewVersion=$NEW_VERSION 53 | # build 54 | # echo "command is: mvn clean $DEPLOY_INSTALL -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION -DskipTests $ALT_DEPLOYMENT_REPOSITORY" 55 | mvn clean $DEPLOY_INSTALL -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION -DskipTests $ALT_DEPLOYMENT_REPOSITORY 56 | 57 | exit_script 58 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | org.neo4j 4 | neo4j-dwh-connector 5 | 1.0.1 6 | ${project.artifactId} 7 | This is the Neo4j DWH Connector supposed to work in combination with Apache Spark 8 | 2022 9 | 10 | 11 | Neo4j, Inc. 12 | http://neo4j.com/ 13 | 14 | 15 | 16 | conker84 17 | Andrea Santurbano 18 | 19 | 20 | 21 | https://github.com/neo4j-contrib/neo4j-dwh-connector 22 | 23 | https://github.com/neo4j-contrib/neo4j-dwh-connector 24 | 25 | 26 | 27 | The Apache Software License, Version 2.0 28 | http://www.apache.org/licenses/LICENSE-2.0.txt 29 | 30 | Note that this license is for the project itself, 31 | and not for its dependencies. 32 | 33 | repo 34 | 35 | 36 | 37 | 38 | 39 | jitpack.io 40 | https://jitpack.io 41 | 42 | 43 | 44 | 45 | 1.8 46 | 1.8 47 | UTF-8 48 | 4.2.0 49 | 2.17.2 50 | 4.13.2 51 | 4.10 52 | 1.9.0 53 | 54 | 2.0.9 55 | 1.18.3 56 | 0.32.0 57 | 2.10.0-spark_3.2 58 | 3.13.15 59 | 5.0.4 60 | 2.1.0.5 61 | 1.12.178 62 | 3.3.1 63 | 1.1.0 64 | spark-mssql-connector_${scala.binary.version} 65 | 66 | 67 | 68 | 69 | 70 | 71 | spark-bigquery-with-dependencies_${scala.binary.version} 72 | 73 | neo4j-connector-apache-spark_${scala.binary.version} 74 | 75 | 76 | 77 | 78 | 79 | org.scala-lang 80 | scala-library 81 | ${scala.version} 82 | 83 | 84 | 85 | org.apache.spark 86 | spark-core_${scala.binary.version} 87 | ${spark.version} 88 | provided 89 | 90 | 91 | com.fasterxml.jackson.core 92 | jackson-databind 93 | 94 | 95 | com.fasterxml.jackson.module 96 | jackson-module-scala_${scala.binary.version} 97 | 98 | 99 | org.apache.xbean 100 | xbean-asm6-shaded 101 | 102 | 103 | 104 | 105 | 106 | org.apache.spark 107 | spark-sql_${scala.binary.version} 108 | ${spark.version} 109 | provided 110 | 111 | 112 | 113 | org.apache.xbean 114 | xbean-asm6-shaded 115 | ${xbean.version} 116 | provided 117 | 118 | 119 | 120 | com.fasterxml.jackson.core 121 | jackson-databind 122 | ${jackson-databind.version} 123 | 124 | 125 | 126 | com.fasterxml.jackson.module 127 | jackson-module-scala_${scala.binary.version} 128 | ${jackson-databind.version} 129 | 130 | 131 | 132 | commons-cli 133 | commons-cli 134 | ${common-cli.version} 135 | 136 | 137 | 138 | 139 | junit 140 | junit 141 | ${junit.version} 142 | test 143 | 144 | 145 | 146 | com.microsoft.azure 147 | ${spark-mssql-connector-artifactId} 148 | ${spark-mssql-connector_version} 149 | test 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | com.github.conker84 161 | spark-redshift 162 | 5.0.4.1-SNAPSHOT 163 | test 164 | 165 | 166 | org.apache.spark 167 | spark-avro_${scala.binary.version} 168 | ${spark.version} 169 | test 170 | 171 | 172 | com.amazon.redshift 173 | redshift-jdbc42 174 | ${redshift.jdbc.version} 175 | test 176 | 177 | 178 | com.amazonaws 179 | aws-java-sdk 180 | ${aws-java-sdk.version} 181 | test 182 | 183 | 184 | org.apache.hadoop 185 | hadoop-aws 186 | ${hadoop-aws.version} 187 | test 188 | 189 | 190 | 191 | org.neo4j 192 | ${neo4j-connector-apache-spark-artifactId} 193 | ${neo4j.spark.version} 194 | test 195 | 196 | 197 | 198 | com.google.cloud.spark 199 | ${spark-bigquery-with-dependencies-artifactId} 200 | ${bigquery.spark.version} 201 | test 202 | 203 | 204 | 205 | net.snowflake 206 | spark-snowflake_${scala.binary.version} 207 | ${snowflake.spark.version} 208 | test 209 | 210 | 211 | 212 | net.snowflake 213 | snowflake-jdbc 214 | ${snowflake.jdbc.version} 215 | test 216 | 217 | 218 | 219 | org.testcontainers 220 | testcontainers 221 | ${testcontainers.version} 222 | test 223 | 224 | 225 | 226 | org.testcontainers 227 | neo4j 228 | ${testcontainers.version} 229 | test 230 | 231 | 232 | 233 | 234 | 235 | 236 | scala-2.12 237 | 238 | 239 | 240 | 2.12.18 241 | 2.12 242 | 243 | 244 | 245 | scala-2.13 246 | 247 | true 248 | 249 | 250 | 2.13.12 251 | 2.13 252 | 256 | spark-mssql-connector_2.12 257 | 258 | 259 | 260 | 261 | 262 | spark-3 263 | 264 | true 265 | 266 | 267 | 3.3.2 268 | 5.0.3_for_spark_3 269 | 270 | 271 | 272 | 273 | 274 | neo4j-4.4 275 | 276 | true 277 | 278 | 279 | 4.4 280 | 281 | 282 | 283 | neo4j-5 284 | 285 | 5 286 | 287 | 288 | 289 | 290 | 291 | 292 | src/main/scala 293 | src/test/scala 294 | 295 | 296 | python 297 | 298 | **/test/** 299 | 300 | 301 | 302 | 303 | 304 | src/test/resources 305 | true 306 | 307 | 308 | 309 | 310 | org.apache.maven.plugins 311 | maven-assembly-plugin 312 | 3.3.0 313 | 314 | 315 | jar-with-dependencies 316 | 317 | 318 | 319 | org.neo4j.dwh.connector.Neo4jDWHConnector 320 | 321 | 322 | ${project.artifactId}-${project.version} 323 | false 324 | 325 | 326 | 327 | package 328 | 329 | single 330 | 331 | 332 | 333 | 334 | 335 | net.alchim31.maven 336 | scala-maven-plugin 337 | 4.5.4 338 | 339 | 340 | 341 | add-source 342 | compile 343 | testCompile 344 | doc-jar 345 | 346 | 347 | 348 | -dependencyfile 349 | ${project.build.directory}/.scala_dependencies 350 | 351 | 352 | 353 | process-resources 354 | 355 | 356 | 357 | 358 | org.apache.maven.plugins 359 | maven-source-plugin 360 | 2.2.1 361 | 362 | 363 | attach-sources 364 | 365 | jar 366 | 367 | 368 | 369 | 370 | 371 | org.apache.maven.plugins 372 | maven-surefire-plugin 373 | 2.21.0 374 | 375 | 1 376 | false 377 | 378 | **/*Test.* 379 | **/*IT.* 380 | 381 | false 382 | 383 | 384 | -XX:+IgnoreUnrecognizedVMOptions 385 | --add-opens=java.base/java.lang=ALL-UNNAMED 386 | --add-opens=java.base/java.lang.invoke=ALL-UNNAMED 387 | --add-opens=java.base/java.lang.reflect=ALL-UNNAMED 388 | --add-opens=java.base/java.io=ALL-UNNAMED 389 | --add-opens=java.base/java.net=ALL-UNNAMED 390 | --add-opens=java.base/java.nio=ALL-UNNAMED 391 | --add-opens=java.base/java.util=ALL-UNNAMED 392 | --add-opens=java.base/java.util.concurrent=ALL-UNNAMED 393 | --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED 394 | --add-opens=java.base/sun.nio.ch=ALL-UNNAMED 395 | --add-opens=java.base/sun.nio.cs=ALL-UNNAMED 396 | --add-opens=java.base/sun.security.action=ALL-UNNAMED 397 | --add-opens=java.base/sun.util.calendar=ALL-UNNAMED 398 | --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED 399 | 400 | 401 | 402 | 403 | 404 | 405 | -------------------------------------------------------------------------------- /python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | # An example MANIFEST file can be found at: 2 | # https://github.com/pypa/sampleproject/blob/master/MANIFEST.in 3 | # For more details about the MANIFEST file, you may read the docs at 4 | # https://docs.python.org/2/distutils/sourcedist.html#the-manifest-in-template 5 | -------------------------------------------------------------------------------- /python/neo4j_dwh_connector/__init__.py: -------------------------------------------------------------------------------- 1 | from neo4j_dwh_connector._dto import JobConfig, Column, Partition, Source, Target 2 | from neo4j_dwh_connector.connector import Neo4jDWHConnector 3 | 4 | __all__ = ['Neo4jDWHConnector', 'JobConfig', 'Column', 'Partition', 'Source', 'Target'] 5 | -------------------------------------------------------------------------------- /python/neo4j_dwh_connector/_dto.py: -------------------------------------------------------------------------------- 1 | class Column: 2 | def __init__(self, name, alias=""): 3 | self.name = name 4 | self.alias = alias 5 | 6 | 7 | class Partition: 8 | def __init__(self, number=0, by=""): 9 | self.number = number 10 | self.by = by 11 | 12 | 13 | class Source: 14 | def __init__(self, format, options={}, columns=[], where="", printSchema=False, limit=-1, show=-1, 15 | partition=Partition()): 16 | self.format = format 17 | self.options = options 18 | self.columns = columns 19 | self.where = where 20 | self.printSchema = printSchema 21 | self.limit = limit 22 | self.show = show 23 | self.partition = partition 24 | 25 | 26 | class Target: 27 | def __init__(self, format, options, mode): 28 | self.format = format 29 | self.options = options 30 | self.mode = mode 31 | 32 | 33 | class JobConfig: 34 | def __init__(self, name, conf, hadoopConfiguration, source, target, master=""): 35 | self.name = name 36 | self.conf = conf 37 | self.hadoopConfiguration = hadoopConfiguration 38 | self.source = source 39 | self.target = target 40 | self.master = master -------------------------------------------------------------------------------- /python/neo4j_dwh_connector/_utils.py: -------------------------------------------------------------------------------- 1 | def _to_java_value(obj, sparkContext): 2 | if is_class(obj): 3 | obj = vars(obj) 4 | if type(obj) is dict: 5 | hashMap = sparkContext._jvm.java.util.HashMap() 6 | for key in obj: 7 | hashMap[key] = _to_java_value(obj[key], sparkContext) 8 | obj = hashMap 9 | if type(obj) is list: 10 | arrayList = sparkContext._jvm.java.util.ArrayList(len(obj)) 11 | for key, val in enumerate(obj): 12 | arrayList.add(_to_java_value(obj[key], sparkContext)) 13 | obj = arrayList 14 | return obj 15 | 16 | 17 | # todo there should be a better way to do this 18 | def is_class(obj): 19 | return ' object at ' in str(obj) and type(obj) is not list and type(obj) is not dict 20 | -------------------------------------------------------------------------------- /python/neo4j_dwh_connector/connector.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if sys.version > '3': 4 | basestring = str 5 | 6 | from pyspark.sql import SparkSession 7 | 8 | from neo4j_dwh_connector._dto import JobConfig 9 | from neo4j_dwh_connector._utils import _to_java_value 10 | 11 | 12 | class Neo4jDWHConnector: 13 | 14 | def __init__(self, session: SparkSession, jobConfig: JobConfig): 15 | java_map = _to_java_value(jobConfig, session.sparkContext) 16 | self._jvm_connector = session.sparkContext._jvm.org.neo4j.dwh.connector.Neo4jDWHConnector( 17 | session._jsparkSession, java_map) 18 | 19 | def run(self, closeSession=False): 20 | self._jvm_connector.run(closeSession) 21 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file should list any python package dependencies. 2 | -------------------------------------------------------------------------------- /python/setup.cfg: -------------------------------------------------------------------------------- 1 | # This file contains the default option values to be used during setup. An 2 | # example can be found at https://github.com/pypa/sampleproject/blob/master/setup.cfg 3 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | # Your python setup file. An example can be found at: 2 | # https://github.com/pypa/sampleproject/blob/master/setup.py 3 | -------------------------------------------------------------------------------- /python/spark-package-deps.txt: -------------------------------------------------------------------------------- 1 | # This file should list any spark package dependencies as: 2 | # :package_name==:version e.g. databricks/spark-csv==0.1 3 | -------------------------------------------------------------------------------- /python/test/__init__.py: -------------------------------------------------------------------------------- 1 | from ._util_test import UtilTest 2 | from .connector_test import ConnectorTest 3 | 4 | __all__ = ['UtilTest', 'ConnectorTest'] 5 | -------------------------------------------------------------------------------- /python/test/_util_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from py4j.java_collections import JavaMap, JavaList 3 | from pyspark.sql import SparkSession 4 | 5 | from neo4j_dwh_connector._dto import * 6 | from neo4j_dwh_connector._utils import _to_java_value 7 | 8 | 9 | class UtilTest(unittest.TestCase): 10 | spark = None 11 | 12 | def setUp(self): 13 | self.spark = (SparkSession.builder 14 | .appName("Neo4jConnectorTests") 15 | .master('local[*]') 16 | # .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.12:4.1.2_for_spark_3') 17 | .config("spark.driver.host", "127.0.0.1") 18 | .getOrCreate()) 19 | 20 | def test__to_java_value(self): 21 | source = Source( 22 | format="snowflake", # the source database (mandatory) 23 | # the configuration options it will change for every source database (mandatory) 24 | options={ 25 | "sfSchema": "TPCH_SF1", 26 | "sfPassword": "****", 27 | "sfUser": "****", 28 | "dbtable": "CUSTOMER", 29 | "sfDatabase": "SNOWFLAKE_SAMPLE_DATA", 30 | "sfURL": "https://****.eu-central-1.snowflakecomputing.com" 31 | }, 32 | # a list of selected projected columns, it can be usefull in order to eventually cast data, 33 | # apply Spark's UDFs and minimize the data movement from the source database (optional) 34 | columns=[ 35 | Column(name="CAST(C_ACCTBAL AS DOUBLE)", alias="C_ACCTBAL"), 36 | Column(name="C_ADDRESS"), 37 | Column(name="C_COMMENT"), 38 | Column(name="CAST(C_CUSTKEY AS LONG)", alias="C_CUSTKEY"), 39 | Column(name="C_MKTSEGMENT"), 40 | Column(name="C_NAME"), 41 | Column(name="CAST(C_NATIONKEY AS LONG)", alias="C_NATIONKEY"), 42 | Column(name="C_PHONE") 43 | ], 44 | where="", # a filter for the source dataset (optional) 45 | printSchema=True, # if you want to print the schema, useful for debug purposes (optional) 46 | show=5, # if you want show the source database, useful for debug purposes (optional) 47 | limit=10, # the amount of rows that you want to have from the source dataset (optional) 48 | # a dataframe partition configuration (optional) 49 | partition=Partition( 50 | number=-1, # the number of partions mandatory if you want to define partitions 51 | by="" # the field to partition (optional) 52 | ) 53 | ) 54 | # the target database configuration 55 | target = Target( 56 | format="org.neo4j.spark.DataSource", # the target database (mandatory) 57 | # the configuration options it will change for every source database (mandatory) 58 | options={ 59 | "labels": ":PersonNew1:CustomerNew1", 60 | "url": "neo4j+s://****.databases.neo4j.io", 61 | "authentication.basic.username": "neo4j", 62 | "authentication.basic.password": "****", 63 | "node.keys": "C_CUSTKEY" 64 | }, 65 | mode="Overwrite" 66 | ) 67 | 68 | config = JobConfig( 69 | name="The name of the Spark Job", 70 | conf={}, # a configuration map, every k/v binding will be insert as Spark Configuration 71 | hadoopConfiguration={}, 72 | # a configuration map, every k/v binding will be insert as Hadoop Configuration 73 | source=source, 74 | target=target 75 | ) 76 | 77 | converted = _to_java_value(config, self.spark) 78 | assert type(converted) is JavaMap 79 | assert type(converted["source"]) is JavaMap 80 | assert type(converted["target"]) is JavaMap 81 | assert type(converted["source"]["columns"]) is JavaList 82 | for id, value in enumerate(converted["source"]["columns"]): 83 | assert type(converted["source"]["columns"][id]) is JavaMap 84 | 85 | 86 | if __name__ == "__main__": 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /python/test/connector_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from unittest import SkipTest 4 | 5 | from pyspark.sql import SparkSession 6 | from testcontainers.neo4j import Neo4jContainer 7 | from tzlocal import get_localzone 8 | 9 | from neo4j_dwh_connector import * 10 | 11 | import pathlib 12 | 13 | connector_version = "1.0-SNAPSHOT" 14 | neo4j_version = "4.4-enterprise" 15 | current_time_zone = get_localzone().zone 16 | 17 | 18 | def parse_arguments(length=4): 19 | print(sys.argv) 20 | global connector_version 21 | global neo4j_version 22 | global current_time_zone 23 | if len(sys.argv) >= length - 1: 24 | if length - 1 in sys.argv: 25 | current_time_zone = sys.argv.pop() # str(sys.argv[++start_index]) 26 | neo4j_version = sys.argv.pop() # str(sys.argv[++start_index]) 27 | connector_version = sys.argv.pop() # str(sys.argv[start_index]) 28 | print("Running tests for Connector %s, Neo4j %s, TimeZone %s" 29 | % (connector_version, neo4j_version, current_time_zone)) 30 | 31 | 32 | class ConnectorTest(unittest.TestCase): 33 | neo4j_container = None 34 | spark = None 35 | 36 | @classmethod 37 | def setUpClass(cls): 38 | jar_path = "target/neo4j-dwh-connector-{version}-jar-with-dependencies.jar".format( 39 | version=connector_version) 40 | jar_file = (pathlib.Path(__file__) 41 | .absolute() 42 | .parent 43 | .parent 44 | .parent 45 | .joinpath(jar_path)) 46 | if not jar_file.exists(): 47 | path_format_error = 'Connector JAR not found under $PROJECT_HOME/{path}'.format(path=jar_path) 48 | print(path_format_error) 49 | raise SkipTest(path_format_error) 50 | cls.neo4j_container = (Neo4jContainer('neo4j:' + neo4j_version) 51 | .with_env("NEO4J_db_temporal_timezone", current_time_zone) 52 | .with_env("NEO4J_ACCEPT_LICENSE_AGREEMENT", "yes")) 53 | cls.neo4j_container.start() 54 | cls.spark = (SparkSession.builder 55 | .appName("Neo4jConnectorTests") 56 | .master('local[*]') 57 | .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.12:4.1.2_for_spark_3') 58 | .config("spark.jars", str(jar_file)) 59 | .config("spark.driver.host", "127.0.0.1") 60 | .getOrCreate()) 61 | 62 | @classmethod 63 | def tearDownClass(cls): 64 | cls.neo4j_container.stop() 65 | cls.spark.stop() 66 | 67 | def test_ingest_csv(self): 68 | csv_path = (pathlib.Path(__file__) 69 | .absolute() 70 | .parent 71 | .parent 72 | .parent 73 | .joinpath("src/test/resources/persons.csv")) 74 | assert csv_path.exists() 75 | source = Source( 76 | format="csv", # the source database (mandatory) 77 | # the configuration options it will change for every source database (mandatory) 78 | options={ 79 | "header": "true", 80 | "path": str(csv_path) 81 | }, 82 | # a list of selected projected columns, it can be useful in order to eventually cast data, 83 | # apply Spark's UDFs and minimize the data movement from the source database (optional) 84 | columns=[ 85 | Column(name="person_id", alias="id"), 86 | Column(name="person_name", alias="name") 87 | ], 88 | where="", # a filter for the source dataset (optional) 89 | printSchema=True, # if you want to print the schema, useful for debug purposes (optional) 90 | show=5, # if you want show the source database, useful for debug purposes (optional) 91 | limit=10, # the amount of rows that you want to have from the source dataset (optional) 92 | # a dataframe partition configuration (optional) 93 | partition=Partition( 94 | number=-1, # the number of partions mandatory if you want to define partitions 95 | by="" # the field to partition (optional) 96 | ) 97 | ) 98 | # the target database configuration 99 | target = Target( 100 | format="org.neo4j.spark.DataSource", # the target database (mandatory) 101 | # the configuration options it will change for every source database (mandatory) 102 | options={ 103 | "labels": ":Person:Customer", 104 | "url": self.neo4j_container.get_connection_url(), 105 | "authentication.basic.username": Neo4jContainer.NEO4J_USER, 106 | "authentication.basic.password": Neo4jContainer.NEO4J_ADMIN_PASSWORD, 107 | "node.keys": "id" 108 | }, 109 | mode="Overwrite" 110 | ) 111 | 112 | config = JobConfig( 113 | name="Create Persons from CSV to Neo4j", 114 | conf={}, # a configuration map, every k/v binding will be insert as Spark Configuration 115 | hadoopConfiguration={}, 116 | # a configuration map, every k/v binding will be insert as Hadoop Configuration 117 | source=source, 118 | target=target, 119 | master="local" 120 | ) 121 | connector = Neo4jDWHConnector(self.spark, config) 122 | 123 | # this will ingest the data from source to target database 124 | connector.run() 125 | 126 | with self.neo4j_container.get_driver() as neo4j_driver: 127 | with neo4j_driver.session() as neo4j_session: 128 | result = neo4j_session.run("MATCH (n:Person:Customer) RETURN count(n) AS count").peek() 129 | assert result["count"] == 3 130 | 131 | 132 | if __name__ == "__main__": 133 | parse_arguments() 134 | unittest.main() 135 | -------------------------------------------------------------------------------- /src/main/scala/org/neo4j/dwh/connector/Neo4jDWHConnector.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector 2 | 3 | import org.apache.commons.cli.HelpFormatter 4 | import org.apache.commons.lang3.StringUtils 5 | import org.apache.spark.sql 6 | import org.apache.spark.sql.{DataFrame, SparkSession} 7 | import org.neo4j.dwh.connector.domain.{JobConfig, Source, Target} 8 | import org.neo4j.dwh.connector.generator.JobConfigGenerator 9 | import org.neo4j.dwh.connector.utils.CliUtils.JsonType 10 | import org.neo4j.dwh.connector.utils.{CliUtils, JobConfigUtils, Utils} 11 | 12 | import java.net.{MalformedURLException, URL} 13 | import java.util 14 | import java.util.Locale 15 | 16 | /** 17 | * @author Andrea Santurbano 18 | */ 19 | object Neo4jDWHConnector { 20 | 21 | def main(args: Array[String]) { 22 | val cli = CliUtils.parseArgs(args) 23 | if (CliUtils.hasHelp(cli)) { 24 | val fmt = new HelpFormatter() 25 | fmt.printHelp(CliUtils.helpText, CliUtils.options()) 26 | return 27 | } 28 | CliUtils.validateCli(cli) 29 | val pathAsString = cli.getOptionValue("p") 30 | val filePath = try { 31 | new URL(pathAsString) 32 | } catch { 33 | case mue: MalformedURLException if mue.getMessage.contains("no protocol") => new URL(s"file:$pathAsString") 34 | case t: Throwable => throw t 35 | } 36 | val isGenerateConfig = cli.hasOption("c") 37 | if (isGenerateConfig) { 38 | new JobConfigGenerator(cli).generate() 39 | } else { 40 | val jobs = JsonType.withName(cli 41 | .getOptionValue("ft", CliUtils.JsonType.SINGLE.toString.toUpperCase(Locale.ENGLISH)) 42 | .toUpperCase(Locale.ENGLISH)) match { 43 | case JsonType.ARRAY => JobConfig.fromSeq(filePath) 44 | case JsonType.SINGLE => Seq(JobConfig.from(filePath)) 45 | } 46 | jobs.foreach(job => new Neo4jDWHConnector(job).run(true)) 47 | } 48 | } 49 | } 50 | 51 | class Neo4jDWHConnector(session: SparkSession, job: JobConfig) { 52 | 53 | def this(jobConfig: JobConfig) = this(JobConfigUtils.toSparkSession(jobConfig), jobConfig) 54 | def this(session: SparkSession, jobConfigMap: util.Map[String, AnyRef]) = this(session, JobConfig.from(jobConfigMap)) 55 | 56 | def run(closeSession: Boolean = false): Unit = try { 57 | val dataFrame = read(job.source, session) 58 | write(job.target, dataFrame) 59 | } finally { 60 | if (closeSession) { 61 | session.close() 62 | } 63 | } 64 | 65 | private def read(source: Source, spark: SparkSession): DataFrame = { 66 | var dataFrame = spark.read.format(source.format) 67 | .options(Utils.enrichMap(source.options)) 68 | .load() 69 | if (StringUtils.isNotBlank(source.where)) { 70 | dataFrame = dataFrame.where(source.where) 71 | } 72 | if (!source.columns.isEmpty) { 73 | dataFrame = dataFrame.selectExpr(source.columns.map(_.toString).toArray : _*) 74 | } 75 | 76 | if (source.printSchema) { 77 | dataFrame.printSchema() 78 | } 79 | 80 | if (source.limit > 0) { 81 | dataFrame = dataFrame.limit(source.limit) 82 | } 83 | 84 | if (source.show > 0) { 85 | dataFrame.show(source.show) 86 | } 87 | 88 | if (source.partition.number > 0) { 89 | dataFrame = if (StringUtils.isBlank(source.partition.by)) { 90 | dataFrame.repartition(source.partition.number) 91 | } else { 92 | dataFrame.repartition(source.partition.number, new sql.Column(source.partition.by)) 93 | } 94 | } 95 | dataFrame 96 | } 97 | 98 | private def write(target: Target, df: DataFrame): Unit = { 99 | var dfWriter = df.write.format(target.format) 100 | .options(Utils.enrichMap(target.options)) 101 | if (StringUtils.isNotBlank(target.mode)) { 102 | dfWriter = dfWriter.mode(target.mode) 103 | } 104 | dfWriter.save() 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/scala/org/neo4j/dwh/connector/domain/JobConfig.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.domain 2 | 3 | import com.fasterxml.jackson.core.`type`.TypeReference 4 | import org.apache.commons.lang3.StringUtils 5 | import org.neo4j.dwh.connector.utils.JSONUtils 6 | 7 | import java.io.File 8 | import java.net.{URI, URL} 9 | import java.util 10 | 11 | case class Column(name: String, alias: String = "") { 12 | override def toString: String = if (StringUtils.isBlank(alias)) name else s"$name AS $alias" 13 | } 14 | 15 | case class Partition(number: Int = 0, by: String = "") 16 | 17 | case class Source(format: String, 18 | options: Map[String, String], 19 | columns: Seq[Column] = Seq.empty, 20 | where: String = "", 21 | printSchema: Boolean, 22 | limit: Int = -1, 23 | show: Int = -1, 24 | partition: Partition = Partition()) 25 | 26 | case class Target(format: String, 27 | options: Map[String, String], 28 | mode: String) 29 | 30 | case class JobConfig(name: String = "Neo4j DWH Connector Job", 31 | master: String = "", 32 | conf: Map[String, String] = Map.empty, 33 | hadoopConfiguration: Map[String, String] = Map.empty, 34 | source: Source, 35 | target: Target) 36 | 37 | object JobConfig { 38 | 39 | def from(data: AnyRef): JobConfig = data match { 40 | case json: String => JSONUtils.mapper.readValue(json, classOf[JobConfig]) 41 | case file: File => JSONUtils.mapper.readValue(file, classOf[JobConfig]) 42 | case uri: URI => JSONUtils.mapper.readValue(uri.toURL, classOf[JobConfig]) 43 | case url: URL => JSONUtils.mapper.readValue(url, classOf[JobConfig]) 44 | case map: util.Map[_, _] => JSONUtils.mapper.convertValue(map, classOf[JobConfig]) 45 | case _ => throw new IllegalArgumentException("Supported input types are String and File") 46 | } 47 | 48 | def fromSeq(data: AnyRef): Seq[JobConfig] = data match { 49 | case json: String => JSONUtils.mapper.readValue(json, new TypeReference[Seq[JobConfig]] {}) 50 | case file: File => JSONUtils.mapper.readValue(file, new TypeReference[Seq[JobConfig]] {}) 51 | case uri: URI => JSONUtils.mapper.readValue(uri.toURL, new TypeReference[Seq[JobConfig]] {}) 52 | case url: URL => JSONUtils.mapper.readValue(url, new TypeReference[Seq[JobConfig]] {}) 53 | case list: util.List[_] => JSONUtils.mapper.convertValue(list, new TypeReference[Seq[JobConfig]] {}) 54 | case _ => throw new IllegalArgumentException("Supported input types are String and File") 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/org/neo4j/dwh/connector/generator/JobConfigGenerator.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.generator 2 | 3 | import org.apache.commons.cli.CommandLine 4 | import org.neo4j.dwh.connector.domain.JobConfig 5 | import org.neo4j.dwh.connector.utils.{DatasourceOptions, JSONUtils} 6 | 7 | import java.io.File 8 | 9 | class JobConfigGenerator(private val cli: CommandLine) { 10 | 11 | def generate(): Unit = { 12 | val source = DatasourceOptions 13 | .withNameIgnoreCase(cli.getOptionValue("s")) 14 | 15 | val target = DatasourceOptions 16 | .withNameIgnoreCase(cli.getOptionValue("t")) 17 | 18 | val deps = (source.deps ++ target.deps) 19 | .map(dep => s" - `$dep`") 20 | .mkString("\n") 21 | 22 | val jobConfig = JobConfig( 23 | name = 24 | s""" 25 | | 26 | |In order to work the following dependencies are required: 27 | |$deps 28 | |""".stripMargin, 29 | source = source.toSource(), 30 | target = target.toTarget(), 31 | conf = (source.conf ++ target.conf), 32 | hadoopConfiguration = (source.hadoopConf ++ target.hadoopConf) 33 | ) 34 | 35 | JSONUtils.mapper 36 | .writerWithDefaultPrettyPrinter 37 | .writeValue(new File(cli.getOptionValue("p")), jobConfig) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/org/neo4j/dwh/connector/utils/CliUtils.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.utils 2 | 3 | import org.apache.commons.cli.{BasicParser, CommandLine, Options} 4 | 5 | object CliUtils { 6 | object JsonType extends Enumeration { 7 | val SINGLE, ARRAY = Value 8 | } 9 | 10 | val helpText = """In case you're using it to generate the configuration stub: 11 | |java -jar neo4j-dwh-connector-.jar -c -s -t -p 12 | |In case you're using it with Spark Submit (from $SPARK_HOME): 13 | |./bin/spark-submit \ 14 | | --class org.neo4j.dwh.connector.Neo4jDWHConnector \ 15 | | --packages \ 16 | | --master spark://: \ 17 | | --deploy-mode cluster \ 18 | | --supervise \ 19 | | --executor-memory 20G \ 20 | | --total-executor-cores 100 \ 21 | | /path/to/neo4j-dwh-connector-.jar \ 22 | | -p /path/to/dwh_job_config.json 23 | |""".stripMargin 24 | 25 | // if used in Databricks Job env don't work with DefaultParser 26 | // because I guess that for some reason they use an old version of apache CommonsCli 27 | // so we're using BasicParser in order to make it work 28 | def parseArgs(args: Array[String]): CommandLine = new BasicParser().parse(options(), args) 29 | 30 | def hasHelp(cli: CommandLine): Boolean = cli.hasOption("h") 31 | 32 | def options(): Options = { 33 | val supportedDataSource = DatasourceOptions 34 | .values 35 | .map(_.toString) 36 | .map(name => s" - `$name`") 37 | .mkString("\n") 38 | new Options() 39 | .addOption("p", "path", true, """If used in combination with the `c` option is the position 40 | |where the configuration field will be saved, otherwise where 41 | |it will be read in order to start the Spark Job. 42 | |""".stripMargin) 43 | .addOption("c", "config", false, """Generates a configuration stub that can be used with the DWH connector. 44 | |You need to define -s and -t options in order to 45 | |specify which are the source and target data sources. 46 | |""".stripMargin) 47 | .addOption("s", "source", true, s"""In combination with -c, it generates a stub configuration with the selected source database. 48 | |Supported Data sources are: 49 | |$supportedDataSource 50 | |""".stripMargin) 51 | .addOption("t", "target", true, s"""In combination with -c, it generates a stub configuration with the selected target database. 52 | |Supported Data sources are: 53 | |$supportedDataSource 54 | |""".stripMargin) 55 | .addOption("ft", "file_type", true, s"""The config file type: 56 | | - `${JsonType.SINGLE}` (default) means a single json 57 | | - `${JsonType.ARRAY}` means that you're passing an array of json 58 | |""".stripMargin) 59 | .addOption("h","help", false, "Prints the help") 60 | } 61 | 62 | def validateCli(cli: CommandLine): Unit = { 63 | val hasPath = cli.hasOption("p") 64 | if (!hasPath) { 65 | throw new IllegalArgumentException("Option -p is required") 66 | } 67 | val isGenerateConfig = cli.hasOption("c") 68 | if (isGenerateConfig) { 69 | val hasGenerateTarget = cli.hasOption("t") 70 | val hasGenerateSource = cli.hasOption("s") 71 | if (!hasGenerateSource && !hasGenerateTarget) { 72 | throw new IllegalArgumentException("You must define `-t` and `-s` option in combination with `-c`") 73 | } 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/org/neo4j/dwh/connector/utils/DatasourceOptions.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.utils 2 | 3 | import org.neo4j.dwh.connector.domain.{Column, Partition, Source, Target} 4 | 5 | object DatasourceOptions extends Enumeration { 6 | case class DatasourceOptionsValue(options: Map[String, Any], deps: Array[String], 7 | conf: Map[String, String] = Map.empty, 8 | hadoopConf: Map[String, String] = Map.empty) extends super.Val { 9 | def toSource(): Source = JSONUtils.mapper.convertValue(options, classOf[Source]) 10 | def toTarget(): Target = JSONUtils.mapper.convertValue(options, classOf[Target]) 11 | } 12 | 13 | private val cols = Array(Column(""". 14 | |N.b. `columns` field will be ignored in case 15 | |you're using it in the `target` field 16 | |""".stripMargin, "Alias to column, not mandatory")) 17 | 18 | private val partition = Partition(-1) 19 | 20 | def withNameIgnoreCase(name: String): DatasourceOptionsValue = this.values 21 | .filter(_.toString.equalsIgnoreCase(name)) 22 | .headOption 23 | .getOrElse(() => throw new NoSuchElementException(s"No value for $name")) 24 | .asInstanceOf[DatasourceOptionsValue] 25 | 26 | val Snowflake = DatasourceOptionsValue(Map( 27 | "format" -> "snowflake", 28 | "columns" -> cols, 29 | "where" -> " It will be ignored in case you're using it in the `target` field.", 30 | "mode" -> 31 | """ 32 | |N.b. It'll be ignored if you're reading data from Snowflake. 33 | |Please check supported save modes here: https://docs.snowflake.com/en/user-guide/spark-connector-use.html#moving-data-from-spark-to-snowflake 34 | |""".stripMargin, 35 | "options" -> Map( 36 | "_comment" -> 37 | """ 38 | |You can find the full list of Snowflake configuration properties here: 39 | |https://docs.snowflake.com/en/user-guide/spark-connector-use.html#setting-configuration-options-for-the-connector 40 | |""".stripMargin, 41 | "sfURL" -> ".snowflakecomputing.com", 42 | "sfUser" -> "", 43 | "sfPassword" -> "", 44 | "sfDatabase" -> "", 45 | "sfSchema" -> "", 46 | "sfWarehouse" -> "", 47 | "dbtable" -> ""), 48 | "partition" -> partition 49 | ), Array("net.snowflake:spark-snowflake_:", "net.snowflake:snowflake-jdbc:")) 50 | val Neo4j = DatasourceOptionsValue(Map( 51 | "format" -> "org.neo4j.spark.DataSource", 52 | "columns" -> cols, 53 | "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.", 54 | "mode" -> 55 | """ 56 | |N.b. It'll be ignored if you're reading data from Neo4j. 57 | |Please check supported save modes here: https://neo4j.com/docs/spark/current/writing/#save-mode 58 | |""".stripMargin, 59 | "options" -> Map( 60 | "_comment" -> 61 | """ 62 | |You can find the full list of Neo4j configuration properties here: 63 | |https://neo4j.com/docs/spark/current/ 64 | |""".stripMargin, 65 | "labels" -> 66 | """. 67 | |In case of writing into Neo4j please see https://neo4j.com/docs/spark/current/writing/#write-node 68 | |In case of reading from Neo4j please see https://neo4j.com/docs/spark/current/reading/#read-node 69 | |""".stripMargin, 70 | "relationship" -> 71 | """ 72 | |N.b. this field requires extra configuration please see 73 | | - In case of writing: https://neo4j.com/docs/spark/current/writing/#write-rel 74 | | - In case of reading: https://neo4j.com/docs/spark/current/reading/#read-rel 75 | |""".stripMargin, 76 | "query" -> 77 | """ 78 | |In case of writing into Neo4j please see https://neo4j.com/docs/spark/current/writing/#write-query 79 | |In case of reading from Neo4j please see https://neo4j.com/docs/spark/current/reading/#read-query 80 | |""".stripMargin, 81 | "url" -> "", 82 | "authentication.type" -> " Please see: https://neo4j.com/docs/spark/current/configuration/", 83 | "authentication.basic.username" -> "", 84 | "authentication.basic.password" -> ""), 85 | "partition" -> partition 86 | ), Array("org.neo4j:neo4j-connector-apache-spark_:")) 87 | val BigQuery = DatasourceOptionsValue(Map( 88 | "format" -> "bigquery", 89 | "columns" -> cols, 90 | "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.", 91 | "mode" -> "", 92 | "options" -> Map( 93 | "_comment" -> 94 | """ 95 | |You can find the full list of BigQuery configuration properties here: 96 | |https://github.com/GoogleCloudDataproc/spark-bigquery-connector#properties 97 | |""".stripMargin, 98 | "path" -> "The BigQuery table in the format [[project:]dataset.]table", 99 | "credentials" -> "", 100 | "dataset" -> "The dataset containing the table. This option should be used with standard table and views, but not when loading query results." 101 | ), 102 | "partition" -> partition 103 | ), Array("com.google.cloud.spark:spark-bigquery-with-dependencies_:")) 104 | val RedShift_Community = DatasourceOptionsValue(Map( 105 | "format" -> "io.github.spark_redshift_community.spark.redshift", 106 | "columns" -> cols, 107 | "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.", 108 | "mode" -> "", 109 | "options" -> Map( 110 | "_comment" -> 111 | """ 112 | |You can find the full list of RedShift configuration properties here: 113 | |https://github.com/spark-redshift-community/spark-redshift#parameters 114 | |""".stripMargin, 115 | "url" -> 116 | """A JDBC URL, of the format, jdbc:subprotocol://host:port/database?user=username&password=password 117 | |subprotocol can be postgresql or redshift, depending on which JDBC driver you have loaded. Note however that one Redshift-compatible driver must be on the classpath and match this URL. 118 | |host and port should point to the Redshift master node, so security groups and/or VPC will need to be configured to allow access from your driver application. 119 | |database identifies a Redshift database name 120 | |user and password are credentials to access the database, which must be embedded in this URL for JDBC, and your user account should have necessary privileges for the table being referenced. 121 | |""".stripMargin, 122 | "query" -> "The query to read from in Redshift (unless `dbtable` is specified)", 123 | "dbtable" -> "The table to create or read from in Redshift. This parameter is required when saving data back to Redshift.", 124 | "tempdir" -> "A writeable location in Amazon S3, to be used for unloaded data when reading and Avro data to be loaded into Redshift when writing. If you're using Redshift data source for Spark as part of a regular ETL pipeline, it can be useful to set a Lifecycle Policy on a bucket and use that as a temp location for this data." 125 | ), 126 | "partition" -> partition 127 | ), 128 | Array("com.amazonaws:aws-java-sdk:", "com.amazon.redshift:redshift-jdbc42:", "org.apache.spark:spark-avro_:", "io.github.spark-redshift-community:spark-redshift_:"), 129 | Map.empty, 130 | Map("fs.s3.awsAccessKeyId" -> "YOUR_KEY_ID", "fs.s3.awsSecretAccessKey" -> "YOUR_SECRET_ACCESS_KEY") 131 | ) 132 | val RedShift_Databricks = DatasourceOptionsValue(Map( 133 | "format" -> "com.databricks.spark.redshift", 134 | "columns" -> cols, 135 | "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.", 136 | "mode" -> "", 137 | "options" -> Map( 138 | "_comment" -> 139 | """ 140 | |You can find the full list of RedShift configuration properties here: 141 | |https://github.com/spark-redshift-community/spark-redshift#parameters 142 | |""".stripMargin, 143 | "url" -> 144 | """A JDBC URL, of the format, jdbc:subprotocol://host:port/database?user=username&password=password 145 | |subprotocol can be postgresql or redshift, depending on which JDBC driver you have loaded. Note however that one Redshift-compatible driver must be on the classpath and match this URL. 146 | |host and port should point to the Redshift master node, so security groups and/or VPC will need to be configured to allow access from your driver application. 147 | |database identifies a Redshift database name 148 | |user and password are credentials to access the database, which must be embedded in this URL for JDBC, and your user account should have necessary privileges for the table being referenced. 149 | |""".stripMargin, 150 | "query" -> "The query to read from in Redshift (unless `dbtable` is specified)", 151 | "dbtable" -> "The table to create or read from in Redshift. This parameter is required when saving data back to Redshift.", 152 | "tempdir" -> "A writeable location in Amazon S3, to be used for unloaded data when reading and Avro data to be loaded into Redshift when writing. If you're using Redshift data source for Spark as part of a regular ETL pipeline, it can be useful to set a Lifecycle Policy on a bucket and use that as a temp location for this data." 153 | ), 154 | "partition" -> partition 155 | ), 156 | Array("com.amazonaws:aws-java-sdk:", "com.amazon.redshift:redshift-jdbc42:", "org.apache.spark:spark-avro_:"), 157 | Map.empty, 158 | Map("fs.s3.awsAccessKeyId" -> "YOUR_KEY_ID", "fs.s3.awsSecretAccessKey" -> "YOUR_SECRET_ACCESS_KEY") 159 | ) 160 | val RedShift_JDBC = DatasourceOptionsValue(Map( 161 | "format" -> "jdbc", 162 | "columns" -> cols, 163 | "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.", 164 | "mode" -> "", 165 | "options" -> Map( 166 | "_comment" -> 167 | """ 168 | |You can connect to RedShift in a non Databricks env also via JDBC. 169 | |Please refer to this documentation page: 170 | |https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html 171 | |""".stripMargin, 172 | "url" -> 173 | """A JDBC URL, of the format, jdbc:subprotocol://host:port/database?user=username&password=password 174 | |subprotocol can be postgresql or redshift, depending on which JDBC driver you have loaded. Note however that one Redshift-compatible driver must be on the classpath and match this URL. 175 | |host and port should point to the Redshift master node, so security groups and/or VPC will need to be configured to allow access from your driver application. 176 | |database identifies a Redshift database name 177 | |user and password are credentials to access the database, which must be embedded in this URL for JDBC, and your user account should have necessary privileges for the table being referenced. 178 | |""".stripMargin, 179 | "query" -> "The query to read from in Redshift (unless `dbtable` is specified)", 180 | "dbtable" -> "The table to create or read from in Redshift. This parameter is required when saving data back to Redshift." 181 | ), 182 | "partition" -> partition 183 | ), Array.empty) 184 | val Synapse_Databricks = DatasourceOptionsValue(Map( 185 | "format" -> "com.databricks.spark.sqldw", 186 | "columns" -> cols, 187 | "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.", 188 | "mode" -> "", 189 | "options" -> Map( 190 | "_comment" -> 191 | """ 192 | |You can find the full list of Azure Synapse Analytics configuration properties here: 193 | |https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/synapse-analytics#parameters 194 | |""".stripMargin, 195 | "url" -> "A JDBC URL with sqlserver set as the subprotocol. It is recommended to use the connection string provided by Azure portal. Setting\nencrypt=true is strongly recommended, because it enables SSL encryption of the JDBC connection. If user and password are set separately, you do not need to include them in the URL.", 196 | "tempDir" -> "A wasbs URI. We recommend you use a dedicated Blob storage container for the Azure Synapse.", 197 | "forwardSparkAzureStorageCredentials" -> 198 | """If true, the library automatically discovers the credentials that Spark is using to connect to the Blob storage container and forwards those credentials to Azure Synapse over JDBC. These credentials are sent as part of the JDBC query. Therefore it is strongly recommended that you enable SSL encryption of the JDBC connection when you use this option. 199 | |The current version of Azure Synapse connector requires (exactly) one of forwardSparkAzureStorageCredentials, enableServicePrincipalAuth, or useAzureMSI to be explicitly set to true. 200 | |The previously supported forward_spark_azure_storage_credentials variant is deprecated and will be ignored in future releases. Use the “camel case” name instead. 201 | |""".stripMargin, 202 | "query" -> "The query to read from in Synapse (unless `dbtable` is specified)", 203 | "dbTable" -> 204 | """The table to create or read from in Azure Synapse. This parameter is required when saving data back to Azure Synapse. 205 | |You can also use {SCHEMA NAME}.{TABLE NAME} to access a table in a given schema. If schema name is not provided, the default schema associated with the JDBC user is used. 206 | |The previously supported dbtable variant is deprecated and will be ignored in future releases. Use the “camel case” name instead.""".stripMargin 207 | ), 208 | "partition" -> partition 209 | ), Array("com.microsoft.azure:spark-mssql-connector_:"), Map( 210 | "fs.azure.account.key..dfs.core.windows.net" -> "" 211 | )) 212 | val Synapse_JDBC = DatasourceOptionsValue(Map( 213 | "format" -> "jdbc", 214 | "columns" -> cols, 215 | "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.", 216 | "mode" -> "", 217 | "options" -> Map( 218 | "_comment" -> 219 | """ 220 | |You can connect to Synapse in a non Databricks env only via JDBC driver. 221 | |Please refer to this documentation page: 222 | |https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html 223 | |""".stripMargin, 224 | "url" -> "A JDBC URL with sqlserver set as the subprotocol. It is recommended to use the connection string provided by Azure portal. Setting\nencrypt=true is strongly recommended, because it enables SSL encryption of the JDBC connection. If user and password are set separately, you do not need to include them in the URL.", 225 | "query" -> "The query to read from in Synapse (unless `dbtable` is specified)", 226 | "dbtable" -> 227 | """The table to create or read from in Azure Synapse. This parameter is required when saving data back to Azure Synapse. 228 | |You can also use {SCHEMA NAME}.{TABLE NAME} to access a table in a given schema. If schema name is not provided, the default schema associated with the JDBC user is used.""".stripMargin 229 | ), 230 | "partition" -> partition 231 | ), Array.empty) 232 | } 233 | -------------------------------------------------------------------------------- /src/main/scala/org/neo4j/dwh/connector/utils/JSONUtils.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.utils 2 | 3 | import com.fasterxml.jackson.databind.DeserializationFeature 4 | import com.fasterxml.jackson.databind.json.JsonMapper 5 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 6 | 7 | object JSONUtils { 8 | 9 | val mapper = JsonMapper.builder() 10 | .addModule(DefaultScalaModule) 11 | .disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES) 12 | .build() 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/org/neo4j/dwh/connector/utils/JobConfigUtils.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.utils 2 | 3 | import org.apache.commons.lang3.StringUtils 4 | import org.apache.spark.sql.SparkSession 5 | import org.neo4j.dwh.connector.domain.JobConfig 6 | 7 | object JobConfigUtils { 8 | 9 | def toSparkSession(jobConfig: JobConfig): SparkSession = { 10 | val sessionBuilder = SparkSession.builder 11 | .appName(jobConfig.name) 12 | 13 | if (StringUtils.isNotBlank(jobConfig.master)) { 14 | sessionBuilder.master(jobConfig.master) 15 | } 16 | 17 | jobConfig.conf.foreach(t => sessionBuilder.config(t._1, t._2)) 18 | 19 | val session = sessionBuilder.getOrCreate() 20 | jobConfig.hadoopConfiguration.foreach(t => session.sparkContext.hadoopConfiguration.set(t._1, t._2)) 21 | 22 | session 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/org/neo4j/dwh/connector/utils/Utils.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.utils 2 | 3 | import org.apache.commons.io.FileUtils 4 | 5 | import java.io.File 6 | import java.nio.charset.Charset 7 | import scala.util.matching.Regex 8 | import scala.util.{Properties, Try} 9 | 10 | object Utils { 11 | 12 | private val envPattern = """\$\{env:(.*)\}""".r 13 | private val filePattern = """\$\{file:(.*)\}""".r 14 | 15 | def enrichMap(map: Map[String, String]): Map[String, String] = map 16 | .map(t => (t._1, Try(findAllInRegex(envPattern, t._2)) 17 | .map(Properties.envOrElse(_, t._2)) 18 | .orElse( 19 | Try(findAllInRegex(filePattern, t._2)) 20 | .map(path => FileUtils.readFileToString(new File(path), Charset.forName("UTF-8"))) 21 | ) 22 | .getOrElse(t._2))) 23 | 24 | private def findAllInRegex(r: Regex, str: String): String = { 25 | // this is a workaround as the same regexp works in Scala 2.12 and 2.13 but not in 2.11 26 | if (Properties.versionString.startsWith("version 2.11") && str.matches(r.regex)) { 27 | val splits = r.regex.split("""\(\.\*\)""") 28 | str.replaceAll(splits(0), "") 29 | .replaceAll(splits(1), "") 30 | } else { 31 | (r findAllIn str).group(1) 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/test/resources/JobConfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Create Persons from CSV to Neo4j", 3 | "master": "local", 4 | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 5 | "source": { 6 | "_comment": "The `format` field manages the connector datasource", 7 | "format": "csv", 8 | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 9 | "options": { 10 | "path": "" 11 | }, 12 | "_comment": "The `columns` field manages projection of the dataframe, `name` is the column name, `alias` is the name that you want to give (not mandatory)", 13 | "columns": [ 14 | { 15 | "name": "person_id", 16 | "alias": "id" 17 | }, 18 | { 19 | "name": "person_name", 20 | "alias": "name" 21 | } 22 | ], 23 | "_comment": "The `where` field manages filters on the datasource (not mandatory)", 24 | "where": "person_surname = 'Santurbano'" 25 | }, 26 | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 27 | "target": { 28 | "_comment": "The `format` field manages the connector datasource", 29 | "format": "org.neo4j.spark.DataSource", 30 | "_comment": "The `mode` is the save mode of the writing connector", 31 | "mode": "Overwrite", 32 | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 33 | "options": { 34 | "labels": ":Person:Customer", 35 | "node.keys": "id" 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /src/test/resources/neo4j-dwh-connector.properties: -------------------------------------------------------------------------------- 1 | neo4j.version=${neo4j.version} 2 | -------------------------------------------------------------------------------- /src/test/resources/persons.csv: -------------------------------------------------------------------------------- 1 | person_id,person_name,person_surname 2 | 1,Andrea,Santurbano 3 | 2,Federico,Santurbano 4 | 3,Mario,Draghi -------------------------------------------------------------------------------- /src/test/resources/query.cyp: -------------------------------------------------------------------------------- 1 | CREATE (p:Person {name: 'Andrea Santurbano'}) -------------------------------------------------------------------------------- /src/test/resources/snowflake.to.neo4j.stub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name" : "\n\nIn order to work the following dependencies are required:\n - `net.snowflake:spark-snowflake_:`\n - `net.snowflake:snowflake-jdbc:`\n - `org.neo4j:neo4j-connector-apache-spark_:`\n", 3 | "master" : "", 4 | "conf" : { }, 5 | "hadoopConfiguration" : { }, 6 | "source" : { 7 | "format" : "snowflake", 8 | "options" : { 9 | "_comment" : "\nYou can find the full list of Snowflake configuration properties here:\nhttps://docs.snowflake.com/en/user-guide/spark-connector-use.html#setting-configuration-options-for-the-connector\n", 10 | "sfSchema" : "", 11 | "sfPassword" : "", 12 | "sfUser" : "", 13 | "sfWarehouse" : "", 14 | "dbtable" : "", 15 | "sfDatabase" : "", 16 | "sfURL" : ".snowflakecomputing.com" 17 | }, 18 | "columns" : [ { 19 | "name" : ".\nN.b. `columns` field will be ignored in case\nyou're using it in the `target` field\n", 20 | "alias" : "Alias to column, not mandatory" 21 | } ], 22 | "where" : " It will be ignored in case you're using it in the `target` field.", 23 | "printSchema" : false, 24 | "limit" : -1, 25 | "show" : -1, 26 | "partition" : { 27 | "number" : -1, 28 | "by" : "" 29 | } 30 | }, 31 | "target" : { 32 | "format" : "org.neo4j.spark.DataSource", 33 | "options" : { 34 | "_comment" : "\nYou can find the full list of Neo4j configuration properties here:\nhttps://neo4j.com/docs/spark/current/\n", 35 | "url" : "", 36 | "authentication.type" : " Please see: https://neo4j.com/docs/spark/current/configuration/", 37 | "authentication.basic.username" : "", 38 | "authentication.basic.password" : "", 39 | "query" : "\nIn case of writing into Neo4j please see https://neo4j.com/docs/spark/current/writing/#write-query\nIn case of reading from Neo4j please see https://neo4j.com/docs/spark/current/reading/#read-query\n", 40 | "relationship" : "\nN.b. this field requires extra configuration please see\n - In case of writing: https://neo4j.com/docs/spark/current/writing/#write-rel\n - In case of reading: https://neo4j.com/docs/spark/current/reading/#read-rel\n", 41 | "labels" : ".\nIn case of writing into Neo4j please see https://neo4j.com/docs/spark/current/writing/#write-node\nIn case of reading from Neo4j please see https://neo4j.com/docs/spark/current/reading/#read-node\n" 42 | }, 43 | "mode" : "\nN.b. It'll be ignored if you're reading data from Neo4j.\nPlease check supported save modes here: https://neo4j.com/docs/spark/current/writing/#save-mode\n" 44 | } 45 | } -------------------------------------------------------------------------------- /src/test/scala/org/neo4j/dwh/connector/Neo4jDWHConnectorIT.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector 2 | 3 | import org.apache.commons.io.FileUtils 4 | import org.apache.spark.sql.SparkSession 5 | import org.junit.{AfterClass, Assert, Assume, BeforeClass, Test} 6 | import org.neo4j.driver.GraphDatabase 7 | import org.neo4j.dwh.connector.Neo4jDWHConnectorIT.neo4jContainer 8 | import org.neo4j.dwh.connector.domain.JobConfig 9 | import org.testcontainers.containers.Neo4jContainer 10 | import org.testcontainers.utility.DockerImageName 11 | 12 | import java.io.File 13 | import java.nio.charset.Charset 14 | import scala.collection.JavaConverters._ 15 | import scala.util.Properties 16 | 17 | object Neo4jDWHConnectorIT { 18 | private val properties = new java.util.Properties() 19 | properties.load(Thread.currentThread().getContextClassLoader().getResourceAsStream("neo4j-dwh-connector.properties")) 20 | 21 | val neo4jContainer = new Neo4jContainer(DockerImageName.parse(s"neo4j:${properties.getProperty("neo4j.version")}")) 22 | .withNeo4jConfig("dbms.security.auth_enabled", "false") 23 | .asInstanceOf[Neo4jContainer[_]] 24 | 25 | @BeforeClass 26 | def setUpContainer(): Unit = { 27 | neo4jContainer.start() 28 | } 29 | 30 | @AfterClass 31 | def teardownContainer(): Unit = { 32 | neo4jContainer.stop() 33 | } 34 | } 35 | 36 | class Neo4jDWHConnectorIT { 37 | 38 | private def createPersons(numPersons: Int) = { 39 | val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl) 40 | val neo4jSession = driver.session() 41 | try { 42 | neo4jSession.run( 43 | """UNWIND RANGE(1, $numPersons) AS ID 44 | |MERGE (p:Person:Customer{id: ID, name: 'Name ' + ID, surname: 'Surname ' + ID, age: 10 + ID}) 45 | |RETURN count(p) AS count 46 | |""".stripMargin, Map[String, AnyRef]("numPersons" -> numPersons.asInstanceOf[AnyRef]).asJava) 47 | .consume() 48 | } finally { 49 | neo4jSession.close() 50 | driver.close() 51 | } 52 | } 53 | 54 | @Test 55 | def shouldImportCSVIntoNeo4j(): Unit = { 56 | val jsonConfig = 57 | s""" 58 | |{ 59 | | "name": "Create Persons from CSV to Neo4j", 60 | | "master": "local", 61 | | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 62 | | "source": { 63 | | "_comment": "The `format` field manages the connector datasource", 64 | | "format": "csv", 65 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 66 | | "options": { 67 | | "header": "true", 68 | | "path": "${Thread 69 | .currentThread 70 | .getContextClassLoader 71 | .getResource("persons.csv") 72 | .getPath}" 73 | | }, 74 | | "_comment": "The `columns` field manages projection of the dataframe, `name` is the column name, `alias` is the name that you want to give (not mandatory)", 75 | | "columns": [ 76 | | { 77 | | "name": "person_id", 78 | | "alias": "id" 79 | | }, 80 | | { 81 | | "name": "person_name", 82 | | "alias": "name" 83 | | } 84 | | ], 85 | | "_comment": "The `where` field manages filters on the datasource (not mandatory)", 86 | | "where": "person_surname = 'Santurbano'" 87 | | }, 88 | | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 89 | | "target": { 90 | | "_comment": "The `format` field manages the connector datasource", 91 | | "format": "org.neo4j.spark.DataSource", 92 | | "_comment": "The `mode` is the save mode of the writing connector", 93 | | "mode": "Overwrite", 94 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 95 | | "options": { 96 | | "labels": ":Person:Customer", 97 | | "url": "${neo4jContainer.getBoltUrl}", 98 | | "node.keys": "id" 99 | | } 100 | | } 101 | |} 102 | |""".stripMargin 103 | 104 | runJob(jsonConfig) 105 | val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl) 106 | val neo4jSession = driver.session() 107 | try { 108 | val count = neo4jSession.run( 109 | """ 110 | |MATCH (p:Person:Customer) 111 | |WHERE p.name IN ['Andrea', 'Federico'] 112 | |RETURN count(p) AS count 113 | |""".stripMargin) 114 | .single() 115 | .get(0) 116 | .asLong() 117 | Assert.assertEquals(2L, count) 118 | } finally { 119 | neo4jSession.close() 120 | driver.close() 121 | } 122 | } 123 | 124 | @Test 125 | def shouldWriteCSVFromNeo4j(): Unit = { 126 | val csvPath = Properties.propOrElse("java.io.tmpdir", "").concat("/from-neo4j") 127 | 128 | val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl) 129 | val neo4jSession = driver.session() 130 | try { 131 | neo4jSession.run( 132 | """ 133 | |UNWIND range(1, 2) AS id 134 | |MERGE (p:Person:Customer {id: id, name: 'Name For Id ' + id}) 135 | |RETURN p 136 | |""".stripMargin) 137 | .consume() 138 | } finally { 139 | neo4jSession.close() 140 | driver.close() 141 | } 142 | 143 | val jsonConfig = 144 | s""" 145 | |{ 146 | | "name": "Create Persons from CSV to Neo4j", 147 | | "master": "local", 148 | | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 149 | | "source": { 150 | | "_comment": "The `format` field manages the connector datasource", 151 | | "format": "org.neo4j.spark.DataSource", 152 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 153 | | "options": { 154 | | "labels": ":Person:Customer", 155 | | "url": "${neo4jContainer.getBoltUrl}" 156 | | }, 157 | | "_comment": "The `columns` field manages projection of the dataframe, `name` is the column name, `alias` is the name that you want to give (not mandatory)", 158 | | "columns": [ 159 | | { 160 | | "name": "id" 161 | | }, 162 | | { 163 | | "name": "name" 164 | | } 165 | | ] 166 | | }, 167 | | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 168 | | "target": { 169 | | "_comment": "The `format` field manages the connector datasource", 170 | | "format": "csv", 171 | | "_comment": "The `mode` is the save mode of the writing connector", 172 | | "mode": "Overwrite", 173 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 174 | | "options": { 175 | | "header": "true", 176 | | "path": "$csvPath" 177 | | } 178 | | } 179 | |} 180 | |""".stripMargin 181 | 182 | runJob(jsonConfig) 183 | 184 | val csvFile = new File(csvPath) 185 | .listFiles() 186 | .filter(_.isFile) 187 | .filter(_.getName.endsWith("csv"))(0) 188 | 189 | val actual = FileUtils.readFileToString(csvFile, Charset.forName("UTF-8")) 190 | val expected = 191 | """id,name 192 | |1,Name For Id 1 193 | |2,Name For Id 2 194 | |""".stripMargin 195 | Assert.assertEquals(expected, actual) 196 | } 197 | 198 | @Test 199 | def shouldImportSnowflakeIntoNeo4j(): Unit = { 200 | val snowflakeschema = Properties.envOrNone("SNOWFLAKE_SCHEMA") 201 | Assume.assumeFalse(snowflakeschema.isEmpty) 202 | val snowflakeuser = Properties.envOrNone("SNOWFLAKE_USER") 203 | Assume.assumeFalse(snowflakeuser.isEmpty) 204 | val snowflakepassword = Properties.envOrNone("SNOWFLAKE_PASSWORD") 205 | Assume.assumeFalse(snowflakepassword.isEmpty) 206 | val snowflakedatabase = Properties.envOrNone("SNOWFLAKE_DATABASE") 207 | Assume.assumeFalse(snowflakedatabase.isEmpty) 208 | val snowflakeurl = Properties.envOrNone("SNOWFLAKE_URL") 209 | Assume.assumeFalse(snowflakeurl.isEmpty) 210 | val snowflaketable = Properties.envOrNone("SNOWFLAKE_TABLE") 211 | Assume.assumeFalse(snowflaketable.isEmpty) 212 | val jsonConfig = 213 | s""" 214 | |{ 215 | | "name": "Create Customers from Snowflake to Neo4j", 216 | | "master": "local", 217 | | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 218 | | "source": { 219 | | "_comment": "The `format` field manages the connector datasource", 220 | | "format" : "snowflake", 221 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 222 | | "options" : { 223 | | "sfSchema" : "${snowflakeschema.get}", 224 | | "sfPassword" : "${snowflakepassword.get}", 225 | | "sfUser" : "${snowflakeuser.get}", 226 | | "dbtable" : "${snowflaketable.get}", 227 | | "sfDatabase" : "${snowflakedatabase.get}", 228 | | "sfURL" : "${snowflakeurl.get}" 229 | | }, 230 | | "_comment": "The `where` field manages filters on the datasource (not mandatory)", 231 | | "where": "C_CUSTKEY <= 10", 232 | | "_comments": "The `columns` field manages the projection of Dataframe columns", 233 | | "columns": [ 234 | | { "name": "CAST(C_ACCTBAL AS DOUBLE)", "alias": "C_ACCTBAL" }, 235 | | { "name": "C_ADDRESS" }, 236 | | { "name": "C_COMMENT" }, 237 | | { "name": "CAST(C_CUSTKEY AS LONG)", "alias": "C_CUSTKEY" }, 238 | | { "name": "C_MKTSEGMENT" }, 239 | | { "name": "C_NAME" }, 240 | | { "name": "CAST(C_NATIONKEY AS LONG)", "alias": "C_NATIONKEY" }, 241 | | { "name": "C_PHONE" } 242 | | ] 243 | | }, 244 | | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 245 | | "target": { 246 | | "_comment": "The `format` field manages the connector datasource", 247 | | "format": "org.neo4j.spark.DataSource", 248 | | "_comment": "The `mode` is the save mode of the writing connector", 249 | | "mode": "Overwrite", 250 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 251 | | "options": { 252 | | "labels": ":Person:Customer", 253 | | "url": "${neo4jContainer.getBoltUrl}", 254 | | "node.keys": "C_CUSTKEY" 255 | | } 256 | | } 257 | |} 258 | |""".stripMargin 259 | 260 | runJob(jsonConfig) 261 | val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl) 262 | val neo4jSession = driver.session() 263 | try { 264 | val count = neo4jSession.run( 265 | """ 266 | |MATCH (p:Person:Customer) 267 | |RETURN count(p) AS count 268 | |""".stripMargin) 269 | .single() 270 | .get(0) 271 | .asLong() 272 | Assert.assertEquals(10L, count) 273 | } finally { 274 | neo4jSession.close() 275 | driver.close() 276 | } 277 | } 278 | 279 | @Test 280 | def shouldImportNeo4jIntoSnowflake(): Unit = { 281 | val snowflakeschema = Properties.envOrNone("SNOWFLAKE_SCHEMA") 282 | Assume.assumeFalse(snowflakeschema.isEmpty) 283 | val snowflakeuser = Properties.envOrNone("SNOWFLAKE_USER") 284 | Assume.assumeFalse(snowflakeuser.isEmpty) 285 | val snowflakepassword = Properties.envOrNone("SNOWFLAKE_PASSWORD") 286 | Assume.assumeFalse(snowflakepassword.isEmpty) 287 | val snowflakedatabase = Properties.envOrNone("SNOWFLAKE_DATABASE") 288 | Assume.assumeFalse(snowflakedatabase.isEmpty) 289 | val snowflakeurl = Properties.envOrNone("SNOWFLAKE_URL") 290 | Assume.assumeFalse(snowflakeurl.isEmpty) 291 | val snowflaketable = Properties.envOrNone("SNOWFLAKE_TABLE") 292 | Assume.assumeFalse(snowflaketable.isEmpty) 293 | val numPersons = 10 294 | createPersons(numPersons) 295 | val jsonConfig = 296 | s""" 297 | |{ 298 | | "name": "Create Person from Neo4j to Snowflake", 299 | | "master": "local", 300 | | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 301 | | "source": { 302 | | "_comment": "The `format` field manages the connector datasource", 303 | | "format": "org.neo4j.spark.DataSource", 304 | | "options": { 305 | | "labels": ":Person:Customer", 306 | | "url": "${neo4jContainer.getBoltUrl}" 307 | | }, 308 | | "columns": [ 309 | | { "name": "ID" }, 310 | | { "name": "NAME" }, 311 | | { "name": "SURNAME" }, 312 | | { "name": "AGE" } 313 | | ] 314 | | }, 315 | | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 316 | | "target": { 317 | | "_comment": "The `format` field manages the connector datasource", 318 | | "format" : "snowflake", 319 | | "_comment": "The `mode` is the save mode of the writing connector", 320 | | "mode": "Append", 321 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 322 | | "options" : { 323 | | "sfSchema" : "${snowflakeschema.get}", 324 | | "sfPassword" : "${snowflakepassword.get}", 325 | | "sfUser" : "${snowflakeuser.get}", 326 | | "dbtable" : "${snowflaketable.get}", 327 | | "sfDatabase" : "${snowflakedatabase.get}", 328 | | "sfURL" : "${snowflakeurl.get}" 329 | | } 330 | | } 331 | |} 332 | |""".stripMargin 333 | 334 | runJob(jsonConfig) 335 | 336 | val count = SparkSession.builder() 337 | .master("local[*]") 338 | .getOrCreate() 339 | .read 340 | .format("snowflake") 341 | .option("sfSchema", snowflakeschema.get) 342 | .option("sfPassword", snowflakepassword.get) 343 | .option("sfUser", snowflakeuser.get) 344 | .option("dbtable", snowflaketable.get) 345 | .option("sfDatabase", snowflakedatabase.get) 346 | .option("sfURL", snowflakeurl.get) 347 | .load() 348 | .count() 349 | Assert.assertEquals(numPersons.toLong, count) 350 | } 351 | 352 | @Test 353 | def shouldImportBigQueryIntoNeo4j(): Unit = { 354 | val googleprojectid = Properties.envOrNone("GOOGLE_PROJECT_ID") 355 | Assume.assumeFalse(googleprojectid.isEmpty) 356 | val googlecredentialsjson = Properties.envOrNone("GOOGLE_CREDENTIALS_JSON") 357 | Assume.assumeFalse(googlecredentialsjson.isEmpty) 358 | val jsonConfig = 359 | s""" 360 | |{ 361 | | "name": "Create ingest BigQuery's Stackoverflow data into Neo4j", 362 | | "master": "local", 363 | | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 364 | | "source": { 365 | | "_comment": "The `format` field manages the connector datasource", 366 | | "format" : "bigquery", 367 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 368 | | "options": { 369 | | "table": "bigquery-public-data.stackoverflow.posts_questions", 370 | | "parentProject": "${googleprojectid.get}", 371 | | "credentialsFile": "${googlecredentialsjson.get}" 372 | | }, 373 | | "_comment": "The `where` field manages filters on the datasource (not mandatory)", 374 | | "where": "id <= 10", 375 | | "_comments": "The `columns` field manages the projection of Dataframe columns", 376 | | "columns": [ 377 | | { "name": "ID" }, 378 | | { "name": "TITLE" }, 379 | | { "name": "BODY" } 380 | | ] 381 | | }, 382 | | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 383 | | "target": { 384 | | "_comment": "The `format` field manages the connector datasource", 385 | | "format": "org.neo4j.spark.DataSource", 386 | | "_comment": "The `mode` is the save mode of the writing connector", 387 | | "mode": "Overwrite", 388 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 389 | | "options": { 390 | | "labels": ":Answer", 391 | | "url": "${neo4jContainer.getBoltUrl}", 392 | | "node.keys": "ID" 393 | | } 394 | | } 395 | |} 396 | |""".stripMargin 397 | 398 | runJob(jsonConfig) 399 | val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl) 400 | val neo4jSession = driver.session() 401 | try { 402 | val count = neo4jSession.run( 403 | """ 404 | |MATCH (p:Answer) 405 | |RETURN count(p) AS count 406 | |""".stripMargin) 407 | .single() 408 | .get(0) 409 | .asLong() 410 | Assert.assertEquals(3L, count) 411 | } finally { 412 | neo4jSession.close() 413 | driver.close() 414 | } 415 | } 416 | 417 | @Test 418 | def shouldImportNeo4jIntoBigQuery(): Unit = { 419 | val googleprojectid = Properties.envOrNone("GOOGLE_PROJECT_ID") 420 | Assume.assumeFalse(googleprojectid.isEmpty) 421 | val googlecredentialsjson = Properties.envOrNone("GOOGLE_CREDENTIALS_JSON") 422 | Assume.assumeFalse(googlecredentialsjson.isEmpty) 423 | val googlebigquerytable = Properties.envOrNone("GOOGLE_BIGQUERY_TABLE") 424 | Assume.assumeFalse(googlebigquerytable.isEmpty) 425 | val numPersons = 10 426 | createPersons(numPersons) 427 | val jsonConfig = 428 | s""" 429 | |{ 430 | | "name": "Create ingest BigQuery's Stackoverflow data into Neo4j", 431 | | "master": "local", 432 | | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 433 | | "source": { 434 | | "_comment": "The `format` field manages the connector datasource", 435 | | "format": "org.neo4j.spark.DataSource", 436 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 437 | | "options": { 438 | | "labels": ":Person", 439 | | "url": "${neo4jContainer.getBoltUrl}" 440 | | }, 441 | | "columns": [ 442 | | { "name": "ID" }, 443 | | { "name": "NAME" }, 444 | | { "name": "SURNAME" }, 445 | | { "name": "AGE" } 446 | | ] 447 | | }, 448 | | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 449 | | "target": { 450 | | "_comment": "The `format` field manages the connector datasource", 451 | | "format" : "bigquery", 452 | | "_comment": "The `mode` is the save mode of the writing connector", 453 | | "mode": "Overwrite", 454 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 455 | | "options" : { 456 | | "table": "${googlebigquerytable.get}", 457 | | "parentProject": "${googleprojectid.get}", 458 | | "credentialsFile": "${googlecredentialsjson.get}" 459 | | } 460 | | } 461 | |} 462 | |""".stripMargin 463 | 464 | runJob(jsonConfig) 465 | 466 | val count = SparkSession.builder() 467 | .master("local[*]") 468 | .getOrCreate() 469 | .read 470 | .format("bigquery") 471 | .option("table", googlebigquerytable.get) 472 | .option("parentProject", googleprojectid.get) 473 | .option("credentialsFile", googlecredentialsjson.get) 474 | .load() 475 | .count() 476 | Assert.assertEquals(numPersons.toLong, count) 477 | } 478 | 479 | @Test 480 | def shouldImportRedShiftIntoNeo4j(): Unit = { 481 | val awsredshifturl = Properties.envOrNone("AWS_REDSHIFT_URL") 482 | Assume.assumeFalse(awsredshifturl.isEmpty) 483 | val awsredshifttable = Properties.envOrNone("AWS_REDSHIFT_TABLE") 484 | Assume.assumeFalse(awsredshifttable.isEmpty) 485 | val awsiamrole = Properties.envOrNone("AWS_IAM_ROLE") 486 | Assume.assumeFalse(awsiamrole.isEmpty) 487 | val awss3tmpdir = Properties.envOrNone("AWS_S3_TMPDIR") 488 | Assume.assumeFalse(awss3tmpdir.isEmpty) 489 | val awss3accessid = Properties.envOrNone("AWS_ACCESS_KEY") 490 | Assume.assumeFalse(awss3accessid.isEmpty) 491 | val awss3accessky = Properties.envOrNone("AWS_SECRET_ACCESS_KEY") 492 | Assume.assumeFalse(awss3accessky.isEmpty) 493 | val jsonConfig = 494 | s""" 495 | |{ 496 | | "name": "Create ingest RedShift data into Neo4j", 497 | | "master": "local", 498 | | "hadoopConfiguration": { 499 | | "fs.s3a.access.key": "${awss3accessid.get}", 500 | | "fs.s3a.secret.key": "${awss3accessky.get}" 501 | | }, 502 | | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 503 | | "source": { 504 | | "_comment": "The `format` field manages the connector datasource", 505 | | "format" : "io.github.spark_redshift_community.spark.redshift", 506 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 507 | | "options": { 508 | | "url": "${awsredshifturl.get}", 509 | | "dbtable": "${awsredshifttable.get}", 510 | | "tempdir": "${awss3tmpdir.get}", 511 | | "forward_spark_s3_credentials": "true" 512 | | }, 513 | | "_comment": "The `where` field manages filters on the datasource (not mandatory)", 514 | | "where": "userid <= 10" 515 | | }, 516 | | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 517 | | "target": { 518 | | "_comment": "The `format` field manages the connector datasource", 519 | | "format": "org.neo4j.spark.DataSource", 520 | | "_comment": "The `mode` is the save mode of the writing connector", 521 | | "mode": "Overwrite", 522 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 523 | | "options": { 524 | | "labels": ":Person", 525 | | "url": "${neo4jContainer.getBoltUrl}", 526 | | "node.keys": "userid" 527 | | } 528 | | } 529 | |} 530 | |""".stripMargin 531 | 532 | runJob(jsonConfig) 533 | val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl) 534 | val neo4jSession = driver.session() 535 | try { 536 | val count = neo4jSession.run( 537 | """ 538 | |MATCH (p:Person) 539 | |RETURN count(p) AS count 540 | |""".stripMargin) 541 | .single() 542 | .get(0) 543 | .asLong() 544 | Assert.assertEquals(10L, count) 545 | } finally { 546 | neo4jSession.close() 547 | driver.close() 548 | } 549 | } 550 | 551 | @Test 552 | def shouldImportNeo4jIntoRedShift(): Unit = { 553 | val awsredshifturl = Properties.envOrNone("AWS_REDSHIFT_URL") 554 | Assume.assumeFalse(awsredshifturl.isEmpty) 555 | val awsredshifttable = Properties.envOrNone("AWS_REDSHIFT_TABLE") 556 | Assume.assumeFalse(awsredshifttable.isEmpty) 557 | val awsiamrole = Properties.envOrNone("AWS_IAM_ROLE") 558 | Assume.assumeFalse(awsiamrole.isEmpty) 559 | val awss3tmpdir = Properties.envOrNone("AWS_S3_TMPDIR") 560 | Assume.assumeFalse(awss3tmpdir.isEmpty) 561 | val awss3accessid = Properties.envOrNone("AWS_ACCESS_KEY") 562 | Assume.assumeFalse(awss3accessid.isEmpty) 563 | val awss3accessky = Properties.envOrNone("AWS_SECRET_ACCESS_KEY") 564 | Assume.assumeFalse(awss3accessky.isEmpty) 565 | val numPersons = 10 566 | createPersons(numPersons) 567 | val jsonConfig = 568 | s""" 569 | |{ 570 | | "name": "Create ingest RedShift data into Neo4j", 571 | | "master": "local", 572 | | "hadoopConfiguration": { 573 | | "fs.s3a.access.key": "${awss3accessid.get}", 574 | | "fs.s3a.secret.key": "${awss3accessky.get}" 575 | | }, 576 | | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 577 | | "source": { 578 | | "_comment": "The `format` field manages the connector datasource", 579 | | "format": "org.neo4j.spark.DataSource", 580 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 581 | | "options": { 582 | | "labels": ":Person", 583 | | "url": "${neo4jContainer.getBoltUrl}" 584 | | }, 585 | | "columns": [ 586 | | { "name": "ID" }, 587 | | { "name": "NAME" }, 588 | | { "name": "SURNAME" }, 589 | | { "name": "AGE" } 590 | | ] 591 | | }, 592 | | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 593 | | "target": { 594 | | "_comment": "The `format` field manages the connector datasource", 595 | | "format" : "io.github.spark_redshift_community.spark.redshift", 596 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 597 | | "options": { 598 | | "url": "${awsredshifturl.get}", 599 | | "dbtable": "${awsredshifttable.get}", 600 | | "tempdir": "${awss3tmpdir.get}", 601 | | "tempformat": "CSV", 602 | | "forward_spark_s3_credentials": "true" 603 | | } 604 | | } 605 | |} 606 | |""".stripMargin 607 | 608 | runJob(jsonConfig) 609 | 610 | val session = SparkSession.builder() 611 | .master("local[*]") 612 | .getOrCreate() 613 | session.sparkContext.hadoopConfiguration.set("fs.s3a.access.key", awss3accessid.get) 614 | session.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", awss3accessky.get) 615 | val count = session 616 | .read 617 | .format("io.github.spark_redshift_community.spark.redshift") 618 | .option("url", awsredshifturl.get) 619 | .option("dbtable", awsredshifttable.get) 620 | .option("tempdir", awss3tmpdir.get) 621 | .option("forward_spark_s3_credentials", "true") 622 | .load() 623 | .count() 624 | Assert.assertEquals(numPersons.toLong, count) 625 | } 626 | 627 | /** 628 | * Synapse Connector is available only in Databricks Cloud 629 | * if you want to Connect to it in non-Databricks environment 630 | * you can user the `jdbc` Datasource with a job like this: 631 | * 632 | * Read: 633 | * spark.read 634 | * .format("jdbc") 635 | * .option("url", "jdbc:sqlserver://synapsesparkneo4j.sql.azuresynapse.net:1433;database=;user=;password=;encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.sql.azuresynapse.net;loginTimeout=30;") 636 | * .option("dbtable", "dbo.Date") 637 | * .load() 638 | * 639 | * Write: 640 | * df.write 641 | * .format("jdbc") 642 | * .option("url", "jdbc:sqlserver://synapsesparkneo4j.sql.azuresynapse.net:1433;database=;user=;password=;encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.sql.azuresynapse.net;loginTimeout=30;") 643 | * .option("dbtable", "dbo.Date") 644 | * .save() 645 | */ 646 | 647 | private def runJob(jsonConfig: String) = { 648 | val jobConfig = JobConfig.from(jsonConfig) 649 | new Neo4jDWHConnector(jobConfig).run() 650 | } 651 | } 652 | -------------------------------------------------------------------------------- /src/test/scala/org/neo4j/dwh/connector/domain/JobConfigTest.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.domain 2 | 3 | import org.apache.spark.sql.SaveMode 4 | import org.junit.Assert.assertEquals 5 | import org.junit.Test 6 | 7 | class JobConfigTest { 8 | 9 | @Test 10 | def shouldParseTheJsonIntoJobConfig(): Unit = { 11 | val jsonConfig = 12 | """ 13 | |{ 14 | | "name": "Create Persons from Snowflake to Neo4j", 15 | | "master": "local", 16 | | "_comment": "The `conf` field will add configuration via spark.conf.set", 17 | | "conf": { 18 | | "": "" 19 | | }, 20 | | "_comment": "The `hadoopConfiguration` field will add configuration via spark.hadoopConfiguration().set", 21 | | "hadoopConfiguration": { 22 | | "": "" 23 | | }, 24 | | "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data", 25 | | "source": { 26 | | "_comment": "The `format` field manages the connector datasource", 27 | | "format": "net.snowflake.spark.snowflake", 28 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 29 | | "options": { 30 | | "sfURL": ".snowflakecomputing.com", 31 | | "sfUser": "", 32 | | "sfPassword": "", 33 | | "sfDatabase": "", 34 | | "sfSchema": "", 35 | | "sfWarehouse": "", 36 | | "dbtable": "" 37 | | }, 38 | | "_comment": "The `columns` field manages projection of the dataframe, `name` is the column name, `alias` is the name that you want to give (not mandatory)", 39 | | "columns": [ 40 | | { 41 | | "name": "person_id", 42 | | "alias": "id" 43 | | }, 44 | | { 45 | | "name": "person_name", 46 | | "alias": "name" 47 | | } 48 | | ], 49 | | "_comment": "The `where` field manages filters on the datasource (not mandatory)", 50 | | "where": "person_surname = 'Santurbano'", 51 | | "_comment": "The `partition` field repartition the source dataframe this can be useful when you're ingesting relationships into Neo4j", 52 | | "partition": { 53 | | "number": 5, 54 | | "by": "foo" 55 | | } 56 | | }, 57 | | "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`", 58 | | "target": { 59 | | "_comment": "The `format` field manages the connector datasource", 60 | | "format": "org.neo4j.spark.DataSource", 61 | | "_comment": "The `mode` is the save mode of the writing connector", 62 | | "mode": "Overwrite", 63 | | "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource", 64 | | "options": { 65 | | "labels": ":Person:Customer", 66 | | "node.keys": "id" 67 | | } 68 | | } 69 | |} 70 | |""".stripMargin 71 | 72 | val jobConfig = JobConfig.from(jsonConfig) 73 | 74 | assertEquals("Create Persons from Snowflake to Neo4j", jobConfig.name) 75 | assertEquals("local", jobConfig.master) 76 | assertEquals(Map("" -> ""), jobConfig.conf) 77 | assertEquals(Map("" -> ""), jobConfig.hadoopConfiguration) 78 | 79 | assertEquals("net.snowflake.spark.snowflake", jobConfig.source.format) 80 | assertEquals(Map("sfURL" -> ".snowflakecomputing.com", 81 | "sfUser" -> "", 82 | "sfPassword" -> "", 83 | "sfDatabase" -> "", 84 | "sfSchema" -> "", 85 | "sfWarehouse" -> "", 86 | "dbtable" -> ""), jobConfig.source.options) 87 | assertEquals(Seq(Column("person_id", "id"), Column("person_name", "name")), jobConfig.source.columns) 88 | assertEquals("person_surname = 'Santurbano'", jobConfig.source.where) 89 | assertEquals(Partition(5, "foo"), jobConfig.source.partition) 90 | 91 | assertEquals("org.neo4j.spark.DataSource", jobConfig.target.format) 92 | assertEquals(SaveMode.Overwrite.toString, jobConfig.target.mode) 93 | assertEquals(Map("labels" -> ":Person:Customer", "node.keys" -> "id"), jobConfig.target.options) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/test/scala/org/neo4j/dwh/connector/generator/JobConfigGeneratorTest.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.generator 2 | 3 | import org.junit.{Assert, Test} 4 | import org.neo4j.dwh.connector.domain.JobConfig 5 | import org.neo4j.dwh.connector.utils.{CliUtils, JSONUtils} 6 | 7 | import java.io.File 8 | import scala.util.Properties 9 | 10 | class JobConfigGeneratorTest { 11 | 12 | @Test 13 | def shouldCreateConfigStubFileFromSnowflakeToNeo4j(): Unit = { 14 | val filePath = Properties.propOrElse("java.io.tmpdir", "").concat("/config.stub.json") 15 | val cli = CliUtils.parseArgs(Array("-p", filePath, "-c", "-s", "Snowflake", "-t", "Neo4j")) 16 | new JobConfigGenerator(cli).generate() 17 | val actual = JSONUtils.mapper.readValue(new File(filePath), classOf[Map[String, Any]]) 18 | val expected = JSONUtils.mapper.readValue(Thread.currentThread() 19 | .getContextClassLoader 20 | .getResourceAsStream("snowflake.to.neo4j.stub.json"), classOf[Map[String, Any]]) 21 | Assert.assertEquals(expected, actual) 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/test/scala/org/neo4j/dwh/connector/utils/UtilsTest.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.dwh.connector.utils 2 | 3 | import org.apache.commons.lang3.StringUtils 4 | import org.junit.Assert.assertEquals 5 | import org.junit.{Assume, Test} 6 | 7 | import java.net.URL 8 | import scala.util.Properties 9 | 10 | class UtilsTest { 11 | 12 | private val queryUrl: URL = Thread 13 | .currentThread 14 | .getContextClassLoader 15 | .getResource("query.cyp") 16 | 17 | private val source = scala.io.Source 18 | .fromFile(queryUrl.toURI) 19 | private val queryFile: String = try { 20 | source 21 | .getLines() 22 | .mkString("\n") 23 | } finally { 24 | source.close() 25 | } 26 | 27 | @Test 28 | def shouldReturnMapWithEnvAndFileContent(): Unit = { 29 | val myenv = Properties.envOrElse("MY_ENV", "") 30 | Assume.assumeTrue(StringUtils.isNotBlank(myenv)) 31 | val sourceMap = Map("foo" -> "bar", 32 | "withEnv" -> "${env:MY_ENV}", 33 | "noEnv" -> "${env:NO_ENV}", 34 | "withFile" -> s"$${$queryUrl}", 35 | "noFile" -> "${file:/foo/bar.cyp}") 36 | val expected = Map("foo" -> "bar", 37 | "withEnv" -> myenv, 38 | "noEnv" -> "${env:NO_ENV}", 39 | "withFile" -> queryFile, 40 | "noFile" -> "${file:/foo/bar.cyp}") 41 | val actual = Utils.enrichMap(sourceMap) 42 | assertEquals(expected, actual) 43 | } 44 | } 45 | --------------------------------------------------------------------------------