├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── ci.yml
    │   └── python-ci.yml
├── .gitignore
├── README.md
├── doc
    ├── images
    │   ├── cli_help.png
    │   ├── databricks_job_config.png
    │   ├── databricks_job_home.png
    │   ├── databricks_job_list.png
    │   ├── databricks_menu.png
    │   └── output-rel.png
    └── notebooks
    │   └── Neo4j DWH Connector quickstart.html
├── maven-release.sh
├── pom.xml
├── python
    ├── MANIFEST.in
    ├── neo4j_dwh_connector
    │   ├── __init__.py
    │   ├── _dto.py
    │   ├── _utils.py
    │   └── connector.py
    ├── requirements.txt
    ├── setup.cfg
    ├── setup.py
    ├── spark-package-deps.txt
    └── test
    │   ├── __init__.py
    │   ├── _util_test.py
    │   └── connector_test.py
└── src
    ├── main
        └── scala
        │   └── org
        │       └── neo4j
        │           └── dwh
        │               └── connector
        │                   ├── Neo4jDWHConnector.scala
        │                   ├── domain
        │                       └── JobConfig.scala
        │                   ├── generator
        │                       └── JobConfigGenerator.scala
        │                   └── utils
        │                       ├── CliUtils.scala
        │                       ├── DatasourceOptions.scala
        │                       ├── JSONUtils.scala
        │                       ├── JobConfigUtils.scala
        │                       └── Utils.scala
    └── test
        ├── resources
            ├── JobConfig.json
            ├── neo4j-dwh-connector.properties
            ├── persons.csv
            ├── query.cyp
            └── snowflake.to.neo4j.stub.json
        └── scala
            └── org
                └── neo4j
                    └── dwh
                        └── connector
                            ├── Neo4jDWHConnectorIT.scala
                            ├── domain
                                └── JobConfigTest.scala
                            ├── generator
                                └── JobConfigGeneratorTest.scala
                            └── utils
                                └── UtilsTest.scala


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @neo4j-contrib/team-connectors
2 | 
3 | /.github/   @ali-ince @fbiville @venikkin


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: JVM CI with Maven
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'main'
 7 |   pull_request:
 8 |     branches:
 9 |       - 'main'
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         java-version: [8, 11, 17]
18 |         scala-version: [2.12, 2.13]
19 |         spark-version: ["3"]
20 |         neo4j-version: ["4.4", "5"]
21 |     name: Build with Scala ${{ matrix.scala-version }}, Spark ${{ matrix.spark-version }} and Neo4j ${{ matrix.neo4j-version }}
22 |     steps:
23 |       - uses: actions/checkout@v2
24 |       - name: Set up JDK {{ matrix.java-version }}
25 |         uses: actions/setup-java@v3
26 |         with:
27 |           java-version: ${{ matrix.java-version }}
28 |           distribution: 'temurin'
29 |       - name: Cache Maven packages
30 |         uses: actions/cache@v1
31 |         with:
32 |           path: ~/.m2
33 |           key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
34 |           restore-keys: ${{ runner.os }}-m2
35 |       - name: Build with Maven
36 |         env:
37 |           CI: true
38 |           MY_ENV: "MY_ENV_value"
39 |         run: mvn clean verify -Pscala-${{ matrix.scala-version }} -Pspark-${{ matrix.spark-version }} -Pneo4j-${{ matrix.neo4j-version }} --no-transfer-progress
40 | 


--------------------------------------------------------------------------------
/.github/workflows/python-ci.yml:
--------------------------------------------------------------------------------
 1 | name: Python CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'main'
 7 |   pull_request:
 8 |     branches:
 9 |       - 'main'
10 | jobs:
11 |   test:
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       fail-fast: false
15 |       matrix:
16 |         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
17 |         neo4j-version: [ "4.4", "5" ]
18 |         spark-version:
19 |           - {short: "3", ext: "3.1.3", scala: "2.12"}
20 |           - {short: "3", ext: "3.2.4", scala: "2.12"}
21 |           - {short: "3", ext: "3.2.4", scala: "2.13"}
22 |           - {short: "3", ext: "3.3.2", scala: "2.12"}
23 |           - {short: "3", ext: "3.3.2", scala: "2.13"}
24 |           - {short: "3", ext: "3.4.1", scala: "2.12"}
25 |           - {short: "3", ext: "3.4.1", scala: "2.13"}
26 |     steps:
27 |       - uses: actions/checkout@v2
28 |       - name: Set up Python ${{ matrix.python-version }}
29 |         uses: actions/setup-python@v2
30 |         with:
31 |           python-version: ${{ matrix.python-version }}
32 |       - name: Set up JDK 8
33 |         uses: actions/setup-java@v1
34 |         with:
35 |           java-version: 1.8
36 |       - uses: avides/actions-project-version-check@v1.2.0
37 |         id: version
38 |         with:
39 |           token: ${{ secrets.GITHUB_TOKEN }}
40 |           file-to-check: pom.xml
41 |           only-return-version: true
42 |       - name: Cache Maven packages
43 |         uses: actions/cache@v1
44 |         with:
45 |           path: ~/.m2
46 |           key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
47 |           restore-keys: ${{ runner.os }}-m2
48 |       - name: Install dependencies
49 |         run: |
50 |           python -m pip install --upgrade pip
51 |           pip install pypandoc six tzlocal==2.1
52 |           pip install pyspark==${{ matrix.spark-version.ext }} "testcontainers[neo4j]"
53 |       - name: Build artifact
54 |         env:
55 |           CI: true
56 |         run: |
57 |           mvn clean package -Pspark-${{ matrix.spark-version.short }} -Pscala-${{ matrix.spark-version.scala }} -DskipTests --no-transfer-progress
58 |       - name: Run tests for Spark ${{ matrix.spark-version.ext }} and Neo4j ${{ matrix.neo4j-version }}
59 |         if: ${{ !(matrix.spark-version.short == 2.4 && matrix.python-version == 3.8) && !(matrix.spark-version.ext == '3.2.0' && matrix.python-version == 3.5) }}
60 |         run: |
61 |           cd ./python
62 |           export PYTHONPATH=$(pwd)
63 |           python3 ./test/_util_test.py 
64 |           python3 ./test/connector_test.py "${{ steps.version.outputs.version }}" "${{ matrix.neo4j-version }}"
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | neo4j-home
 2 | .gradle
 3 | gradle/
 4 | build/
 5 | *~
 6 | \#*
 7 | target
 8 | out
 9 | .project
10 | .classpath
11 | .settings
12 | .externalToolBuilders/
13 | .scala_dependencies
14 | .factorypath
15 | .cache
16 | .cache-main
17 | .cache-tests
18 | *.iws
19 | *.ipr
20 | *.iml
21 | .idea
22 | .DS_Store
23 | .shell_history
24 | .mailmap
25 | .java-version
26 | .cache-main
27 | .cache-tests
28 | Thumbs.db
29 | .cache-main
30 | .cache-tests
31 | docs/guides
32 | doc/node
33 | doc/node_modules
34 | doc/package-lock.json
35 | scripts/python/local
36 | python/neo4j_dwh_connector/__pycache__/
37 | python/neo4j_dwh_connector/.pytest_cache/
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Neo4j DWH Connector
  2 | 
  3 | This repository contains the Neo4j Data Warehouse Connector for Apache Spark.
  4 | 
  5 | # Goal
  6 | 
  7 | The goal of the Neo4j DWH Connector is to simplify the interoperability between Neo4j Spark and other data sources like Snowflake, Redshift and so on.
  8 | 
  9 | In order to do that we created this connector that via a simple JSON file creates Spark’s jobs through Spark Submit.
 10 | 
 11 | **Nota bene**
 12 | 
 13 | The examples that we’re providing here are for a Job that moves data from Snowflake to Neo4j but the DWH connector works also in the other way around.
 14 | 
 15 | 
 16 | # How does it work?
 17 | 
 18 | The Neo4j DWH Connector provides an easy way in order move data between Neo4j and popular Data Warehouses like:
 19 | 
 20 | 
 21 | 
 22 | * Snowflake
 23 | * BigQuery
 24 | * Redshift
 25 | * Azure Synapse
 26 | 
 27 | You can you use it in two ways:
 28 | 
 29 | 
 30 | 
 31 | * As Spark Submit Job by providing a JSON configuration that abstracts a Spark Job which moves data from one data source to another
 32 | * As Scala/Python API in order to simplify writing a Spark Job that moves the dat from a database to another
 33 | 
 34 | 
 35 | # How does the JSON configuration look like?
 36 | 
 37 | Following a very simple JSON that moves data from Snowflake to Neo4j.
 38 | 
 39 | 
 40 | ## Nodes
 41 | 
 42 | 
 43 | ```json
 44 |  {
 45 |    "name" : "Ingest Customer table as nodes into Neo4j",
 46 |    "conf" : { },
 47 |    "hadoopConfiguration" : { },
 48 |    "source" : {
 49 |      "format" : "snowflake",
 50 |      "options" : {
 51 |        "sfSchema" : "TPCH_SF1",
 52 |        "sfPassword" : "<omissis>",
 53 |        "sfUser" : "conker84",
 54 |        "dbtable" : "CUSTOMER",
 55 |        "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA",
 56 |        "sfURL" : "https://<omissis>.snowflakecomputing.com"
 57 |      },
 58 |      "columns": [
 59 |        { "name": "CAST(C_ACCTBAL AS DOUBLE)", "alias": "C_ACCTBAL" },
 60 |        { "name": "C_ADDRESS" },
 61 |        { "name": "C_COMMENT" },
 62 |        { "name": "CAST(C_CUSTKEY AS LONG)", "alias": "C_CUSTKEY" },
 63 |        { "name": "C_MKTSEGMENT" },
 64 |        { "name": "C_NAME" },
 65 |        { "name": "CAST(C_NATIONKEY AS LONG)", "alias": "C_NATIONKEY" },
 66 |        { "name": "C_PHONE" }
 67 |      ],
 68 |      "where" : "C_CUSTKEY <= 10",
 69 |      "printSchema" : false,
 70 |      "partition" : {}
 71 |    },
 72 |    "target" : {
 73 |      "format" : "org.neo4j.spark.DataSource",
 74 |      "options" : {
 75 |        "url" : "neo4j+s://<omissis>.databases.neo4j.io",
 76 |        "authentication.basic.username" : "neo4j",
 77 |        "labels" : ":Customer",
 78 |        "authentication.basic.password" : "<omissis>"
 79 |      },
 80 |      "mode" : "Append"
 81 |    }
 82 |  }
 83 | ```
 84 | 
 85 | 
 86 | This Job moves data from a Snowflake instance, and in particular from **SNOWFLAKE_SAMPLE_DATA** database, **TPCH_SF1** schema and table **CUSTOMER** into Neo4j database as nodes with label **Customer**.
 87 | 
 88 | 
 89 | ## Relationships
 90 | 
 91 | 
 92 | ```json
 93 | {
 94 |  "name" : "Ingest Order table as relationships into Neo4j",
 95 |  "conf" : { },
 96 |  "hadoopConfiguration" : { },
 97 |  "source" : {
 98 |    "format" : "snowflake",
 99 |    "options" : {
100 |      "sfSchema" : "TPCH_SF1",
101 |      "sfPassword" : "<omissis>",
102 |      "sfUser" : "conker84",
103 |      "dbtable" : "ORDERS",
104 |      "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA",
105 |      "sfURL" : "https://<omissis>.snowflakecomputing.com"
106 |    },
107 |    "columns": [
108 |      { "name": "CAST(O_CUSTKEY AS DOUBLE)", "alias": "O_CUSTKEY" },
109 |      { "name": "O_ORDERDATE" },
110 |      { "name": "O_COMMENT" },
111 |      { "name": "CAST(O_ORDERKEY AS LONG)", "alias": "O_ORDERKEY" }
112 |    ],
113 |    "where" : "O_CUSTKEY <= 10",
114 |    "printSchema" : false,
115 |    "partition" : {}
116 |  },
117 |  "target" : {
118 |    "format" : "org.neo4j.spark.DataSource",
119 |    "options" : {
120 |      "url" : "neo4j+s://<omissis>.databases.neo4j.io",
121 |      "authentication.basic.username" : "neo4j",
122 |      "authentication.basic.password" : "<omissis>",
123 |      "relationship" : "HAS_ORDER",
124 |      "relationship.save.strategy" : "keys",
125 |      "relationship.source.save.mode" : "Overwrite",
126 |      "relationship.source.labels" : ":Customer",
127 |      "relationship.source.node.keys" : "O_CUSTKEY",
128 |      "relationship.target.save.mode" : "Overwrite",
129 |      "relationship.target.labels" : ":Order",
130 |      "relationship.target.node.keys" : "O_ORDERKEY"
131 |    },
132 |    "mode" : "Overwrite"
133 |  }
134 | }
135 | ```
136 | 
137 | 
138 | 
139 | 
140 | Output:
141 | 
142 | ![](doc/images/output-rel.png)
143 | 
144 | 
145 | ## Query
146 | 
147 | ```json
148 | {
149 |  "name" : "Ingest Order table as relationships into Neo4j",
150 |  "conf" : { },
151 |  "hadoopConfiguration" : { },
152 |  "source" : {
153 |    "format" : "snowflake",
154 |    "options" : {
155 |      "sfSchema" : "TPCH_SF1",
156 |      "sfPassword" : "<omissis>",
157 |      "sfUser" : "conker84",
158 |      "dbtable" : "ORDERS",
159 |      "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA",
160 |      "sfURL" : "https://<omissis>.snowflakecomputing.com"
161 |    },
162 |    "columns": [
163 |      { "name": "CAST(O_CUSTKEY AS DOUBLE)", "alias": "O_CUSTKEY" },
164 |      { "name": "O_ORDERDATE" },
165 |      { "name": "O_COMMENT" },
166 |      { "name": "CAST(O_ORDERKEY AS LONG)", "alias": "O_ORDERKEY" }
167 |    ],
168 |    "where" : "O_CUSTKEY <= 10",
169 |    "printSchema" : false,
170 |    "partition" : {}
171 |  },
172 |  "target" : {
173 |    "format" : "org.neo4j.spark.DataSource",
174 |    "options" : {
175 |      "url" : "neo4j+s://<omissis>.databases.neo4j.io",
176 |      "authentication.basic.username" : "neo4j",
177 |      "authentication.basic.password" : "<omissis>",
178 |      "query" : "MERGE (s:Person:Customer{id: event.O_CUSTKEY}) MERGE(t:Order{id: event.O_ORDERKEY}) MERGE (s)-[:HAS_ORDER{date: event.O_ORDERDATE}]->(t)"
179 |    },
180 |    "mode" : "Overwrite"
181 |  }
182 | }
183 | ```
184 | 
185 | This Job moves data from a Snowflake instance, and in particular from **SNOWFLAKE_SAMPLE_DATA** database, **TPCH_SF1** 
186 | schema and table **ORDERS** into Neo4j database as custom graph composed by the pattern
187 | `(:Person:Customer)-[:HAS_ORDER]->(:Order)`.
188 | 
189 | 
190 | ## Fields Description
191 | 
192 | Following a detailed description of each field in the JSON configuration file:
193 | 
194 | 
195 | 
196 | * **name**: the job name. _It’s Optional_
197 | * **master**: the spark master url, used only for internal testing purposes;
198 | * **conf**: a set of key/value string pairs which will be applied to SparkConf. _It’s Optional_
199 | * **hadoopConfiguration**: a set of key/value string pairs that will be applied to HadoopConfiguration. _It’s Optional_
200 | * **source**, contains the information about the source Database (_it’s **Mandatory**_):
201 |     * **format**: the format of the source Database (i.e. Snowflake, Neo4j, RedShift…). _It’s **Mandatory**_
202 |     * **options**: a set of key/value pairs that contains the required configuration parameters for the selected format. Each option set is specifically related to the selected source **format**. For each one we’ll provide a set of links in the configuration stub in order to easily retrieve the correct configuration. _It’s **Mandatory**_
203 |     * **columns**, a set of columns that you want to project, is useful in order to minimize the data movement from the source database; each column is composed of two fields (_It’s Optional_):
204 |         * **name**: the name of the column. Consider that this field supports Spark SQL notation so you can manipulate the field like casting the field type, applying UDF to it and so on
205 |         * **alias**: the name of the field if you want to rename it
206 |     * **where**: a string that filters the data retrieved from the Database, you can use Spark SQL where condition in order to filter the data. _It’s Optional_
207 |     * **limit**: limits the number of rows returned from the source data, it’s good for testing purposes. _It’s Optional_
208 |     * **printSchema**: a boolean that prints the schema of the source data, useful for debugging. Default false
209 |     * **partition**, is composed of two fields (_It’s Optional_):
210 |         * **number**: the number of partitions
211 |         * **by**: an optional parameter that defines the partition field
212 | * **target**, contains the information about the target Database (_it’s **Mandatory**_):
213 |     * **format**: the format of the target Database (i.e. Snowflake, Neo4j, RedShift…). _It’s **Mandatory**_
214 |     * **options**: a set of key/value pairs that contains the required configuration parameters for the selected format. Each option set is specifically related to the selected target **format**. For each one we’ll provide a set of links in the configuration stub in order to easily retrieve the correct configuration. _It’s **Mandatory**_
215 |     * **mode**, the Spark save mode, it’s specifically related to the selected target **format**._ It’s Optional;_
216 | 
217 | ## Special values
218 | 
219 | In the JSON you can use special values that will be replaced at runtime with an actual value.
220 | Currently we now support two special values:
221 | 
222 | * `${env:<ENV_PARAM_NAME>}` which checks for the parameter name and replace it with the actual value
223 | * `${file:<FILE_PATH>}` which checks for the file replace it with the actual text content
224 | 
225 | This in order you let you keep the file cleanest as possibile
226 | 
227 | Consider this json:
228 | 
229 | ```json
230 | {
231 |  "name" : "Ingest Order table as relationships into Neo4j",
232 |  "conf" : { },
233 |  "hadoopConfiguration" : { },
234 |  "source" : {
235 |    "format" : "snowflake",
236 |    "options" : {
237 |      "sfSchema" : "TPCH_SF1",
238 |      "sfPassword" : "${env:SNOWFLAKE_PASSWORD}",
239 |      "sfUser" : "${env:SNOWFLAKE_USER}",
240 |      "dbtable" : "ORDERS",
241 |      "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA",
242 |      "sfURL" : "${env:SNOWFLAKE_URL}"
243 |    },
244 |    "columns": [
245 |      { "name": "CAST(O_CUSTKEY AS DOUBLE)", "alias": "O_CUSTKEY" },
246 |      { "name": "O_ORDERDATE" },
247 |      { "name": "O_COMMENT" },
248 |      { "name": "CAST(O_ORDERKEY AS LONG)", "alias": "O_ORDERKEY" }
249 |    ],
250 |    "where" : "C_CUSTKEY <= 10",
251 |    "printSchema" : false,
252 |    "partition" : {}
253 |  },
254 |  "target" : {
255 |    "format" : "org.neo4j.spark.DataSource",
256 |    "options" : {
257 |      "url" : "${env:NEO4J_URL}",
258 |      "authentication.basic.username" : "${env:NEO4J_USER}",
259 |      "authentication.basic.password" : "${env:NEO4J_PASSWORD}",
260 |      "query" : "${file:/tmp/my_cypher_query.cyp}"
261 |    },
262 |    "mode" : "Overwrite"
263 |  }
264 | }
265 | ```
266 | 
267 | Consider that you have the following env variables:
268 | 
269 | * export SNOWFLAKE_USER=snowflake_foo
270 | * export SNOWFLAKE_PASSWORD=snowflake_bar
271 | * export SNOWFLAKE_URL=https://foo_bar.snowflakecomputing.com
272 | * export NEO4J_USER=neo4j_foo
273 | * export NEO4J_PASSWORD=neo4j_bar
274 | * export NEO4J_URL=neo4j+s://foo_bar.databases.neo4j.io
275 | 
276 | And the content of `/tmp/my_cypher_query.cyp` is:
277 | 
278 | `CREATE (p:Person{id: event.id, fullName: event.full_name})`
279 | 
280 | The connector will replace the value as if you pass the following json:
281 | 
282 | Consider this json:
283 | 
284 | ```json
285 | {
286 |  "name" : "Ingest Order table as relationships into Neo4j",
287 |  "conf" : { },
288 |  "hadoopConfiguration" : { },
289 |  "source" : {
290 |    "format" : "snowflake",
291 |    "options" : {
292 |      "sfSchema" : "TPCH_SF1",
293 |      "sfPassword" : "snowflake_bar",
294 |      "sfUser" : "snowflake_foo",
295 |      "dbtable" : "ORDERS",
296 |      "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA",
297 |      "sfURL" : "https://foo_bar.snowflakecomputing.com"
298 |    },
299 |    "columns": [
300 |      { "name": "CAST(O_CUSTKEY AS DOUBLE)", "alias": "O_CUSTKEY" },
301 |      { "name": "O_ORDERDATE" },
302 |      { "name": "O_COMMENT" },
303 |      { "name": "CAST(O_ORDERKEY AS LONG)", "alias": "O_ORDERKEY" }
304 |    ],
305 |    "where" : "C_CUSTKEY <= 10",
306 |    "printSchema" : false,
307 |    "partition" : {}
308 |  },
309 |  "target" : {
310 |    "format" : "org.neo4j.spark.DataSource",
311 |    "options" : {
312 |      "url" : "neo4j+s://foo_bar.databases.neo4j.io",
313 |      "authentication.basic.username" : "neo4j_foo",
314 |      "authentication.basic.password" : "neo4j_bar",
315 |      "query" : "CREATE (p:Person{id: event.id, fullName: event.full_name})"
316 |    },
317 |    "mode" : "Overwrite"
318 |  }
319 | }
320 | ```
321 | 
322 | 
323 | # Use the DWH Connector as Spark Submit Job
324 | 
325 | The jar can be used in two ways:
326 | 
327 | 
328 | 
329 | 1. Generate the configuration stub for the selected Source/Target databases
330 | 2. Used by Spark-submit to launch the Spark Job
331 | 
332 | To move data from a selected Source to a defined Target you need to perform the following steps:
333 | 
334 | 
335 | 1. Prepare the JSON file with all the required configurations with the Source and the Target database
336 | 2. Copy the JSON file(s) and neo4j-dwh-connector-<version>.jar to the server or Docker running Spark
337 | 3. Run the Spark-submit command to start the Spark Job
338 | 
339 | If you, want to get the full list of available options you can use the following command:
340 | 
341 | 
342 | ```bash
343 | java -jar neo4j-dwh-connector-<version>.jar -h
344 | ```
345 | 
346 | ![](doc/images/cli_help.png)
347 | 
348 | 
349 | 
350 | ## Generate the configuration stub
351 | 
352 | You can generate the configuration file in a very simple way by running the following command:
353 | 
354 | 
355 | ```bash
356 | java -jar neo4j-dwh-connector-<version>.jar -c -s <Source Database> -t <Target Database> -p <Where to put the configuration json file>
357 | ```
358 | 
359 | 
360 | Example, generate a stub for a job that moves data from **Snowflake** to **Neo4j** and put the configuration into **/tmp/dwh_job_config.json**:
361 | 
362 | 
363 | ```bash
364 | java -jar neo4j-dwh-connector-1.0.0.jar -c -s Snowflake -t Neo4j -p /tmp/dwh_job_config.json
365 | ```
366 | 
367 | 
368 | 
369 | ## Launch the Spark Job
370 | 
371 | Once you configured the JSON file properly, in order to get the job done, you only need to launch the Spark Job from a client .
372 | 
373 | **_Nota bene_**
374 | 
375 | Each selected Source/Target will require external dependencies in oder work. So please consider to add them.
376 | 
377 | 
378 | ### Launch the Spark Job from CLI via Spark Submit
379 | 
380 | In order to launch the Spark Job you need to be in the Spark directory and run the following command:
381 | 
382 | 
383 | ```bash
384 | ./bin/spark-submit \
385 |   --class org.neo4j.dwh.connector.Neo4jDWHConnector \
386 |   --packages <required dependecies> \
387 |   --master spark://<SPARK_IP>:<SPARK_PORT> \
388 |   --deploy-mode cluster \
389 |   --supervise \
390 |   --executor-memory 20G \
391 |   --total-executor-cores 100 \
392 |   /path/to/neo4j-dwh-connector-<version>.jar \
393 |   -p /path/to/dwh_job_config.json
394 | ```
395 | 
396 | 
397 | Example command to launch a Spark Job that moves data from Snowflake to Neo4j:
398 | 
399 | 
400 | ```bash
401 | ./bin/spark-submit \
402 |   --class org.neo4j.dwh.connector.Neo4jDWHConnector \
403 |   --packages org.neo4j:neo4j-connector-apache-spark_2.12:4.1.0_for_spark_3,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2,net.snowflake:snowflake-jdbc:3.13.15 \
404 |   --master local
405 |   /path/to/neo4j-dwh-connector-1.0.0-SNAPSHOT.jar \
406 |   -p /path/to/dwh_job_config.json
407 | ```
408 | 
409 | 
410 | 
411 | ### Launch the Spark Job from Databricks Cloud
412 | 
413 | You may also run the Spark Job from Databricks Cloud, to do this requires that you import the DWH connector jar and the JSON config [into DBFS](https://docs.databricks.com/data/data.html#:~:text=Import%20data,-If%20you%20have&text=There%20are%20two%20ways%20to,box%20on%20the%20landing%20page.)
414 | 
415 | In order to create a Spark Job into Databricks cloud you have to click into the **Jobs** menu section
416 | 
417 | ![](doc/images/databricks_menu.png)
418 | 
419 | and then on **Create Job**
420 | 
421 | ![](doc/images/databricks_job_home.png)
422 | 
423 | This will open a the Job creation section:
424 | 
425 | ![](doc/images/databricks_job_config.png)
426 | 
427 | where you need to define:
428 | 
429 | * the task name
430 | * the type of the task, please select **Spark Submit**
431 | * the cluster
432 | * the parameters which are related to the required dependencies and the DWH job configuration
433 | 
434 | In particular for our Spark Job that moves data from Snowflake into Neo4j you need to apply these parameters:
435 | 
436 | 
437 | ```json
438 | ["--packages", "org.neo4j:neo4j-connector-apache-spark_2.12:4.1.2_for_spark_3,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2,net.snowflake:snowflake-jdbc:3.13.15", "--class", "org.neo4j.dwh.connector.Neo4jDWHConnector", "dbfs:/FileStore/dwh-connector/neo4j_dwh_connector_1_0_SNAPSHOT_jar_with_dependencies.jar", "-p", "/dbfs/FileStore/dwh-connector/job_config.json"]
439 | ```
440 | 
441 | 
442 | Once create you’ll have the find it in your Job list as shown here:
443 | 
444 | ![](doc/images/databricks_job_list.png)
445 | 
446 | In order to start it you just need to press the play button in the Actions sections.
447 | 
448 | 
449 | # Use the DWH Connector via Scala APIs
450 | 
451 | You can also use the connector in a notebook for instance leveraging the Scala APIs in very convenient way like this:
452 | 
453 | ```scala
454 | 
455 | import org.neo4j.dwh.connector.Neo4jDWHConnector
456 | import org.neo4j.dwh.connector.domain._
457 | 
458 | 
459 | // the source database configuration
460 | val source = Source(
461 |   format = "snowflake", // the source database (mandatory)
462 |   // the configuration options it will change for every source database (mandatory)
463 |   options = Map(
464 |     "sfSchema" -> "TPCH_SF1",
465 |     "sfPassword" -> "<**>",
466 |     "sfUser" -> "<**>",
467 |     "dbtable" -> "CUSTOMER",
468 |     "sfDatabase" -> "SNOWFLAKE_SAMPLE_DATA",
469 |     "sfURL" -> "https://<**>.eu-central-1.snowflakecomputing.com"
470 |   ),
471 |   // a list of selected projected columns, it can be useful in order to eventually cast data, 
472 |   // apply Spark's UDFs and minimize the data movement from the source database (optional)
473 |   columns = Seq(
474 |     Column(name = "CAST(C_ACCTBAL AS DOUBLE)", alias = "C_ACCTBAL"),
475 |     Column(name = "C_ADDRESS"),
476 |     Column(name = "C_COMMENT"),
477 |     Column(name = "CAST(C_CUSTKEY AS LONG)", alias = "C_CUSTKEY"),
478 |     Column(name = "C_MKTSEGMENT"),
479 |     Column(name = "C_NAME"),
480 |     Column(name = "CAST(C_NATIONKEY AS LONG)", alias = "C_NATIONKEY"),
481 |     Column(name = "C_PHONE")
482 |   ),
483 |   where = "", // a filter for the source dataset (optional)
484 |   printSchema = true, // if you want to print the schema, useful for debug purposes (optional)
485 |   show = 5, // if you want show the source database, useful for debug purposes (optional)
486 |   limit = 10, // the amount of rows that you want to have from the source dataset (optional)
487 |   // a dataframe partition configuration (optional)
488 |   partition = Partition(
489 |     number = -1, // the number of partions mandatory if you want to define partitions
490 |     by = "" // the field to partition (optional)
491 |   )
492 | )
493 | 
494 | // the target database configuration
495 | val target = Target(
496 |   format = "org.neo4j.spark.DataSource", // the target database (mandatory)
497 |   // the configuration options it will change for every source database (mandatory)
498 |   options = Map(
499 |     "labels" -> ":Person:Customer",
500 |     "url" -> "neo4j+s://<**>.databases.neo4j.io",
501 |     "authentication.basic.username" -> "neo4j",
502 |     "authentication.basic.password" -> "<**>",
503 |     "node.keys" -> "C_CUSTKEY"
504 |   ),
505 |   mode = "Overwrite"
506 | )
507 | 
508 | val config = JobConfig(
509 |   name = "The name of the Spark Job",
510 |   conf = Map.empty, // a <String,String> configuration map, every k/v binding will be insert as Spark Configuration
511 |   hadoopConfiguration = Map.empty, // a <String,String> configuration map, every k/v binding will be insert as Hadoop Configuration
512 |   source = source,
513 |   target = target
514 | )
515 | 
516 | val connector = new Neo4jDWHConnector(sparkSession, config)
517 | 
518 | // this will ingest the data from source to target database
519 | connector.run()
520 | ```
521 | 
522 | # Use DWH Connector via Python APIs
523 | 
524 | You can also use the connector in a notebook for instance leveraging the Python APIs in very convenient way like this:
525 | 
526 | ```python
527 | from neo4j_dwh_connector import *
528 | 
529 | source = Source(
530 |     format="snowflake",  # the source database (mandatory)
531 |     # the configuration options it will change for every source database (mandatory)
532 |     options={
533 |         "sfSchema": "TPCH_SF1",
534 |         "sfPassword": "****",
535 |         "sfUser": "****",
536 |         "dbtable": "CUSTOMER",
537 |         "sfDatabase": "SNOWFLAKE_SAMPLE_DATA",
538 |         "sfURL": "https://****.eu-central-1.snowflakecomputing.com"
539 |     },
540 |     # a list of selected projected columns, it can be useful in order to eventually cast data,
541 |     # apply Spark's UDFs and minimize the data movement from the source database (optional)
542 |     columns=[
543 |         Column(name="CAST(C_ACCTBAL AS DOUBLE)", alias="C_ACCTBAL"),
544 |         Column(name="C_ADDRESS"),
545 |         Column(name="C_COMMENT"),
546 |         Column(name="CAST(C_CUSTKEY AS LONG)", alias="C_CUSTKEY"),
547 |         Column(name="C_MKTSEGMENT"),
548 |         Column(name="C_NAME"),
549 |         Column(name="CAST(C_NATIONKEY AS LONG)", alias="C_NATIONKEY"),
550 |         Column(name="C_PHONE")
551 |     ],
552 |     where="",  # a filter for the source dataset (optional)
553 |     printSchema=True,  # if you want to print the schema, useful for debug purposes (optional)
554 |     show=5,  # if you want show the source database, useful for debug purposes (optional)
555 |     limit=10,  # the amount of rows that you want to have from the source dataset (optional)
556 |     # a dataframe partition configuration (optional)
557 |     partition=Partition(
558 |         number=-1,  # the number of partions mandatory if you want to define partitions
559 |         by=""  # the field to partition (optional)
560 |     )
561 | )
562 | # the target database configuration
563 | target = Target(
564 |     format="org.neo4j.spark.DataSource",  # the target database (mandatory)
565 |     # the configuration options it will change for every source database (mandatory)
566 |     options={
567 |         "labels": ":PersonNew1:CustomerNew1",
568 |         "url": "neo4j+s://****.databases.neo4j.io",
569 |         "authentication.basic.username": "neo4j",
570 |         "authentication.basic.password": "****",
571 |         "node.keys": "C_CUSTKEY"
572 |     },
573 |     mode="Overwrite"
574 | )
575 | 
576 | config = JobConfig(
577 |     name="The name of the Spark Job",
578 |     conf={},  # a <String,String> configuration dict, every k/v binding will be insert as Spark Configuration
579 |     hadoopConfiguration={},
580 |     # a <String,String> configuration dict, every k/v binding will be insert as Hadoop Configuration
581 |     source=source,
582 |     target=target
583 | )
584 | 
585 | connector = Neo4jDWHConnector(sparkSession, config)
586 | 
587 | # this will ingest the data from source to target database
588 | connector.run()
589 | ```
590 | 
591 | # Supported Spark versions
592 | 
593 | We support:
594 | 
595 | * Spark 2.4 with Scala 2.11 and 2.12
596 | * Spark 3.x with Scala 2.12 and 2.13
597 | 
598 | # Maven resolution
599 | 
600 | It depends on the Spark and Scala version:
601 | * For Spark 2.4 with Scala 2.11 and 2.12 the package resolution is `org.neo4j:neo4j-dwh-connector_<scala_version>:<dwh-connector-version>_for_spark_2.4`
602 | * For Spark 3.x with Scala 2.12 and 2.13 the package resolution is `org.neo4j:neo4j-dwh-connector_<scala_version>:<dwh-connector-version>_for_spark_3`
603 | 
604 | 
605 | # Build it locally
606 | 
607 | In order to build it locally you can use the following commands
608 | 
609 | For scala 2.11 and Spark 2.4 `./maven-release.sh package 2.11 2.4`
610 | 
611 | For scala 2.12 and Spark 2.4 `./maven-release.sh package 2.12 2.4`
612 | 
613 | For scala 2.12 and Spark 3 `./maven-release.sh package 2.12 3`
614 | 
615 | For scala 2.13 and Spark 3 `./maven-release.sh package 2.13 3`
616 | 
617 | # Blog 
618 | 
619 | Here comes the details for Make easier than ever to move data between your data warehouse and Neo4j!
620 | 
621 | https://neo4j.com/developer-blog/introducing-neo4j-data-warehouse-connector/
622 | 
623 | # Quickstart
624 | 
625 | If you want to see how you can leverage Scala and Python APIs you can download:
626 | 
627 | * a [Databricks notebook](doc/notebooks/Neo4j%20DWH%20Connector%20quickstart.html) ready to be imported into the Databricks cloud environment 
628 | 


--------------------------------------------------------------------------------
/doc/images/cli_help.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/cli_help.png


--------------------------------------------------------------------------------
/doc/images/databricks_job_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/databricks_job_config.png


--------------------------------------------------------------------------------
/doc/images/databricks_job_home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/databricks_job_home.png


--------------------------------------------------------------------------------
/doc/images/databricks_job_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/databricks_job_list.png


--------------------------------------------------------------------------------
/doc/images/databricks_menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/databricks_menu.png


--------------------------------------------------------------------------------
/doc/images/output-rel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neo4j-contrib/neo4j-dwh-connector/eb9feddb06c32d1905eb98e634031679ac04841d/doc/images/output-rel.png


--------------------------------------------------------------------------------
/maven-release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -lt 3 ]] ; then
 4 |     echo "Usage ./maven-release.sh <DEPLOY_OR_INSTALL> <SCALA-VERSION> <SPARK-VERSION> [<ALT_DEPLOYMENT_REPOSITORY>]"
 5 |     exit 1
 6 | fi
 7 | 
 8 | JAVA_VER=$(java -version 2>&1 | grep -i version)
 9 | 
10 | if [[ ! $JAVA_VER =~ 1.8 ]] ; then
11 |     echo "You must use Java 8"
12 |     exit 1
13 | fi
14 | 
15 | exit_script() {
16 |   echo "Process terminated cleaning up resources"
17 |   mv -f pom.xml.bak pom.xml
18 |   rm -f pom.xml.versionsBackup
19 |   trap - SIGINT SIGTERM # clear the trap
20 |   kill -- -$$ # Sends SIGTERM to child/sub processes
21 | }
22 | 
23 | trap exit_script SIGINT SIGTERM
24 | 
25 | DEPLOY_INSTALL=$1
26 | SCALA_VERSION=$2
27 | SPARK_VERSION=$3
28 | # echo "command is: mvn -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION -q -Dexec.executable=echo -Dexec.args='\${project.version}' --non-recursive exec:exec"
29 | CURRENT_VERSION=$(mvn -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION -q -Dexec.executable=echo -Dexec.args='${project.version}' --non-recursive exec:exec)
30 | #TARGET_DIR=spark-$SPARK_VERSION
31 | if [[ $# -eq 4 ]] ; then
32 |   ALT_DEPLOYMENT_REPOSITORY="-DaltDeploymentRepository=$4"
33 | else
34 |   ALT_DEPLOYMENT_REPOSITORY=""
35 | fi
36 | 
37 | case $(sed --help 2>&1) in
38 |   *GNU*) sed_i () { sed -i "$@"; };;
39 |   *) sed_i () { sed -i '' "$@"; };;
40 | esac
41 | 
42 | # backup files
43 | cp pom.xml pom.xml.bak
44 | 
45 | # replace pom files with target scala version
46 | sed_i "s/<artifactId>neo4j-dwh-connector<\/artifactId>/<artifactId>neo4j-dwh-connector_$SCALA_VERSION<\/artifactId>/" pom.xml
47 | sed_i "s/<scala.binary.version \/>/<scala.binary.version>$SCALA_VERSION<\/scala.binary.version>/" pom.xml
48 | 
49 | # setting version
50 | NEW_VERSION="${CURRENT_VERSION}_for_spark_${SPARK_VERSION}"
51 | # echo "New version is $NEW_VERSION"
52 | mvn -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION versions:set -DnewVersion=$NEW_VERSION
53 | # build
54 | # echo "command is: mvn clean $DEPLOY_INSTALL -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION -DskipTests $ALT_DEPLOYMENT_REPOSITORY"
55 | mvn clean $DEPLOY_INSTALL -Pscala-$SCALA_VERSION -Pspark-$SPARK_VERSION -DskipTests $ALT_DEPLOYMENT_REPOSITORY
56 | 
57 | exit_script
58 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 |   <groupId>org.neo4j</groupId>
  4 |   <artifactId>neo4j-dwh-connector</artifactId>
  5 |   <version>1.0.1</version>
  6 |   <name>${project.artifactId}</name>
  7 |   <description>This is the Neo4j DWH Connector supposed to work in combination with Apache Spark</description>
  8 |   <inceptionYear>2022</inceptionYear>
  9 | 
 10 |   <organization>
 11 |     <name>Neo4j, Inc.</name>
 12 |     <url>http://neo4j.com/</url>
 13 |   </organization>
 14 |   <developers>
 15 |     <developer>
 16 |       <id>conker84</id>
 17 |       <name>Andrea Santurbano</name>
 18 |     </developer>
 19 |   </developers>
 20 | 
 21 |   <url>https://github.com/neo4j-contrib/neo4j-dwh-connector</url>
 22 |   <scm>
 23 |     <url>https://github.com/neo4j-contrib/neo4j-dwh-connector</url>
 24 |   </scm>
 25 |   <licenses>
 26 |     <license>
 27 |       <name>The Apache Software License, Version 2.0</name>
 28 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 29 |       <comments>
 30 |         Note that this license is for the project itself,
 31 |         and not for its dependencies.
 32 |       </comments>
 33 |       <distribution>repo</distribution>
 34 |     </license>
 35 |   </licenses>
 36 | 
 37 |   <repositories>
 38 |     <repository>
 39 |       <id>jitpack.io</id>
 40 |       <url>https://jitpack.io</url>
 41 |     </repository>
 42 |   </repositories>
 43 | 
 44 |   <properties>
 45 |     <maven.compiler.source>1.8</maven.compiler.source>
 46 |     <maven.compiler.target>1.8</maven.compiler.target>
 47 |     <encoding>UTF-8</encoding>
 48 |     <spec2.version>4.2.0</spec2.version>
 49 |     <jackson-databind.version>2.17.2</jackson-databind.version>
 50 |     <junit.version>4.13.2</junit.version>
 51 |     <xbean.version>4.10</xbean.version>
 52 |     <common-cli.version>1.9.0</common-cli.version>
 53 | 
 54 |     <powermock.version>2.0.9</powermock.version>
 55 |     <testcontainers.version>1.18.3</testcontainers.version>
 56 |     <bigquery.spark.version>0.32.0</bigquery.spark.version>
 57 |     <snowflake.spark.version>2.10.0-spark_3.2</snowflake.spark.version>
 58 |     <snowflake.jdbc.version>3.13.15</snowflake.jdbc.version>
 59 |     <redshift.spark.version>5.0.4</redshift.spark.version>
 60 |     <redshift.jdbc.version>2.1.0.5</redshift.jdbc.version>
 61 |     <aws-java-sdk.version>1.12.178</aws-java-sdk.version>
 62 |     <hadoop-aws.version>3.3.1</hadoop-aws.version>
 63 |     <spark-mssql-connector_version>1.1.0</spark-mssql-connector_version>
 64 |     <spark-mssql-connector-artifactId>spark-mssql-connector_${scala.binary.version}</spark-mssql-connector-artifactId>
 65 |     <!--    <scala.version>2.12.12</scala.version>-->
 66 |     <!--    <spark.version>3.2.1</spark.version>-->
 67 |     <!--    <scala.binary.version>2.12</scala.binary.version>-->
 68 |     <!-- This is empty because we use it for the deploy process -->
 69 |     <scala.binary.version/>
 70 |     <spark-bigquery-with-dependencies-artifactId>
 71 |       spark-bigquery-with-dependencies_${scala.binary.version}
 72 |     </spark-bigquery-with-dependencies-artifactId>
 73 |     <neo4j-connector-apache-spark-artifactId>neo4j-connector-apache-spark_${scala.binary.version}</neo4j-connector-apache-spark-artifactId>
 74 |   </properties>
 75 | 
 76 | 
 77 |   <dependencies>
 78 |     <dependency>
 79 |       <groupId>org.scala-lang</groupId>
 80 |       <artifactId>scala-library</artifactId>
 81 |       <version>${scala.version}</version>
 82 |     </dependency>
 83 | 
 84 |     <dependency>
 85 |       <groupId>org.apache.spark</groupId>
 86 |       <artifactId>spark-core_${scala.binary.version}</artifactId>
 87 |       <version>${spark.version}</version>
 88 |       <scope>provided</scope>
 89 |       <exclusions>
 90 |         <exclusion>
 91 |           <groupId>com.fasterxml.jackson.core</groupId>
 92 |           <artifactId>jackson-databind</artifactId>
 93 |         </exclusion>
 94 |         <exclusion>
 95 |           <groupId>com.fasterxml.jackson.module</groupId>
 96 |           <artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
 97 |         </exclusion>
 98 |         <exclusion>
 99 |           <groupId>org.apache.xbean</groupId>
100 |           <artifactId>xbean-asm6-shaded</artifactId>
101 |         </exclusion>
102 |       </exclusions>
103 |     </dependency>
104 | 
105 |     <dependency>
106 |       <groupId>org.apache.spark</groupId>
107 |       <artifactId>spark-sql_${scala.binary.version}</artifactId>
108 |       <version>${spark.version}</version>
109 |       <scope>provided</scope>
110 |     </dependency>
111 | 
112 |     <dependency>
113 |       <groupId>org.apache.xbean</groupId>
114 |       <artifactId>xbean-asm6-shaded</artifactId>
115 |       <version>${xbean.version}</version>
116 |       <scope>provided</scope>
117 |     </dependency>
118 | 
119 |     <dependency>
120 |       <groupId>com.fasterxml.jackson.core</groupId>
121 |       <artifactId>jackson-databind</artifactId>
122 |       <version>${jackson-databind.version}</version>
123 |     </dependency>
124 | 
125 |     <dependency>
126 |       <groupId>com.fasterxml.jackson.module</groupId>
127 |       <artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
128 |       <version>${jackson-databind.version}</version>
129 |     </dependency>
130 | 
131 |     <dependency>
132 |       <groupId>commons-cli</groupId>
133 |       <artifactId>commons-cli</artifactId>
134 |       <version>${common-cli.version}</version>
135 |     </dependency>
136 | 
137 |     <!-- Test -->
138 |     <dependency>
139 |       <groupId>junit</groupId>
140 |       <artifactId>junit</artifactId>
141 |       <version>${junit.version}</version>
142 |       <scope>test</scope>
143 |     </dependency>
144 | 
145 |     <dependency>
146 |       <groupId>com.microsoft.azure</groupId>
147 |       <artifactId>${spark-mssql-connector-artifactId}</artifactId>
148 |       <version>${spark-mssql-connector_version}</version>
149 |       <scope>test</scope>
150 |     </dependency>
151 | 
152 | 
153 |     <!--    <dependency>-->
154 |     <!--      <groupId>io.github.spark-redshift-community</groupId>-->
155 |     <!--      <artifactId>spark-redshift_${scala.binary.version}</artifactId>-->
156 |     <!--      <version>${redshift.spark.version}</version>-->
157 |     <!--      <scope>test</scope>-->
158 |     <!--    </dependency>-->
159 |     <dependency>
160 |       <groupId>com.github.conker84</groupId>
161 |       <artifactId>spark-redshift</artifactId>
162 |       <version>5.0.4.1-SNAPSHOT</version>
163 |       <scope>test</scope>
164 |     </dependency>
165 |     <dependency>
166 |       <groupId>org.apache.spark</groupId>
167 |       <artifactId>spark-avro_${scala.binary.version}</artifactId>
168 |       <version>${spark.version}</version>
169 |       <scope>test</scope>
170 |     </dependency>
171 |     <dependency>
172 |       <groupId>com.amazon.redshift</groupId>
173 |       <artifactId>redshift-jdbc42</artifactId>
174 |       <version>${redshift.jdbc.version}</version>
175 |       <scope>test</scope>
176 |     </dependency>
177 |     <dependency>
178 |       <groupId>com.amazonaws</groupId>
179 |       <artifactId>aws-java-sdk</artifactId>
180 |       <version>${aws-java-sdk.version}</version>
181 |       <scope>test</scope>
182 |     </dependency>
183 |     <dependency>
184 |       <groupId>org.apache.hadoop</groupId>
185 |       <artifactId>hadoop-aws</artifactId>
186 |       <version>${hadoop-aws.version}</version>
187 |       <scope>test</scope>
188 |     </dependency>
189 | 
190 |     <dependency>
191 |       <groupId>org.neo4j</groupId>
192 |       <artifactId>${neo4j-connector-apache-spark-artifactId}</artifactId>
193 |       <version>${neo4j.spark.version}</version>
194 |       <scope>test</scope>
195 |     </dependency>
196 | 
197 |     <dependency>
198 |       <groupId>com.google.cloud.spark</groupId>
199 |       <artifactId>${spark-bigquery-with-dependencies-artifactId}</artifactId>
200 |       <version>${bigquery.spark.version}</version>
201 |       <scope>test</scope>
202 |     </dependency>
203 | 
204 |     <dependency>
205 |       <groupId>net.snowflake</groupId>
206 |       <artifactId>spark-snowflake_${scala.binary.version}</artifactId>
207 |       <version>${snowflake.spark.version}</version>
208 |       <scope>test</scope>
209 |     </dependency>
210 | 
211 |     <dependency>
212 |       <groupId>net.snowflake</groupId>
213 |       <artifactId>snowflake-jdbc</artifactId>
214 |       <version>${snowflake.jdbc.version}</version>
215 |       <scope>test</scope>
216 |     </dependency>
217 | 
218 |     <dependency>
219 |       <groupId>org.testcontainers</groupId>
220 |       <artifactId>testcontainers</artifactId>
221 |       <version>${testcontainers.version}</version>
222 |       <scope>test</scope>
223 |     </dependency>
224 | 
225 |     <dependency>
226 |       <groupId>org.testcontainers</groupId>
227 |       <artifactId>neo4j</artifactId>
228 |       <version>${testcontainers.version}</version>
229 |       <scope>test</scope>
230 |     </dependency>
231 |   </dependencies>
232 | 
233 |   <profiles>
234 |     <!-- scala profiles -->
235 |     <profile>
236 |       <id>scala-2.12</id>
237 |       <properties>
238 |         <!-- 2.12.15 is the minimum, because of https://github.com/scala/bug/issues/12419 -->
239 |         <!-- required fix is https://github.com/scala/scala/pull/9676 -->
240 |         <scala.version>2.12.18</scala.version>
241 |         <scala.binary.version>2.12</scala.binary.version>
242 |       </properties>
243 |     </profile>
244 |     <profile>
245 |       <id>scala-2.13</id>
246 |       <activation>
247 |         <activeByDefault>true</activeByDefault>
248 |       </activation>
249 |       <properties>
250 |         <scala.version>2.13.12</scala.version>
251 |         <scala.binary.version>2.13</scala.binary.version>
252 |         <!--
253 |           N.B. the following packages are not available yet for Scala 2.13,
254 |           we use Scala 2.12 only for allowing the compilation
255 |         -->
256 |         <spark-mssql-connector-artifactId>spark-mssql-connector_2.12</spark-mssql-connector-artifactId>
257 |       </properties>
258 |     </profile>
259 |     <!-- end scala profiles -->
260 |     <!-- spark profiles -->
261 |     <profile>
262 |       <id>spark-3</id>
263 |       <activation>
264 |         <activeByDefault>true</activeByDefault>
265 |       </activation>
266 |       <properties>
267 |         <spark.version>3.3.2</spark.version>
268 |         <neo4j.spark.version>5.0.3_for_spark_3</neo4j.spark.version>
269 |       </properties>
270 |     </profile>
271 |     <!-- end spark profiles -->
272 |     <!-- neo4j profiles -->
273 |     <profile>
274 |       <id>neo4j-4.4</id>
275 |       <activation>
276 |         <activeByDefault>true</activeByDefault>
277 |       </activation>
278 |       <properties>
279 |         <neo4j.version>4.4</neo4j.version>
280 |       </properties>
281 |     </profile>
282 |     <profile>
283 |       <id>neo4j-5</id>
284 |       <properties>
285 |         <neo4j.version>5</neo4j.version>
286 |       </properties>
287 |     </profile>
288 |     <!-- end neo4j profiles -->
289 |   </profiles>
290 | 
291 |   <build>
292 |     <sourceDirectory>src/main/scala</sourceDirectory>
293 |     <testSourceDirectory>src/test/scala</testSourceDirectory>
294 |     <resources>
295 |       <resource>
296 |         <directory>python</directory>
297 |         <excludes>
298 |           <exclude>**/test/**</exclude>
299 |         </excludes>
300 |       </resource>
301 |     </resources>
302 |     <testResources>
303 |       <testResource>
304 |         <directory>src/test/resources</directory>
305 |         <filtering>true</filtering>
306 |       </testResource>
307 |     </testResources>
308 |     <plugins>
309 |       <plugin>
310 |         <groupId>org.apache.maven.plugins</groupId>
311 |         <artifactId>maven-assembly-plugin</artifactId>
312 |         <version>3.3.0</version>
313 |         <configuration>
314 |           <descriptorRefs>
315 |             <descriptorRef>jar-with-dependencies</descriptorRef>
316 |           </descriptorRefs>
317 |           <archive>
318 |             <manifest>
319 |               <mainClass>org.neo4j.dwh.connector.Neo4jDWHConnector</mainClass>
320 |             </manifest>
321 |           </archive>
322 |           <finalName>${project.artifactId}-${project.version}</finalName>
323 |           <appendAssemblyId>false</appendAssemblyId>
324 |         </configuration>
325 |         <executions>
326 |           <execution>
327 |             <phase>package</phase>
328 |             <goals>
329 |               <goal>single</goal>
330 |             </goals>
331 |           </execution>
332 |         </executions>
333 |       </plugin>
334 |       <plugin>
335 |         <groupId>net.alchim31.maven</groupId>
336 |         <artifactId>scala-maven-plugin</artifactId>
337 |         <version>4.5.4</version>
338 |         <executions>
339 |           <execution>
340 |             <goals>
341 |               <goal>add-source</goal>
342 |               <goal>compile</goal>
343 |               <goal>testCompile</goal>
344 |               <goal>doc-jar</goal>
345 |             </goals>
346 |             <configuration>
347 |               <args>
348 |                 <arg>-dependencyfile</arg>
349 |                 <arg>${project.build.directory}/.scala_dependencies</arg>
350 | <!--                <arg>-Ypartial-unification</arg>-->
351 |               </args>
352 |             </configuration>
353 |             <phase>process-resources</phase>
354 |           </execution>
355 |         </executions>
356 |       </plugin>
357 |       <plugin>
358 |         <groupId>org.apache.maven.plugins</groupId>
359 |         <artifactId>maven-source-plugin</artifactId>
360 |         <version>2.2.1</version>
361 |         <executions>
362 |           <execution>
363 |             <id>attach-sources</id>
364 |             <goals>
365 |               <goal>jar</goal>
366 |             </goals>
367 |           </execution>
368 |         </executions>
369 |       </plugin>
370 |       <plugin>
371 |         <groupId>org.apache.maven.plugins</groupId>
372 |         <artifactId>maven-surefire-plugin</artifactId>
373 |         <version>2.21.0</version>
374 |         <configuration>
375 |           <forkCount>1</forkCount>
376 |           <reuseForks>false</reuseForks>
377 |           <includes>
378 |             <include>**/*Test.*</include>
379 |             <include>**/*IT.*</include>
380 |           </includes>
381 |           <trimStackTrace>false</trimStackTrace>
382 |           <!-- copied from https://github.com/apache/spark/blob/3941369d13ad885eac21bd8ac1769aaf1a325c5a/launcher/src/main/java/org/apache/spark/launcher/JavaModuleOptions.java -->
383 |           <argLine>
384 |             -XX:+IgnoreUnrecognizedVMOptions
385 |             --add-opens=java.base/java.lang=ALL-UNNAMED
386 |             --add-opens=java.base/java.lang.invoke=ALL-UNNAMED
387 |             --add-opens=java.base/java.lang.reflect=ALL-UNNAMED
388 |             --add-opens=java.base/java.io=ALL-UNNAMED
389 |             --add-opens=java.base/java.net=ALL-UNNAMED
390 |             --add-opens=java.base/java.nio=ALL-UNNAMED
391 |             --add-opens=java.base/java.util=ALL-UNNAMED
392 |             --add-opens=java.base/java.util.concurrent=ALL-UNNAMED
393 |             --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
394 |             --add-opens=java.base/sun.nio.ch=ALL-UNNAMED
395 |             --add-opens=java.base/sun.nio.cs=ALL-UNNAMED
396 |             --add-opens=java.base/sun.security.action=ALL-UNNAMED
397 |             --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
398 |             --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED
399 |           </argLine>
400 |         </configuration>
401 |       </plugin>
402 |     </plugins>
403 |   </build>
404 | </project>
405 | 


--------------------------------------------------------------------------------
/python/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # An example MANIFEST file can be found at:
2 | # https://github.com/pypa/sampleproject/blob/master/MANIFEST.in
3 | # For more details about the MANIFEST file, you may read the docs at
4 | # https://docs.python.org/2/distutils/sourcedist.html#the-manifest-in-template
5 | 


--------------------------------------------------------------------------------
/python/neo4j_dwh_connector/__init__.py:
--------------------------------------------------------------------------------
1 | from neo4j_dwh_connector._dto import JobConfig, Column, Partition, Source, Target
2 | from neo4j_dwh_connector.connector import Neo4jDWHConnector
3 | 
4 | __all__ = ['Neo4jDWHConnector', 'JobConfig', 'Column', 'Partition', 'Source', 'Target']
5 | 


--------------------------------------------------------------------------------
/python/neo4j_dwh_connector/_dto.py:
--------------------------------------------------------------------------------
 1 | class Column:
 2 |     def __init__(self, name, alias=""):
 3 |         self.name = name
 4 |         self.alias = alias
 5 | 
 6 | 
 7 | class Partition:
 8 |     def __init__(self, number=0, by=""):
 9 |         self.number = number
10 |         self.by = by
11 | 
12 | 
13 | class Source:
14 |     def __init__(self, format, options={}, columns=[], where="", printSchema=False, limit=-1, show=-1,
15 |                  partition=Partition()):
16 |         self.format = format
17 |         self.options = options
18 |         self.columns = columns
19 |         self.where = where
20 |         self.printSchema = printSchema
21 |         self.limit = limit
22 |         self.show = show
23 |         self.partition = partition
24 | 
25 | 
26 | class Target:
27 |     def __init__(self, format, options, mode):
28 |         self.format = format
29 |         self.options = options
30 |         self.mode = mode
31 | 
32 | 
33 | class JobConfig:
34 |     def __init__(self, name, conf, hadoopConfiguration, source, target, master=""):
35 |         self.name = name
36 |         self.conf = conf
37 |         self.hadoopConfiguration = hadoopConfiguration
38 |         self.source = source
39 |         self.target = target
40 |         self.master = master


--------------------------------------------------------------------------------
/python/neo4j_dwh_connector/_utils.py:
--------------------------------------------------------------------------------
 1 | def _to_java_value(obj, sparkContext):
 2 |     if is_class(obj):
 3 |         obj = vars(obj)
 4 |     if type(obj) is dict:
 5 |         hashMap = sparkContext._jvm.java.util.HashMap()
 6 |         for key in obj:
 7 |             hashMap[key] = _to_java_value(obj[key], sparkContext)
 8 |         obj = hashMap
 9 |     if type(obj) is list:
10 |         arrayList = sparkContext._jvm.java.util.ArrayList(len(obj))
11 |         for key, val in enumerate(obj):
12 |             arrayList.add(_to_java_value(obj[key], sparkContext))
13 |         obj = arrayList
14 |     return obj
15 | 
16 | 
17 | # todo there should be a better way to do this
18 | def is_class(obj):
19 |     return ' object at ' in str(obj) and type(obj) is not list and type(obj) is not dict
20 | 


--------------------------------------------------------------------------------
/python/neo4j_dwh_connector/connector.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | if sys.version > '3':
 4 |     basestring = str
 5 | 
 6 | from pyspark.sql import SparkSession
 7 | 
 8 | from neo4j_dwh_connector._dto import JobConfig
 9 | from neo4j_dwh_connector._utils import _to_java_value
10 | 
11 | 
12 | class Neo4jDWHConnector:
13 | 
14 |     def __init__(self, session: SparkSession, jobConfig: JobConfig):
15 |         java_map = _to_java_value(jobConfig, session.sparkContext)
16 |         self._jvm_connector = session.sparkContext._jvm.org.neo4j.dwh.connector.Neo4jDWHConnector(
17 |             session._jsparkSession, java_map)
18 | 
19 |     def run(self, closeSession=False):
20 |         self._jvm_connector.run(closeSession)
21 | 


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file should list any python package dependencies.
2 | 


--------------------------------------------------------------------------------
/python/setup.cfg:
--------------------------------------------------------------------------------
1 | # This file contains the default option values to be used during setup. An
2 | # example can be found at https://github.com/pypa/sampleproject/blob/master/setup.cfg
3 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
1 | # Your python setup file. An example can be found at:
2 | # https://github.com/pypa/sampleproject/blob/master/setup.py
3 | 


--------------------------------------------------------------------------------
/python/spark-package-deps.txt:
--------------------------------------------------------------------------------
1 | # This file should list any spark package dependencies as:
2 | # :package_name==:version   e.g. databricks/spark-csv==0.1
3 | 


--------------------------------------------------------------------------------
/python/test/__init__.py:
--------------------------------------------------------------------------------
1 | from ._util_test import UtilTest
2 | from .connector_test import ConnectorTest
3 | 
4 | __all__ = ['UtilTest', 'ConnectorTest']
5 | 


--------------------------------------------------------------------------------
/python/test/_util_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from py4j.java_collections import JavaMap, JavaList
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | from neo4j_dwh_connector._dto import *
 6 | from neo4j_dwh_connector._utils import _to_java_value
 7 | 
 8 | 
 9 | class UtilTest(unittest.TestCase):
10 |     spark = None
11 | 
12 |     def setUp(self):
13 |         self.spark = (SparkSession.builder
14 |                       .appName("Neo4jConnectorTests")
15 |                       .master('local[*]')
16 |                       # .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.12:4.1.2_for_spark_3')
17 |                       .config("spark.driver.host", "127.0.0.1")
18 |                       .getOrCreate())
19 | 
20 |     def test__to_java_value(self):
21 |         source = Source(
22 |             format="snowflake",  # the source database (mandatory)
23 |             # the configuration options it will change for every source database (mandatory)
24 |             options={
25 |                 "sfSchema": "TPCH_SF1",
26 |                 "sfPassword": "****",
27 |                 "sfUser": "****",
28 |                 "dbtable": "CUSTOMER",
29 |                 "sfDatabase": "SNOWFLAKE_SAMPLE_DATA",
30 |                 "sfURL": "https://****.eu-central-1.snowflakecomputing.com"
31 |             },
32 |             # a list of selected projected columns, it can be usefull in order to eventually cast data,
33 |             # apply Spark's UDFs and minimize the data movement from the source database (optional)
34 |             columns=[
35 |                 Column(name="CAST(C_ACCTBAL AS DOUBLE)", alias="C_ACCTBAL"),
36 |                 Column(name="C_ADDRESS"),
37 |                 Column(name="C_COMMENT"),
38 |                 Column(name="CAST(C_CUSTKEY AS LONG)", alias="C_CUSTKEY"),
39 |                 Column(name="C_MKTSEGMENT"),
40 |                 Column(name="C_NAME"),
41 |                 Column(name="CAST(C_NATIONKEY AS LONG)", alias="C_NATIONKEY"),
42 |                 Column(name="C_PHONE")
43 |             ],
44 |             where="",  # a filter for the source dataset (optional)
45 |             printSchema=True,  # if you want to print the schema, useful for debug purposes (optional)
46 |             show=5,  # if you want show the source database, useful for debug purposes (optional)
47 |             limit=10,  # the amount of rows that you want to have from the source dataset (optional)
48 |             # a dataframe partition configuration (optional)
49 |             partition=Partition(
50 |                 number=-1,  # the number of partions mandatory if you want to define partitions
51 |                 by=""  # the field to partition (optional)
52 |             )
53 |         )
54 |         # the target database configuration
55 |         target = Target(
56 |             format="org.neo4j.spark.DataSource",  # the target database (mandatory)
57 |             # the configuration options it will change for every source database (mandatory)
58 |             options={
59 |                 "labels": ":PersonNew1:CustomerNew1",
60 |                 "url": "neo4j+s://****.databases.neo4j.io",
61 |                 "authentication.basic.username": "neo4j",
62 |                 "authentication.basic.password": "****",
63 |                 "node.keys": "C_CUSTKEY"
64 |             },
65 |             mode="Overwrite"
66 |         )
67 | 
68 |         config = JobConfig(
69 |             name="The name of the Spark Job",
70 |             conf={},  # a <String,String> configuration map, every k/v binding will be insert as Spark Configuration
71 |             hadoopConfiguration={},
72 |             # a <String,String> configuration map, every k/v binding will be insert as Hadoop Configuration
73 |             source=source,
74 |             target=target
75 |         )
76 | 
77 |         converted = _to_java_value(config, self.spark)
78 |         assert type(converted) is JavaMap
79 |         assert type(converted["source"]) is JavaMap
80 |         assert type(converted["target"]) is JavaMap
81 |         assert type(converted["source"]["columns"]) is JavaList
82 |         for id, value in enumerate(converted["source"]["columns"]):
83 |             assert type(converted["source"]["columns"][id]) is JavaMap
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/python/test/connector_test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import unittest
  3 | from unittest import SkipTest
  4 | 
  5 | from pyspark.sql import SparkSession
  6 | from testcontainers.neo4j import Neo4jContainer
  7 | from tzlocal import get_localzone
  8 | 
  9 | from neo4j_dwh_connector import *
 10 | 
 11 | import pathlib
 12 | 
 13 | connector_version = "1.0-SNAPSHOT"
 14 | neo4j_version = "4.4-enterprise"
 15 | current_time_zone = get_localzone().zone
 16 | 
 17 | 
 18 | def parse_arguments(length=4):
 19 |     print(sys.argv)
 20 |     global connector_version
 21 |     global neo4j_version
 22 |     global current_time_zone
 23 |     if len(sys.argv) >= length - 1:
 24 |         if length - 1 in sys.argv:
 25 |             current_time_zone = sys.argv.pop()  # str(sys.argv[++start_index])
 26 |         neo4j_version = sys.argv.pop()  # str(sys.argv[++start_index])
 27 |         connector_version = sys.argv.pop()  # str(sys.argv[start_index])
 28 |     print("Running tests for Connector %s, Neo4j %s, TimeZone %s"
 29 |           % (connector_version, neo4j_version, current_time_zone))
 30 | 
 31 | 
 32 | class ConnectorTest(unittest.TestCase):
 33 |     neo4j_container = None
 34 |     spark = None
 35 | 
 36 |     @classmethod
 37 |     def setUpClass(cls):
 38 |         jar_path = "target/neo4j-dwh-connector-{version}-jar-with-dependencies.jar".format(
 39 |             version=connector_version)
 40 |         jar_file = (pathlib.Path(__file__)
 41 |                     .absolute()
 42 |                     .parent
 43 |                     .parent
 44 |                     .parent
 45 |                     .joinpath(jar_path))
 46 |         if not jar_file.exists():
 47 |             path_format_error = 'Connector JAR not found under $PROJECT_HOME/{path}'.format(path=jar_path)
 48 |             print(path_format_error)
 49 |             raise SkipTest(path_format_error)
 50 |         cls.neo4j_container = (Neo4jContainer('neo4j:' + neo4j_version)
 51 |                                .with_env("NEO4J_db_temporal_timezone", current_time_zone)
 52 |                                .with_env("NEO4J_ACCEPT_LICENSE_AGREEMENT", "yes"))
 53 |         cls.neo4j_container.start()
 54 |         cls.spark = (SparkSession.builder
 55 |                      .appName("Neo4jConnectorTests")
 56 |                      .master('local[*]')
 57 |                      .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.12:4.1.2_for_spark_3')
 58 |                      .config("spark.jars", str(jar_file))
 59 |                      .config("spark.driver.host", "127.0.0.1")
 60 |                      .getOrCreate())
 61 | 
 62 |     @classmethod
 63 |     def tearDownClass(cls):
 64 |         cls.neo4j_container.stop()
 65 |         cls.spark.stop()
 66 | 
 67 |     def test_ingest_csv(self):
 68 |         csv_path = (pathlib.Path(__file__)
 69 |                     .absolute()
 70 |                     .parent
 71 |                     .parent
 72 |                     .parent
 73 |                     .joinpath("src/test/resources/persons.csv"))
 74 |         assert csv_path.exists()
 75 |         source = Source(
 76 |             format="csv",  # the source database (mandatory)
 77 |             # the configuration options it will change for every source database (mandatory)
 78 |             options={
 79 |                 "header": "true",
 80 |                 "path": str(csv_path)
 81 |             },
 82 |             # a list of selected projected columns, it can be useful in order to eventually cast data,
 83 |             # apply Spark's UDFs and minimize the data movement from the source database (optional)
 84 |             columns=[
 85 |                 Column(name="person_id", alias="id"),
 86 |                 Column(name="person_name", alias="name")
 87 |             ],
 88 |             where="",  # a filter for the source dataset (optional)
 89 |             printSchema=True,  # if you want to print the schema, useful for debug purposes (optional)
 90 |             show=5,  # if you want show the source database, useful for debug purposes (optional)
 91 |             limit=10,  # the amount of rows that you want to have from the source dataset (optional)
 92 |             # a dataframe partition configuration (optional)
 93 |             partition=Partition(
 94 |                 number=-1,  # the number of partions mandatory if you want to define partitions
 95 |                 by=""  # the field to partition (optional)
 96 |             )
 97 |         )
 98 |         # the target database configuration
 99 |         target = Target(
100 |             format="org.neo4j.spark.DataSource",  # the target database (mandatory)
101 |             # the configuration options it will change for every source database (mandatory)
102 |             options={
103 |                 "labels": ":Person:Customer",
104 |                 "url": self.neo4j_container.get_connection_url(),
105 |                 "authentication.basic.username": Neo4jContainer.NEO4J_USER,
106 |                 "authentication.basic.password": Neo4jContainer.NEO4J_ADMIN_PASSWORD,
107 |                 "node.keys": "id"
108 |             },
109 |             mode="Overwrite"
110 |         )
111 | 
112 |         config = JobConfig(
113 |             name="Create Persons from CSV to Neo4j",
114 |             conf={},  # a <String,String> configuration map, every k/v binding will be insert as Spark Configuration
115 |             hadoopConfiguration={},
116 |             # a <String,String> configuration map, every k/v binding will be insert as Hadoop Configuration
117 |             source=source,
118 |             target=target,
119 |             master="local"
120 |         )
121 |         connector = Neo4jDWHConnector(self.spark, config)
122 | 
123 |         # this will ingest the data from source to target database
124 |         connector.run()
125 | 
126 |         with self.neo4j_container.get_driver() as neo4j_driver:
127 |             with neo4j_driver.session() as neo4j_session:
128 |                 result = neo4j_session.run("MATCH (n:Person:Customer) RETURN count(n) AS count").peek()
129 |                 assert result["count"] == 3
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     parse_arguments()
134 |     unittest.main()
135 | 


--------------------------------------------------------------------------------
/src/main/scala/org/neo4j/dwh/connector/Neo4jDWHConnector.scala:
--------------------------------------------------------------------------------
  1 | package org.neo4j.dwh.connector
  2 | 
  3 | import org.apache.commons.cli.HelpFormatter
  4 | import org.apache.commons.lang3.StringUtils
  5 | import org.apache.spark.sql
  6 | import org.apache.spark.sql.{DataFrame, SparkSession}
  7 | import org.neo4j.dwh.connector.domain.{JobConfig, Source, Target}
  8 | import org.neo4j.dwh.connector.generator.JobConfigGenerator
  9 | import org.neo4j.dwh.connector.utils.CliUtils.JsonType
 10 | import org.neo4j.dwh.connector.utils.{CliUtils, JobConfigUtils, Utils}
 11 | 
 12 | import java.net.{MalformedURLException, URL}
 13 | import java.util
 14 | import java.util.Locale
 15 | 
 16 | /**
 17 |  * @author Andrea Santurbano
 18 |  */
 19 | object Neo4jDWHConnector {
 20 | 
 21 |   def main(args: Array[String]) {
 22 |     val cli = CliUtils.parseArgs(args)
 23 |     if (CliUtils.hasHelp(cli)) {
 24 |       val fmt = new HelpFormatter()
 25 |       fmt.printHelp(CliUtils.helpText, CliUtils.options())
 26 |       return
 27 |     }
 28 |     CliUtils.validateCli(cli)
 29 |     val pathAsString = cli.getOptionValue("p")
 30 |     val filePath = try {
 31 |       new URL(pathAsString)
 32 |     } catch {
 33 |       case mue: MalformedURLException if mue.getMessage.contains("no protocol") => new URL(s"file:$pathAsString")
 34 |       case t: Throwable => throw t
 35 |     }
 36 |     val isGenerateConfig = cli.hasOption("c")
 37 |     if (isGenerateConfig) {
 38 |       new JobConfigGenerator(cli).generate()
 39 |     } else {
 40 |       val jobs = JsonType.withName(cli
 41 |           .getOptionValue("ft", CliUtils.JsonType.SINGLE.toString.toUpperCase(Locale.ENGLISH))
 42 |           .toUpperCase(Locale.ENGLISH)) match {
 43 |         case JsonType.ARRAY => JobConfig.fromSeq(filePath)
 44 |         case JsonType.SINGLE => Seq(JobConfig.from(filePath))
 45 |       }
 46 |       jobs.foreach(job => new Neo4jDWHConnector(job).run(true))
 47 |     }
 48 |   }
 49 | }
 50 | 
 51 | class Neo4jDWHConnector(session: SparkSession, job: JobConfig) {
 52 | 
 53 |   def this(jobConfig: JobConfig) = this(JobConfigUtils.toSparkSession(jobConfig), jobConfig)
 54 |   def this(session: SparkSession, jobConfigMap: util.Map[String, AnyRef]) = this(session, JobConfig.from(jobConfigMap))
 55 | 
 56 |   def run(closeSession: Boolean = false): Unit = try {
 57 |     val dataFrame = read(job.source, session)
 58 |     write(job.target, dataFrame)
 59 |   } finally {
 60 |     if (closeSession) {
 61 |       session.close()
 62 |     }
 63 |   }
 64 | 
 65 |   private def read(source: Source, spark: SparkSession): DataFrame = {
 66 |     var dataFrame = spark.read.format(source.format)
 67 |       .options(Utils.enrichMap(source.options))
 68 |       .load()
 69 |     if (StringUtils.isNotBlank(source.where)) {
 70 |       dataFrame = dataFrame.where(source.where)
 71 |     }
 72 |     if (!source.columns.isEmpty) {
 73 |       dataFrame = dataFrame.selectExpr(source.columns.map(_.toString).toArray : _*)
 74 |     }
 75 | 
 76 |     if (source.printSchema) {
 77 |       dataFrame.printSchema()
 78 |     }
 79 | 
 80 |     if (source.limit > 0) {
 81 |       dataFrame = dataFrame.limit(source.limit)
 82 |     }
 83 | 
 84 |     if (source.show > 0) {
 85 |       dataFrame.show(source.show)
 86 |     }
 87 | 
 88 |     if (source.partition.number > 0) {
 89 |       dataFrame = if (StringUtils.isBlank(source.partition.by)) {
 90 |         dataFrame.repartition(source.partition.number)
 91 |       } else {
 92 |         dataFrame.repartition(source.partition.number, new sql.Column(source.partition.by))
 93 |       }
 94 |     }
 95 |     dataFrame
 96 |   }
 97 | 
 98 |   private def write(target: Target, df: DataFrame): Unit = {
 99 |     var dfWriter = df.write.format(target.format)
100 |       .options(Utils.enrichMap(target.options))
101 |     if (StringUtils.isNotBlank(target.mode)) {
102 |       dfWriter = dfWriter.mode(target.mode)
103 |     }
104 |     dfWriter.save()
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/main/scala/org/neo4j/dwh/connector/domain/JobConfig.scala:
--------------------------------------------------------------------------------
 1 | package org.neo4j.dwh.connector.domain
 2 | 
 3 | import com.fasterxml.jackson.core.`type`.TypeReference
 4 | import org.apache.commons.lang3.StringUtils
 5 | import org.neo4j.dwh.connector.utils.JSONUtils
 6 | 
 7 | import java.io.File
 8 | import java.net.{URI, URL}
 9 | import java.util
10 | 
11 | case class Column(name: String, alias: String = "") {
12 |   override def toString: String = if (StringUtils.isBlank(alias)) name else s"$name AS $alias"
13 | }
14 | 
15 | case class Partition(number: Int = 0, by: String = "")
16 | 
17 | case class Source(format: String,
18 |                   options: Map[String, String],
19 |                   columns: Seq[Column] = Seq.empty,
20 |                   where: String = "",
21 |                   printSchema: Boolean,
22 |                   limit: Int = -1,
23 |                   show: Int = -1,
24 |                   partition: Partition = Partition())
25 | 
26 | case class Target(format: String,
27 |                   options: Map[String, String],
28 |                   mode: String)
29 | 
30 | case class JobConfig(name: String = "Neo4j DWH Connector Job",
31 |                      master: String = "",
32 |                      conf: Map[String, String] = Map.empty,
33 |                      hadoopConfiguration: Map[String, String] = Map.empty,
34 |                      source: Source,
35 |                      target: Target)
36 | 
37 | object JobConfig {
38 | 
39 |   def from(data: AnyRef): JobConfig = data match {
40 |     case json: String => JSONUtils.mapper.readValue(json, classOf[JobConfig])
41 |     case file: File => JSONUtils.mapper.readValue(file, classOf[JobConfig])
42 |     case uri: URI => JSONUtils.mapper.readValue(uri.toURL, classOf[JobConfig])
43 |     case url: URL => JSONUtils.mapper.readValue(url, classOf[JobConfig])
44 |     case map: util.Map[_, _] => JSONUtils.mapper.convertValue(map, classOf[JobConfig])
45 |     case _ => throw new IllegalArgumentException("Supported input types are String and File")
46 |   }
47 | 
48 |   def fromSeq(data: AnyRef): Seq[JobConfig] = data match {
49 |     case json: String => JSONUtils.mapper.readValue(json, new TypeReference[Seq[JobConfig]] {})
50 |     case file: File => JSONUtils.mapper.readValue(file, new TypeReference[Seq[JobConfig]] {})
51 |     case uri: URI => JSONUtils.mapper.readValue(uri.toURL, new TypeReference[Seq[JobConfig]] {})
52 |     case url: URL => JSONUtils.mapper.readValue(url, new TypeReference[Seq[JobConfig]] {})
53 |     case list: util.List[_] => JSONUtils.mapper.convertValue(list, new TypeReference[Seq[JobConfig]] {})
54 |     case _ => throw new IllegalArgumentException("Supported input types are String and File")
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/org/neo4j/dwh/connector/generator/JobConfigGenerator.scala:
--------------------------------------------------------------------------------
 1 | package org.neo4j.dwh.connector.generator
 2 | 
 3 | import org.apache.commons.cli.CommandLine
 4 | import org.neo4j.dwh.connector.domain.JobConfig
 5 | import org.neo4j.dwh.connector.utils.{DatasourceOptions, JSONUtils}
 6 | 
 7 | import java.io.File
 8 | 
 9 | class JobConfigGenerator(private val cli: CommandLine) {
10 | 
11 |   def generate(): Unit = {
12 |     val source = DatasourceOptions
13 |       .withNameIgnoreCase(cli.getOptionValue("s"))
14 | 
15 |     val target = DatasourceOptions
16 |       .withNameIgnoreCase(cli.getOptionValue("t"))
17 | 
18 |     val deps = (source.deps ++ target.deps)
19 |       .map(dep => s" - `$dep`")
20 |       .mkString("\n")
21 | 
22 |     val jobConfig = JobConfig(
23 |       name =
24 |         s"""
25 |           |<This is a generated Configuration, please fill all the field accordingly>
26 |           |In order to work the following dependencies are required:
27 |           |$deps
28 |           |""".stripMargin,
29 |       source = source.toSource(),
30 |       target = target.toTarget(),
31 |       conf = (source.conf ++ target.conf),
32 |       hadoopConfiguration = (source.hadoopConf ++ target.hadoopConf)
33 |     )
34 | 
35 |     JSONUtils.mapper
36 |       .writerWithDefaultPrettyPrinter
37 |       .writeValue(new File(cli.getOptionValue("p")), jobConfig)
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/org/neo4j/dwh/connector/utils/CliUtils.scala:
--------------------------------------------------------------------------------
 1 | package org.neo4j.dwh.connector.utils
 2 | 
 3 | import org.apache.commons.cli.{BasicParser, CommandLine, Options}
 4 | 
 5 | object CliUtils {
 6 |   object JsonType extends Enumeration {
 7 |     val SINGLE, ARRAY = Value
 8 |   }
 9 | 
10 |   val helpText = """In case you're using it to generate the configuration stub:
11 |                    |java -jar neo4j-dwh-connector-<version>.jar -c -s <Source Database> -t <Target Database> -p <Where to put the configuration json file>
12 |                    |In case you're using it with Spark Submit (from $SPARK_HOME):
13 |                    |./bin/spark-submit \
14 |                    |  --class org.neo4j.dwh.connector.Neo4jDWHConnector \
15 |                    |  --packages <required dependecies> \
16 |                    |  --master spark://<SPARK_IP>:<SPARK_PORT> \
17 |                    |  --deploy-mode cluster \
18 |                    |  --supervise \
19 |                    |  --executor-memory 20G \
20 |                    |  --total-executor-cores 100 \
21 |                    |  /path/to/neo4j-dwh-connector-<version>.jar \
22 |                    |  -p /path/to/dwh_job_config.json
23 |                    |""".stripMargin
24 | 
25 |   // if used in Databricks Job env don't work with DefaultParser
26 |   // because I guess that for some reason they use an old version of apache CommonsCli
27 |   // so we're using BasicParser in order to make it work
28 |   def parseArgs(args: Array[String]): CommandLine = new BasicParser().parse(options(), args)
29 | 
30 |   def hasHelp(cli: CommandLine): Boolean = cli.hasOption("h")
31 | 
32 |   def options(): Options = {
33 |     val supportedDataSource = DatasourceOptions
34 |       .values
35 |       .map(_.toString)
36 |       .map(name => s" - `$name`")
37 |       .mkString("\n")
38 |     new Options()
39 |       .addOption("p", "path", true, """If used in combination with the `c` option is the position
40 |                                                  |where the configuration field will be saved, otherwise where
41 |                                                  |it will be read in order to start the Spark Job.
42 |                                                  |""".stripMargin)
43 |       .addOption("c", "config", false, """Generates a configuration stub that can be used with the DWH connector.
44 |                                          |You need to define -s and -t options in order to
45 |                                          |specify which are the source and target data sources.
46 |                                          |""".stripMargin)
47 |       .addOption("s", "source", true, s"""In combination with -c, it generates a stub configuration with the selected source database.
48 |                                          |Supported Data sources are:
49 |                                          |$supportedDataSource
50 |                                          |""".stripMargin)
51 |       .addOption("t", "target", true, s"""In combination with -c, it generates a stub configuration with the selected target database.
52 |                                          |Supported Data sources are:
53 |                                          |$supportedDataSource
54 |                                          |""".stripMargin)
55 |       .addOption("ft", "file_type", true, s"""The config file type:
56 |                                             |	- `${JsonType.SINGLE}` (default) means a single json
57 |                                             |	- `${JsonType.ARRAY}` means that you're passing an array of json
58 |                                             |""".stripMargin)
59 |       .addOption("h","help", false, "Prints the help")
60 |   }
61 | 
62 |   def validateCli(cli: CommandLine): Unit = {
63 |     val hasPath = cli.hasOption("p")
64 |     if (!hasPath) {
65 |       throw new IllegalArgumentException("Option -p is required")
66 |     }
67 |     val isGenerateConfig = cli.hasOption("c")
68 |     if (isGenerateConfig) {
69 |       val hasGenerateTarget = cli.hasOption("t")
70 |       val hasGenerateSource = cli.hasOption("s")
71 |       if (!hasGenerateSource && !hasGenerateTarget) {
72 |         throw new IllegalArgumentException("You must define `-t` and `-s` option in combination with `-c`")
73 |       }
74 |     }
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/org/neo4j/dwh/connector/utils/DatasourceOptions.scala:
--------------------------------------------------------------------------------
  1 | package org.neo4j.dwh.connector.utils
  2 | 
  3 | import org.neo4j.dwh.connector.domain.{Column, Partition, Source, Target}
  4 | 
  5 | object DatasourceOptions extends Enumeration {
  6 |   case class DatasourceOptionsValue(options: Map[String, Any], deps: Array[String],
  7 |                                     conf: Map[String, String] = Map.empty,
  8 |                                     hadoopConf: Map[String, String] = Map.empty) extends super.Val {
  9 |     def toSource(): Source = JSONUtils.mapper.convertValue(options, classOf[Source])
 10 |     def toTarget(): Target = JSONUtils.mapper.convertValue(options, classOf[Target])
 11 |   }
 12 | 
 13 |   private val cols = Array(Column("""<Dataframe column name to project>.
 14 |                                   |N.b. `columns` field will be ignored in case
 15 |                                   |you're using it in the `target` field
 16 |                                   |""".stripMargin, "Alias to column, not mandatory"))
 17 | 
 18 |   private val partition = Partition(-1)
 19 | 
 20 |   def withNameIgnoreCase(name: String): DatasourceOptionsValue = this.values
 21 |     .filter(_.toString.equalsIgnoreCase(name))
 22 |     .headOption
 23 |     .getOrElse(() => throw new NoSuchElementException(s"No value for $name"))
 24 |     .asInstanceOf[DatasourceOptionsValue]
 25 | 
 26 |   val Snowflake = DatasourceOptionsValue(Map(
 27 |     "format" -> "snowflake",
 28 |     "columns" -> cols,
 29 |     "where" -> "<Spark SQL filter to the Dataframe.> It will be ignored in case you're using it in the `target` field.",
 30 |     "mode" ->
 31 |       """<Spark Save Mode>
 32 |         |N.b. It'll be ignored if you're reading data from Snowflake.
 33 |         |Please check supported save modes here: https://docs.snowflake.com/en/user-guide/spark-connector-use.html#moving-data-from-spark-to-snowflake
 34 |         |""".stripMargin,
 35 |     "options" -> Map(
 36 |       "_comment" ->
 37 |         """<This field is a comment you can simple ignore it>
 38 |           |You can find the full list of Snowflake configuration properties here:
 39 |           |https://docs.snowflake.com/en/user-guide/spark-connector-use.html#setting-configuration-options-for-the-connector
 40 |           |""".stripMargin,
 41 |       "sfURL" -> "<account_identifier>.snowflakecomputing.com",
 42 |       "sfUser" -> "<user_name>",
 43 |       "sfPassword" -> "<password>",
 44 |       "sfDatabase" -> "<database>",
 45 |       "sfSchema" -> "<schema>",
 46 |       "sfWarehouse" -> "<warehouse>",
 47 |       "dbtable" -> "<snowflake-table>"),
 48 |     "partition" -> partition
 49 |   ), Array("net.snowflake:spark-snowflake_<scala_version>:<version>", "net.snowflake:snowflake-jdbc:<version>"))
 50 |   val Neo4j = DatasourceOptionsValue(Map(
 51 |     "format" -> "org.neo4j.spark.DataSource",
 52 |     "columns" -> cols,
 53 |     "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.",
 54 |     "mode" ->
 55 |       """<Spark Save Mode>
 56 |         |N.b. It'll be ignored if you're reading data from Neo4j.
 57 |         |Please check supported save modes here: https://neo4j.com/docs/spark/current/writing/#save-mode
 58 |         |""".stripMargin,
 59 |     "options" -> Map(
 60 |       "_comment" ->
 61 |         """<This field is a comment you can simple ignore it>
 62 |           |You can find the full list of Neo4j configuration properties here:
 63 |           |https://neo4j.com/docs/spark/current/
 64 |           |""".stripMargin,
 65 |       "labels" ->
 66 |         """<List of node labels separated by `:` The first label will be the primary label>.
 67 |           |In case of writing into Neo4j please see https://neo4j.com/docs/spark/current/writing/#write-node
 68 |           |In case of reading from Neo4j please see https://neo4j.com/docs/spark/current/reading/#read-node
 69 |           |""".stripMargin,
 70 |       "relationship" ->
 71 |         """<The relationship type>
 72 |           |N.b. this field requires extra configuration please see
 73 |           | - In case of writing: https://neo4j.com/docs/spark/current/writing/#write-rel
 74 |           | - In case of reading: https://neo4j.com/docs/spark/current/reading/#read-rel
 75 |           |""".stripMargin,
 76 |       "query" ->
 77 |         """<The neo4j cypher query>
 78 |           |In case of writing into Neo4j please see https://neo4j.com/docs/spark/current/writing/#write-query
 79 |           |In case of reading from Neo4j please see https://neo4j.com/docs/spark/current/reading/#read-query
 80 |           |""".stripMargin,
 81 |       "url" -> "<neo4j_url>",
 82 |       "authentication.type" -> "<auth type> Please see: https://neo4j.com/docs/spark/current/configuration/",
 83 |       "authentication.basic.username" -> "<neo4j_user_name>",
 84 |       "authentication.basic.password" -> "<neo4j_password>"),
 85 |     "partition" -> partition
 86 |   ), Array("org.neo4j:neo4j-connector-apache-spark_<scala_version>:<version>"))
 87 |   val BigQuery = DatasourceOptionsValue(Map(
 88 |     "format" -> "bigquery",
 89 |     "columns" -> cols,
 90 |     "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.",
 91 |     "mode" -> "<Spark Save Mode>",
 92 |     "options" -> Map(
 93 |       "_comment" ->
 94 |         """<This field is a comment you can simple ignore it>
 95 |           |You can find the full list of BigQuery configuration properties here:
 96 |           |https://github.com/GoogleCloudDataproc/spark-bigquery-connector#properties
 97 |           |""".stripMargin,
 98 |       "path" -> "The BigQuery table in the format [[project:]dataset.]table",
 99 |       "credentials" -> "<SERVICE_ACCOUNT_JSON_IN_BASE64>",
100 |       "dataset" -> "The dataset containing the table. This option should be used with standard table and views, but not when loading query results."
101 |     ),
102 |     "partition" -> partition
103 |   ), Array("com.google.cloud.spark:spark-bigquery-with-dependencies_<scala_version>:<version>"))
104 |   val RedShift_Community = DatasourceOptionsValue(Map(
105 |       "format" -> "io.github.spark_redshift_community.spark.redshift",
106 |       "columns" -> cols,
107 |       "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.",
108 |       "mode" -> "<Spark Save Mode>",
109 |       "options" -> Map(
110 |         "_comment" ->
111 |           """<This field is a comment you can simple ignore it>
112 |             |You can find the full list of RedShift configuration properties here:
113 |             |https://github.com/spark-redshift-community/spark-redshift#parameters
114 |             |""".stripMargin,
115 |         "url" ->
116 |           """A JDBC URL, of the format, jdbc:subprotocol://host:port/database?user=username&password=password
117 |             |subprotocol can be postgresql or redshift, depending on which JDBC driver you have loaded. Note however that one Redshift-compatible driver must be on the classpath and match this URL.
118 |             |host and port should point to the Redshift master node, so security groups and/or VPC will need to be configured to allow access from your driver application.
119 |             |database identifies a Redshift database name
120 |             |user and password are credentials to access the database, which must be embedded in this URL for JDBC, and your user account should have necessary privileges for the table being referenced.
121 |             |""".stripMargin,
122 |         "query" -> "The query to read from in Redshift (unless `dbtable` is specified)",
123 |         "dbtable" -> "The table to create or read from in Redshift. This parameter is required when saving data back to Redshift.",
124 |         "tempdir" -> "A writeable location in Amazon S3, to be used for unloaded data when reading and Avro data to be loaded into Redshift when writing. If you're using Redshift data source for Spark as part of a regular ETL pipeline, it can be useful to set a Lifecycle Policy on a bucket and use that as a temp location for this data."
125 |       ),
126 |       "partition" -> partition
127 |     ),
128 |     Array("com.amazonaws:aws-java-sdk:<version>", "com.amazon.redshift:redshift-jdbc42:<version>", "org.apache.spark:spark-avro_<scala_version>:<version>", "io.github.spark-redshift-community:spark-redshift_<scala_version>:<version>"),
129 |     Map.empty,
130 |     Map("fs.s3<n/a depends by the s3 that you want to use>.awsAccessKeyId" -> "YOUR_KEY_ID", "fs.s3<n/a depends by the s3 that you want to use>.awsSecretAccessKey" -> "YOUR_SECRET_ACCESS_KEY")
131 |   )
132 |   val RedShift_Databricks = DatasourceOptionsValue(Map(
133 |     "format" -> "com.databricks.spark.redshift",
134 |     "columns" -> cols,
135 |     "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.",
136 |     "mode" -> "<Spark Save Mode>",
137 |     "options" -> Map(
138 |       "_comment" ->
139 |         """<This field is a comment you can simple ignore it>
140 |           |You can find the full list of RedShift configuration properties here:
141 |           |https://github.com/spark-redshift-community/spark-redshift#parameters
142 |           |""".stripMargin,
143 |       "url" ->
144 |         """A JDBC URL, of the format, jdbc:subprotocol://host:port/database?user=username&password=password
145 |           |subprotocol can be postgresql or redshift, depending on which JDBC driver you have loaded. Note however that one Redshift-compatible driver must be on the classpath and match this URL.
146 |           |host and port should point to the Redshift master node, so security groups and/or VPC will need to be configured to allow access from your driver application.
147 |           |database identifies a Redshift database name
148 |           |user and password are credentials to access the database, which must be embedded in this URL for JDBC, and your user account should have necessary privileges for the table being referenced.
149 |           |""".stripMargin,
150 |       "query" -> "The query to read from in Redshift (unless `dbtable` is specified)",
151 |       "dbtable" -> "The table to create or read from in Redshift. This parameter is required when saving data back to Redshift.",
152 |       "tempdir" -> "A writeable location in Amazon S3, to be used for unloaded data when reading and Avro data to be loaded into Redshift when writing. If you're using Redshift data source for Spark as part of a regular ETL pipeline, it can be useful to set a Lifecycle Policy on a bucket and use that as a temp location for this data."
153 |     ),
154 |     "partition" -> partition
155 |   ),
156 |     Array("com.amazonaws:aws-java-sdk:<version>", "com.amazon.redshift:redshift-jdbc42:<version>", "org.apache.spark:spark-avro_<scala_version>:<version>"),
157 |     Map.empty,
158 |     Map("fs.s3<n/a depends by the s3 that you want to use>.awsAccessKeyId" -> "YOUR_KEY_ID", "fs.s3<n/a depends by the s3 that you want to use>.awsSecretAccessKey" -> "YOUR_SECRET_ACCESS_KEY")
159 |   )
160 |   val RedShift_JDBC = DatasourceOptionsValue(Map(
161 |     "format" -> "jdbc",
162 |     "columns" -> cols,
163 |     "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.",
164 |     "mode" -> "<Spark Save Mode>",
165 |     "options" -> Map(
166 |       "_comment" ->
167 |         """<This field is a comment you can simple ignore it>
168 |           |You can connect to RedShift in a non Databricks env also via JDBC.
169 |           |Please refer to this documentation page:
170 |           |https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html
171 |           |""".stripMargin,
172 |       "url" ->
173 |         """A JDBC URL, of the format, jdbc:subprotocol://host:port/database?user=username&password=password
174 |           |subprotocol can be postgresql or redshift, depending on which JDBC driver you have loaded. Note however that one Redshift-compatible driver must be on the classpath and match this URL.
175 |           |host and port should point to the Redshift master node, so security groups and/or VPC will need to be configured to allow access from your driver application.
176 |           |database identifies a Redshift database name
177 |           |user and password are credentials to access the database, which must be embedded in this URL for JDBC, and your user account should have necessary privileges for the table being referenced.
178 |           |""".stripMargin,
179 |       "query" -> "The query to read from in Redshift (unless `dbtable` is specified)",
180 |       "dbtable" -> "The table to create or read from in Redshift. This parameter is required when saving data back to Redshift."
181 |     ),
182 |     "partition" -> partition
183 |   ), Array.empty)
184 |   val Synapse_Databricks = DatasourceOptionsValue(Map(
185 |     "format" -> "com.databricks.spark.sqldw",
186 |     "columns" -> cols,
187 |     "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.",
188 |     "mode" -> "<Spark Save Mode>",
189 |     "options" -> Map(
190 |       "_comment" ->
191 |         """<This field is a comment you can simple ignore it>
192 |           |You can find the full list of Azure Synapse Analytics configuration properties here:
193 |           |https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/synapse-analytics#parameters
194 |           |""".stripMargin,
195 |       "url" -> "A JDBC URL with sqlserver set as the subprotocol. It is recommended to use the connection string provided by Azure portal. Setting\nencrypt=true is strongly recommended, because it enables SSL encryption of the JDBC connection. If user and password are set separately, you do not need to include them in the URL.",
196 |       "tempDir" -> "A wasbs URI. We recommend you use a dedicated Blob storage container for the Azure Synapse.",
197 |       "forwardSparkAzureStorageCredentials" ->
198 |         """If true, the library automatically discovers the credentials that Spark is using to connect to the Blob storage container and forwards those credentials to Azure Synapse over JDBC. These credentials are sent as part of the JDBC query. Therefore it is strongly recommended that you enable SSL encryption of the JDBC connection when you use this option.
199 |           |The current version of Azure Synapse connector requires (exactly) one of forwardSparkAzureStorageCredentials, enableServicePrincipalAuth, or useAzureMSI to be explicitly set to true.
200 |           |The previously supported forward_spark_azure_storage_credentials variant is deprecated and will be ignored in future releases. Use the “camel case” name instead.
201 |           |""".stripMargin,
202 |       "query" -> "The query to read from in Synapse (unless `dbtable` is specified)",
203 |       "dbTable" ->
204 |         """The table to create or read from in Azure Synapse. This parameter is required when saving data back to Azure Synapse.
205 |           |You can also use {SCHEMA NAME}.{TABLE NAME} to access a table in a given schema. If schema name is not provided, the default schema associated with the JDBC user is used.
206 |           |The previously supported dbtable variant is deprecated and will be ignored in future releases. Use the “camel case” name instead.""".stripMargin
207 |     ),
208 |     "partition" -> partition
209 |   ), Array("com.microsoft.azure:spark-mssql-connector_<scala_version>:<version>"), Map(
210 |     "fs.azure.account.key.<your-storage-account-name>.dfs.core.windows.net" -> "<your-storage-account-access-key>"
211 |   ))
212 |   val Synapse_JDBC = DatasourceOptionsValue(Map(
213 |     "format" -> "jdbc",
214 |     "columns" -> cols,
215 |     "where" -> ">Spark SQL filter to the Dataframe> It will be ignored in case you're using it in the `target` field.",
216 |     "mode" -> "<Spark Save Mode>",
217 |     "options" -> Map(
218 |       "_comment" ->
219 |         """<This field is a comment you can simple ignore it>
220 |           |You can connect to Synapse in a non Databricks env only via JDBC driver.
221 |           |Please refer to this documentation page:
222 |           |https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html
223 |           |""".stripMargin,
224 |       "url" -> "A JDBC URL with sqlserver set as the subprotocol. It is recommended to use the connection string provided by Azure portal. Setting\nencrypt=true is strongly recommended, because it enables SSL encryption of the JDBC connection. If user and password are set separately, you do not need to include them in the URL.",
225 |       "query" -> "The query to read from in Synapse (unless `dbtable` is specified)",
226 |       "dbtable" ->
227 |         """The table to create or read from in Azure Synapse. This parameter is required when saving data back to Azure Synapse.
228 |           |You can also use {SCHEMA NAME}.{TABLE NAME} to access a table in a given schema. If schema name is not provided, the default schema associated with the JDBC user is used.""".stripMargin
229 |     ),
230 |     "partition" -> partition
231 |   ), Array.empty)
232 | }
233 | 


--------------------------------------------------------------------------------
/src/main/scala/org/neo4j/dwh/connector/utils/JSONUtils.scala:
--------------------------------------------------------------------------------
 1 | package org.neo4j.dwh.connector.utils
 2 | 
 3 | import com.fasterxml.jackson.databind.DeserializationFeature
 4 | import com.fasterxml.jackson.databind.json.JsonMapper
 5 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
 6 | 
 7 | object JSONUtils {
 8 | 
 9 |   val mapper = JsonMapper.builder()
10 |     .addModule(DefaultScalaModule)
11 |     .disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
12 |     .build()
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/org/neo4j/dwh/connector/utils/JobConfigUtils.scala:
--------------------------------------------------------------------------------
 1 | package org.neo4j.dwh.connector.utils
 2 | 
 3 | import org.apache.commons.lang3.StringUtils
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.neo4j.dwh.connector.domain.JobConfig
 6 | 
 7 | object JobConfigUtils {
 8 | 
 9 |   def toSparkSession(jobConfig: JobConfig): SparkSession = {
10 |     val sessionBuilder = SparkSession.builder
11 |       .appName(jobConfig.name)
12 | 
13 |     if (StringUtils.isNotBlank(jobConfig.master)) {
14 |       sessionBuilder.master(jobConfig.master)
15 |     }
16 | 
17 |     jobConfig.conf.foreach(t => sessionBuilder.config(t._1, t._2))
18 | 
19 |     val session = sessionBuilder.getOrCreate()
20 |     jobConfig.hadoopConfiguration.foreach(t => session.sparkContext.hadoopConfiguration.set(t._1, t._2))
21 | 
22 |     session
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/neo4j/dwh/connector/utils/Utils.scala:
--------------------------------------------------------------------------------
 1 | package org.neo4j.dwh.connector.utils
 2 | 
 3 | import org.apache.commons.io.FileUtils
 4 | 
 5 | import java.io.File
 6 | import java.nio.charset.Charset
 7 | import scala.util.matching.Regex
 8 | import scala.util.{Properties, Try}
 9 | 
10 | object Utils {
11 | 
12 |   private val envPattern = """\$\{env:(.*)\}""".r
13 |   private val filePattern = """\$\{file:(.*)\}""".r
14 | 
15 |   def enrichMap(map: Map[String, String]): Map[String, String] = map
16 |     .map(t => (t._1, Try(findAllInRegex(envPattern, t._2))
17 |       .map(Properties.envOrElse(_, t._2))
18 |       .orElse(
19 |         Try(findAllInRegex(filePattern, t._2))
20 |           .map(path => FileUtils.readFileToString(new File(path), Charset.forName("UTF-8")))
21 |       )
22 |       .getOrElse(t._2)))
23 | 
24 |   private def findAllInRegex(r: Regex, str: String): String = {
25 |     // this is a workaround as the same regexp works in Scala 2.12 and 2.13 but not in 2.11
26 |     if (Properties.versionString.startsWith("version 2.11") && str.matches(r.regex)) {
27 |       val splits = r.regex.split("""\(\.\*\)""")
28 |       str.replaceAll(splits(0), "")
29 |         .replaceAll(splits(1), "")
30 |     } else {
31 |       (r findAllIn str).group(1)
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/test/resources/JobConfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Create Persons from CSV to Neo4j",
 3 |   "master": "local",
 4 |   "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
 5 |   "source": {
 6 |     "_comment": "The `format` field manages the connector datasource",
 7 |     "format": "csv",
 8 |     "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
 9 |     "options": {
10 |       "path": ""
11 |     },
12 |     "_comment": "The `columns` field manages projection of the dataframe, `name` is the column name, `alias` is the name that you want to give (not mandatory)",
13 |     "columns": [
14 |       {
15 |         "name": "person_id",
16 |         "alias": "id"
17 |       },
18 |       {
19 |         "name": "person_name",
20 |         "alias": "name"
21 |       }
22 |     ],
23 |     "_comment": "The `where` field manages filters on the datasource (not mandatory)",
24 |     "where": "person_surname = 'Santurbano'"
25 |   },
26 |   "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
27 |   "target": {
28 |     "_comment": "The `format` field manages the connector datasource",
29 |     "format": "org.neo4j.spark.DataSource",
30 |     "_comment": "The `mode` is the save mode of the writing connector",
31 |     "mode": "Overwrite",
32 |     "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
33 |     "options": {
34 |       "labels": ":Person:Customer",
35 |       "node.keys": "id"
36 |     }
37 |   }
38 | }


--------------------------------------------------------------------------------
/src/test/resources/neo4j-dwh-connector.properties:
--------------------------------------------------------------------------------
1 | neo4j.version=${neo4j.version}
2 | 


--------------------------------------------------------------------------------
/src/test/resources/persons.csv:
--------------------------------------------------------------------------------
1 | person_id,person_name,person_surname
2 | 1,Andrea,Santurbano
3 | 2,Federico,Santurbano
4 | 3,Mario,Draghi


--------------------------------------------------------------------------------
/src/test/resources/query.cyp:
--------------------------------------------------------------------------------
1 | CREATE (p:Person {name: 'Andrea Santurbano'})


--------------------------------------------------------------------------------
/src/test/resources/snowflake.to.neo4j.stub.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name" : "\n<This is a generated Configuration, please fill all the field accordingly>\nIn order to work the following dependencies are required:\n - `net.snowflake:spark-snowflake_<scala_version>:<version>`\n - `net.snowflake:snowflake-jdbc:<version>`\n - `org.neo4j:neo4j-connector-apache-spark_<scala_version>:<version>`\n",
 3 |   "master" : "",
 4 |   "conf" : { },
 5 |   "hadoopConfiguration" : { },
 6 |   "source" : {
 7 |     "format" : "snowflake",
 8 |     "options" : {
 9 |       "_comment" : "<This field is a comment you can simple ignore it>\nYou can find the full list of Snowflake configuration properties here:\nhttps://docs.snowflake.com/en/user-guide/spark-connector-use.html#setting-configuration-options-for-the-connector\n",
10 |       "sfSchema" : "<schema>",
11 |       "sfPassword" : "<password>",
12 |       "sfUser" : "<user_name>",
13 |       "sfWarehouse" : "<warehouse>",
14 |       "dbtable" : "<snowflake-table>",
15 |       "sfDatabase" : "<database>",
16 |       "sfURL" : "<account_identifier>.snowflakecomputing.com"
17 |     },
18 |     "columns" : [ {
19 |       "name" : "<Dataframe column name to project>.\nN.b. `columns` field will be ignored in case\nyou're using it in the `target` field\n",
20 |       "alias" : "Alias to column, not mandatory"
21 |     } ],
22 |     "where" : "<Spark SQL filter to the Dataframe.> It will be ignored in case you're using it in the `target` field.",
23 |     "printSchema" : false,
24 |     "limit" : -1,
25 |     "show" : -1,
26 |     "partition" : {
27 |       "number" : -1,
28 |       "by" : ""
29 |     }
30 |   },
31 |   "target" : {
32 |     "format" : "org.neo4j.spark.DataSource",
33 |     "options" : {
34 |       "_comment" : "<This field is a comment you can simple ignore it>\nYou can find the full list of Neo4j configuration properties here:\nhttps://neo4j.com/docs/spark/current/\n",
35 |       "url" : "<neo4j_url>",
36 |       "authentication.type" : "<auth type> Please see: https://neo4j.com/docs/spark/current/configuration/",
37 |       "authentication.basic.username" : "<neo4j_user_name>",
38 |       "authentication.basic.password" : "<neo4j_password>",
39 |       "query" : "<The neo4j cypher query>\nIn case of writing into Neo4j please see https://neo4j.com/docs/spark/current/writing/#write-query\nIn case of reading from Neo4j please see https://neo4j.com/docs/spark/current/reading/#read-query\n",
40 |       "relationship" : "<The relationship type>\nN.b. this field requires extra configuration please see\n - In case of writing: https://neo4j.com/docs/spark/current/writing/#write-rel\n - In case of reading: https://neo4j.com/docs/spark/current/reading/#read-rel\n",
41 |       "labels" : "<List of node labels separated by `:` The first label will be the primary label>.\nIn case of writing into Neo4j please see https://neo4j.com/docs/spark/current/writing/#write-node\nIn case of reading from Neo4j please see https://neo4j.com/docs/spark/current/reading/#read-node\n"
42 |     },
43 |     "mode" : "<Spark Save Mode>\nN.b. It'll be ignored if you're reading data from Neo4j.\nPlease check supported save modes here: https://neo4j.com/docs/spark/current/writing/#save-mode\n"
44 |   }
45 | }


--------------------------------------------------------------------------------
/src/test/scala/org/neo4j/dwh/connector/Neo4jDWHConnectorIT.scala:
--------------------------------------------------------------------------------
  1 | package org.neo4j.dwh.connector
  2 | 
  3 | import org.apache.commons.io.FileUtils
  4 | import org.apache.spark.sql.SparkSession
  5 | import org.junit.{AfterClass, Assert, Assume, BeforeClass, Test}
  6 | import org.neo4j.driver.GraphDatabase
  7 | import org.neo4j.dwh.connector.Neo4jDWHConnectorIT.neo4jContainer
  8 | import org.neo4j.dwh.connector.domain.JobConfig
  9 | import org.testcontainers.containers.Neo4jContainer
 10 | import org.testcontainers.utility.DockerImageName
 11 | 
 12 | import java.io.File
 13 | import java.nio.charset.Charset
 14 | import scala.collection.JavaConverters._
 15 | import scala.util.Properties
 16 | 
 17 | object Neo4jDWHConnectorIT {
 18 |   private val properties = new java.util.Properties()
 19 |   properties.load(Thread.currentThread().getContextClassLoader().getResourceAsStream("neo4j-dwh-connector.properties"))
 20 | 
 21 |   val neo4jContainer = new Neo4jContainer(DockerImageName.parse(s"neo4j:${properties.getProperty("neo4j.version")}"))
 22 |     .withNeo4jConfig("dbms.security.auth_enabled", "false")
 23 |     .asInstanceOf[Neo4jContainer[_]]
 24 | 
 25 |   @BeforeClass
 26 |   def setUpContainer(): Unit = {
 27 |     neo4jContainer.start()
 28 |   }
 29 | 
 30 |   @AfterClass
 31 |   def teardownContainer(): Unit = {
 32 |     neo4jContainer.stop()
 33 |   }
 34 | }
 35 | 
 36 | class Neo4jDWHConnectorIT {
 37 | 
 38 |   private def createPersons(numPersons: Int) = {
 39 |     val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl)
 40 |     val neo4jSession = driver.session()
 41 |     try {
 42 |       neo4jSession.run(
 43 |         """UNWIND RANGE(1, $numPersons) AS ID
 44 |           |MERGE (p:Person:Customer{id: ID, name: 'Name ' + ID, surname: 'Surname ' + ID, age: 10 + ID})
 45 |           |RETURN count(p) AS count
 46 |           |""".stripMargin, Map[String, AnyRef]("numPersons" -> numPersons.asInstanceOf[AnyRef]).asJava)
 47 |         .consume()
 48 |     } finally {
 49 |       neo4jSession.close()
 50 |       driver.close()
 51 |     }
 52 |   }
 53 | 
 54 |   @Test
 55 |   def shouldImportCSVIntoNeo4j(): Unit = {
 56 |     val jsonConfig =
 57 |       s"""
 58 |         |{
 59 |         |  "name": "Create Persons from CSV to Neo4j",
 60 |         |  "master": "local",
 61 |         |  "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
 62 |         |  "source": {
 63 |         |    "_comment": "The `format` field manages the connector datasource",
 64 |         |    "format": "csv",
 65 |         |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
 66 |         |    "options": {
 67 |         |      "header": "true",
 68 |         |      "path": "${Thread
 69 |                   .currentThread
 70 |                   .getContextClassLoader
 71 |                   .getResource("persons.csv")
 72 |                   .getPath}"
 73 |         |    },
 74 |         |    "_comment": "The `columns` field manages projection of the dataframe, `name` is the column name, `alias` is the name that you want to give (not mandatory)",
 75 |         |    "columns": [
 76 |         |      {
 77 |         |        "name": "person_id",
 78 |         |        "alias": "id"
 79 |         |      },
 80 |         |      {
 81 |         |        "name": "person_name",
 82 |         |        "alias": "name"
 83 |         |      }
 84 |         |    ],
 85 |         |    "_comment": "The `where` field manages filters on the datasource (not mandatory)",
 86 |         |    "where": "person_surname = 'Santurbano'"
 87 |         |  },
 88 |         |  "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
 89 |         |  "target": {
 90 |         |    "_comment": "The `format` field manages the connector datasource",
 91 |         |    "format": "org.neo4j.spark.DataSource",
 92 |         |    "_comment": "The `mode` is the save mode of the writing connector",
 93 |         |    "mode": "Overwrite",
 94 |         |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
 95 |         |    "options": {
 96 |         |      "labels": ":Person:Customer",
 97 |         |      "url": "${neo4jContainer.getBoltUrl}",
 98 |         |      "node.keys": "id"
 99 |         |    }
100 |         |  }
101 |         |}
102 |         |""".stripMargin
103 | 
104 |     runJob(jsonConfig)
105 |     val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl)
106 |     val neo4jSession = driver.session()
107 |     try {
108 |       val count = neo4jSession.run(
109 |         """
110 |           |MATCH (p:Person:Customer)
111 |           |WHERE p.name IN ['Andrea', 'Federico']
112 |           |RETURN count(p) AS count
113 |           |""".stripMargin)
114 |         .single()
115 |         .get(0)
116 |         .asLong()
117 |       Assert.assertEquals(2L, count)
118 |     } finally {
119 |       neo4jSession.close()
120 |       driver.close()
121 |     }
122 |   }
123 | 
124 |   @Test
125 |   def shouldWriteCSVFromNeo4j(): Unit = {
126 |     val csvPath = Properties.propOrElse("java.io.tmpdir", "").concat("/from-neo4j")
127 | 
128 |     val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl)
129 |     val neo4jSession = driver.session()
130 |     try {
131 |       neo4jSession.run(
132 |         """
133 |           |UNWIND range(1, 2) AS id
134 |           |MERGE (p:Person:Customer {id: id, name: 'Name For Id ' + id})
135 |           |RETURN p
136 |           |""".stripMargin)
137 |         .consume()
138 |     } finally {
139 |       neo4jSession.close()
140 |       driver.close()
141 |     }
142 | 
143 |     val jsonConfig =
144 |       s"""
145 |          |{
146 |          |  "name": "Create Persons from CSV to Neo4j",
147 |          |  "master": "local",
148 |          |  "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
149 |          |  "source": {
150 |          |    "_comment": "The `format` field manages the connector datasource",
151 |          |    "format": "org.neo4j.spark.DataSource",
152 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
153 |          |    "options": {
154 |          |      "labels": ":Person:Customer",
155 |          |      "url": "${neo4jContainer.getBoltUrl}"
156 |          |    },
157 |          |    "_comment": "The `columns` field manages projection of the dataframe, `name` is the column name, `alias` is the name that you want to give (not mandatory)",
158 |          |    "columns": [
159 |          |      {
160 |          |        "name": "id"
161 |          |      },
162 |          |      {
163 |          |        "name": "name"
164 |          |      }
165 |          |    ]
166 |          |  },
167 |          |  "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
168 |          |  "target": {
169 |          |    "_comment": "The `format` field manages the connector datasource",
170 |          |    "format": "csv",
171 |          |    "_comment": "The `mode` is the save mode of the writing connector",
172 |          |    "mode": "Overwrite",
173 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
174 |          |    "options": {
175 |          |      "header": "true",
176 |          |      "path": "$csvPath"
177 |          |    }
178 |          |  }
179 |          |}
180 |          |""".stripMargin
181 | 
182 |     runJob(jsonConfig)
183 | 
184 |     val csvFile = new File(csvPath)
185 |       .listFiles()
186 |       .filter(_.isFile)
187 |       .filter(_.getName.endsWith("csv"))(0)
188 | 
189 |     val actual = FileUtils.readFileToString(csvFile, Charset.forName("UTF-8"))
190 |     val expected =
191 |       """id,name
192 |         |1,Name For Id 1
193 |         |2,Name For Id 2
194 |         |""".stripMargin
195 |     Assert.assertEquals(expected, actual)
196 |   }
197 | 
198 |   @Test
199 |   def shouldImportSnowflakeIntoNeo4j(): Unit = {
200 |     val snowflakeschema = Properties.envOrNone("SNOWFLAKE_SCHEMA")
201 |     Assume.assumeFalse(snowflakeschema.isEmpty)
202 |     val snowflakeuser = Properties.envOrNone("SNOWFLAKE_USER")
203 |     Assume.assumeFalse(snowflakeuser.isEmpty)
204 |     val snowflakepassword = Properties.envOrNone("SNOWFLAKE_PASSWORD")
205 |     Assume.assumeFalse(snowflakepassword.isEmpty)
206 |     val snowflakedatabase = Properties.envOrNone("SNOWFLAKE_DATABASE")
207 |     Assume.assumeFalse(snowflakedatabase.isEmpty)
208 |     val snowflakeurl = Properties.envOrNone("SNOWFLAKE_URL")
209 |     Assume.assumeFalse(snowflakeurl.isEmpty)
210 |     val snowflaketable = Properties.envOrNone("SNOWFLAKE_TABLE")
211 |     Assume.assumeFalse(snowflaketable.isEmpty)
212 |     val jsonConfig =
213 |       s"""
214 |          |{
215 |          |  "name": "Create Customers from Snowflake to Neo4j",
216 |          |  "master": "local",
217 |          |  "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
218 |          |  "source": {
219 |          |    "_comment": "The `format` field manages the connector datasource",
220 |          |    "format" : "snowflake",
221 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
222 |          |    "options" : {
223 |          |      "sfSchema" : "${snowflakeschema.get}",
224 |          |      "sfPassword" : "${snowflakepassword.get}",
225 |          |      "sfUser" : "${snowflakeuser.get}",
226 |          |      "dbtable" : "${snowflaketable.get}",
227 |          |      "sfDatabase" : "${snowflakedatabase.get}",
228 |          |      "sfURL" : "${snowflakeurl.get}"
229 |          |    },
230 |          |    "_comment": "The `where` field manages filters on the datasource (not mandatory)",
231 |          |    "where": "C_CUSTKEY <= 10",
232 |          |    "_comments": "The `columns` field manages the projection of Dataframe columns",
233 |          |    "columns": [
234 |          |      { "name": "CAST(C_ACCTBAL AS DOUBLE)", "alias": "C_ACCTBAL" },
235 |          |      { "name": "C_ADDRESS" },
236 |          |      { "name": "C_COMMENT" },
237 |          |      { "name": "CAST(C_CUSTKEY AS LONG)", "alias": "C_CUSTKEY" },
238 |          |      { "name": "C_MKTSEGMENT" },
239 |          |      { "name": "C_NAME" },
240 |          |      { "name": "CAST(C_NATIONKEY AS LONG)", "alias": "C_NATIONKEY" },
241 |          |      { "name": "C_PHONE" }
242 |          |    ]
243 |          |  },
244 |          |  "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
245 |          |  "target": {
246 |          |    "_comment": "The `format` field manages the connector datasource",
247 |          |    "format": "org.neo4j.spark.DataSource",
248 |          |    "_comment": "The `mode` is the save mode of the writing connector",
249 |          |    "mode": "Overwrite",
250 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
251 |          |    "options": {
252 |          |      "labels": ":Person:Customer",
253 |          |      "url": "${neo4jContainer.getBoltUrl}",
254 |          |      "node.keys": "C_CUSTKEY"
255 |          |    }
256 |          |  }
257 |          |}
258 |          |""".stripMargin
259 | 
260 |     runJob(jsonConfig)
261 |     val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl)
262 |     val neo4jSession = driver.session()
263 |     try {
264 |       val count = neo4jSession.run(
265 |         """
266 |           |MATCH (p:Person:Customer)
267 |           |RETURN count(p) AS count
268 |           |""".stripMargin)
269 |         .single()
270 |         .get(0)
271 |         .asLong()
272 |       Assert.assertEquals(10L, count)
273 |     } finally {
274 |       neo4jSession.close()
275 |       driver.close()
276 |     }
277 |   }
278 | 
279 |   @Test
280 |   def shouldImportNeo4jIntoSnowflake(): Unit = {
281 |     val snowflakeschema = Properties.envOrNone("SNOWFLAKE_SCHEMA")
282 |     Assume.assumeFalse(snowflakeschema.isEmpty)
283 |     val snowflakeuser = Properties.envOrNone("SNOWFLAKE_USER")
284 |     Assume.assumeFalse(snowflakeuser.isEmpty)
285 |     val snowflakepassword = Properties.envOrNone("SNOWFLAKE_PASSWORD")
286 |     Assume.assumeFalse(snowflakepassword.isEmpty)
287 |     val snowflakedatabase = Properties.envOrNone("SNOWFLAKE_DATABASE")
288 |     Assume.assumeFalse(snowflakedatabase.isEmpty)
289 |     val snowflakeurl = Properties.envOrNone("SNOWFLAKE_URL")
290 |     Assume.assumeFalse(snowflakeurl.isEmpty)
291 |     val snowflaketable = Properties.envOrNone("SNOWFLAKE_TABLE")
292 |     Assume.assumeFalse(snowflaketable.isEmpty)
293 |     val numPersons = 10
294 |     createPersons(numPersons)
295 |     val jsonConfig =
296 |       s"""
297 |          |{
298 |          |  "name": "Create Person from Neo4j to Snowflake",
299 |          |  "master": "local",
300 |          |  "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
301 |          |  "source": {
302 |          |    "_comment": "The `format` field manages the connector datasource",
303 |          |    "format": "org.neo4j.spark.DataSource",
304 |          |    "options": {
305 |          |      "labels": ":Person:Customer",
306 |          |      "url": "${neo4jContainer.getBoltUrl}"
307 |          |    },
308 |          |    "columns": [
309 |          |      { "name": "ID" },
310 |          |      { "name": "NAME" },
311 |          |      { "name": "SURNAME" },
312 |          |      { "name": "AGE" }
313 |          |    ]
314 |          |  },
315 |          |  "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
316 |          |  "target": {
317 |          |    "_comment": "The `format` field manages the connector datasource",
318 |          |    "format" : "snowflake",
319 |          |    "_comment": "The `mode` is the save mode of the writing connector",
320 |          |    "mode": "Append",
321 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
322 |          |    "options" : {
323 |          |      "sfSchema" : "${snowflakeschema.get}",
324 |          |      "sfPassword" : "${snowflakepassword.get}",
325 |          |      "sfUser" : "${snowflakeuser.get}",
326 |          |      "dbtable" : "${snowflaketable.get}",
327 |          |      "sfDatabase" : "${snowflakedatabase.get}",
328 |          |      "sfURL" : "${snowflakeurl.get}"
329 |          |    }
330 |          |  }
331 |          |}
332 |          |""".stripMargin
333 | 
334 |     runJob(jsonConfig)
335 | 
336 |     val count = SparkSession.builder()
337 |       .master("local[*]")
338 |       .getOrCreate()
339 |       .read
340 |       .format("snowflake")
341 |       .option("sfSchema", snowflakeschema.get)
342 |       .option("sfPassword", snowflakepassword.get)
343 |       .option("sfUser", snowflakeuser.get)
344 |       .option("dbtable", snowflaketable.get)
345 |       .option("sfDatabase", snowflakedatabase.get)
346 |       .option("sfURL", snowflakeurl.get)
347 |       .load()
348 |       .count()
349 |     Assert.assertEquals(numPersons.toLong, count)
350 |   }
351 | 
352 |   @Test
353 |   def shouldImportBigQueryIntoNeo4j(): Unit = {
354 |     val googleprojectid = Properties.envOrNone("GOOGLE_PROJECT_ID")
355 |     Assume.assumeFalse(googleprojectid.isEmpty)
356 |     val googlecredentialsjson = Properties.envOrNone("GOOGLE_CREDENTIALS_JSON")
357 |     Assume.assumeFalse(googlecredentialsjson.isEmpty)
358 |     val jsonConfig =
359 |       s"""
360 |          |{
361 |          |  "name": "Create ingest BigQuery's Stackoverflow data into Neo4j",
362 |          |  "master": "local",
363 |          |  "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
364 |          |  "source": {
365 |          |    "_comment": "The `format` field manages the connector datasource",
366 |          |    "format" : "bigquery",
367 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
368 |          |    "options": {
369 |          |      "table": "bigquery-public-data.stackoverflow.posts_questions",
370 |          |      "parentProject": "${googleprojectid.get}",
371 |          |      "credentialsFile": "${googlecredentialsjson.get}"
372 |          |    },
373 |          |    "_comment": "The `where` field manages filters on the datasource (not mandatory)",
374 |          |    "where": "id <= 10",
375 |          |    "_comments": "The `columns` field manages the projection of Dataframe columns",
376 |          |    "columns": [
377 |          |      { "name": "ID" },
378 |          |      { "name": "TITLE" },
379 |          |      { "name": "BODY" }
380 |          |    ]
381 |          |  },
382 |          |  "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
383 |          |  "target": {
384 |          |    "_comment": "The `format` field manages the connector datasource",
385 |          |    "format": "org.neo4j.spark.DataSource",
386 |          |    "_comment": "The `mode` is the save mode of the writing connector",
387 |          |    "mode": "Overwrite",
388 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
389 |          |    "options": {
390 |          |      "labels": ":Answer",
391 |          |      "url": "${neo4jContainer.getBoltUrl}",
392 |          |      "node.keys": "ID"
393 |          |    }
394 |          |  }
395 |          |}
396 |          |""".stripMargin
397 | 
398 |     runJob(jsonConfig)
399 |     val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl)
400 |     val neo4jSession = driver.session()
401 |     try {
402 |       val count = neo4jSession.run(
403 |         """
404 |           |MATCH (p:Answer)
405 |           |RETURN count(p) AS count
406 |           |""".stripMargin)
407 |         .single()
408 |         .get(0)
409 |         .asLong()
410 |       Assert.assertEquals(3L, count)
411 |     } finally {
412 |       neo4jSession.close()
413 |       driver.close()
414 |     }
415 |   }
416 | 
417 |   @Test
418 |   def shouldImportNeo4jIntoBigQuery(): Unit = {
419 |     val googleprojectid = Properties.envOrNone("GOOGLE_PROJECT_ID")
420 |     Assume.assumeFalse(googleprojectid.isEmpty)
421 |     val googlecredentialsjson = Properties.envOrNone("GOOGLE_CREDENTIALS_JSON")
422 |     Assume.assumeFalse(googlecredentialsjson.isEmpty)
423 |     val googlebigquerytable = Properties.envOrNone("GOOGLE_BIGQUERY_TABLE")
424 |     Assume.assumeFalse(googlebigquerytable.isEmpty)
425 |     val numPersons = 10
426 |     createPersons(numPersons)
427 |     val jsonConfig =
428 |       s"""
429 |          |{
430 |          |  "name": "Create ingest BigQuery's Stackoverflow data into Neo4j",
431 |          |  "master": "local",
432 |          |  "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
433 |          |  "source": {
434 |          |    "_comment": "The `format` field manages the connector datasource",
435 |          |    "format": "org.neo4j.spark.DataSource",
436 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
437 |          |    "options": {
438 |          |      "labels": ":Person",
439 |          |      "url": "${neo4jContainer.getBoltUrl}"
440 |          |    },
441 |          |    "columns": [
442 |          |      { "name": "ID" },
443 |          |      { "name": "NAME" },
444 |          |      { "name": "SURNAME" },
445 |          |      { "name": "AGE" }
446 |          |    ]
447 |          |  },
448 |          |  "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
449 |          |  "target": {
450 |          |    "_comment": "The `format` field manages the connector datasource",
451 |          |    "format" : "bigquery",
452 |          |    "_comment": "The `mode` is the save mode of the writing connector",
453 |          |    "mode": "Overwrite",
454 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
455 |          |    "options" : {
456 |          |      "table": "${googlebigquerytable.get}",
457 |          |      "parentProject": "${googleprojectid.get}",
458 |          |      "credentialsFile": "${googlecredentialsjson.get}"
459 |          |    }
460 |          |  }
461 |          |}
462 |          |""".stripMargin
463 | 
464 |     runJob(jsonConfig)
465 | 
466 |     val count = SparkSession.builder()
467 |       .master("local[*]")
468 |       .getOrCreate()
469 |       .read
470 |       .format("bigquery")
471 |       .option("table", googlebigquerytable.get)
472 |       .option("parentProject", googleprojectid.get)
473 |       .option("credentialsFile", googlecredentialsjson.get)
474 |       .load()
475 |       .count()
476 |     Assert.assertEquals(numPersons.toLong, count)
477 |   }
478 | 
479 |   @Test
480 |   def shouldImportRedShiftIntoNeo4j(): Unit = {
481 |     val awsredshifturl = Properties.envOrNone("AWS_REDSHIFT_URL")
482 |     Assume.assumeFalse(awsredshifturl.isEmpty)
483 |     val awsredshifttable = Properties.envOrNone("AWS_REDSHIFT_TABLE")
484 |     Assume.assumeFalse(awsredshifttable.isEmpty)
485 |     val awsiamrole = Properties.envOrNone("AWS_IAM_ROLE")
486 |     Assume.assumeFalse(awsiamrole.isEmpty)
487 |     val awss3tmpdir = Properties.envOrNone("AWS_S3_TMPDIR")
488 |     Assume.assumeFalse(awss3tmpdir.isEmpty)
489 |     val awss3accessid = Properties.envOrNone("AWS_ACCESS_KEY")
490 |     Assume.assumeFalse(awss3accessid.isEmpty)
491 |     val awss3accessky = Properties.envOrNone("AWS_SECRET_ACCESS_KEY")
492 |     Assume.assumeFalse(awss3accessky.isEmpty)
493 |     val jsonConfig =
494 |       s"""
495 |          |{
496 |          |  "name": "Create ingest RedShift data into Neo4j",
497 |          |  "master": "local",
498 |          |  "hadoopConfiguration": {
499 |          |    "fs.s3a.access.key": "${awss3accessid.get}",
500 |          |    "fs.s3a.secret.key": "${awss3accessky.get}"
501 |          |  },
502 |          |  "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
503 |          |  "source": {
504 |          |    "_comment": "The `format` field manages the connector datasource",
505 |          |    "format" : "io.github.spark_redshift_community.spark.redshift",
506 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
507 |          |    "options": {
508 |          |      "url": "${awsredshifturl.get}",
509 |          |      "dbtable": "${awsredshifttable.get}",
510 |          |      "tempdir": "${awss3tmpdir.get}",
511 |          |      "forward_spark_s3_credentials": "true"
512 |          |    },
513 |          |    "_comment": "The `where` field manages filters on the datasource (not mandatory)",
514 |          |    "where": "userid <= 10"
515 |          |  },
516 |          |  "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
517 |          |  "target": {
518 |          |    "_comment": "The `format` field manages the connector datasource",
519 |          |    "format": "org.neo4j.spark.DataSource",
520 |          |    "_comment": "The `mode` is the save mode of the writing connector",
521 |          |    "mode": "Overwrite",
522 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
523 |          |    "options": {
524 |          |      "labels": ":Person",
525 |          |      "url": "${neo4jContainer.getBoltUrl}",
526 |          |      "node.keys": "userid"
527 |          |    }
528 |          |  }
529 |          |}
530 |          |""".stripMargin
531 | 
532 |     runJob(jsonConfig)
533 |     val driver = GraphDatabase.driver(neo4jContainer.getBoltUrl)
534 |     val neo4jSession = driver.session()
535 |     try {
536 |       val count = neo4jSession.run(
537 |         """
538 |           |MATCH (p:Person)
539 |           |RETURN count(p) AS count
540 |           |""".stripMargin)
541 |         .single()
542 |         .get(0)
543 |         .asLong()
544 |       Assert.assertEquals(10L, count)
545 |     } finally {
546 |       neo4jSession.close()
547 |       driver.close()
548 |     }
549 |   }
550 | 
551 |   @Test
552 |   def shouldImportNeo4jIntoRedShift(): Unit = {
553 |     val awsredshifturl = Properties.envOrNone("AWS_REDSHIFT_URL")
554 |     Assume.assumeFalse(awsredshifturl.isEmpty)
555 |     val awsredshifttable = Properties.envOrNone("AWS_REDSHIFT_TABLE")
556 |     Assume.assumeFalse(awsredshifttable.isEmpty)
557 |     val awsiamrole = Properties.envOrNone("AWS_IAM_ROLE")
558 |     Assume.assumeFalse(awsiamrole.isEmpty)
559 |     val awss3tmpdir = Properties.envOrNone("AWS_S3_TMPDIR")
560 |     Assume.assumeFalse(awss3tmpdir.isEmpty)
561 |     val awss3accessid = Properties.envOrNone("AWS_ACCESS_KEY")
562 |     Assume.assumeFalse(awss3accessid.isEmpty)
563 |     val awss3accessky = Properties.envOrNone("AWS_SECRET_ACCESS_KEY")
564 |     Assume.assumeFalse(awss3accessky.isEmpty)
565 |     val numPersons = 10
566 |     createPersons(numPersons)
567 |     val jsonConfig =
568 |       s"""
569 |          |{
570 |          |  "name": "Create ingest RedShift data into Neo4j",
571 |          |  "master": "local",
572 |          |  "hadoopConfiguration": {
573 |          |    "fs.s3a.access.key": "${awss3accessid.get}",
574 |          |    "fs.s3a.secret.key": "${awss3accessky.get}"
575 |          |  },
576 |          |  "_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
577 |          |  "source": {
578 |          |    "_comment": "The `format` field manages the connector datasource",
579 |          |    "format": "org.neo4j.spark.DataSource",
580 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
581 |          |    "options": {
582 |          |      "labels": ":Person",
583 |          |      "url": "${neo4jContainer.getBoltUrl}"
584 |          |    },
585 |          |    "columns": [
586 |          |      { "name": "ID" },
587 |          |      { "name": "NAME" },
588 |          |      { "name": "SURNAME" },
589 |          |      { "name": "AGE" }
590 |          |    ]
591 |          |  },
592 |          |  "_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
593 |          |  "target": {
594 |          |    "_comment": "The `format` field manages the connector datasource",
595 |          |    "format" : "io.github.spark_redshift_community.spark.redshift",
596 |          |    "_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
597 |          |    "options": {
598 |          |      "url": "${awsredshifturl.get}",
599 |          |      "dbtable": "${awsredshifttable.get}",
600 |          |      "tempdir": "${awss3tmpdir.get}",
601 |          |      "tempformat": "CSV",
602 |          |      "forward_spark_s3_credentials": "true"
603 |          |    }
604 |          |  }
605 |          |}
606 |          |""".stripMargin
607 | 
608 |     runJob(jsonConfig)
609 | 
610 |     val session = SparkSession.builder()
611 |       .master("local[*]")
612 |       .getOrCreate()
613 |     session.sparkContext.hadoopConfiguration.set("fs.s3a.access.key", awss3accessid.get)
614 |     session.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", awss3accessky.get)
615 |     val count = session
616 |       .read
617 |       .format("io.github.spark_redshift_community.spark.redshift")
618 |       .option("url", awsredshifturl.get)
619 |       .option("dbtable", awsredshifttable.get)
620 |       .option("tempdir", awss3tmpdir.get)
621 |       .option("forward_spark_s3_credentials", "true")
622 |       .load()
623 |       .count()
624 |     Assert.assertEquals(numPersons.toLong, count)
625 |   }
626 | 
627 |   /**
628 |    * Synapse Connector is available only in Databricks Cloud
629 |    * if you want to Connect to it in non-Databricks environment
630 |    * you can user the `jdbc` Datasource with a job like this:
631 |    *
632 |    * Read:
633 |    * spark.read
634 |    *  .format("jdbc")
635 |    *  .option("url", "jdbc:sqlserver://synapsesparkneo4j.sql.azuresynapse.net:1433;database=<database_name>;user=<user>;password=<password>;encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.sql.azuresynapse.net;loginTimeout=30;")
636 |    *  .option("dbtable", "dbo.Date")
637 |    *  .load()
638 |    *
639 |    * Write:
640 |    * df.write
641 |    *  .format("jdbc")
642 |    *  .option("url", "jdbc:sqlserver://synapsesparkneo4j.sql.azuresynapse.net:1433;database=<database_name>;user=<user>;password=<password>;encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.sql.azuresynapse.net;loginTimeout=30;")
643 |    *  .option("dbtable", "dbo.Date")
644 |    *  .save()
645 |    */
646 | 
647 |   private def runJob(jsonConfig: String) = {
648 |     val jobConfig = JobConfig.from(jsonConfig)
649 |     new Neo4jDWHConnector(jobConfig).run()
650 |   }
651 | }
652 | 


--------------------------------------------------------------------------------
/src/test/scala/org/neo4j/dwh/connector/domain/JobConfigTest.scala:
--------------------------------------------------------------------------------
 1 | package org.neo4j.dwh.connector.domain
 2 | 
 3 | import org.apache.spark.sql.SaveMode
 4 | import org.junit.Assert.assertEquals
 5 | import org.junit.Test
 6 | 
 7 | class JobConfigTest {
 8 | 
 9 |   @Test
10 |   def shouldParseTheJsonIntoJobConfig(): Unit = {
11 |     val jsonConfig =
12 |       """
13 |         |{
14 |         |	"name": "Create Persons from Snowflake to Neo4j",
15 |         |	"master": "local",
16 |         |	"_comment": "The `conf` field will add configuration via spark.conf.set",
17 |         |	"conf": {
18 |         |		"<field>": "<value>"
19 |         |	},
20 |         |	"_comment": "The `hadoopConfiguration` field will add configuration via spark.hadoopConfiguration().set",
21 |         |	"hadoopConfiguration": {
22 |         |		"<fieldHadoop>": "<valueHadoop>"
23 |         |	},
24 |         |	"_comment": "The `source` field is a general field that manages the source database. Is basically where we read the data",
25 |         |	"source": {
26 |         |		"_comment": "The `format` field manages the connector datasource",
27 |         |		"format": "net.snowflake.spark.snowflake",
28 |         |		"_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
29 |         |		"options": {
30 |         |			"sfURL": "<account_identifier>.snowflakecomputing.com",
31 |         |			"sfUser": "<user_name>",
32 |         |			"sfPassword": "<password>",
33 |         |			"sfDatabase": "<database>",
34 |         |			"sfSchema": "<schema>",
35 |         |			"sfWarehouse": "<warehouse>",
36 |         |     "dbtable": "<snowflake-table>"
37 |         |		},
38 |         |		"_comment": "The `columns` field manages projection of the dataframe, `name` is the column name, `alias` is the name that you want to give (not mandatory)",
39 |         |		"columns": [
40 |         |     {
41 |         |				"name": "person_id",
42 |         |				"alias": "id"
43 |         |			},
44 |         |			{
45 |         |				"name": "person_name",
46 |         |				"alias": "name"
47 |         |			}
48 |         |		],
49 |         |		"_comment": "The `where` field manages filters on the datasource (not mandatory)",
50 |         |		"where": "person_surname = 'Santurbano'",
51 |         |		"_comment": "The `partition` field repartition the source dataframe this can be useful when you're ingesting relationships into Neo4j",
52 |         |		"partition": {
53 |         |     "number": 5,
54 |         |     "by": "foo"
55 |         |   }
56 |         |	},
57 |         |	"_comment": "The `target` field is a general field that manages the target database. Is basically where we write the data that has been read in the field `source`",
58 |         |	"target": {
59 |         |		"_comment": "The `format` field manages the connector datasource",
60 |         |		"format": "org.neo4j.spark.DataSource",
61 |         |		"_comment": "The `mode` is the save mode of the writing connector",
62 |         |		"mode": "Overwrite",
63 |         |		"_comment": "The `options` field manages the connector datasource options, which are specific for each datasource",
64 |         |		"options": {
65 |         |			"labels": ":Person:Customer",
66 |         |     "node.keys": "id"
67 |         |		}
68 |         |	}
69 |         |}
70 |         |""".stripMargin
71 | 
72 |     val jobConfig = JobConfig.from(jsonConfig)
73 | 
74 |     assertEquals("Create Persons from Snowflake to Neo4j", jobConfig.name)
75 |     assertEquals("local", jobConfig.master)
76 |     assertEquals(Map("<field>" -> "<value>"), jobConfig.conf)
77 |     assertEquals(Map("<fieldHadoop>" -> "<valueHadoop>"), jobConfig.hadoopConfiguration)
78 | 
79 |     assertEquals("net.snowflake.spark.snowflake", jobConfig.source.format)
80 |     assertEquals(Map("sfURL" -> "<account_identifier>.snowflakecomputing.com",
81 |       "sfUser" -> "<user_name>",
82 |       "sfPassword" -> "<password>",
83 |       "sfDatabase" -> "<database>",
84 |       "sfSchema" -> "<schema>",
85 |       "sfWarehouse" -> "<warehouse>",
86 |       "dbtable" -> "<snowflake-table>"), jobConfig.source.options)
87 |     assertEquals(Seq(Column("person_id", "id"), Column("person_name", "name")), jobConfig.source.columns)
88 |     assertEquals("person_surname = 'Santurbano'", jobConfig.source.where)
89 |     assertEquals(Partition(5, "foo"), jobConfig.source.partition)
90 | 
91 |     assertEquals("org.neo4j.spark.DataSource", jobConfig.target.format)
92 |     assertEquals(SaveMode.Overwrite.toString, jobConfig.target.mode)
93 |     assertEquals(Map("labels" -> ":Person:Customer", "node.keys" -> "id"), jobConfig.target.options)
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/test/scala/org/neo4j/dwh/connector/generator/JobConfigGeneratorTest.scala:
--------------------------------------------------------------------------------
 1 | package org.neo4j.dwh.connector.generator
 2 | 
 3 | import org.junit.{Assert, Test}
 4 | import org.neo4j.dwh.connector.domain.JobConfig
 5 | import org.neo4j.dwh.connector.utils.{CliUtils, JSONUtils}
 6 | 
 7 | import java.io.File
 8 | import scala.util.Properties
 9 | 
10 | class JobConfigGeneratorTest {
11 | 
12 |   @Test
13 |   def shouldCreateConfigStubFileFromSnowflakeToNeo4j(): Unit = {
14 |     val filePath = Properties.propOrElse("java.io.tmpdir", "").concat("/config.stub.json")
15 |     val cli = CliUtils.parseArgs(Array("-p", filePath, "-c", "-s", "Snowflake", "-t", "Neo4j"))
16 |     new JobConfigGenerator(cli).generate()
17 |     val actual = JSONUtils.mapper.readValue(new File(filePath), classOf[Map[String, Any]])
18 |     val expected = JSONUtils.mapper.readValue(Thread.currentThread()
19 |       .getContextClassLoader
20 |       .getResourceAsStream("snowflake.to.neo4j.stub.json"), classOf[Map[String, Any]])
21 |     Assert.assertEquals(expected, actual)
22 |   }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/test/scala/org/neo4j/dwh/connector/utils/UtilsTest.scala:
--------------------------------------------------------------------------------
 1 | package org.neo4j.dwh.connector.utils
 2 | 
 3 | import org.apache.commons.lang3.StringUtils
 4 | import org.junit.Assert.assertEquals
 5 | import org.junit.{Assume, Test}
 6 | 
 7 | import java.net.URL
 8 | import scala.util.Properties
 9 | 
10 | class UtilsTest {
11 | 
12 |   private val queryUrl: URL = Thread
13 |     .currentThread
14 |     .getContextClassLoader
15 |     .getResource("query.cyp")
16 | 
17 |   private val source = scala.io.Source
18 |     .fromFile(queryUrl.toURI)
19 |   private val queryFile: String = try {
20 |     source
21 |       .getLines()
22 |       .mkString("\n")
23 |   } finally {
24 |     source.close()
25 |   }
26 | 
27 |   @Test
28 |   def shouldReturnMapWithEnvAndFileContent(): Unit = {
29 |     val myenv = Properties.envOrElse("MY_ENV", "")
30 |     Assume.assumeTrue(StringUtils.isNotBlank(myenv))
31 |     val sourceMap = Map("foo" -> "bar",
32 |       "withEnv" -> "${env:MY_ENV}",
33 |       "noEnv" -> "${env:NO_ENV}",
34 |       "withFile" -> s"$${$queryUrl}",
35 |       "noFile" -> "${file:/foo/bar.cyp}")
36 |     val expected = Map("foo" -> "bar",
37 |       "withEnv" -> myenv,
38 |       "noEnv" -> "${env:NO_ENV}",
39 |       "withFile" -> queryFile,
40 |       "noFile" -> "${file:/foo/bar.cyp}")
41 |     val actual = Utils.enrichMap(sourceMap)
42 |     assertEquals(expected, actual)
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------