├── .github
├── ISSUE_TEMPLATE
│ ├── BUG_REPORT.md
│ └── FEATURE_REQUEST.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ ├── README.md
│ ├── auto-triage.yml
│ ├── main.yml
│ ├── nightly.yml
│ ├── on-main-push.yml
│ ├── on-pull-request.yml
│ ├── remove-issue.yml
│ └── weekly.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── codecov.yml
├── connector
├── .java-version
├── build.sbt
├── project
│ ├── build.properties
│ └── plugins.sbt
├── scalastyle-config.xml
└── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── vertica
│ │ └── spark
│ │ ├── config
│ │ ├── FileStoreConfig.scala
│ │ ├── JDBCConfig.scala
│ │ ├── LogProvider.scala
│ │ ├── ReadConfig.scala
│ │ ├── TableName.scala
│ │ ├── VerticaMetadata.scala
│ │ └── WriteConfig.scala
│ │ ├── datasource
│ │ ├── VerticaDatasourceV2.scala
│ │ ├── core
│ │ │ ├── DSConfigSetup.scala
│ │ │ ├── DSReader.scala
│ │ │ ├── DSWriter.scala
│ │ │ ├── SessionId.scala
│ │ │ ├── VerticaDistributedFilesystemReadPipe.scala
│ │ │ ├── VerticaDistributedFilesystemWritePipe.scala
│ │ │ ├── VerticaPipe.scala
│ │ │ └── factory
│ │ │ │ └── VerticaPipeFactory.scala
│ │ ├── fs
│ │ │ └── FileStoreLayerInterface.scala
│ │ ├── jdbc
│ │ │ └── VerticaJdbcLayer.scala
│ │ ├── json
│ │ │ ├── JsonBatchFactory.scala
│ │ │ └── VerticaJsonScan.scala
│ │ ├── partitions
│ │ │ ├── file
│ │ │ │ ├── PartitionedFileIdentity.scala
│ │ │ │ └── VerticaFilePartition.scala
│ │ │ ├── mixin
│ │ │ │ ├── Cleanup.scala
│ │ │ │ └── Identifiable.scala
│ │ │ └── parquet
│ │ │ │ ├── ParquetFileRange.scala
│ │ │ │ └── VerticaDistributedFilesystemPartition.scala
│ │ ├── v2
│ │ │ ├── VerticaDatasourceV2Catalog.scala
│ │ │ ├── VerticaDatasourceV2Read.scala
│ │ │ ├── VerticaDatasourceV2Table.scala
│ │ │ └── VerticaDatasourceV2Write.scala
│ │ └── wrappers
│ │ │ ├── PartitionReaderWrapper.scala
│ │ │ ├── PartitionReaderWrapperFactory.scala
│ │ │ ├── VerticaScanWrapper.scala
│ │ │ ├── VerticaScanWrapperBuilder.scala
│ │ │ └── json
│ │ │ └── VerticaJsonTableWrapper.scala
│ │ ├── parquet
│ │ ├── ParquetReadSupport.scala
│ │ ├── ParquetRecordMaterializer.scala
│ │ ├── ParquetRowConverter.scala
│ │ ├── ParquetSchemaConverter.scala
│ │ └── VerticaDataSourceUtils.scala
│ │ └── util
│ │ ├── Timer.scala
│ │ ├── cleanup
│ │ ├── CleanupUtils.scala
│ │ ├── DistributedFilesCleaner.scala
│ │ └── FileCleanupInfo.scala
│ │ ├── complex
│ │ └── ComplexTypeUtils.scala
│ │ ├── error
│ │ └── ErrorHandling.scala
│ │ ├── general
│ │ └── Utils.scala
│ │ ├── listeners
│ │ └── SparkListeners.scala
│ │ ├── pushdown
│ │ └── PushdownUtils.scala
│ │ ├── query
│ │ ├── ColumnsTable.scala
│ │ ├── ComplexTypesTable.scala
│ │ ├── StringParsingUtils.scala
│ │ ├── TypesTable.scala
│ │ └── VerticaTable.scala
│ │ ├── reflections
│ │ └── ReflectionTools.scala
│ │ ├── schema
│ │ ├── ComplexTypesSchemaTools.scala
│ │ └── SchemaTools.scala
│ │ ├── table
│ │ └── TableUtils.scala
│ │ └── version
│ │ ├── SparkVersionTools.scala
│ │ ├── Version.scala
│ │ └── VerticaVersionUtils.scala
│ └── test
│ └── scala
│ └── com
│ └── vertica
│ └── spark
│ ├── common
│ └── TestObjects.scala
│ ├── datasource
│ ├── core
│ │ ├── DSConfigSetupTest.scala
│ │ ├── DSConfigSetupUtilsTest.scala
│ │ ├── DSReaderTest.scala
│ │ ├── DSWriterTest.scala
│ │ ├── JDBCConfigParserTests.scala
│ │ ├── TableNameTest.scala
│ │ ├── VerticaDistributedFilesystemReadPipeTests.scala
│ │ └── VerticaDistributedFilesystemWritePipeTest.scala
│ ├── json
│ │ ├── JsonBatchFactoryTest.scala
│ │ └── VerticaJsonScanTest.scala
│ ├── partitions
│ │ └── parquet
│ │ │ └── ParquetFileRangeTest.scala
│ ├── v2
│ │ └── VerticaV2SourceTest.scala
│ └── wrappers
│ │ ├── PartitionReaderWrapperFactoryTest.scala
│ │ ├── PartitionReaderWrapperTest.scala
│ │ ├── VerticaScanWrapperBuilderTest.scala
│ │ ├── VerticaScanWrapperTest.scala
│ │ └── json
│ │ └── VerticaJsonTableWrapperTest.scala
│ └── util
│ ├── cleanup
│ └── CleanupUtilsTest.scala
│ ├── error
│ └── ErrorHandlingTest.scala
│ ├── pushdown
│ └── PushdownUtilsTest.scala
│ ├── query
│ ├── StringParsingUtilsTest.scala
│ └── VerticaTableTests.scala
│ ├── schema
│ ├── ComplexTypesSchemaToolsTest.scala
│ ├── SchemaToolsTest.scala
│ └── SchemaToolsV10Test.scala
│ ├── table
│ └── TableUtilsTest.scala
│ └── version
│ ├── SparkVersionToolsTests.scala
│ ├── VersionTest.scala
│ └── VerticaVersionUtilsTest.scala
├── docker
├── README.md
├── client-krb
│ ├── Dockerfile
│ ├── docker-entrypoint.sh
│ ├── jaas.config
│ └── vsql
├── client
│ └── Dockerfile
├── docker-compose-kerberos.yml
├── docker-compose.yml
├── hdfs-krb
│ ├── Dockerfile
│ └── docker-entrypoint.sh
├── hdfs
│ └── docker-entrypoint.sh
├── kdc
│ ├── Dockerfile
│ └── docker-entrypoint.sh
├── keytabs
│ └── .gitkeep
├── krb.env
├── vertica-hdfs-config
│ ├── hadoop-kerberized
│ │ ├── core-site.xml
│ │ ├── hdfs-site.xml
│ │ ├── keystore
│ │ ├── ssl-client.xml
│ │ └── ssl-server.xml
│ └── hadoop
│ │ ├── core-site.xml
│ │ └── hdfs-site.xml
├── vertica-krb
│ └── docker-entrypoint.sh
└── vertica
│ ├── docker-entrypoint-legacy.sh
│ └── docker-entrypoint.sh
├── docs
├── gcs-guide.md
├── hdfs-guide.md
├── kerberos-guide.md
├── s3-guide.md
├── tls-guide.md
└── troubleshooting-guide.md
├── examples
├── README.md
├── jupyter
│ ├── README.md
│ ├── basic-read-and-write-example.ipynb
│ ├── complex-array-example.ipynb
│ ├── data
│ │ ├── faithful.csv
│ │ ├── faithful_testing.csv
│ │ └── faithful_training.csv
│ ├── linear-regression-example-spark.ipynb
│ ├── linear-regression-example-vertica-direct.ipynb
│ └── linear-regression-example-verticapy.ipynb
├── pyspark
│ ├── README.md
│ ├── run-python-example.sh
│ └── sparkapp.py
├── scala
│ ├── README.md
│ ├── build.sbt
│ ├── project
│ │ └── plugins.sbt
│ ├── src
│ │ └── main
│ │ │ ├── resources
│ │ │ └── application.conf
│ │ │ └── scala
│ │ │ └── example
│ │ │ ├── Main.scala
│ │ │ ├── PrintUtils.scala
│ │ │ ├── TestUtils.scala
│ │ │ └── examples
│ │ │ ├── BasicReadWriteExamples.scala
│ │ │ ├── ComplexTypeExamples.scala
│ │ │ └── ConnectorOptionsExamples.scala
│ ├── submit-examples-debug.sh
│ ├── submit-examples-kerberos.sh
│ └── submit-examples.sh
└── sparklyr
│ ├── README.md
│ ├── run-r-example.sh
│ └── sparkapp.r
├── functional-tests
├── README.md
├── build.sbt
├── default-config.sh
├── pipeline-gcs-config.sh
├── pipeline-s3-config.sh
├── project
│ └── plugins.sbt
├── src
│ └── main
│ │ ├── resources
│ │ ├── 1600ColumnTable.csv
│ │ ├── 3.1.1
│ │ │ ├── _SUCCESS
│ │ │ ├── col1=1
│ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=10
│ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=11
│ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=12
│ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=13
│ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=14
│ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=15
│ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=16
│ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=17
│ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=18
│ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=19
│ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=2
│ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=20
│ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=3
│ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=4
│ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=5
│ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=6
│ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=7
│ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ ├── col1=8
│ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ │ └── col1=9
│ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
│ │ ├── AuditSVM.xml
│ │ ├── application.conf
│ │ ├── datafile-100cols-100rows.csv
│ │ ├── datafile-17_2_test
│ │ ├── datafile-String-Int.txt
│ │ ├── date_test_file.txt
│ │ ├── diffTypes.txt
│ │ ├── diffTypesORC.txt
│ │ └── sample_libsvm_data.txt
│ │ └── scala
│ │ ├── Main.scala
│ │ └── com
│ │ └── vertica
│ │ └── spark
│ │ └── functests
│ │ ├── CleanupUtilTests.scala
│ │ ├── HDFSTests.scala
│ │ ├── JDBCTests.scala
│ │ ├── LargeDataTests.scala
│ │ ├── TestUtils.scala
│ │ └── endtoend
│ │ ├── BasicJsonReadTests.scala
│ │ ├── ComplexTypeTests.scala
│ │ ├── ComplexTypeTestsV10.scala
│ │ ├── EndToEndTests.scala
│ │ ├── RemoteTests.scala
│ │ └── SparkConfig.scala
├── submit-functional-tests-debug.sh
└── submit-functional-tests.sh
├── img
├── CoreArchitecture.png
├── Overview.png
└── SparkInterfaces.png
├── performance-tests
├── README.md
├── build.sbt
├── project
│ └── plugins.sbt
└── src
│ └── main
│ ├── resources
│ └── application.conf
│ └── scala
│ ├── Main.scala
│ └── com
│ └── vertica
│ └── spark
│ └── perftests
│ ├── DataGenUtils.scala
│ └── PerformanceTestSuite.scala
└── version.properties
/.github/ISSUE_TEMPLATE/BUG_REPORT.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Report an issue for the Vertica Spark Connector
4 | title: "[BUG]"
5 | labels: 'bug'
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## Environment
11 | - Spark version:
12 | - Hadoop version:
13 | - Vertica version:
14 | - Vertica Spark Connector version:
15 | - Java version:
16 | - Additional Environment Information:
17 |
18 | ---
19 |
20 | ## Problem Description
21 | - Describe the issue in as much details as possible, so it is possible to reproduce it.
22 |
23 | 1. Steps to reproduce:
24 | 2. Expected behaviour:
25 | 3. Actual behaviour:
26 | 4. Error message/stack trace:
27 | 5. Code sample or example on how to reproduce the issue:
28 |
29 | ---
30 |
31 | ## Spark Connector Logs
32 | - Add related logs entries here.
33 |
34 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/FEATURE_REQUEST.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for the Vertica Spark Connector
4 | title: '[FEATURE]'
5 | labels: 'enhancement'
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## Is your feature request related to a problem? Please describe.
11 |
12 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
13 |
14 | ## Describe the solution you'd like
15 |
16 | A clear and concise description of what you want to happen.
17 |
18 | ## Describe alternatives you've considered
19 |
20 | A clear and concise description of any alternative solutions or features you've considered.
21 |
22 | ## Additional context
23 |
24 | Add any other context or screenshots about the feature request here.
25 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ### Summary
2 |
3 |
4 |
5 | ### Description
6 |
7 |
8 |
9 | ### Related Issue
10 |
11 |
12 |
13 | ### Additional Reviewers
14 |
15 |
16 |
--------------------------------------------------------------------------------
/.github/workflows/README.md:
--------------------------------------------------------------------------------
1 | # GitHub Work Flows
2 |
3 | The following are descriptions of the workflows used in the repository.
4 |
5 | ## Main Tests
6 |
7 | The workflow `main.yml` is a reusable workflow performing the following critical tests:
8 |
9 | Currently, this includes:
10 | * Compile checks
11 | * Unit-tests checks
12 | * Test coverage checks:
13 | * Require at least 80% coverage when the PR is merged
14 | * [Patch coverage](https://docs.codecov.com/docs/commit-status#patch-status) of at least 80%. Patch coverage only measures the coverage of changes made in the PR
15 | * Scalastyle checks
16 | * Integration tests against the latest Vertica. Uses the default Spark and Hadoop from the functional test which should be the latest.
17 |
18 | It is being used by `on-main-push.yml`, which execute when there's a push to `main` branch (like when a PR is merged).
19 |
20 | ## On Pull Request
21 |
22 | Runs `main.yml` a pull requests to `main` (when a PR is created or has content pushed to it).
23 |
24 | ## Nightly Tests
25 |
26 | The workflow `nightly.yml` runs nightly, from Monday to Friday at 9:18 AM GMT (or 2:18 AM Pacific Time), executing the
27 | `main` branch against non-critical tests. It currently performs regression testing on combinations of Spark 3.x, with
28 | the appropriate Hadoop HDFS, against Vertica 11.1.1-2 and 12.0.4-0. We also test against the latest Spark 3.x on a
29 | standalone Spark cluster.
30 |
31 | ## Weekly Tests
32 |
33 | `weekly.yml` performs weekly tests every Monday at 10:18 AM GMT (or 3:18 AM Pacific Time), executing the following tests:
34 | * Integration tests against different intermediary file-store:
35 | * S3, using a MINIO object store container to mimic S3
36 | * GCS, against an actual GCS bucket provided by Vertica. We could not find a solution to mock a GCS environment yet
37 | * Testing the `json` option against Spark 3.x
38 | * Test against Vertica 10.1.1-0
39 |
40 | Unless specified, all tests use the latest Vertica docker image. This would notify us of breaking changes
41 |
42 | ## Auto Triage and Remove Issue
43 |
44 | When an issue is labeled with a priority, `auto-triage.yml` workflow move it to the backlog, into the respective
45 | priority column.
46 |
47 | `remove-issue.yml` workflow triggers when an issue is closed, removing it from the backlog.
48 |
--------------------------------------------------------------------------------
/.github/workflows/auto-triage.yml:
--------------------------------------------------------------------------------
1 | name: Move labeled issue
2 | on:
3 | issues:
4 | types:
5 | - labeled
6 | workflow_dispatch:
7 | jobs:
8 | move-low-priority:
9 | if: github.event.label.name == 'Low Priority'
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: alex-page/github-project-automation-plus@v0.8.1
13 | with:
14 | project: Backlog
15 | column: Low Priority
16 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
17 |
18 | move-normal-priority:
19 | if: github.event.label.name == 'Normal Priority'
20 | runs-on: ubuntu-latest
21 | steps:
22 | - uses: alex-page/github-project-automation-plus@v0.8.1
23 | with:
24 | project: Backlog
25 | column: Normal Priority
26 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
27 |
28 | move-high-priority:
29 | if: github.event.label.name == 'High Priority'
30 | runs-on: ubuntu-latest
31 | steps:
32 | - uses: alex-page/github-project-automation-plus@v0.8.1
33 | with:
34 | project: Backlog
35 | column: High Priority
36 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
37 |
--------------------------------------------------------------------------------
/.github/workflows/on-main-push.yml:
--------------------------------------------------------------------------------
1 | name: main
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | workflow_dispatch:
7 |
8 | jobs:
9 | functional-tests:
10 | uses: ./.github/workflows/main.yml
--------------------------------------------------------------------------------
/.github/workflows/on-pull-request.yml:
--------------------------------------------------------------------------------
1 | name: On Pull Requests
2 |
3 | on:
4 | pull_request:
5 | branches: [ main ]
6 | workflow_dispatch:
7 |
8 | jobs:
9 | functional-tests:
10 | uses: ./.github/workflows/main.yml
--------------------------------------------------------------------------------
/.github/workflows/remove-issue.yml:
--------------------------------------------------------------------------------
1 | name: Remove Closed Issue
2 | on:
3 | issues:
4 | types:
5 | - closed
6 | workflow_dispatch:
7 | jobs:
8 | remove-low-priority:
9 | if: contains(github.event.issue.labels.*.name, 'Low Priority')
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: alex-page/github-project-automation-plus@v0.8.1
13 | with:
14 | project: Backlog
15 | column: Low Priority
16 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
17 | action: delete
18 |
19 | remove-normal-priority:
20 | if: contains(github.event.issue.labels.*.name, 'Normal Priority')
21 | runs-on: ubuntu-latest
22 | steps:
23 | - uses: alex-page/github-project-automation-plus@v0.8.1
24 | with:
25 | project: Backlog
26 | column: Normal Priority
27 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
28 | action: delete
29 |
30 | remove-high-priority:
31 | if: contains(github.event.issue.labels.*.name, 'High Priority')
32 | runs-on: ubuntu-latest
33 | steps:
34 | - uses: alex-page/github-project-automation-plus@v0.8.1
35 | with:
36 | project: Backlog
37 | column: High Priority
38 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
39 | action: delete
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # sbt-specific
2 | .bsp/
3 | target/
4 |
5 | .bloop/
6 | .idea/
7 | .metals/
8 | .vscode/
9 | *.iml
10 |
11 | .DS_Store
12 | *.class
13 | *.log
14 | *.jar
15 |
16 | connector/project/project/
17 | connector/.scannerwork/
18 | functional-tests/lib/
19 | functional-tests/project/build.properties
20 | functional-tests/project/project/
21 | examples/*/lib/*
22 | examples/*/project/build.properties
23 | examples/*/project/project/
24 | examples/jupyter/.ipynb_checkpoints
25 |
26 | docker/.env
27 | docker/keytabs/*.keytab
28 | !docker/keytabs/.gitkeep
29 | docker/vertica-hdfs-config/hadoop-kerberized/*.cert
30 |
31 | # Scala .gitignore
32 | /lib/*.jar
33 | /test/files/codelib/*.jar
34 | /test/files/lib/*.jar
35 | /test/files/speclib/instrumented.jar
36 | /tools/*.jar
37 |
38 | # Developer specific properties
39 | /**/build.properties
40 | /buildcharacter.properties
41 |
42 | # might get generated when testing Jenkins scripts locally
43 | /jenkins.properties
44 |
45 | # target directory for build
46 | /build/
47 |
48 | # other
49 | /out/
50 | /bin/
51 | /sandbox/
52 |
53 | # intellij
54 | /src/intellij*/*.iml
55 | /src/intellij*/*.ipr
56 | /src/intellij*/*.iws
57 | **/.cache
58 | /.idea
59 | /.settings
60 |
61 | # vscode
62 | /.vscode
63 |
64 | # Standard symbolic link to build/quick/bin
65 | /qbin
66 |
67 | # sbt's target directories
68 | /target/
69 | /project/**/target/
70 | /test/macro-annot/target/
71 | /test/files/target/
72 | /test/target/
73 | /build-sbt/
74 | local.sbt
75 | jitwatch.out
76 |
77 | # Used by the restarr/restarrFull commands as target directories
78 | /build-restarr/
79 | /target-restarr/
80 |
81 | # metals
82 | .metals
83 | .bloop
84 | **/project/**/metals.sbt
85 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 | status:
3 | project:
4 | default:
5 | # basic
6 | target: 80
7 | threshold: 1%
8 | patch:
9 | default:
10 | target: 80
11 |
--------------------------------------------------------------------------------
/connector/.java-version:
--------------------------------------------------------------------------------
1 | 1.8
2 |
--------------------------------------------------------------------------------
/connector/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.5.5
2 |
--------------------------------------------------------------------------------
/connector/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases"
2 |
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
4 | addSbtPlugin("com.artima.supersafe" % "sbtplugin" % "1.1.12")
5 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.1")
6 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2")
7 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.8.1")
8 | addSbtPlugin("com.github.mwz" % "sbt-sonar" % "2.2.0")
9 | addSbtPlugin("com.sksamuel.scapegoat" % "sbt-scapegoat" % "1.1.0")
10 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
11 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0")
12 | addDependencyTreePlugin
13 |
14 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/config/JDBCConfig.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.config
15 |
16 | import com.vertica.spark.datasource.core.TLSMode
17 |
18 | /**
19 | * Represents any config necessary for authenticating to JDBC.
20 | *
21 | * Abstract as there are multiple possible methods of authentication.
22 | */
23 | sealed trait JdbcAuth {
24 | def user: String
25 | }
26 |
27 | /**
28 | * Authentication to Vertica using username and password
29 | */
30 | case class BasicJdbcAuth(username: String, password: String) extends JdbcAuth {
31 | override def user: String = username
32 | }
33 |
34 | /**
35 | * Authentication using kerberos
36 | * @param kerberosServiceName the Kerberos service name, as specified when creating the service principal
37 | * @param kerberosHostname the Kerberos host name, as specified when creating the service principal
38 | * @param jaasConfigName the name of the JAAS configuration used for Kerberos authentication
39 | */
40 | case class KerberosAuth(username: String,
41 | kerberosServiceName: String,
42 | kerberosHostname: String,
43 | jaasConfigName: String) extends JdbcAuth {
44 | override def user: String = username
45 | }
46 |
47 | /**
48 | * Configuration for a JDBC connection to Vertica.
49 | *
50 | * @param host hostname for the JDBC connection
51 | * @param port port for the JDBC connection
52 | * @param db name of the Vertica database to connect to
53 | * @param auth the authentication details, varies depending on method used
54 | * @param tlsConfig the TLS configuration settings for the JDBC connection
55 | * @param backupServerNodes the comma separates list of vertica backup nodes. The host name or IP can optionally be
56 | * followed by a colon and a port number. If not supplied, defaults to the standard Vertica
57 | * port number (5433). To list multiple hosts, separate them by a comma.
58 | */
59 | final case class JDBCConfig(host: String,
60 | port: Int,
61 | db: String,
62 | auth: JdbcAuth,
63 | tlsConfig: JDBCTLSConfig,
64 | backupServerNodes: Option[String] = None)
65 |
66 | /**
67 | * TLS configuration settings for a JDBC connection to Vertica.
68 | *
69 | * @param tlsMode flag indicating whether to enable TLS for the connection or not
70 | * @param keyStorePath path to the key store
71 | * @param keyStorePassword password for the key store
72 | * @param trustStorePath path to the trust store
73 | * @param trustStorePassword password for the trust store
74 | */
75 | case class JDBCTLSConfig(tlsMode: TLSMode,
76 | keyStorePath: Option[String],
77 | keyStorePassword: Option[String],
78 | trustStorePath: Option[String],
79 | trustStorePassword: Option[String])
80 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/config/LogProvider.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.config
15 |
16 | import com.typesafe.scalalogging.Logger
17 |
18 | /**
19 | * Used to provide a logger for a given class, configured with a given log level.
20 | */
21 | case object LogProvider {
22 | def getLogger(c: Class[_]): Logger = Logger(c)
23 | def getLogger(obj: Object): Logger = Logger(obj.getClass)
24 | }
25 |
26 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/config/TableName.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.config
15 |
16 | object EscapeUtils {
17 | def sqlEscape(str: String, char: Char = '\"'): String = {
18 | val c = char.toString
19 | str.replace(c, c + c)
20 | }
21 |
22 | def sqlEscapeAndQuote(str: String): String = {
23 | "\"" + sqlEscape(str) + "\""
24 | }
25 | }
26 |
27 | /**
28 | * Parent trait representing a set of data being read from
29 | */
30 | trait TableSource {
31 | /**
32 | * Get a unique identifier for the operation.
33 | *
34 | * This value is used in a filepath.
35 | */
36 | def identifier : String
37 | }
38 |
39 | /**
40 | * Represents a fully qualified tablename in Vertica.
41 | *
42 | * @param name Name of the table
43 | * @param dbschema Optionally, the schema of the table. Public schema will be assumed if not specified.
44 | */
45 | final case class TableName(name: String, dbschema: Option[String]) extends TableSource {
46 |
47 | /**
48 | * Returns the full name of the table, escaped and surrounded with double quotes to prevent injection
49 | * and allow for special characters.
50 | */
51 | def getFullTableName : String = {
52 | dbschema match {
53 | case None => EscapeUtils.sqlEscapeAndQuote(name)
54 | case Some(schema) => EscapeUtils.sqlEscapeAndQuote(schema) + "." + EscapeUtils.sqlEscapeAndQuote(name)
55 | }
56 | }
57 |
58 | def getTableName : String = EscapeUtils.sqlEscapeAndQuote(name)
59 |
60 | def getDbSchema : String = {
61 | dbschema match {
62 | case None => ""
63 | case Some(schema) => EscapeUtils.sqlEscapeAndQuote(schema)
64 | }
65 | }
66 |
67 | /**
68 | * The table's name is used as an identifier for the operation.
69 | */
70 | override def identifier: String = name
71 | }
72 |
73 | final case class TableQuery(query: String, uniqueId: String, dbSchema: Option[String]) extends TableSource {
74 | override def identifier: String = uniqueId
75 | }
76 |
77 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/config/VerticaMetadata.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.config
15 |
16 | import org.apache.spark.sql.types._
17 | import com.vertica.spark.util.version.Version
18 |
19 | /**
20 | * Abstract trait for passing metadata of a table retrieved from Vertica.
21 | */
22 | trait VerticaMetadata
23 |
24 | /**
25 | * Metadata for read operation.
26 | * @param schema Schema of the table being read in Vertica.
27 | * @param version Version of Vertica being used.
28 | */
29 | final case class VerticaReadMetadata(schema: StructType, version: Version) extends VerticaMetadata
30 |
31 | /**
32 | * Empty class; No metadata retrieval required for current write operation.
33 | */
34 | final case class VerticaWriteMetadata() extends VerticaMetadata
35 |
36 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/VerticaDatasourceV2.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource
15 |
16 | import com.vertica.spark.datasource.v2._
17 | import com.vertica.spark.util.error.{ConnectorException, MissingSparkSessionError}
18 | import org.apache.spark.sql.connector.catalog._
19 | import org.apache.spark.sql.connector.expressions.Transform
20 | import org.apache.spark.sql.types._
21 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
22 | import org.apache.spark.sql.SparkSession
23 |
24 | import java.util
25 | import scala.collection.JavaConverters._
26 |
27 | /**
28 | * Entry-Point for Spark V2 Datasource.
29 | *
30 | * Implements Spark V2 datasource class [[http://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/connector/catalog/TableProvider.html here]]
31 | *
32 | * This and the tree of classes returned by it are to be kept light, and hook into the core of the connector
33 | */
34 | class VerticaSource extends TableProvider with SupportsCatalogOptions {
35 |
36 | /**
37 | * Used for read operation to get the schema for the table being read from
38 | *
39 | * @param caseInsensitiveStringMap A string map of options that was passed in by user to datasource
40 | * @return The table's schema in spark StructType format
41 | */
42 | override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType = {
43 | val table = getTable(schema = StructType(Nil), partitioning = Array.empty[Transform], properties = caseInsensitiveStringMap)
44 | table.schema()
45 | }
46 |
47 | /**
48 | * Gets the structure representing a Vertica table
49 | *
50 | * @param schema StructType representing table schema, used for write
51 | * @param partitioning specified partitioning for the table
52 | * @param properties A string map of options that was passed in by user to datasource
53 | * @return [[VerticaTable]]
54 | */
55 | override def getTable(schema: StructType,
56 | partitioning: Array[Transform],
57 | properties: util.Map[String, String]): Table = {
58 | new VerticaTable(new CaseInsensitiveStringMap(properties))
59 | }
60 |
61 |
62 | override def extractIdentifier(options: CaseInsensitiveStringMap): Identifier = {
63 | val name = options.asScala.toMap.getOrElse("table", "")
64 | Identifier.of(Array[String](), name)
65 | }
66 |
67 | private val CATALOG_NAME = VerticaDatasourceV2Catalog.NAME
68 | override def extractCatalog(options: CaseInsensitiveStringMap): String = {
69 | // Add all passed in options to spark catalog options
70 | VerticaDatasourceV2Catalog.setOptions(options)
71 |
72 | // Set the spark conf for catalog class
73 | SparkSession.getActiveSession match {
74 | case Some(session) => session.conf.set("spark.sql.catalog." + CATALOG_NAME, "com.vertica.spark.datasource.v2.VerticaDatasourceV2Catalog")
75 | case None => throw new ConnectorException(MissingSparkSessionError())
76 | }
77 |
78 | CATALOG_NAME
79 | }
80 |
81 | }
82 |
83 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/core/DSWriter.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.core
15 |
16 | import com.typesafe.scalalogging.Logger
17 | import com.vertica.spark.config._
18 | import com.vertica.spark.datasource.core.factory.{VerticaPipeFactory, VerticaPipeFactoryInterface}
19 | import com.vertica.spark.util.error.ErrorHandling.ConnectorResult
20 | import org.apache.spark.sql.catalyst.InternalRow
21 |
22 | /**
23 | * Interface responsible for writing to the Vertica source.
24 | *
25 | * This interface is initiated and called from each spark worker.
26 | */
27 | trait DSWriterInterface {
28 | /**
29 | * Called before reading to perform any needed setup with the given configuration.
30 | */
31 | def openWrite(): ConnectorResult[Unit]
32 |
33 | /**
34 | * Called to write an individual row to the datasource.
35 | */
36 | def writeRow(row: InternalRow): ConnectorResult[Unit]
37 |
38 | /**
39 | * Called from the executor to cleanup the individual write operation
40 | */
41 | def closeWrite(): ConnectorResult[Unit]
42 |
43 | /**
44 | * Called by the driver to commit all the write results
45 | */
46 | def commitRows(): ConnectorResult[Unit]
47 | }
48 |
49 | /**
50 | * Writer class, agnostic to the kind of pipe used for the operation (which VerticaPipe is used)
51 | *
52 | * @param config Configuration data definining the write operation.
53 | * @param uniqueId Unique identifier for this specific writer. The writer for each partition should have a different ID.
54 | * @param pipeFactory Factory returning the underlying implementation of a pipe between us and Vertica, to use for write.
55 | * @param isOnDriver true if the writer will be executed by a driver.
56 | */
57 | class DSWriter(config: WriteConfig, uniqueId: String, pipeFactory: VerticaPipeFactoryInterface = VerticaPipeFactory, isOnDriver: Boolean) extends DSWriterInterface {
58 | private val logger: Logger = LogProvider.getLogger(classOf[DSWriter])
59 | private val thread = Thread.currentThread().getName + ": "
60 | logger.debug(thread + "Initializing writer")
61 |
62 | private val pipe = pipeFactory.getWritePipe(config, isOnDriver)
63 | private var blockSize = 0L
64 |
65 | private var data = List[InternalRow]()
66 |
67 | def openWrite(): ConnectorResult[Unit] = {
68 | for {
69 | size <- pipe.getDataBlockSize
70 | _ <- pipe.startPartitionWrite(uniqueId)
71 | _ = this.blockSize = size
72 | } yield ()
73 | }
74 |
75 | def writeRow(row: InternalRow): ConnectorResult[Unit] = {
76 | data = data :+ row
77 | if(data.length >= blockSize) {
78 | pipe.writeData(DataBlock(data)) match {
79 | case Right(_) =>
80 | data = List[InternalRow]()
81 | Right(())
82 | case Left(errors) => Left(errors)
83 | }
84 | }
85 | else {
86 | Right(())
87 | }
88 | }
89 |
90 | def closeWrite(): ConnectorResult[Unit] = {
91 | if(data.nonEmpty) {
92 | val ret = pipe.writeData(DataBlock(data))
93 | pipe.endPartitionWrite()
94 | ret
95 | }
96 | else {
97 | pipe.endPartitionWrite()
98 | }
99 | }
100 |
101 | def commitRows(): ConnectorResult[Unit] = {
102 | val ret = pipe.commit()
103 | // Ensure all connections are closed, including read connections used by the write operation
104 | val _ = pipeFactory.closeJdbcLayers()
105 | ret
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/core/SessionId.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.core
15 |
16 | /**
17 | * Interface for getting a unique session ID
18 | */
19 | trait SessionIdInterface {
20 | def getId : String
21 | }
22 |
23 | /**
24 | * Implementation generating unique session ID
25 | */
26 | object SessionId extends SessionIdInterface {
27 | def getId : String = java.util.UUID.randomUUID.toString.replace("-", "_")
28 | }
29 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/json/JsonBatchFactory.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.json
15 |
16 | import com.vertica.spark.config.ReadConfig
17 | import com.vertica.spark.datasource.wrappers.json.VerticaJsonTableWrapper
18 | import org.apache.spark.sql.SparkSession
19 | import org.apache.spark.sql.connector.read.Batch
20 | import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
21 | import org.apache.spark.sql.execution.datasources.v2.json.JsonTable
22 | import org.apache.spark.sql.types.StructType
23 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
24 |
25 | import scala.collection.JavaConverters.mapAsJavaMapConverter
26 |
27 | class JsonBatchFactory {
28 | def build(filePath: String, schema: Option[StructType], readConfig: ReadConfig, sparkSession: SparkSession): Batch = {
29 | val paths = List(filePath)
30 | val options = CaseInsensitiveStringMap.empty()
31 | val fallback = classOf[JsonFileFormat]
32 | val jsonTable = JsonTable("Vertica Table", sparkSession, options, paths, schema, fallback)
33 | val verticaJsonTable = new VerticaJsonTableWrapper(jsonTable, readConfig)
34 | val builderOpts = new CaseInsensitiveStringMap(Map[String, String]().asJava)
35 | verticaJsonTable.newScanBuilder(builderOpts).build().toBatch
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/json/VerticaJsonScan.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.json
15 |
16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, LogProvider, ReadConfig}
17 | import com.vertica.spark.datasource.core.{DSConfigSetupInterface, TableMetaInterface}
18 | import com.vertica.spark.datasource.fs.FileStoreLayerInterface
19 | import com.vertica.spark.datasource.v2.VerticaScan
20 | import com.vertica.spark.util.cleanup.CleanupUtils
21 | import com.vertica.spark.util.error.{ErrorHandling, InitialSetupPartitioningError}
22 | import org.apache.spark.sql.SparkSession
23 | import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory, Scan}
24 | import org.apache.spark.sql.types.StructType
25 |
26 | /**
27 | * We support reading JSON files by re-using Spark's JSON support implemented in [[JsonTable]].
28 | * */
29 | class VerticaJsonScan(config: ReadConfig, readConfigSetup: DSConfigSetupInterface[ReadConfig] with TableMetaInterface[ReadConfig], batchFactory: JsonBatchFactory, fsLayer: FileStoreLayerInterface) extends Scan with Batch {
30 |
31 | private val logger = LogProvider.getLogger(classOf[VerticaScan])
32 |
33 | private val jsonReadConfig = config match {
34 | case cfg: DistributedFilesystemReadConfig =>
35 | val copied = cfg.copy(useJson = true)
36 | copied.setGroupBy(cfg.getGroupBy)
37 | copied.setPushdownAgg(cfg.isAggPushedDown)
38 | copied.setPushdownFilters(cfg.getPushdownFilters)
39 | copied.setRequiredSchema(cfg.getRequiredSchema)
40 | copied
41 | case _ => config
42 | }
43 |
44 | private lazy val batch: Batch = {
45 | // Export JSON before initializing Spark's JSON support.
46 | readConfigSetup.performInitialSetup(jsonReadConfig) match {
47 | case Left(err) => ErrorHandling.logAndThrowError(logger, err)
48 | case Right(opt) => opt match {
49 | case None => ErrorHandling.logAndThrowError(logger, InitialSetupPartitioningError())
50 | case Some(partitionInfo) =>
51 | val sparkSession = SparkSession.getActiveSession.getOrElse(ErrorHandling.logAndThrowError(logger, InitialSetupPartitioningError()))
52 | val batch = batchFactory.build(partitionInfo.rootPath, Some(readSchema()), jsonReadConfig, sparkSession)
53 |
54 | val files = fsLayer.getFileList(partitionInfo.rootPath).getOrElse(ErrorHandling.logAndThrowError(logger, InitialSetupPartitioningError()))
55 | if (files.isEmpty) {
56 | new CleanupUtils().cleanupAll(fsLayer, partitionInfo.rootPath)
57 | }
58 | batch
59 | }
60 | }
61 | }
62 |
63 | override def readSchema(): StructType = {
64 | (readConfigSetup.getTableMetadata(config), jsonReadConfig.getRequiredSchema) match {
65 | case (Right(metadata), requiredSchema) => if (requiredSchema.nonEmpty) { requiredSchema } else { metadata.schema }
66 | case (Left(err), _) => ErrorHandling.logAndThrowError(logger, err)
67 | }
68 | }
69 |
70 | override def planInputPartitions(): Array[InputPartition] = batch.planInputPartitions()
71 |
72 | override def createReaderFactory(): PartitionReaderFactory = batch.createReaderFactory()
73 |
74 | override def toBatch: Batch = this
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/partitions/file/PartitionedFileIdentity.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.partitions.file
15 |
16 | import com.vertica.spark.datasource.partitions.mixin.Identifiable
17 |
18 | case class PartitionedFileIdentity(filename: String, index: Long) extends Identifiable
19 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/partitions/file/VerticaFilePartition.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.partitions.file
15 |
16 | import com.vertica.spark.datasource.partitions.mixin.{Cleanup, Identifiable}
17 | import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile}
18 |
19 | /**
20 | * Extended from Spark's FilePartition to hold extra hold extra partitioning data.
21 | *
22 | * @param partitioningRecords A record of the partition count for all file partition created with the key being the
23 | * file path.
24 | * */
25 | class VerticaFilePartition(override val index: Int,
26 | override val files: Array[PartitionedFile],
27 | val filesIdentity: Array[PartitionedFileIdentity],
28 | val partitioningRecords: Map[String, Int])
29 | extends FilePartition(index, files) with Cleanup {
30 |
31 | override def getPortions: Seq[Identifiable] = this.filesIdentity
32 |
33 | override def getPartitioningRecord: Map[String, Int] = this.partitioningRecords
34 | }
35 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/partitions/mixin/Cleanup.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.partitions.mixin
15 |
16 | import org.apache.spark.sql.connector.read.InputPartition
17 |
18 | /**
19 | * Mixin trait for [[InputPartition]] that contains information for cleanup
20 | * */
21 | trait Cleanup {
22 |
23 | /**
24 | * @return returns any [[Identifiable]] object
25 | * */
26 | def getPortions: Seq[Identifiable]
27 |
28 | /**
29 | * @return return a mapping of filename to their portion count
30 | * */
31 | def getPartitioningRecord: Map[String, Int]
32 | }
33 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/partitions/mixin/Identifiable.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.partitions.mixin
15 |
16 | /**
17 | * Mixin trait for data portion containing information that identify itself amongst other portions.
18 | * */
19 | trait Identifiable {
20 |
21 | /**
22 | * @return the name of the file the portion belongs to
23 | * */
24 | def filename: String
25 |
26 | /**
27 | * @return the portion's index amongst the other portions of a file.
28 | * */
29 | def index: Long
30 | }
31 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/partitions/parquet/ParquetFileRange.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.partitions.parquet
15 |
16 | import com.vertica.spark.datasource.partitions.mixin.Identifiable
17 |
18 | /**
19 | * Represents a portion of a parquet file
20 | *
21 | * @param filename Full path with name of the parquet file
22 | * @param minRowGroup First row group to read from parquet file
23 | * @param maxRowGroup Last row group to read from parquet file
24 | * @param rangeIdx Range index for this file. Used to track access to this file / cleanup among different nodes.
25 | * If there are three ranges for a given file this will be a value between 0 and 2
26 | */
27 | final case class ParquetFileRange(filename: String, minRowGroup: Int, maxRowGroup: Int, rangeIdx: Int) extends Identifiable {
28 |
29 | override def index: Long = this.rangeIdx
30 | }
31 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/partitions/parquet/VerticaDistributedFilesystemPartition.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.partitions.parquet
15 |
16 | import com.vertica.spark.datasource.core.VerticaPartition
17 | import com.vertica.spark.datasource.partitions.mixin.{Cleanup, Identifiable}
18 |
19 | /**
20 | * Partition for distributed filesystem transport method using parquet files
21 | *
22 | * @param fileRanges List of files and ranges of row groups to read for those files
23 | * @param rangeCountMap Map representing how many file ranges exist for each file. Used for tracking and cleanup.
24 | */
25 | final case class VerticaDistributedFilesystemPartition(fileRanges: Seq[ParquetFileRange], rangeCountMap: Map[String, Int])
26 | extends VerticaPartition with Cleanup {
27 | override def getPortions: Seq[Identifiable] = this.fileRanges
28 |
29 | override def getPartitioningRecord: Map[String, Int] = this.rangeCountMap
30 | }
31 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/wrappers/PartitionReaderWrapper.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.wrappers
15 |
16 | import com.vertica.spark.config.LogProvider
17 | import com.vertica.spark.datasource.partitions.mixin.Cleanup
18 | import com.vertica.spark.util.cleanup.DistributedFilesCleaner
19 | import org.apache.spark.sql.catalyst.InternalRow
20 | import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader}
21 |
22 | /**
23 | * Wraps a [[PartitionReader]], allowing us to intercept it's methods and add additional functionalities.
24 | * */
25 | class PartitionReaderWrapper(val reader: PartitionReader[InternalRow],
26 | val partitions: Cleanup,
27 | val cleaner: DistributedFilesCleaner)
28 | extends PartitionReader[InternalRow] {
29 |
30 | private val logger = LogProvider.getLogger(classOf[PartitionReaderWrapper])
31 |
32 | override def next(): Boolean = reader.next()
33 |
34 | override def get(): InternalRow = reader.get()
35 |
36 | override def close(): Unit = {
37 | reader.close()
38 | cleaner.cleanupFiles(partitions)
39 | logger.info("Cleaning up")
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/wrappers/PartitionReaderWrapperFactory.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.wrappers
15 |
16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, ReadConfig}
17 | import com.vertica.spark.datasource.partitions.mixin.Cleanup
18 | import com.vertica.spark.util.cleanup.{CleanupUtils, DistributedFilesCleaner}
19 | import org.apache.spark.sql.catalyst.InternalRow
20 | import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
21 |
22 | /**
23 | * Wraps a [[PartitionReaderFactory]] so it will create a [[PartitionReaderWrapper]]
24 | *
25 | * planInputPartition() will also record partitioning information.
26 | * */
27 | class PartitionReaderWrapperFactory(val readerFactory: PartitionReaderFactory, val config: ReadConfig)
28 | extends PartitionReaderFactory {
29 |
30 | override def createReader(inputPartition: InputPartition): PartitionReader[InternalRow] = {
31 | config match {
32 | case readConfig: DistributedFilesystemReadConfig =>
33 | val reader = readerFactory.createReader(inputPartition)
34 | val partition = inputPartition.asInstanceOf[Cleanup]
35 | val cleaner = new DistributedFilesCleaner(readConfig, new CleanupUtils)
36 | new PartitionReaderWrapper(reader, partition, cleaner)
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/wrappers/VerticaScanWrapper.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.wrappers
15 |
16 | import com.vertica.spark.config.ReadConfig
17 | import com.vertica.spark.datasource.partitions.file.{PartitionedFileIdentity, VerticaFilePartition}
18 | import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory, Scan}
19 | import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile}
20 | import org.apache.spark.sql.types.StructType
21 | import org.apache.spark.util.SerializableConfiguration
22 |
23 | /**
24 | * Wraps a [[Scan]] so that it will create a [[PartitionReaderWrapperFactory]]
25 | *
26 | * planInputPartition() will also record partitioning information.
27 | * */
28 | class VerticaScanWrapper(val scan: Scan, val config: ReadConfig) extends Scan with Batch {
29 |
30 | override def readSchema(): StructType = scan.readSchema()
31 |
32 | /**
33 | * Calls the wrapped scan to plan inputs. Then process them into [[VerticaFilePartition]] with partitioning info
34 | * */
35 | override def planInputPartitions(): Array[InputPartition] = {
36 | val partitioningCounts = scala.collection.mutable.Map[String, Int]()
37 |
38 | def makeFilesIdentity(files: Array[PartitionedFile]): Array[PartitionedFileIdentity] = {
39 | // Record each files to the count and create each an identity
40 | files.map(file => {
41 | val key = file.filePath.toString
42 | val count = partitioningCounts.getOrElse(key, 0)
43 | partitioningCounts.put(key, count + 1)
44 | PartitionedFileIdentity(key, file.start)
45 | })
46 | }
47 |
48 | scan.toBatch.planInputPartitions()
49 | .map(partition => partition.asInstanceOf[FilePartition])
50 | .map(filePartition => (filePartition, makeFilesIdentity(filePartition.files)))
51 | .map(result => {
52 | val (filePartition, fileIdentities) = result
53 | new VerticaFilePartition(filePartition.index, filePartition.files, fileIdentities, partitioningCounts.toMap)
54 | })
55 | }
56 |
57 | override def createReaderFactory(): PartitionReaderFactory = {
58 | new PartitionReaderWrapperFactory(scan.toBatch.createReaderFactory(), config)
59 | }
60 |
61 | override def toBatch: Batch = this
62 | }
63 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/wrappers/VerticaScanWrapperBuilder.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.wrappers
15 |
16 | import com.vertica.spark.config.ReadConfig
17 | import org.apache.spark.sql.connector.read.{Scan, ScanBuilder}
18 |
19 | /**
20 | * Wraps a [[ScanBuilder]] to create a [[VerticaScanWrapper]]
21 | * */
22 | class VerticaScanWrapperBuilder(val builder: ScanBuilder, val config: ReadConfig) extends ScanBuilder {
23 | override def build(): Scan = new VerticaScanWrapper(builder.build(), config)
24 | }
25 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/datasource/wrappers/json/VerticaJsonTableWrapper.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.wrappers.json
15 |
16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, ReadConfig}
17 | import com.vertica.spark.datasource.wrappers.VerticaScanWrapperBuilder
18 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability}
19 | import org.apache.spark.sql.connector.read.ScanBuilder
20 | import org.apache.spark.sql.execution.datasources.v2.json.JsonTable
21 | import org.apache.spark.sql.types.StructType
22 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
23 |
24 | import java.util
25 |
26 |
27 | /**
28 | * Wraps a [[JsonTable]] so that that it will create a [[VerticaScanWrapperBuilder]].
29 | * */
30 | class VerticaJsonTableWrapper(val jsonTable: JsonTable, config: ReadConfig) extends Table with SupportsRead {
31 | override def name(): String = "Vertica" + jsonTable.name
32 |
33 | override def schema(): StructType = jsonTable.schema
34 |
35 | override def capabilities(): util.Set[TableCapability] = jsonTable.capabilities()
36 |
37 | override def newScanBuilder(caseInsensitiveStringMap: CaseInsensitiveStringMap): ScanBuilder =
38 | new VerticaScanWrapperBuilder(jsonTable.newScanBuilder(caseInsensitiveStringMap), config)
39 | }
40 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/parquet/ParquetRecordMaterializer.scala:
--------------------------------------------------------------------------------
1 | // scalastyle:off
2 | /*
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package org.apache.spark.sql.execution.datasources.parquet.vertica
20 |
21 | import java.time.ZoneId
22 |
23 | import org.apache.parquet.io.api.{GroupConverter, RecordMaterializer}
24 | import org.apache.parquet.schema.MessageType
25 |
26 | import org.apache.spark.sql.catalyst.InternalRow
27 | import org.apache.spark.sql.internal.LegacyBehaviorPolicy
28 | import org.apache.spark.sql.types.StructType
29 |
30 | /**
31 | * A [[RecordMaterializer]] for Catalyst rows.
32 | *
33 | * @param parquetSchema Parquet schema of the records to be read
34 | * @param catalystSchema Catalyst schema of the rows to be constructed
35 | * @param schemaConverter A Parquet-Catalyst schema converter that helps initializing row converters
36 | * @param convertTz the optional time zone to convert to int96 data
37 | * @param datetimeRebaseMode the mode of rebasing date/timestamp from Julian to Proleptic Gregorian
38 | * calendar
39 | * @param int96RebaseMode the mode of rebasing INT96 timestamp from Julian to Proleptic Gregorian
40 | * calendar
41 | */
42 | private[parquet] class ParquetRecordMaterializer(
43 | parquetSchema: MessageType,
44 | catalystSchema: StructType,
45 | schemaConverter: ParquetToSparkSchemaConverter,
46 | convertTz: Option[ZoneId],
47 | datetimeRebaseMode: LegacyBehaviorPolicy.Value,
48 | int96RebaseMode: LegacyBehaviorPolicy.Value)
49 | extends RecordMaterializer[InternalRow] {
50 |
51 | private val rootConverter = new ParquetRowConverter(
52 | schemaConverter,
53 | parquetSchema,
54 | catalystSchema,
55 | convertTz,
56 | datetimeRebaseMode,
57 | int96RebaseMode,
58 | NoopUpdater)
59 |
60 | override def getCurrentRecord: InternalRow = rootConverter.currentRecord
61 |
62 | override def getRootConverter: GroupConverter = rootConverter
63 | }
64 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/parquet/VerticaDataSourceUtils.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package org.apache.spark.sql.execution.datasources.parquet.vertica
15 |
16 | import org.apache.spark.sql.catalyst.util.RebaseDateTime
17 | import org.apache.spark.sql.execution.datasources.DataSourceUtils
18 | import org.apache.spark.sql.internal.LegacyBehaviorPolicy
19 |
20 | /**
21 | * Copied from Spark 3.2.0 DataSourceUtils implementation.
22 | *
23 | * Classes under the parquet package were copied from Spark for reading parquet into Spark. It is not a public API and
24 | * thus is expected to change. In Spark 3.2.1, the DataSourceUtils interface was changed. As a fix, this class
25 | * copied from Spark 3.2.0 only the needed functions.
26 | * */
27 | object VerticaDataSourceUtils {
28 |
29 | /**
30 | * Create a function that rebase a given datetime value
31 | * */
32 | def createDateRebaseFuncInRead(
33 | rebaseMode: LegacyBehaviorPolicy.Value,
34 | format: String): Int => Int = rebaseMode match {
35 | case LegacyBehaviorPolicy.EXCEPTION => days: Int =>
36 | if (days < RebaseDateTime.lastSwitchJulianDay) {
37 | throw DataSourceUtils.newRebaseExceptionInRead(format)
38 | }
39 | days
40 | case LegacyBehaviorPolicy.LEGACY => RebaseDateTime.rebaseJulianToGregorianDays
41 | case LegacyBehaviorPolicy.CORRECTED => identity[Int]
42 | }
43 |
44 | /**
45 | * Create a function that rebase a given timestamp value
46 | * */
47 | def createTimestampRebaseFuncInRead(
48 | rebaseMode: LegacyBehaviorPolicy.Value,
49 | format: String): Long => Long = rebaseMode match {
50 | case LegacyBehaviorPolicy.EXCEPTION => micros: Long =>
51 | if (micros < RebaseDateTime.lastSwitchJulianTs) {
52 | throw DataSourceUtils.newRebaseExceptionInRead(format)
53 | }
54 | micros
55 | case LegacyBehaviorPolicy.LEGACY => RebaseDateTime.rebaseJulianToGregorianMicros
56 | case LegacyBehaviorPolicy.CORRECTED => identity[Long]
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/Timer.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util
15 |
16 | import com.typesafe.scalalogging.Logger
17 |
18 | /**
19 | * Class for reporting how long operations take
20 | *
21 | * @param enabled Timer is enabled. If false, timing will not happen and nothing is logged
22 | * @param logger Logger for logging how long operation took
23 | * @param operationName Name of operation being timed
24 | */
25 | class Timer (val enabled: Boolean, val logger: Logger, val operationName: String ) {
26 |
27 | var t0 = 0L
28 |
29 | def startTime(): Unit = {
30 | if(enabled) {
31 | t0 = System.currentTimeMillis();
32 | }
33 | }
34 |
35 | def endTime(): Unit = {
36 | if(enabled) {
37 | val t1 = System.currentTimeMillis();
38 | logger.info("Timed operation: " + operationName + " -- took " + (t1-t0) + " ms.");
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/cleanup/DistributedFilesCleaner.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.cleanup
15 |
16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, LogProvider}
17 | import com.vertica.spark.datasource.fs.FileStoreLayerInterface
18 | import com.vertica.spark.datasource.fs.HadoopFileStoreLayer
19 | import com.vertica.spark.datasource.partitions.mixin.Cleanup
20 |
21 | /**
22 | * Class handles cleanup of exported files on file system. Intended to be used by each worker before exiting.
23 | * */
24 | class DistributedFilesCleaner(val config: DistributedFilesystemReadConfig,
25 | val cleanupUtils: CleanupUtilsInterface,
26 | val optionalFSLayer: Option[FileStoreLayerInterface] = None) {
27 |
28 | private val logger = LogProvider.getLogger(this)
29 | private val fileStoreLayer = optionalFSLayer.getOrElse(HadoopFileStoreLayer.make(config))
30 | private val fileStoreConfig = config.fileStoreConfig
31 |
32 | /**
33 | * The idea is to first writing to the filesystem, marking that a portion of a file has been read.
34 | * Then, we count if all portion of a file are present. Delete the file if so, else ignore.
35 | *
36 | * This is done for all partitions.
37 | *
38 | * @param partition The object with [[Cleanup]] information.
39 | * */
40 | def cleanupFiles(partition: Cleanup): Unit = {
41 | logger.info("Removing files before closing read pipe.")
42 |
43 | partition.getPortions.indices.foreach(fileIndex => {
44 | if (!fileStoreConfig.preventCleanup) {
45 | // Cleanup old file if required
46 | getCleanupInfo(partition, fileIndex) match {
47 | case Some(cleanupInfo) => cleanupUtils.checkAndCleanup(fileStoreLayer, cleanupInfo) match {
48 | case Left(err) => logger.warn("Ran into error when calling cleaning up. Treating as non-fatal. Err: " + err.getFullContext)
49 | case Right(_) => ()
50 | }
51 | case None => logger.warn("No cleanup info found.")
52 | }
53 | }
54 | })
55 | }
56 |
57 | def getCleanupInfo(partition: Cleanup, partitionIndex: Int): Option[FileCleanupInfo] = {
58 | logger.debug("Getting cleanup info for partition with idx " + partitionIndex)
59 | if (partitionIndex >= partition.getPortions.size) {
60 | logger.warn("Invalid fileIdx " + partitionIndex + ", can't perform cleanup.")
61 | None
62 | } else {
63 | val fileRange = partition.getPortions(partitionIndex)
64 | Some(FileCleanupInfo(fileRange.filename, fileRange.index, partition.getPartitioningRecord(fileRange.filename)))
65 | }
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/cleanup/FileCleanupInfo.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.cleanup
15 |
16 | /**
17 | * Structure containing cleanup information for a given portion of a file.
18 | *
19 | * @param filename The file to check for cleanup.
20 | * @param fileIdx Which portion of the file is done being read.
21 | * @param fileRangeCount How many portions of the file exist.
22 | */
23 | final case class FileCleanupInfo(filename: String, fileIdx: Long, fileRangeCount: Int)
24 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/complex/ComplexTypeUtils.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.complex
15 |
16 | import org.apache.spark.sql.types.{ArrayType, MapType, StructField, StructType}
17 |
18 | import scala.util.Either
19 |
20 | class ComplexTypeUtils {
21 |
22 | def getComplexTypeColumns(schema: StructType): (List[StructField], List[StructField]) = {
23 | val initialAccumulators: (List[StructField], List[StructField]) = (List(), List())
24 | schema
25 | .foldLeft(initialAccumulators)((acc, col) => {
26 | val (nativeCols, complexTypeCols) = acc
27 | if (isNativeType(col)) {
28 | (col :: nativeCols, complexTypeCols)
29 | } else {
30 | (nativeCols, col :: complexTypeCols)
31 | }
32 | })
33 | }
34 |
35 | /*
36 | * Check if field is a vertica native type. Vertica native types contains 1D arrays
37 | * */
38 | private def isNativeType(field: StructField): Boolean = {
39 | field.dataType match {
40 | case ArrayType(elementType, _) =>
41 | elementType match {
42 | case MapType(_, _, _) | StructType(_) | ArrayType(_, _) => false
43 | case _ => true
44 | }
45 | case MapType(_, _, _) | StructType(_) => false
46 | case _ => true
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/general/Utils.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.general
15 |
16 | object Utils {
17 | // Used to explicitly ignore returned values
18 | def ignore[T](t: T): Unit = ()
19 | }
20 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/listeners/SparkListeners.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.listeners
15 |
16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, LogProvider}
17 | import com.vertica.spark.datasource.fs.HadoopFileStoreLayer
18 | import com.vertica.spark.util.error.ConnectorError
19 | import org.apache.spark.SparkContext
20 | import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
21 |
22 | /**
23 | * This wrapper is created solely for compatibility with unit testing.
24 | *
25 | * Because we could not instantiate during unit test, a dummy class that extends from
26 | * SparkContext is needed to override the functions we are using. However, SparkContext.addSparkListener's argument
27 | * use a private interface thus can't be override.
28 | * */
29 | case class SparkContextWrapper(sparkContext: Option[SparkContext]){
30 |
31 | def addSparkListener(listener:SparkListener): Unit ={
32 | sparkContext match {
33 | // We may not get a context if this is executed on executor nodes.
34 | case None =>
35 | case Some(context) => context.addSparkListener(listener)
36 | }
37 | }
38 | }
39 | /**
40 | * This listener is called at the end of Spark app to remove the export folder.
41 | * */
42 | class ApplicationParquetCleaner(config: DistributedFilesystemReadConfig) extends SparkListener {
43 | private val logger = LogProvider.getLogger(classOf[ApplicationParquetCleaner])
44 |
45 | private val fileStoreLayer = new HadoopFileStoreLayer(config.fileStoreConfig, config.metadata match {
46 | case Some(metadata) => if (config.getRequiredSchema.nonEmpty) {
47 | Some(config.getRequiredSchema)
48 | } else {
49 | Some(metadata.schema)
50 | }
51 | case _ => None
52 | })
53 |
54 | override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
55 | val hdfsPath = config.fileStoreConfig.address
56 | if (!config.fileStoreConfig.preventCleanup) {
57 | fileStoreLayer.removeDir(hdfsPath) match {
58 | case Right(_) => logger.info("Removed " + hdfsPath)
59 | case Left(error) => logger.error(s"Error removing $hdfsPath. ${error.toString}")
60 | }
61 | }
62 | }
63 | }
64 |
65 | case class CleanerRegistrationError() extends ConnectorError {
66 | def getFullContext: String = "Failed to add application shutdown listener to context"
67 | }
68 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/query/ColumnsTable.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.query
15 |
16 | import com.vertica.spark.datasource.jdbc.JdbcLayerInterface
17 | import com.vertica.spark.util.error.ErrorHandling.ConnectorResult
18 |
19 | import java.sql.ResultSet
20 |
21 | case class ColumnInfo(verticaType: Long, dataTypeName: String, precision: Long, scale: Long)
22 |
23 | class ColumnsTable(jdbcLayer: JdbcLayerInterface) extends VerticaTable[ColumnInfo](jdbc = jdbcLayer) {
24 |
25 | override def tableName: String = "columns"
26 |
27 | override def columns: Seq[String] = List("data_type_id", "data_type", "numeric_precision", "numeric_scale")
28 |
29 | override def buildRow(resultSet: ResultSet): ColumnInfo = {
30 | // The column name should be in sync with the ones defined above.
31 | ColumnInfo(
32 | resultSet.getLong("data_type_id"),
33 | getTypeName(resultSet.getString("data_type")),
34 | resultSet.getLong("numeric_precision"),
35 | resultSet.getLong("numeric_scale")
36 | )
37 | }
38 |
39 | /**
40 | * Type name report by Vertica could be INTEGER or ARRAY[...] or ROW(...)
41 | * and we want to extract just the type identifier
42 | * */
43 | def getTypeName(dataType:String) : String = {
44 | dataType
45 | .replaceFirst("\\[",",")
46 | .replaceFirst("\\(",",")
47 | .split(',')
48 | .head
49 | }
50 |
51 | def getColumnInfo(columnName: String, tableName: String, schema: String): ConnectorResult[ColumnInfo] = {
52 | val schemaCond = if(schema.nonEmpty) s" AND table_schema='$schema'" else ""
53 | val conditions = s"table_name='$tableName'$schemaCond AND column_name='$columnName'"
54 | super.selectWhereExpectOne(conditions)
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/query/ComplexTypesTable.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.query
15 |
16 | import com.vertica.spark.datasource.jdbc.JdbcLayerInterface
17 | import com.vertica.spark.util.error.ErrorHandling.ConnectorResult
18 |
19 | import java.sql.ResultSet
20 |
21 | /**
22 | * A row of complex_types table. Represents a component of the data structure type_id.
23 | * [[https://www.vertica.com/docs/latest/HTML/Content/Authoring/SQLReferenceManual/SystemTables/CATALOG/COMPLEX_TYPES.htm?zoom_highlight=complex%20type Documentations]]
24 | *
25 | * @param typeId The vertica type id of the complex structure.
26 | * @param fieldId the vertica type id of the field.
27 | *
28 | * */
29 | case class ComplexTypeInfo(typeId: Long, typeName: String, fieldId: Long, fieldTypeName: String, numericScale: Long, typeKind: String, numericPrecision: Long, fieldName: String)
30 |
31 | /**
32 | * When a complex type is created in Vertica, it's structure is recorded in this table.
33 | * Each row represents then a component (a field) of the complex structure, with the type_id being the vertica id of the complex type,
34 | * and field_id being the vertica id of the component. For example, a nested array will have as many rows as
35 | * its depth.
36 | * [[https://www.vertica.com/docs/latest/HTML/Content/Authoring/SQLReferenceManual/SystemTables/CATALOG/COMPLEX_TYPES.htm?zoom_highlight=complex%20type Documentations]]
37 | * */
38 | class ComplexTypesTable(jdbcLayer: JdbcLayerInterface)
39 | extends VerticaTable[ComplexTypeInfo](jdbc = jdbcLayer) {
40 |
41 | override def tableName: String = "complex_types"
42 |
43 | override protected def columns: Seq[String] = List("type_id", "type_name", "field_id", "field_type_name", "numeric_scale", "type_kind", "numeric_precision", "field_name")
44 |
45 | override protected def buildRow(rs: ResultSet): ComplexTypeInfo = {
46 | // The column name should be in sync with the ones defined above.
47 | ComplexTypeInfo(
48 | rs.getLong("type_id"),
49 | rs.getString("type_name"),
50 | rs.getLong("field_id"),
51 | rs.getString("field_type_name"),
52 | rs.getLong("numeric_scale"),
53 | rs.getString("type_kind"),
54 | rs.getLong("numeric_precision"),
55 | rs.getString("field_name"))
56 | }
57 |
58 | def findComplexTypeInfo(verticaTypeId: Long): ConnectorResult[ComplexTypeInfo] = {
59 | val conditions = s"type_id=$verticaTypeId"
60 | super.selectWhereExpectOne(conditions)
61 | }
62 |
63 | def getComplexTypeFields(verticaTypeId: Long): ConnectorResult[Seq[ComplexTypeInfo]] = {
64 | val conditions = s"type_id=$verticaTypeId"
65 | super.selectWhere(conditions)
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/query/StringParsingUtils.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.query
15 |
16 | import scala.annotation.tailrec
17 |
18 | /**
19 | * Class contains helper methods for parsing Vertica SQL queries. Ideally we would like a more robust parser however it
20 | * was not justified because:
21 | * 1. We have not needed to do a lot of SQL parsing, yet!
22 | * 2. We did not find an appropriate library for use
23 | * Should we start handling more SQL parsing, we will need to implement a custom parser.
24 | * */
25 | object StringParsingUtils {
26 |
27 | /**
28 | * Return the indices of the first open parenthesis and its matching closing parenthesis
29 | *
30 | * @return a tuple of (openParenIndex, closingParenIndex)
31 | * */
32 | def findFirstParenGroupIndices(str: String): (Int, Int) = {
33 | val openParenIndex = str.indexOf("(")
34 | val subString = str.substring(openParenIndex + 1)
35 |
36 | /**
37 | * This recursion finds the matching paren by tracking the paren count.
38 | * When it is 0 then we have the matching paren.
39 | * */
40 | @tailrec
41 | def findMatchingClosingParen(char: Char, tail: String, parenCount: Int): Int = {
42 | char match {
43 | case '(' => findMatchingClosingParen(tail.head, tail.tail, parenCount + 1)
44 | case ')' =>
45 | if (parenCount == 1) {
46 | subString.length - tail.length
47 | } else {
48 | findMatchingClosingParen(tail.head, tail.tail, parenCount - 1)
49 | }
50 | case _ => findMatchingClosingParen(tail.head, tail.tail, parenCount)
51 | }
52 | }
53 |
54 | val closingParenIndex = openParenIndex + findMatchingClosingParen(subString.head, subString.tail, 1)
55 | (openParenIndex, closingParenIndex)
56 | }
57 |
58 | /**
59 | * Split a string by comma. Will not split on a comma if it is between parentheses.
60 | *
61 | * @return a list of separated strings.
62 | * */
63 | def splitByComma(str: String): Seq[String] = {
64 |
65 | @tailrec
66 | def recursion(char: Char, tail: String, currStr: String = "", splits: List[String] = List(), parenCount: Int = 0): List[String] = {
67 | val posParenCount = if(parenCount < 0) 0 else parenCount
68 | val nextStr = currStr :+ char
69 | char match {
70 | // Keeping track of parenthesis to know if it should split or not
71 | case '(' if tail.nonEmpty => recursion(tail.head, tail.tail, nextStr, splits, posParenCount + 1)
72 | case ')' if tail.nonEmpty => recursion(tail.head, tail.tail, nextStr, splits, posParenCount - 1)
73 | case ',' if tail.nonEmpty =>
74 | if (posParenCount > 0) {
75 | recursion(tail.head, tail.tail, nextStr, splits, posParenCount)
76 | } else {
77 | recursion(tail.head, tail.tail, "", splits :+ currStr.trim, posParenCount)
78 | }
79 | case _ =>
80 | if (tail.isEmpty) {
81 | char match {
82 | case ',' if posParenCount == 0 => splits :+ currStr.trim
83 | case _ => splits :+ nextStr.trim
84 | }
85 | } else {
86 | recursion(tail.head, tail.tail, nextStr, splits, posParenCount)
87 | }
88 | }
89 | }
90 |
91 | recursion(str.head, str.tail)
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/query/TypesTable.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.query
15 |
16 | import com.vertica.spark.datasource.jdbc.JdbcLayerInterface
17 | import com.vertica.spark.util.error.ErrorHandling.ConnectorResult
18 | import com.vertica.spark.util.schema.ColumnDef
19 | import org.apache.spark.sql.types.{DecimalType, Metadata}
20 |
21 | import java.sql.ResultSet
22 |
23 | case class TypeInfo(typeId: Long, jdbcType: Long, typeName: String, maxScale: Long)
24 |
25 | /**
26 | * Vertica's types table contains type information of primitives and 1D array/set of primitive type.
27 | * */
28 | class TypesTable(jdbcLayer: JdbcLayerInterface) extends VerticaTable[TypeInfo](jdbcLayer) {
29 | override protected def tableName: String = "types"
30 |
31 | override protected def columns: Seq[String] = List("type_id", "jdbc_type", "type_name", "max_scale")
32 |
33 | override protected def buildRow(rs: ResultSet): TypeInfo =
34 | // The column name should be in sync with the ones defined above.
35 | TypeInfo(
36 | rs.getLong("type_id"),
37 | rs.getLong("jdbc_type"),
38 | rs.getString("type_name"),
39 | rs.getLong("max_scale"))
40 |
41 | def getVerticaTypeInfo(verticaType: Long): ConnectorResult[TypeInfo] = {
42 | val conditions = s"type_id=$verticaType"
43 | super.selectWhereExpectOne(conditions)
44 | }
45 |
46 | private val signedList = List(
47 | java.sql.Types.DOUBLE,
48 | java.sql.Types.FLOAT,
49 | java.sql.Types.REAL,
50 | java.sql.Types.INTEGER,
51 | java.sql.Types.BIGINT,
52 | java.sql.Types.TINYINT,
53 | java.sql.Types.SMALLINT
54 | )
55 |
56 | def isSigned(jdbcType: Long): Boolean = signedList.contains(jdbcType)
57 |
58 | def getColumnDef(verticaType: Long): ConnectorResult[ColumnDef] = {
59 | getVerticaTypeInfo(verticaType)
60 | .map(typeInfo =>
61 | ColumnDef("",
62 | typeInfo.jdbcType.toInt,
63 | typeInfo.typeName,
64 | DecimalType.MAX_PRECISION,
65 | typeInfo.maxScale.toInt,
66 | signed = isSigned(typeInfo.jdbcType),
67 | nullable = false,
68 | Metadata.empty))
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/reflections/ReflectionTools.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.reflections
15 |
16 | import com.vertica.spark.config.ReadConfig
17 | import com.vertica.spark.datasource.core.DSConfigSetupInterface
18 | import com.vertica.spark.datasource.v2.{VerticaScanBuilder, VerticaScanBuilderWithPushdown}
19 | import org.apache.spark.sql.connector.expressions.aggregate.Aggregation
20 |
21 | class ReflectionTools {
22 | def makeScanBuilderWithPushDown(config: ReadConfig, readSetupInterface: DSConfigSetupInterface[ReadConfig]): VerticaScanBuilder = {
23 | classOf[VerticaScanBuilderWithPushdown]
24 | .getDeclaredConstructor(classOf[ReadConfig], classOf[DSConfigSetupInterface[ReadConfig]])
25 | .newInstance(config, readSetupInterface)
26 | }
27 |
28 | def makeScanBuilderWithoutPushDown(config: ReadConfig, readSetupInterface: DSConfigSetupInterface[ReadConfig]): VerticaScanBuilder = {
29 | classOf[VerticaScanBuilder]
30 | .getDeclaredConstructor(classOf[ReadConfig], classOf[DSConfigSetupInterface[ReadConfig]])
31 | .newInstance(config, readSetupInterface)
32 | }
33 |
34 | def aggregationInvokeMethod[T](aggregation: Aggregation, methodName: String): T = {
35 | classOf[Aggregation]
36 | .getDeclaredMethod(methodName)
37 | .invoke(aggregation)
38 | .asInstanceOf[T]
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/version/SparkVersionTools.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.version
15 |
16 | import com.vertica.spark.config.ReadConfig
17 | import com.vertica.spark.datasource.core.DSConfigSetupInterface
18 | import com.vertica.spark.datasource.v2.VerticaScanBuilder
19 | import com.vertica.spark.util.reflections.ReflectionTools
20 | import com.vertica.spark.util.version.SparkVersionTools.{SPARK_3_2_0, SPARK_3_3_0}
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.sql.connector.expressions.aggregate.Aggregation
23 | import org.apache.spark.sql.connector.expressions.Expression
24 |
25 | import scala.util.Try
26 |
27 | class SparkVersionTools(reflection: ReflectionTools = new ReflectionTools) {
28 |
29 | /**
30 | * @return the version string of Spark
31 | * */
32 | def getVersionString: Option[String] = SparkSession.getActiveSession.map(_.version)
33 |
34 | /**
35 | * @return a [[Version]] from a Spark version string
36 | * */
37 | def getVersion: Option[Version] = getVersion(getVersionString)
38 |
39 | /**
40 | * @return a [[Version]] from a Spark version string
41 | * */
42 | def getVersion(versionStr: Option[String]): Option[Version] = versionStr match {
43 | case Some(str) =>
44 | val regex = "([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)".r
45 | Try {
46 | val regex(major, minor, service, _) = str
47 | Some(Version(major.toInt, minor.toInt, service.toInt))
48 | }.getOrElse(None)
49 | case None => None
50 | }
51 |
52 | /**
53 | * @return a compatible [[VerticaScanBuilder]] for the given spark version.
54 | * */
55 | def makeCompatibleVerticaScanBuilder(sparkVersion: Version, config: ReadConfig, readSetupInterface: DSConfigSetupInterface[ReadConfig]): VerticaScanBuilder = {
56 | val sparkSupportsAggregatePushDown = sparkVersion >= SPARK_3_2_0
57 | if (sparkSupportsAggregatePushDown) {
58 | reflection.makeScanBuilderWithPushDown(config, readSetupInterface)
59 | } else {
60 | reflection.makeScanBuilderWithoutPushDown(config, readSetupInterface)
61 | }
62 | }
63 |
64 | /**
65 | * Since the connector compiles against the latest version of Spark, for backward compatibility this function uses
66 | * reflection to invoke the appropriate method that returns group-by expressions.
67 | *
68 | * @return an array of [[Expression]] reprsenting the group-by columns.
69 | * */
70 | def getCompatibleGroupByExpressions(sparkVersion: Version, aggObj: Aggregation): Array[Expression] = {
71 | if(sparkVersion < SPARK_3_3_0){
72 | // $COVERAGE-OFF$
73 | reflection.aggregationInvokeMethod[Array[Expression]](aggObj, "groupByColumns")
74 | // $COVERAGE-ON$
75 | } else {
76 | aggObj.groupByExpressions()
77 | }
78 | }
79 | }
80 |
81 | object SparkVersionTools {
82 | val SPARK_3_3_0: Version = Version(3, 3)
83 | val SPARK_3_2_0: Version = Version(3, 2)
84 | }
85 |
--------------------------------------------------------------------------------
/connector/src/main/scala/com/vertica/spark/util/version/Version.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.version
15 |
16 | /**
17 | * A class representing a version string of format major.minor.patch-hotfix.
18 | * Only digits are allowed for minor, patch, and hotfix.
19 | * */
20 | case class Version(major: Int, minor: Int = 0, servicePack: Int = 0, hotfix: Int = 0) extends Ordered[Version] {
21 |
22 | override def toString: String = s"${major}.${minor}.${servicePack}-${hotfix}"
23 |
24 | override def compare(that: Version): Int =
25 | (this.major * 1000 + this.minor * 100 + this.servicePack * 10 + this.hotfix) -
26 | (that.major * 1000 + that.minor * 100 + that.servicePack * 10 + that.hotfix)
27 | }
28 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/datasource/core/TableNameTest.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.datasource.core
15 |
16 | import com.vertica.spark.config.TableName
17 | import org.scalamock.scalatest.MockFactory
18 | import org.scalatest.BeforeAndAfterAll
19 | import org.scalatest.flatspec.AnyFlatSpec
20 |
21 | class TableNameTest extends AnyFlatSpec with BeforeAndAfterAll with MockFactory {
22 | it should "Escape table name" in {
23 | val tablename = TableName("t\"nam\"e", Some("Sch \" ema"))
24 |
25 | assert(tablename.getFullTableName == "\"Sch \"\" ema\".\"t\"\"nam\"\"e\"")
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/datasource/json/JsonBatchFactoryTest.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.datasource.json
2 |
3 | import com.vertica.spark.common.TestObjects
4 | import com.vertica.spark.datasource.wrappers.VerticaScanWrapper
5 | import org.apache.spark.sql.SparkSession
6 | import org.scalatest.BeforeAndAfterAll
7 | import org.scalatest.flatspec.AnyFlatSpec
8 |
9 | import java.io.{File, PrintWriter}
10 |
11 | class JsonBatchFactoryTest extends AnyFlatSpec with BeforeAndAfterAll{
12 |
13 | behavior of "JsonBatchFactoryTest"
14 |
15 | val jsonFile = new File("./test.json" )
16 | val pw = new PrintWriter(jsonFile)
17 | pw.write("{\"a\":9}")
18 | pw.close()
19 |
20 | it should "should build a VerticaScanWrapper" in {
21 | val spark = SparkSession.builder()
22 | .master("local[*]")
23 | .appName("Vertica Connector Test")
24 | .getOrCreate()
25 |
26 | val batch = new JsonBatchFactory().build("./test.json", None, TestObjects.readConfig, spark)
27 | assert(batch.isInstanceOf[VerticaScanWrapper])
28 |
29 | spark.close()
30 | }
31 |
32 | override protected def afterAll(): Unit = {
33 | jsonFile.delete
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/datasource/partitions/parquet/ParquetFileRangeTest.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.datasource.partitions.parquet
2 |
3 | import org.scalamock.scalatest.MockFactory
4 | import org.scalatest.flatspec.AnyFlatSpec
5 |
6 | class ParquetFileRangeTest extends AnyFlatSpec with MockFactory{
7 |
8 | behavior of "ParquetFileRangeTest"
9 |
10 | val fileRange = ParquetFileRange("filename", 10, 20, 30)
11 |
12 | it should "return correct filename" in {
13 | assert(fileRange.filename == "filename")
14 | }
15 |
16 | it should "return correct index" in {
17 | assert(fileRange.index == 30)
18 | }
19 |
20 | }
21 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/datasource/wrappers/PartitionReaderWrapperFactoryTest.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.datasource.wrappers
2 |
3 | import com.vertica.spark.common.TestObjects
4 | import com.vertica.spark.datasource.partitions.mixin.Cleanup
5 | import org.apache.spark.sql.catalyst.InternalRow
6 | import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
7 | import org.scalamock.scalatest.MockFactory
8 | import org.scalatest.flatspec.AnyFlatSpec
9 |
10 | class PartitionReaderWrapperFactoryTest extends AnyFlatSpec with MockFactory{
11 |
12 | behavior of "PartitionReaderWrapperFactoryTest"
13 |
14 | trait MockInputPartition extends InputPartition with Cleanup
15 |
16 | it should "create a PartitionReaderWrapper" in {
17 | val readerFactory = mock[PartitionReaderFactory]
18 | val inputPartition = mock[MockInputPartition]
19 | val partitionReader = mock[PartitionReader[InternalRow]]
20 | (readerFactory.createReader _).expects(inputPartition).returning(partitionReader)
21 |
22 | val reader = new PartitionReaderWrapperFactory(readerFactory, TestObjects.readConfig).createReader(inputPartition)
23 | assert(reader.isInstanceOf[PartitionReaderWrapper])
24 | }
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/datasource/wrappers/PartitionReaderWrapperTest.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.datasource.wrappers
2 |
3 | import com.vertica.spark.common.TestObjects
4 | import com.vertica.spark.datasource.partitions.file.VerticaFilePartition
5 | import com.vertica.spark.util.cleanup.{CleanupUtils, DistributedFilesCleaner}
6 | import org.apache.spark.sql.catalyst.InternalRow
7 | import org.apache.spark.sql.connector.read.PartitionReader
8 | import org.scalamock.scalatest.MockFactory
9 | import org.scalatest.flatspec.AnyFlatSpec
10 |
11 | class PartitionReaderWrapperTest extends AnyFlatSpec with MockFactory{
12 |
13 | behavior of "PartitionReaderWrapperTest"
14 |
15 | private val config = TestObjects.readConfig
16 |
17 | it should "get" in {
18 | val reader = mock[PartitionReader[InternalRow]]
19 | (reader.get _).expects().returning(mock[InternalRow])
20 | val mockCleanupUtils = new CleanupUtils
21 | val mockCleaner = new DistributedFilesCleaner(config, mockCleanupUtils)
22 |
23 | new PartitionReaderWrapper(reader, mock[VerticaFilePartition], mockCleaner).get()
24 | }
25 |
26 | it should "next" in {
27 | val reader = mock[PartitionReader[InternalRow]]
28 | (reader.next _).expects()
29 | val mockCleanupUtils = new CleanupUtils
30 | val mockCleaner = new DistributedFilesCleaner(config, mockCleanupUtils)
31 |
32 | new PartitionReaderWrapper(reader, mock[VerticaFilePartition], mockCleaner).next()
33 | }
34 |
35 | it should "perform cleanup on close" in {
36 | val reader = mock[PartitionReader[InternalRow]]
37 | (reader.close _).expects()
38 | val partitions = mock[VerticaFilePartition]
39 | (partitions.getPortions _).expects().returning(Seq())
40 |
41 | val cleanupUtils = new CleanupUtils
42 | val cleaner = new DistributedFilesCleaner(config, cleanupUtils)
43 | new PartitionReaderWrapper(reader, partitions, cleaner).close()
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/datasource/wrappers/VerticaScanWrapperBuilderTest.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.datasource.wrappers
2 |
3 | import com.vertica.spark.common.TestObjects
4 | import org.apache.spark.sql.connector.read.ScanBuilder
5 | import org.scalamock.scalatest.MockFactory
6 | import org.scalatest.flatspec.AnyFlatSpec
7 |
8 | class VerticaScanWrapperBuilderTest extends AnyFlatSpec with MockFactory {
9 |
10 | private val readConfig = TestObjects.readConfig
11 |
12 | it should "build VerticaScanWrapper" in {
13 | val builder = mock[ScanBuilder]
14 | (builder.build _).expects().returning(mock[VerticaScanWrapper])
15 | assert(new VerticaScanWrapperBuilder(builder, readConfig).build().isInstanceOf[VerticaScanWrapper])
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/datasource/wrappers/json/VerticaJsonTableWrapperTest.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 | package com.vertica.spark.datasource.wrappers.json
14 |
15 | import com.vertica.spark.common.TestObjects
16 | import com.vertica.spark.datasource.wrappers.VerticaScanWrapperBuilder
17 | import org.apache.spark.sql.SparkSession
18 | import org.apache.spark.sql.execution.datasources.FileFormat
19 | import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
20 | import org.apache.spark.sql.execution.datasources.v2.json.JsonTable
21 | import org.apache.spark.sql.types.StructType
22 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
23 | import org.scalamock.scalatest.MockFactory
24 | import org.scalatest.BeforeAndAfterAll
25 | import org.scalatest.flatspec.AnyFlatSpec
26 |
27 | class VerticaJsonTableWrapperTest extends AnyFlatSpec with BeforeAndAfterAll with MockFactory {
28 |
29 | behavior of "VerticaJsonTableTest"
30 |
31 | private val spark: SparkSession = SparkSession.builder()
32 | .master("local[*]")
33 | .appName("Vertica Connector Test")
34 | .getOrCreate()
35 |
36 | class MockJsonTable(_name: String,
37 | sparkSession: SparkSession,
38 | options: CaseInsensitiveStringMap,
39 | paths: Seq[String],
40 | userSpecifiedSchema: Option[StructType],
41 | fallbackFileFormat: Class[_ <: FileFormat])
42 | extends JsonTable(_name, sparkSession, options, paths, userSpecifiedSchema, fallbackFileFormat) {
43 |
44 | override lazy val schema: StructType = StructType(Seq())
45 | }
46 |
47 | private val mockTable = new MockJsonTable("MockJsonTable", spark, CaseInsensitiveStringMap.empty(), List(), Some(StructType(Seq())), classOf[JsonFileFormat])
48 | private val readConfig = TestObjects.readConfig
49 |
50 | it should "return JsonTable capabilities" in {
51 | assert(new VerticaJsonTableWrapper(mockTable, readConfig).capabilities() == mockTable.capabilities())
52 | }
53 |
54 | it should "build VerticaScanWrapperBuilder" in {
55 | assert(new VerticaJsonTableWrapper(mockTable, readConfig).newScanBuilder(CaseInsensitiveStringMap.empty()).isInstanceOf[VerticaScanWrapperBuilder])
56 | }
57 |
58 | it should "return JsonTable name" in {
59 | assert(new VerticaJsonTableWrapper(mockTable, readConfig).name() == "Vertica" + mockTable.name)
60 | }
61 |
62 | it should "return JsonTable schema" in {
63 | // Comparing references
64 | assert(new VerticaJsonTableWrapper(mockTable, readConfig).schema() == mockTable.schema)
65 | }
66 |
67 | override protected def afterAll(): Unit = spark.close()
68 | }
69 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/util/pushdown/PushdownUtilsTest.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.pushdown
15 |
16 | import org.apache.spark.sql.sources._
17 | import org.scalamock.scalatest.MockFactory
18 | import org.scalatest.BeforeAndAfterAll
19 | import org.scalatest.flatspec.AnyFlatSpec
20 |
21 | class PushdownUtilsTest extends AnyFlatSpec with BeforeAndAfterAll with MockFactory with org.scalatest.OneInstancePerTest {
22 |
23 | val sparkFilters: Array[Filter] = Array(
24 | EqualTo("a", 5),
25 | GreaterThan("a", 6.7),
26 | GreaterThanOrEqual("a", 8.8f),
27 | LessThan("a", 1000000),
28 | LessThanOrEqual("a", -1000000),
29 | In("a", Array("abc", "123", "456")),
30 | IsNull("a"),
31 | IsNotNull("b"),
32 | Not(EqualTo("a", 5)),
33 | StringStartsWith("a", "abc"),
34 | StringEndsWith("a", "qwe"),
35 | StringContains("a", "zxc")
36 | )
37 |
38 | val textFilters: Array[String] = Array(
39 | "(\"a\" = 5)",
40 | "(\"a\" > 6.7)",
41 | "(\"a\" >= 8.8)",
42 | "(\"a\" < 1000000)",
43 | "(\"a\" <= -1000000)",
44 | "(\"a\" IN ('abc', '123', '456'))",
45 | "(\"a\" IS NULL)",
46 | "(\"b\" IS NOT NULL)",
47 | "( NOT (\"a\" = 5))",
48 | "(\"a\" like 'abc%')",
49 | "(\"a\" like '%qwe')",
50 | "(\"a\" like '%zxc%')"
51 | )
52 |
53 | it should "generate all filters" in {
54 | sparkFilters.indices.map( i => {
55 | val filter = sparkFilters(i)
56 | val text = textFilters(i)
57 |
58 | assert(PushdownUtils.genFilter(filter).right.get.filterString.toLowerCase == text.toLowerCase)
59 | })
60 | }
61 |
62 | it should "compose all filters with AND" in {
63 | sparkFilters.indices.map( i => {
64 | sparkFilters.indices.map( j => {
65 | val filter = And(sparkFilters(i), sparkFilters(j))
66 | val text = "(" + textFilters(i) + " AND " + textFilters(j) + ")"
67 |
68 | assert(PushdownUtils.genFilter(filter).right.get.filterString.toLowerCase == text.toLowerCase)
69 | })
70 | })
71 | }
72 |
73 | it should "compose all filters with OR" in {
74 | sparkFilters.indices.map( i => {
75 | sparkFilters.indices.map( j => {
76 | val filter = Or(sparkFilters(i), sparkFilters(j))
77 | val text = "(" + textFilters(i) + " OR " + textFilters(j) + ")"
78 |
79 | assert(PushdownUtils.genFilter(filter).right.get.filterString.toLowerCase == text.toLowerCase)
80 | })
81 | })
82 | }
83 |
84 | it should "compose all filters with AND + OR + NOT" in {
85 | sparkFilters.indices.map( i => {
86 | sparkFilters.indices.map( j => {
87 | val filter = Not(Or(And(sparkFilters(i), sparkFilters(j)), And(sparkFilters(i), sparkFilters(j))) )
88 | val text = "( NOT (" +
89 | "(" + textFilters(i) + " AND " + textFilters(j) + ")" +
90 | " OR " +
91 | "(" + textFilters(i) + " AND " + textFilters(j) + ")" +
92 | "))"
93 |
94 | assert(PushdownUtils.genFilter(filter).right.get.filterString.toLowerCase == text.toLowerCase)
95 | })
96 | })
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/util/query/StringParsingUtilsTest.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.util.query
2 |
3 | import org.scalatest.flatspec.AnyFlatSpec
4 |
5 | class StringParsingUtilsTest extends AnyFlatSpec {
6 |
7 | behavior of "VerticaSQLUtilsTest"
8 |
9 | it should "split a comma separated list" in {
10 | val result = StringParsingUtils.splitByComma("cat, cat dog, shark,")
11 | assert(result.length == 3)
12 | assert(result(0) == "cat")
13 | assert(result(1) == "cat dog")
14 | assert(result(2) == "shark")
15 | }
16 |
17 | it should "split a comma separated list with parentheses" in {
18 | val result = StringParsingUtils.splitByComma("col1 (int, col2) (cat, ((dog))), shark")
19 | assert(result.length == 2)
20 | assert(result.head == "col1 (int, col2) (cat, ((dog)))")
21 | assert(result(1) == "shark")
22 |
23 | val result2 = StringParsingUtils.splitByComma("(col1 int, col2 cat dog,)")
24 | assert(result2.length == 1)
25 | assert(result2.head == "(col1 int, col2 cat dog,)")
26 |
27 | val result3 = StringParsingUtils.splitByComma(")(col1 (int, ()col2, shark)), cat, dog")
28 | assert(result3.length == 3)
29 | assert(result3.head == ")(col1 (int, ()col2, shark))")
30 | assert(result3(1) == "cat")
31 | assert(result3(2) == "dog")
32 | }
33 |
34 | it should "split a comma separated list with non matching parentheses" in {
35 | val result = StringParsingUtils.splitByComma("(col1 int, col2 cat dog,")
36 | assert(result.length == 1)
37 | assert(result.head == "(col1 int, col2 cat dog,")
38 |
39 | val result2 = StringParsingUtils.splitByComma(")col1 (int, (col2, shark)), cat, dog")
40 | assert(result2.length == 3)
41 | assert(result2.head == ")col1 (int, (col2, shark))")
42 | assert(result2(1) == "cat")
43 | assert(result2(2) == "dog")
44 |
45 | val result3 = StringParsingUtils.splitByComma(")(col1 (int, (col2, shark)), cat, dog")
46 | assert(result3.length == 1)
47 | assert(result3.head == ")(col1 (int, (col2, shark)), cat, dog")
48 | }
49 |
50 | it should "find the indices of the first matching parentheses" in {
51 | val str = ")cat(dog_(sha(rk)))cat(_(d)og)"
52 | val (openParen, closeParen) = StringParsingUtils.findFirstParenGroupIndices(str)
53 | assert(openParen == 4)
54 | assert(closeParen == 18)
55 | assert(str.substring(openParen + 1, closeParen) == "dog_(sha(rk))")
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/util/version/SparkVersionToolsTests.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.util.version
2 |
3 | import com.vertica.spark.config.ReadConfig
4 | import com.vertica.spark.datasource.core.DSConfigSetupInterface
5 | import com.vertica.spark.datasource.v2.{VerticaScanBuilder, VerticaScanBuilderWithPushdown}
6 | import com.vertica.spark.util.reflections.ReflectionTools
7 | import org.apache.spark.sql.SparkSession
8 | import org.apache.spark.sql.connector.expressions.Expression
9 | import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Aggregation}
10 | import org.scalamock.scalatest.MockFactory
11 | import org.scalatest.flatspec.AnyFlatSpec
12 |
13 | class SparkVersionToolsTests extends AnyFlatSpec with MockFactory{
14 |
15 | behavior of "SparkUtilsTests"
16 |
17 | it should "correctly parses Spark version string" in {
18 | val version = (new SparkVersionTools).getVersion(Some("3.2.1"))
19 | assert(version.isDefined)
20 | assert(version == Some(Version(3,2,1)))
21 | }
22 |
23 | it should "correctly parses major-minor-patch numbers" in {
24 | val version = (new SparkVersionTools).getVersion(Some("3.2.1-0-vertica-1"))
25 | assert(version.isDefined)
26 | assert(version == Some(Version(3,2,1)))
27 | }
28 |
29 | it should "return a Spark version string" in {
30 | val spark = SparkSession.builder().master("local[1]").getOrCreate()
31 | assert((new SparkVersionTools).getVersionString.isDefined)
32 | spark.close()
33 | }
34 |
35 | private val groupByExpressions = Array[Expression]()
36 | private val aggregates = Array[AggregateFunc]()
37 | // Aggregation is final and can't be mocked with MockFactory
38 | private val mockAggregation = new Aggregation(aggregates, groupByExpressions)
39 |
40 | it should "get group by expressions from groupByColumns method when Spark is than 3.2.x" in {
41 | val sparkVersion = Version(3,2,9)
42 | val reflection = mock[ReflectionTools]
43 | val groupByColumns = Array[Expression]()
44 | (reflection.aggregationInvokeMethod[Array[Expression]] _).expects(mockAggregation, "groupByColumns").returning(groupByColumns)
45 |
46 | assert(new SparkVersionTools(reflection).getCompatibleGroupByExpressions(sparkVersion, mockAggregation) == groupByColumns)
47 | }
48 |
49 | it should "get group by expressions using groupByExpressions method when spark is at least 3.3.0" in {
50 | val sparkVersion = Version(3,3)
51 | assert(new SparkVersionTools(mock[ReflectionTools]).getCompatibleGroupByExpressions(sparkVersion, mockAggregation) == groupByExpressions)
52 | }
53 |
54 | it should "build VerticaScanBuilder for Spark version before than 3.2" in {
55 | val sparkVersion = Version(3, 1, 9)
56 | val reflection = mock[ReflectionTools]
57 | (reflection.makeScanBuilderWithoutPushDown _).expects(*, *).returning(mock[VerticaScanBuilder])
58 |
59 | new SparkVersionTools(reflection).makeCompatibleVerticaScanBuilder(sparkVersion, mock[ReadConfig], mock[DSConfigSetupInterface[ReadConfig]])
60 | }
61 |
62 | it should "build VerticaScanBuilder with aggregates push down for Spark version 3.2 or newer" in {
63 | val sparkVersion = Version(3, 2)
64 | val reflection = mock[ReflectionTools]
65 | (reflection.makeScanBuilderWithPushDown _).expects(*, *).returning(mock[VerticaScanBuilderWithPushdown])
66 |
67 | new SparkVersionTools(reflection).makeCompatibleVerticaScanBuilder(sparkVersion, mock[ReadConfig], mock[DSConfigSetupInterface[ReadConfig]])
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/connector/src/test/scala/com/vertica/spark/util/version/VersionTest.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.util.version
15 |
16 | import org.scalamock.scalatest.MockFactory
17 | import org.scalatest.flatspec.AnyFlatSpec
18 | import org.scalatest.BeforeAndAfterAll
19 |
20 | //scalastyle:off
21 | class VersionTest extends AnyFlatSpec with BeforeAndAfterAll with MockFactory with org.scalatest.OneInstancePerTest {
22 |
23 | it should "compare to bigger version" in {
24 | assert(Version(11, 1, 5, 3) > (Version(10, 4, 7, 5)))
25 | }
26 |
27 | it should "compare to smaller version" in {
28 | assert(Version(11, 1, 5, 3) < (Version(12, 0, 2, 1)))
29 | }
30 |
31 | it should "compare to smaller or equal versions" in {
32 | assert(Version(11, 1, 5, 3) <= (Version(11, 1, 5, 3)))
33 | assert(Version(11, 1, 5, 3) <= (Version(11, 2, 5, 3)))
34 | }
35 |
36 | it should "compare to bigger or equal versions" in {
37 | assert(Version(11, 1, 5, 3) >= (Version(11, 1, 5, 3)))
38 | assert(Version(11, 1, 5, 3) >= (Version(11, 1, 5, 2)))
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/docker/client-krb/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM centos:7
2 |
3 | ENV SBT_VERSION 1.3.13
4 | ENV JAVA_OPTS="$JAVA_OPTS -Djava.security.auth.login.config=/spark-connector/docker/client-krb/jaas.config"
5 |
6 | RUN yum install -y java-11-openjdk && \
7 | yum install -y krb5-workstation && \
8 | yum install -y epel-release && \
9 | yum update -y && yum install -y wget && \
10 | curl -L https://www.scala-sbt.org/sbt-rpm.repo > sbt-rpm.repo && \
11 | mv sbt-rpm.repo /etc/yum.repos.d/ && \
12 | yum -y install sbt && \
13 | wget https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-without-hadoop.tgz && \
14 | wget https://archive.apache.org/dist/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz && \
15 | tar xvf spark-3.1.2-bin-without-hadoop.tgz && \
16 | tar xvf hadoop-3.3.1.tar.gz && \
17 | mv spark-3.1.2-bin-without-hadoop/ /opt/spark && \
18 | cd /opt/spark/conf && \
19 | mv spark-env.sh.template spark-env.sh
20 |
21 | ENTRYPOINT ["/bin/bash"]
22 |
--------------------------------------------------------------------------------
/docker/client-krb/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | echo "[logging]
4 | default = FILE:/var/log/krb5libs.log
5 | kdc = FILE:/var/log/krb5kdc.log
6 | admin_server = FILE:/var/log/kadmind.log
7 | [libdefaults]
8 | default_realm = $REALM
9 | dns_lookup_realm = false
10 | dns_lookup_kdc = false
11 | ticket_lifetime = 24h
12 | renew_lifetime = 7d
13 | forwardable = true
14 | [realms]
15 | $REALM = {
16 | kdc = $KDC
17 | admin_server = $KDC
18 | }
19 | [domain_realm]
20 | .example.com = $REALM
21 | example.com = $REALM" | tee /etc/krb5.conf
22 |
23 | cp /etc/hadoop/conf/* /hadoop-3.3.1/etc/hadoop/
24 |
25 | keytool -import -file /hadoop-3.3.1/etc/hadoop/hdfs.cert -alias hdfs -keystore cacerts.jks -no-prompt -storepass password
26 |
27 | echo 'user1' | kinit user1
28 |
29 | exec "$@"
30 |
--------------------------------------------------------------------------------
/docker/client-krb/jaas.config:
--------------------------------------------------------------------------------
1 | Client {
2 | com.sun.security.auth.module.Krb5LoginModule required
3 | useKeyTab=false
4 | useTicketCache=true
5 | doNotPrompt=true;
6 | };
7 |
--------------------------------------------------------------------------------
/docker/client-krb/vsql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/docker/client-krb/vsql
--------------------------------------------------------------------------------
/docker/client/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG SPARK=latest
2 | FROM bitnami/spark:$SPARK
3 |
4 | USER root
5 |
6 | # Install JDK
7 | RUN apt-get update && apt-get install -y openjdk-11-jdk
8 | ENV JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64"
9 | # Prepending our JAVA_HOME so that it would take precedent over bitnami's java home path.
10 | ENV PATH=$JAVA_HOME/bin:$PATH
11 |
12 | # Install SBT
13 | RUN apt-get update && \
14 | apt-get -y install apt-transport-https curl gnupg -yqq && \
15 | echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
16 | echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
17 | curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
18 | chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
19 | apt-get update && \
20 | apt-get -y install sbt
21 |
22 | # Sync Python version
23 | RUN mkdir -p /tmp/bitnami/pkg/cache/ && cd /tmp/bitnami/pkg/cache/ && \
24 | COMPONENTS=( \
25 | "python-3.10.6-8-linux-${OS_ARCH}-debian-11" \
26 | ) && \
27 | for COMPONENT in "${COMPONENTS[@]}"; do \
28 | if [ ! -f "${COMPONENT}.tar.gz" ]; then \
29 | curl -SsLf "https://downloads.bitnami.com/files/stacksmith/${COMPONENT}.tar.gz" -O ; \
30 | curl -SsLf "https://downloads.bitnami.com/files/stacksmith/${COMPONENT}.tar.gz.sha256" -O ; \
31 | fi && \
32 | sha256sum -c "${COMPONENT}.tar.gz.sha256" && \
33 | tar -zxf "${COMPONENT}.tar.gz" -C /opt/bitnami --strip-components=2 --no-same-owner --wildcards '*/files' && \
34 | rm -rf "${COMPONENT}".tar.gz{,.sha256} ; \
35 | done
36 |
37 |
--------------------------------------------------------------------------------
/docker/docker-compose-kerberos.yml:
--------------------------------------------------------------------------------
1 | version: "3.9"
2 | services:
3 | krb-client:
4 | build: ./client-krb
5 | entrypoint: /client-krb/docker-entrypoint.sh sleep infinity
6 | container_name: client
7 | hostname: client
8 | domainname: example.com
9 | networks:
10 | default:
11 | aliases:
12 | - client.example.com
13 | ports:
14 | - "5005:5005"
15 | volumes:
16 | - ./..:/spark-connector
17 | - ./vertica-hdfs-config/hadoop-kerberized:/etc/hadoop/conf
18 | - ./client-krb:/client-krb
19 | env_file:
20 | - krb.env
21 | environment:
22 | - HADOOP_VERSION
23 | - SPARK_VERSION
24 | - AWS_ACCESS_KEY_ID
25 | - AWS_SECRET_ACCESS_KEY
26 | - GCS_FILEPATH
27 | - GCS_HMAC_KEY_ID
28 | - GCS_HMAC_KEY_SECRET
29 | - GCS_SERVICE_KEY_ID
30 | - GCS_SERVICE_KEY
31 | - GCS_SERVICE_EMAIL
32 |
33 | kdc:
34 | build: ./kdc
35 | entrypoint: /kdc/docker-entrypoint.sh /usr/sbin/init
36 | container_name: kdc
37 | hostname: kdc
38 | domainname: example.com
39 | networks:
40 | default:
41 | aliases:
42 | - kdc.example.com
43 | volumes:
44 | - ./kdc:/kdc
45 | - ./keytabs:/keytabs
46 | env_file:
47 | - krb.env
48 |
49 | vertica:
50 | image: vertica/vertica-k8s:${VERTICA_VERSION:-latest}
51 | container_name: vertica
52 | hostname: vertica
53 | domainname: example.com
54 | networks:
55 | default:
56 | aliases:
57 | - vertica.example.com
58 | ports:
59 | - "5433:5433"
60 | volumes:
61 | - ./vertica-krb/docker-entrypoint.sh:/usr/local/bin/docker-entrypoint.sh
62 | - ./vertica-hdfs-config/hadoop-kerberized:/etc/hadoop/conf
63 | - ./vertica-krb:/vertica-krb
64 | - ./keytabs:/keytabs
65 | env_file:
66 | - krb.env
67 | environment:
68 | - VERTICA_MEMDEBUG=2
69 |
70 | hdfs:
71 | build: ./hdfs-krb
72 | entrypoint: /usr/local/bin/docker-entrypoint.sh sleep infinity
73 | # Must explicitly set container_name or add entries to /etc/hosts in other containers that
74 | # communicate with hdfs (client and vertica), otherwise Kerberos is unable to perform both
75 | # forward and reverse lookup
76 | container_name: hdfs
77 | hostname: hdfs
78 | domainname: example.com
79 | networks:
80 | default:
81 | aliases:
82 | - hdfs.example.com
83 | ports:
84 | - "22022:22"
85 | - "8020:8020"
86 | - "50010:50010"
87 | - "50020:50020"
88 | - "50070:50070"
89 | - "50071:50071"
90 | - "50075:50075"
91 | - "50076:50076"
92 | volumes:
93 | - ./hdfs-krb/docker-entrypoint.sh:/usr/local/bin/docker-entrypoint.sh
94 | - ./vertica-hdfs-config/hadoop-kerberized:/hadoop/conf
95 | - ./hdfs-krb:/hdfs-krb
96 | - ./keytabs:/keytabs
97 | env_file:
98 | - krb.env
99 |
100 | networks:
101 | default:
102 | name: "EXAMPLE.COM"
103 | driver: bridge
104 |
--------------------------------------------------------------------------------
/docker/hdfs-krb/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM openjdk:8
2 | MAINTAINER vertica
3 |
4 | ENV DEBIAN_FRONTEND noninteractive
5 |
6 | # Refresh package lists
7 | RUN apt-get update
8 | RUN apt-get -qy dist-upgrade
9 |
10 | RUN apt-get install -qy rsync curl openssh-server openssh-client vim nfs-common
11 |
12 | RUN mkdir -p /data/hdfs-nfs/
13 | RUN mkdir -p /opt
14 | WORKDIR /opt
15 |
16 | # Install Hadoop
17 | RUN curl -L https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz -s -o - | tar -xzf -
18 | RUN mv hadoop-3.3.1 hadoop
19 |
20 | # Setup
21 | WORKDIR /opt/hadoop
22 | ENV PATH /opt/hadoop/bin:/opt/hadoop/sbin:$PATH
23 | RUN echo $JAVA_HOME
24 | ENV JAVA_HOME /usr/local/openjdk-8
25 | RUN sed --in-place='.ori' -e "s/\${JAVA_HOME}/\/usr\/local\/openjdk-8/" etc/hadoop/hadoop-env.sh
26 |
27 | # Configure ssh client
28 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa && \
29 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
30 | chmod 0600 ~/.ssh/authorized_keys
31 |
32 | RUN echo "\nHost *\n" >> ~/.ssh/config && \
33 | echo " StrictHostKeyChecking no\n" >> ~/.ssh/config && \
34 | echo " UserKnownHostsFile=/dev/null\n" >> ~/.ssh/config
35 |
36 | # Disable sshd authentication
37 | RUN echo "root:root" | chpasswd
38 | RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
39 | RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
40 |
41 | # SSH login fix. Otherwise user is kicked off after login
42 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
43 |
44 | # Pseudo-Distributed Operation
45 | RUN echo "export JAVA_HOME=/usr/local/openjdk-8" >> /opt/hadoop/etc/hadoop/hadoop-env.sh
46 | RUN hdfs namenode -format
47 |
48 | ENV HDFS_NAMENODE_USER root
49 | ENV HDFS_DATANODE_USER root
50 | ENV HDFS_SECONDARYNAMENODE_USER root
51 |
52 | # SSH
53 | EXPOSE 22
54 | # hdfs://localhost:8020
55 | EXPOSE 8020
56 | # HDFS namenode
57 | EXPOSE 50020
58 | # HDFS Web browser
59 | EXPOSE 50070
60 | # HDFS datanodes
61 | EXPOSE 50075
62 | # HDFS secondary namenode
63 | EXPOSE 50090
64 |
65 | ENTRYPOINT service ssh start \
66 | && start-dfs.sh \
67 | && hadoop-daemon.sh start portmap \
68 | && hadoop-daemon.sh start nfs3 \
69 | && bash || bash
70 |
--------------------------------------------------------------------------------
/docker/hdfs-krb/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | service ssh start
4 |
5 | # Start HDFS services
6 | rm -f /tmp/*.pid
7 | start-dfs.sh
8 | hadoop-daemon.sh start portmap
9 | hadoop-daemon.sh start nfs3
10 |
11 | # Configure Kerberos
12 | echo "[logging]
13 | default = FILE:/var/log/krb5libs.log
14 | kdc = FILE:/var/log/krb5kdc.log
15 | admin_server = FILE:/var/log/kadmind.log
16 | [libdefaults]
17 | default_realm = $REALM
18 | dns_lookup_realm = false
19 | dns_lookup_kdc = false
20 | ticket_lifetime = 24h
21 | forwardable = true
22 | [realms]
23 | $REALM = {
24 | kdc = $KDC
25 | admin_server = $KDC
26 | }
27 | [domain_realm]
28 | .example.com = $REALM
29 | example.com = $REALM" | tee /etc/krb5.conf
30 |
31 | cp /keytabs/hdfs.keytab /root/.keytab
32 |
33 | cp /hadoop/conf/core-site.xml /opt/hadoop/etc/hadoop/core-site.xml
34 | cp /hadoop/conf/hdfs-site.xml /opt/hadoop/etc/hadoop/hdfs-site.xml
35 | cp /hadoop/conf/ssl-server.xml /opt/hadoop/etc/hadoop/ssl-server.xml
36 | cp /hadoop/conf/keystore /root/.keystore
37 |
38 | export PATH=$PATH:/usr/bin
39 |
40 | rm /hadoop/conf/hdfs.cert
41 | keytool -delete -alias hdfs -keystore /root/.keystore -storepass password
42 | keytool -genkey -keyalg RSA -alias hdfs -keystore /root/.keystore -validity 500 -keysize 2048 -dname "CN=hdfs.example.com, OU=hdfs, O=hdfs, L=hdfs, S=hdfs, C=hdfs" -no-prompt -storepass password -keypass password
43 | echo "password" | keytool -export -alias hdfs -keystore /root/.keystore -rfc -file hdfs.cert
44 | cp hdfs.cert /hadoop/conf/
45 |
46 | # Restart HDFS service
47 | stop-dfs.sh
48 | start-dfs.sh
49 |
50 | echo "HDFS container is now running"
51 |
52 | exec "$@"
53 |
--------------------------------------------------------------------------------
/docker/hdfs/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | service ssh start
4 |
5 | # Override HDFS config
6 | cp /hadoop/conf/*.xml /opt/hadoop/etc/hadoop
7 |
8 | # Start HDFS services
9 | rm -f /tmp/*.pid
10 | start-dfs.sh
11 | hadoop-daemon.sh start portmap
12 | hadoop-daemon.sh start nfs3
13 |
14 | # Copy test data to HDFS
15 | while [ "$(hdfs dfsadmin -safemode get)" = "Safe mode is ON" ]; do sleep 1; done
16 | hadoop fs -copyFromLocal /partitioned /3.1.1
17 |
18 | echo "HDFS container is now running"
19 |
20 | exec "$@"
21 |
--------------------------------------------------------------------------------
/docker/kdc/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM centos:8
2 |
3 | RUN (cd /lib/systemd/system/sysinit.target.wants/; \
4 | for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done); \
5 | rm -f /lib/systemd/system/multi-user.target.wants/*;\
6 | rm -f /etc/systemd/system/*.wants/*;\
7 | rm -f /lib/systemd/system/local-fs.target.wants/*; \
8 | rm -f /lib/systemd/system/sockets.target.wants/*udev*; \
9 | rm -f /lib/systemd/system/sockets.target.wants/*initctl*; \
10 | rm -f /lib/systemd/system/basic.target.wants/*;\
11 | rm -f /lib/systemd/system/anaconda.target.wants/* && \
12 | sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
13 | sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && \
14 | yum update -y && \
15 | yum install python2 wget -y && \
16 | wget https://raw.githubusercontent.com/gdraheim/docker-systemctl-replacement/master/files/docker/systemctl.py -O /usr/local/bin/systemctl && \
17 | chmod a+x /usr/local/bin/systemctl && \
18 | yum -y install initscripts && yum clean all && \
19 | yum install krb5-server krb5-libs krb5-workstation -y
20 |
21 | EXPOSE 88
22 |
23 | CMD ["/usr/sbin/init"]
24 |
--------------------------------------------------------------------------------
/docker/kdc/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | echo "[logging]
4 | default = FILE:/var/log/krb5libs.log
5 | kdc = FILE:/var/log/krb5kdc.log
6 | admin_server = FILE:/var/log/kadmind.log
7 | [libdefaults]
8 | default_realm = $REALM
9 | dns_lookup_realm = false
10 | dns_lookup_kdc = false
11 | ticket_lifetime = 24h
12 | renew_lifetime = 7d
13 | forwardable = true
14 | [realms]
15 | $REALM = {
16 | kdc = localhost
17 | admin_server = localhost
18 | }
19 | [domain_realm]
20 | .example.com = $REALM
21 | example.com = $REALM" | tee /etc/krb5.conf
22 |
23 | kdb5_util -P 'admin' create
24 |
25 | systemctl start kadmin.service
26 | systemctl start krb5kdc.service
27 | chkconfig krb5kdc on
28 | chkconfig kadmin on
29 |
30 | # Create admin
31 | $KADMIN -q "addprinc -pw admin admin/admin"
32 | echo "*/admin@$REALM *" | tee -a /var/kerberos/krb5kdc/kadm5.acl
33 |
34 | # Add user principals
35 | for u in ${USERS//,/ };do
36 | $KADMIN -q "addprinc -pw ${u} ${u}"
37 | done
38 |
39 | $KADMIN -q "addprinc -randkey $V_PRINC"
40 | $KADMIN -q "ktadd -norandkey -k vertica.keytab $V_PRINC"
41 | chmod 777 vertica.keytab
42 | cp vertica.keytab /keytabs
43 |
44 | $KADMIN -q "addprinc -randkey $HDFS_PRINC"
45 | $KADMIN -q "addprinc -randkey $HTTP_HDFS_PRINC"
46 | $KADMIN -q "ktadd -norandkey -k hdfs.keytab $HDFS_PRINC $HTTP_HDFS_PRINC"
47 | chmod 777 hdfs.keytab
48 | cp hdfs.keytab /keytabs
49 |
50 | exec "$@"
51 |
--------------------------------------------------------------------------------
/docker/keytabs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/docker/keytabs/.gitkeep
--------------------------------------------------------------------------------
/docker/krb.env:
--------------------------------------------------------------------------------
1 | V_PRINC=vertica/vertica.example.com@EXAMPLE.COM
2 | KDC=kdc
3 | KHOST=vertica.example.com
4 | REALM=EXAMPLE.COM
5 | KTAB=/vertica.keytab
6 | DBNAME=docker
7 |
8 | SERVICE_NAME=vertica
9 | USERS=user1,user2,user3
10 | KADMIN=kadmin.local
11 |
12 | HDFS_PRINC=root/hdfs.example.com@EXAMPLE.COM
13 | HTTP_HDFS_PRINC=HTTP/hdfs.example.com@EXAMPLE.COM
14 |
--------------------------------------------------------------------------------
/docker/vertica-hdfs-config/hadoop-kerberized/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | fs.defaultFS
4 | hdfs://hdfs.example.com:8020
5 |
6 |
7 |
8 | hadoop.security.authentication
9 | kerberos
10 |
11 |
12 |
13 | hadoop.security.authorization
14 | true
15 |
16 |
17 |
18 | hadoop.security.auth_to_local
19 |
20 | RULE:[2:$1/$2@$0](.*/.*@EXAMPLE.COM)s/.*/root/
21 | DEFAULT
22 |
23 |
24 |
25 |
26 | hadoop.rpc.protection
27 | authentication
28 |
29 |
30 |
31 | hadoop.proxyuser.root.groups
32 | *
33 |
34 |
35 |
36 | hadoop.proxyuser.root.hosts
37 | *
38 |
39 |
40 |
41 | hadoop.proxyuser.superuser.hosts
42 | *
43 |
44 |
45 |
46 | hadoop.proxyuser.superuser.groups
47 | *
48 |
49 |
50 |
51 | hadoop.http.authentication.type
52 | kerberos
53 |
54 |
55 |
56 | hadoop.http.authentication.kerberos.keytab
57 | /root/.keytab
58 |
59 |
60 |
61 | hadoop.http.authentication.kerberos.principal
62 | HTTP/hdfs.example.com@EXAMPLE.COM
63 |
64 |
65 |
--------------------------------------------------------------------------------
/docker/vertica-hdfs-config/hadoop-kerberized/keystore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/docker/vertica-hdfs-config/hadoop-kerberized/keystore
--------------------------------------------------------------------------------
/docker/vertica-hdfs-config/hadoop-kerberized/ssl-client.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | ssl.client.truststore.location
4 | /cacerts.jks
5 |
6 |
7 |
8 | ssl.client.truststore.password
9 | password
10 |
11 |
12 |
--------------------------------------------------------------------------------
/docker/vertica-hdfs-config/hadoop-kerberized/ssl-server.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | ssl.server.keystore.keypassword
4 | password
5 |
6 |
7 |
8 | ssl.server.keystore.password
9 | password
10 |
11 |
12 |
13 | ssl.server.keystore.location
14 | /root/.keystore
15 |
16 |
17 |
--------------------------------------------------------------------------------
/docker/vertica-hdfs-config/hadoop/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | fs.defaultFS
4 | hdfs://hdfs:8020
5 |
6 |
7 |
8 | dfs.client.use.datanode.hostname
9 | true
10 |
11 |
12 |
13 | dfs.datanode.use.datanode.hostname
14 | true
15 |
16 |
17 |
18 |
19 |
20 | hadoop.proxyuser.root.groups
21 | *
22 |
23 |
24 |
25 | hadoop.proxyuser.root.hosts
26 | *
27 |
28 |
29 |
--------------------------------------------------------------------------------
/docker/vertica/docker-entrypoint-legacy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This file has been modified for use by the Spark Connector
4 | # See original entrypoint script at https://github.com/vertica/vertica-kubernetes/blob/main/docker-vertica/docker-entrypoint.sh
5 |
6 | set -e
7 |
8 | start_cron(){
9 | # daemonizes, no need for &
10 | sudo /usr/sbin/crond
11 | }
12 |
13 | # Kubernetes start-up is a little weird
14 | # - in order to configure the host-list correctly, k8s
15 | # has to do an install_vertica, which writes to
16 | # non-volatile store
17 | # - but the agent needs things that will be created by
18 | # that install.
19 | # - so we don't start the agent until we find the database running
20 | start_agent_when_ready(){
21 | agent_started=No
22 | while [ $agent_started == No ]; do
23 | if [ -f /opt/vertica/config/admintools.conf ]; then
24 | # safe to try to run admintools
25 | db=$(/opt/vertica/bin/admintools -t show_active_db) || true
26 | case "$db"x in
27 | x)
28 | sleep 15
29 | ;;
30 | *)
31 | echo "Starting vertica agent for db $db"
32 | sudo /opt/vertica/sbin/vertica_agent start \
33 | 2> /tmp/agent_start.err \
34 | 1> /tmp/agent_start.out
35 | echo "Agent started"
36 | agent_started=Yes
37 | ;;
38 | esac
39 | else
40 | sleep 15
41 | fi
42 | done
43 | }
44 |
45 | restartNode(){
46 | if [ ! -f /opt/vertica/config/admintools.conf ]
47 | then
48 | echo "Vertica is not installed, expect manual user intervention for install.";
49 | sudo /usr/sbin/sshd -D
50 | # If we get here we fail to force restart of container:
51 | exit 1
52 | fi
53 | # restart local Vertica node
54 | echo "Restart local node"
55 | /opt/vertica/sbin/python3 /opt/vertica/bin/re-ip-node.py --restart-node
56 | sudo /usr/sbin/sshd -D
57 | }
58 |
59 | reIpNode(){
60 | if [ ! -d /opt/vertica/config/licensing ] || [ -z $(ls -A /opt/vertica/config/licensing/*) ]
61 | then
62 | echo "Installing license..."
63 | mkdir -p /opt/vertica/config/licensing
64 | cp -r /home/dbadmin/licensing/ce/* /opt/vertica/config/licensing
65 | fi
66 | echo "Update IP address on local node"
67 | /opt/vertica/sbin/python3 /opt/vertica/bin/re-ip-node.py --re-ip-node
68 | exit $?
69 | }
70 |
71 | defaultEntrypoint(){
72 | echo "Vertica container is now running"
73 | sudo /usr/sbin/sshd -D
74 | }
75 |
76 | start_cron
77 | start_agent_when_ready &
78 |
79 | # Create database
80 | /opt/vertica/bin/admintools -t list_db --database=docker || \
81 | /opt/vertica/bin/admintools -t create_db --database="${VERTICA_DATABASE:-docker}" --password="${VERTICA_PASSWORD}" --hosts=localhost
82 |
83 | # Start database
84 | if [ "$(/opt/vertica/bin/admintools -t db_status --status=DOWN)" == "${VERTICA_DATABASE:-docker}" ]; then
85 | /opt/vertica/bin/admintools -t start_db --database="${VERTICA_DATABASE:-docker}" --password="${VERTICA_PASSWORD}" --hosts=localhost
86 | fi
87 |
88 | # Configure database
89 | /opt/vertica/bin/vsql -c "ALTER DATABASE docker SET MaxClientSessions=100;"
90 |
91 | case $# in
92 | 1)
93 | case $1 in
94 | restart-vertica-node)
95 | restartNode
96 | ;;
97 | re-ip-vertica-node)
98 | reIpNode
99 | ;;
100 | *)
101 | echo "Invalid argument: $1"
102 | exit 1
103 | ;;
104 | esac
105 | ;;
106 | *)
107 | defaultEntrypoint
108 | ;;
109 | esac
110 |
--------------------------------------------------------------------------------
/docker/vertica/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This file has been modified for use by the Spark Connector
4 | # See original entrypoint script at https://github.com/vertica/vertica-kubernetes/blob/main/docker-vertica/docker-entrypoint.sh
5 |
6 | set -e
7 |
8 | start_cron(){
9 | # daemonizes, no need for &
10 | sudo /usr/sbin/cron
11 | }
12 |
13 | # We copy back the files normally stored in /opt/vertica/config/. We do this
14 | # because we have a Persistent Volume that backs /opt/vertica/config, so
15 | # it starts up empty and must be populated
16 | copy_config_files() {
17 | mkdir -p /opt/vertica/config/licensing
18 |
19 | mv /home/dbadmin/logrotate/* /opt/vertica/config/ 2>/dev/null || true
20 |
21 | cp -r /home/dbadmin/licensing/ce/* /opt/vertica/config/licensing 2>/dev/null || true
22 | chmod -R ugo+r,u+rw /opt/vertica/config/licensing
23 | }
24 |
25 | # Ensure all PV paths are owned by dbadmin. This is done for some PVs that
26 | # start with restrictive ownership.
27 | ensure_path_is_owned_by_dbadmin() {
28 | # -z is to needed in case input arg is empty
29 | [ -z "$1" ] || [ "$(stat -c "%U" "$1")" == "dbadmin" ] || sudo chown -R dbadmin:verticadba "$1"
30 | }
31 |
32 | start_cron
33 | ensure_path_is_owned_by_dbadmin /opt/vertica/config
34 | ensure_path_is_owned_by_dbadmin /opt/vertica/log
35 | ensure_path_is_owned_by_dbadmin $DATA_PATH
36 | ensure_path_is_owned_by_dbadmin $DEPOT_PATH
37 | copy_config_files
38 |
39 | # Create database
40 | /opt/vertica/bin/admintools -t list_db --database=docker || \
41 | /opt/vertica/bin/admintools -t create_db --database="${VERTICA_DATABASE:-docker}" --password="${VERTICA_PASSWORD}" --hosts=localhost
42 |
43 | # Start database
44 | if [ "$(/opt/vertica/bin/admintools -t db_status --status=DOWN)" == "${VERTICA_DATABASE:-docker}" ]; then
45 | /opt/vertica/bin/admintools -t start_db --database="${VERTICA_DATABASE:-docker}" --password="${VERTICA_PASSWORD}" --hosts=localhost
46 | fi
47 |
48 | # Configure database
49 | /opt/vertica/bin/vsql -c "ALTER DATABASE docker SET MaxClientSessions=100;"
50 |
51 | echo "Vertica container is now running"
52 |
53 | sudo ssh-keygen -q -A
54 | sudo /usr/sbin/sshd -D
55 |
--------------------------------------------------------------------------------
/docs/gcs-guide.md:
--------------------------------------------------------------------------------
1 | # Google Cloud Storage User Guide
2 |
3 | Since Vertica can be deployed on Google Cloud Platform, it is possible for the Spark Connector to make use of Google Cloud Storage as the intermediary storage.
4 |
5 | * **Running on DataProc clusters:** If your Spark cluster deployed on GCP, you will need to obtain an HMAC interoperability key. Then configure connector options `gcs_hmac_key_id` and `gcs_hmac_key_secret`. The instruction for obtaining the key can be found [here](https://cloud.google.com/storage/docs/authentication/managing-hmackeys#create).
6 | * **Running outside of DataProc clusters:** In addition to configuring the HMAC key above, you will obtain a GCS service account key in the form of a JSON service keyfile. Instruction on obtaining one can be found [here](https://cloud.google.com/storage/docs/authentication#generating-a-private-key).
7 |
8 | Then, specify the connector option `gcs_service_keyfile` with the path to your keyfile JSON. Alternatively, the connector can pick up the option from the environment variable `GOOGLE_APPLICATION_CREDENTIALS` as well as the spark configuration option `fs.gs.auth.service.account.json.keyfile`.
9 |
10 | Finally, ensure that you include the [Google Hadoop Connector](https://mvnrepository.com/artifact/com.google.cloud.bigdataoss/gcs-connector) dependency into your project. Make sure your select the appropriate connector distribution for your Hadoop version.
11 |
12 | With the credential specified, you can now configure the connector option `staging_fs_url` to use GCS paths `gs:///path/to/data`.
13 |
14 | Another option to specifying the keyfile path is to set the following connector options:
15 | ```
16 | gcs_service_key_id =
17 | gcs_service_key =
18 | gcs_service_email =
19 | ```
20 |
21 | ## Additional Resources
22 |
23 | * [Google Hadoop Connector GitHub](https://github.com/GoogleCloudDataproc/hadoop-connectors)
24 | * [Using Google Hadoop Connector](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage)
25 |
--------------------------------------------------------------------------------
/docs/hdfs-guide.md:
--------------------------------------------------------------------------------
1 | # Setting up a single-node HDFS and using it with the Vertica Spark Connector
2 |
3 | Here, we'll give some instructions for a simple one-node cluster setup on a Linux environment.
4 |
5 | ## 1. Download Hadoop
6 |
7 | Navigate to the desired install location and download hadoop. You can replace version number with version of your choice:
8 |
9 | ```shell
10 | wget https://httpd-mirror.sergal.org/apache/hadoop/common/hadoop-2.9.2/hadoop-2.9.2.tar.gz
11 | ```
12 |
13 | ## 2. Unzip and Change Permissions
14 |
15 | Replace with desired hadoop install location.
16 |
17 | ```shell
18 | mkdir /hadoop
19 | sudo tar -zxvf hadoop-2.7.3.tar.gz -C /hadoop
20 | cd /hadoop
21 | sudo chmod 750 hadoop-2.9.2
22 | ```
23 |
24 | ## 3. Edit Hadoop Configuration
25 |
26 | Edit etc/hadoop/hadoop-env.sh with the HADOOP_CONF_DIR variable to your directory. If necessary, you can also set the JAVA_HOME variable here
27 |
28 | ```shell
29 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"//hadoop/hadoop-2.9.2/etc/hadoop"}
30 | export JAVA_HOME=...
31 | ```
32 |
33 | Edit etc/hadoop/core-site.xml with the following configuration (fill in your directory):
34 |
35 | ```shell
36 |
37 |
38 | fs.defaultFS
39 | hdfs://localhost:8020
40 |
41 |
42 | hadoop.tmp.dir
43 | //hadoop/hadooptmpdata
44 |
45 |
46 | ```
47 |
48 | and etc/hadoop/hdfs-site.xml with the following configuration (fill in your directory):
49 |
50 | ```shell
51 |
52 |
53 | dfs.replication
54 | 1
55 |
56 |
57 | dfs.name.dir
58 | file:///hadoop/hdfs/namenode
59 |
60 |
61 | dfs.data.dir
62 | file:///hadoop/hdfs/datanode
63 |
64 |
65 | dfs.webhdfs.enabled
66 | true
67 |
68 |
69 | ```
70 |
71 | Finally, set the HADOOP_HOME variable in your .bashrc (of whichever user is running hadoop):
72 |
73 | ```shell
74 | export HADOOP_HOME=/hadoop/hadoop-2.9.2
75 | ```
76 |
77 | ## 4. Create directories
78 |
79 | Create the directories referenced above:
80 |
81 | ```shell
82 | cd //hadoop/
83 | mkdir hdfs
84 | mkdir hadooptmpdata
85 | ```
86 |
87 | ## 5. Set up passwordless ssh to localhost:
88 |
89 | ```shell
90 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
91 | ```
92 |
93 | and check that this worked:
94 |
95 | ```shell
96 | ssh localhost
97 | ```
98 |
99 | ## 6. Format HDFS:
100 |
101 | ```shell
102 | bin/hdfs namenode -format
103 | ```
104 |
105 | ## 7. Start HDFS
106 |
107 | ```shell
108 | cd /scratch_b//hadoop/hadoop-2.9.2
109 | sbin/start-dfs.sh
110 | ```
111 |
112 | ## 8. Get Vertica to Work with HDFS
113 |
114 | Each Vertica node needs to have access to a copy of the HDFS configuration. If these are on seperate machines, you can use a command such as rsync to copy the configuration over. This must be done for each Vertica node.
115 |
116 | ```shell
117 | rsync -R --progress /hadoop/hadoop-2.9.2/etc/hadoop/hdfs-site.xml arehnby@eng-g9-158:/etc/hadoop/conf/
118 | rsync -R --progress /hadoop/hadoop-2.9.2/etc/hadoop/core-site.xml arehnby@eng-g9-158:/etc/hadoop/conf/
119 | ```
120 |
--------------------------------------------------------------------------------
/docs/s3-guide.md:
--------------------------------------------------------------------------------
1 | # S3 User Guide
2 |
3 | Apache Hadoop provides an [AWS connector](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) allowing Hadoop file system to access S3, and subsequently allowing the connector to use S3 as the staging area.
4 |
5 | ## Required Dependencies
6 |
7 | What you will need:
8 | - Spark 3.x
9 | - An appropriate `hadoop-aws` version for your hadoop install. Note that Spark comes bundled with Hadoop or stand-alone.
10 | - Importantly, the versions of hadoop-aws must be identical to the hadoop install.
11 | - For example, for a sbt project using Hadoop 3.3.0, add to your `build.sbt`:
12 | `libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % "3.3.0"`
13 | - An S3 bucket configured to use either A) access key ID + secret access key or B) IAM roles for authentication
14 |
15 | Some features may work with older versions of hadoop-aws, but we currently only test against the hadoop-aws version compatible with the latest Spark 3.
16 |
17 | ## Spark with User-Provided Hadoop
18 |
19 | The following example sets up a **user-provided Apache Hadoop**. To download Spark, [go here](https://spark.apache.org/downloads.html). Be sure to select package type "Pre-built with user-provided Apache Hadoop".
20 |
21 | You can [download Hadoop 3.3 here](https://hadoop.apache.org/releases.html). Make sure to download the binary.
22 |
23 | ### Setting up Spark with Hadoop
24 | Note: All instructions here are for MacOS or Linux users.
25 |
26 | First, you will need to decompress the Spark tar file and Hadoop tar file:
27 | ```sh
28 | tar xvf spark-3.0.2-bin-without-hadoop.tgz
29 | tar xvf hadoop-3.3.0.tar.gz
30 | ```
31 |
32 | Move the resulting folder to /opt/spark/:
33 | `mv spark-3.0.2-bin-without-hadoop/ /opt/spark`
34 |
35 | Go to the Spark configuration directory:
36 | `cd /opt/spark/conf`
37 |
38 | There should be a spark-env.sh.template file. You will want a real spark-env.sh file, so rename the template to spark-env.sh:
39 | `mv spark-env.sh.template spark-env.sh`
40 |
41 | Next, set the JAVA_HOME environment variable:
42 | `export JAVA_HOME=/usr/lib/jvm/jre-11-openjdk`
43 |
44 | Now, edit spark-env.sh and point SPARK_DIST_CLASSPATH to the Hadoop folder you extracted earlier. For example, if you extracted it to /myhadoop, you should add the following line:
45 | `export SPARK_DIST_CLASSPATH=$(/myhadoop/hadoop-3.3.0/bin/hadoop classpath)`
46 |
47 | See [Spark's documentation](http://spark.apache.org/docs/latest/hadoop-provided.html) for more information.
48 |
49 | Finally, set the SPARK_HOME environment variable:
50 | ```sh
51 | export SPARK_HOME=/opt/spark
52 | export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
53 | ```
54 |
55 | ### Example Application Using S3
56 |
57 | See [here](https://github.com/vertica/spark-connector/tree/main/examples) for an example of how to connect to an S3 bucket with the Spark Connector.
58 |
59 | ## Troubleshooting
60 |
61 | If you see this error:
62 | `java.lang.NoClassDefFoundError: org/apache/hadoop/fs/StreamCapabilities`
63 | it is likely because you are not using Spark with Hadoop 3.3.0 and hadoop-aws 3.3.0.
64 |
--------------------------------------------------------------------------------
/docs/tls-guide.md:
--------------------------------------------------------------------------------
1 | # Configuring TLS with the Connector
2 |
3 | In order to use TLS with the connector, you will need to setup Vertica as a TLS server, and the host containing the application that uses the connector as a TLS client.
4 |
5 | The following two sections are meant to be followed in order, with the client configuration following the Vertica configuration. Please note that this guide only uses a self-signed certificate.
6 |
7 | ## Setting up Vertica as a TLS server
8 |
9 | Simply follow the instructions [here](https://www.vertica.com/kb/Using-SSL-Server-Authentication-with-Vertica-Validating-Your-SSL/Content/BestPractices/Using-SSL-Server-Authentication-with-Vertica-Validating-Your-SSL.htm).
10 |
11 | ## Setting up the client machine as a TLS client
12 |
13 | Copy the server.crt certificate created on the Vertica server to the client machine.
14 |
15 | Run the following command on the client machine:
16 | `keytool -keystore truststore.jks -alias bmc -import -file server.crt`
17 |
18 | Note: `keytool` is included as part of the Java runtime. If you do not have it, then you may need to install Java first.
19 |
20 | This will create the truststore file on the client side, prompt you to create a new password for it, and import the server.crt self-signed certificate into the truststore.
21 |
22 | Set the `tls_mode`, `trust_store_path`, and `trust_store_password` properties in the connector options:
23 | ```
24 | "tls_mode" -> "disable"
25 | "trust_store_path" -> "/truststore.jks"
26 | "trust_store_password" -> "testpass"
27 | ```
28 |
29 | Here, the absolute path `/truststore.jks` was used. Set this path to wherever you created your truststore.jks file. You will also need to set `trust_store_password` to the password you set on your truststore.jks file.
30 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | These examples are intended to be run either on our provided Docker environment or on your own cluster.
4 |
5 | If you want to try these examples on our Docker environment, then:
6 | 1. Install sbt on your local machine with JDK 11
7 | 2. Clone the project if you haven't already:
8 | ```sh
9 | git clone https://github.com/vertica/spark-connector.git
10 | ```
11 | 3. Start the appropriate configuration:
12 | ```sh
13 | cd spark-connector/docker
14 | docker-compose up -d
15 | # or, for Kerberos
16 | docker-compose -f docker-compose-kerberos.yml up -d
17 | ```
18 | 4. Get a shell to the client container:
19 | ```sh
20 | docker exec -it docker-client-1 bash
21 | # or, for Kerberos
22 | docker exec -it client bash
23 | ```
24 |
25 | Once in the container, navigate to the examples folder using `cd /spark-connector/examples`.
26 |
27 | You can find more information about our docker environment [here](/docker/README.md).
28 |
29 | ### Troubleshooting
30 |
31 | If you are using the thin JAR and running into an error similar to the following:
32 | `java.lang.NoSuchMethodError: 'void cats.kernel.CommutativeSemigroup.$init$(cats.kernel.CommutativeSemigroup)'`, you may need to shade the cats dependency in your project.
33 |
34 | This can be done by adding the following to your build.sbt file:
35 |
36 | ```
37 | assembly / assemblyShadeRules := {
38 | val shadePackage = "com.azavea.shaded.demo"
39 | Seq(
40 | ShadeRule.rename("cats.kernel.**" -> s"$shadePackage.cats.kernel.@1").inAll
41 | )
42 | }
43 | ```
44 |
45 | ### Tear down containers
46 |
47 | To shut down and remove the containers safely:
48 | ```sh
49 | cd spark-connector/docker
50 | docker-compose down
51 | # or, for Kerberos
52 | docker-compose -f docker-compose-kerberos.yml down
53 | ```
54 |
--------------------------------------------------------------------------------
/examples/jupyter/README.md:
--------------------------------------------------------------------------------
1 | # Jupyter Notebook Examples
2 |
3 | ## Creating the Jupyter Notebook Docker Container
4 |
5 | In order to run these examples the Jupyter container must be created and started. To do that start the Docker containers with the "jupyter" profile:
6 | ```sh
7 | cd spark-connector/docker
8 | docker-compose --profile jupyter up -d
9 | ```
10 |
11 | An important thing to note is that the Spark and Python versions for Spark (master and worker nodes) and Jupyter Notebook must match, otherwise it will not work. Our Docker environment ensures the Python and Spark versions between these images are in-sync.
12 |
13 | For more information see the [Docker README](/docker/README.md).
14 |
15 | ## Running a Notebook
16 |
17 | 1. Go to http://localhost:8888/ and login with the token "test"
18 | 2. Under the File Browser on the left, navigate to the work folder and open the desired example Jupyter Notebook
19 | 3. Execute the cells, in order, using the Run button or by pressing `Shift+Enter`
20 |
21 | ## Examples
22 |
23 | ### Basic Read & Write
24 |
25 | A simple read and write that uses a two column schema of a string and an integer.
26 |
27 | ### Complex Array
28 |
29 | A Spark job that writes a regular array, nested array, and an array representative of a hash map.
30 |
31 | ### Linear Regression
32 |
33 | A Machine Learning example that utilizes Spark's Linear Regression algorithm. This job also contains reading and importing a .csv into Vertica.
34 |
35 | Each Notebook Example is annotated and written in a way to walk the user step-by-step through a Spark job to Vertica.
36 |
37 | ## ARM Limitations
38 |
39 | Due to limited availability of aarch64 images in Docker at this time, if you are running these examples on an ARM-based machine note that there may be performance issues or connection failures between containers.
40 |
41 | ## General Notebook Configuration
42 |
43 | Jupyter must be able to communicate with Spark, Hadoop, Vertica, etc, so it must be on the same Docker network. Our Docker environment configures this for you.
44 |
45 | The Spark Connector JAR must also be available in order to load the JAR and send it to Spark. The entire Spark Connector repo is mounted in the Docker container, including the directory containing the Spark Connector JAR (if you build it yourself). Otherwise you must download the JAR from [Maven](https://mvnrepository.com/artifact/com.vertica.spark/vertica-spark) and reference the location in your environment.
46 |
47 | A new Spark session must be created, pointing to the Spark master as well as loading the Spark Connector JAR. For example:
48 | ```py
49 | from pyspark.sql import SparkSession
50 |
51 | spark = (SparkSession.builder
52 | .config("spark.master", "spark://spark:7077")
53 | .config("spark.driver.memory", "2G")
54 | .config("spark.executor.memory", "1G")
55 | .config("spark.jars", "/spark-connector/connector/target/scala-2.12/spark-vertica-connector-assembly-.jar")
56 | .getOrCreate())
57 | sc = spark.sparkContext
58 | ```
59 |
60 | Once that is complete the Spark context may be used to read and write data using the Vertica Spark Connector data source ("com.vertica.spark.datasource.VerticaSource"). See the example Jupyter Notebooks in this folder.
61 |
62 | Note that Jupyter Notebook previously bundled the Spylon kernel so that Scala could be used, but that kernel has not been maintained and is no longer included in Jupyter Notebook by default. As a result it is recommended to use the Python kernel in Jupyter Notebook.
63 |
--------------------------------------------------------------------------------
/examples/jupyter/data/faithful_testing.csv:
--------------------------------------------------------------------------------
1 | "id","eruptions","waiting"
2 | "4",2.283,62
3 | "5",4.533,85
4 | "8",3.6,85
5 | "9",1.95,51
6 | "11",1.833,54
7 | "12",3.917,84
8 | "14",1.75,47
9 | "20",4.25,79
10 | "22",1.75,47
11 | "23",3.45,78
12 | "24",3.067,69
13 | "26",3.6,83
14 | "30",4.433,79
15 | "31",4.3,73
16 | "35",3.833,74
17 | "38",4.833,80
18 | "42",1.883,58
19 | "44",1.75,58
20 | "47",3.833,64
21 | "49",4.633,82
22 | "53",1.833,54
23 | "55",1.733,54
24 | "56",4.883,83
25 | "58",1.667,64
26 | "59",4.567,77
27 | "61",2.233,59
28 | "63",1.75,48
29 | "64",4.8,82
30 | "66",4.4,92
31 | "68",4.7,78
32 | "69",2.067,65
33 | "71",4.033,82
34 | "75",1.983,62
35 | "78",4.567,78
36 | "79",3.883,76
37 | "82",4.333,82
38 | "83",4.1,70
39 | "85",4.067,73
40 | "86",4.933,88
41 | "87",3.95,76
42 | "89",2.167,48
43 | "90",4,86
44 | "92",4.333,90
45 | "93",1.867,50
46 | "94",4.817,78
47 | "100",4.9,82
48 | "102",4.367,88
49 | "104",4.5,83
50 | "106",1.867,47
51 | "113",4.9,89
52 | "114",4.417,79
53 | "125",4.6,88
54 | "126",3.767,81
55 | "127",1.917,45
56 | "131",1.867,45
57 | "134",4.333,89
58 | "139",2.033,53
59 | "141",4.233,81
60 | "143",4.533,82
61 | "145",4.333,76
62 | "147",4.633,80
63 | "155",3.567,71
64 | "157",4.5,81
65 | "161",2.2,45
66 | "162",4.15,86
67 | "163",2,58
68 | "165",3.5,66
69 | "167",2.367,63
70 | "171",1.917,49
71 | "181",1.883,55
72 | "184",3.767,83
73 | "190",2.183,55
74 | "194",4.1,84
75 | "197",3.5,87
76 | "198",4.366,77
77 | "203",4.133,91
78 | "204",1.867,53
79 | "207",4.367,77
80 | "211",2.383,71
81 | "215",3.417,64
82 | "216",4.233,76
83 | "217",2.4,53
84 | "218",4.8,94
85 | "219",2,55
86 | "220",4.15,76
87 | "222",4.267,82
88 | "223",1.75,54
89 | "224",4.483,75
90 | "227",4.083,78
91 | "229",3.917,70
92 | "230",4.55,79
93 | "232",2.417,54
94 | "235",4.45,90
95 | "239",3.95,79
96 | "240",2.333,64
97 | "241",4.15,75
98 | "247",2.083,57
99 | "251",2.2,54
100 | "252",4.45,83
101 | "253",3.567,73
102 | "255",4.15,88
103 | "258",4.45,83
104 | "263",1.85,58
105 | "265",1.983,43
106 | "267",4.75,75
107 | "268",4.117,81
108 | "269",2.15,46
109 | "270",4.417,90
110 | "271",1.817,46
111 | "272",4.467,74
112 |
--------------------------------------------------------------------------------
/examples/jupyter/data/faithful_training.csv:
--------------------------------------------------------------------------------
1 | "id","eruptions","waiting"
2 | "1",3.6,79
3 | "2",1.8,54
4 | "3",3.333,74
5 | "6",2.883,55
6 | "7",4.7,88
7 | "10",4.35,85
8 | "13",4.2,78
9 | "15",4.7,83
10 | "16",2.167,52
11 | "17",1.75,62
12 | "18",4.8,84
13 | "19",1.6,52
14 | "21",1.8,51
15 | "25",4.533,74
16 | "27",1.967,55
17 | "28",4.083,76
18 | "29",3.85,78
19 | "32",4.467,77
20 | "33",3.367,66
21 | "34",4.033,80
22 | "36",2.017,52
23 | "37",1.867,48
24 | "39",1.833,59
25 | "40",4.783,90
26 | "41",4.35,80
27 | "43",4.567,84
28 | "45",4.533,73
29 | "46",3.317,83
30 | "48",2.1,53
31 | "50",2,59
32 | "51",4.8,75
33 | "52",4.716,90
34 | "54",4.833,80
35 | "57",3.717,71
36 | "60",4.317,81
37 | "62",4.5,84
38 | "65",1.817,60
39 | "67",4.167,78
40 | "70",4.7,73
41 | "72",1.967,56
42 | "73",4.5,79
43 | "74",4,71
44 | "76",5.067,76
45 | "77",2.017,60
46 | "80",3.6,83
47 | "81",4.133,75
48 | "84",2.633,65
49 | "88",4.517,80
50 | "91",2.2,60
51 | "95",1.833,63
52 | "96",4.3,72
53 | "97",4.667,84
54 | "98",3.75,75
55 | "99",1.867,51
56 | "101",2.483,62
57 | "103",2.1,49
58 | "105",4.05,81
59 | "107",4.7,84
60 | "108",1.783,52
61 | "109",4.85,86
62 | "110",3.683,81
63 | "111",4.733,75
64 | "112",2.3,59
65 | "115",1.7,59
66 | "116",4.633,81
67 | "117",2.317,50
68 | "118",4.6,85
69 | "119",1.817,59
70 | "120",4.417,87
71 | "121",2.617,53
72 | "122",4.067,69
73 | "123",4.25,77
74 | "124",1.967,56
75 | "128",4.5,82
76 | "129",2.267,55
77 | "130",4.65,90
78 | "132",4.167,83
79 | "133",2.8,56
80 | "135",1.833,46
81 | "136",4.383,82
82 | "137",1.883,51
83 | "138",4.933,86
84 | "140",3.733,79
85 | "142",2.233,60
86 | "144",4.817,77
87 | "146",1.983,59
88 | "148",2.017,49
89 | "149",5.1,96
90 | "150",1.8,53
91 | "151",5.033,77
92 | "152",4,77
93 | "153",2.4,65
94 | "154",4.6,81
95 | "156",4,70
96 | "158",4.083,93
97 | "159",1.8,53
98 | "160",3.967,89
99 | "164",3.833,78
100 | "166",4.583,76
101 | "168",5,88
102 | "169",1.933,52
103 | "170",4.617,93
104 | "172",2.083,57
105 | "173",4.583,77
106 | "174",3.333,68
107 | "175",4.167,81
108 | "176",4.333,81
109 | "177",4.5,73
110 | "178",2.417,50
111 | "179",4,85
112 | "180",4.167,74
113 | "182",4.583,77
114 | "183",4.25,83
115 | "185",2.033,51
116 | "186",4.433,78
117 | "187",4.083,84
118 | "188",1.833,46
119 | "189",4.417,83
120 | "191",4.8,81
121 | "192",1.833,57
122 | "193",4.8,76
123 | "195",3.966,77
124 | "196",4.233,81
125 | "199",2.25,51
126 | "200",4.667,78
127 | "201",2.1,60
128 | "202",4.35,82
129 | "205",4.6,78
130 | "206",1.783,46
131 | "208",3.85,84
132 | "209",1.933,49
133 | "210",4.5,83
134 | "212",4.7,80
135 | "213",1.867,49
136 | "214",3.833,75
137 | "221",1.867,50
138 | "225",4,78
139 | "226",4.117,79
140 | "228",4.267,78
141 | "231",4.083,70
142 | "233",4.183,86
143 | "234",2.217,50
144 | "236",1.883,54
145 | "237",1.85,54
146 | "238",4.283,77
147 | "242",2.35,47
148 | "243",4.933,86
149 | "244",2.9,63
150 | "245",4.583,85
151 | "246",3.833,82
152 | "248",4.367,82
153 | "249",2.133,67
154 | "250",4.35,74
155 | "254",4.5,73
156 | "256",3.817,80
157 | "257",3.917,71
158 | "259",2,56
159 | "260",4.283,79
160 | "261",4.767,78
161 | "262",4.533,84
162 | "264",4.25,83
163 | "266",2.25,60
164 |
--------------------------------------------------------------------------------
/examples/pyspark/README.md:
--------------------------------------------------------------------------------
1 | # Pyspark Example
2 |
3 | This example show how to configure a PySpark application with our connector.
4 |
5 | In general, you would want to define the appropriate connector options. Then, include the connector's fat JAR into `spark-submit` argument `--jars`, For example:
6 | ```sh
7 | spark-submit --master local[*] --jars example.py
8 | ```
9 |
10 | # How to Run the Example
11 |
12 | First, set up the docker environment as mentioned in [examples](/examples/README.md), then:
13 | 1. Download the spark connector "all" JAR from our [releases](https://github.com/vertica/spark-connector/releases) and place it in to `/connector/target/scala-2.12/`. You can do this on your local machine as this folder is mounted. Alternatively, you could build the JAR yourself by following the instructions [here](/CONTRIBUTING.md)
14 | 2. Assuming you are in the client container, use `cd /spark-connector/examples/pyspark` then run the `./run-python-example.sh` script. This will submit the pyspark example to our [standalone cluster](localhost:8080)
15 | 3. To shut down, exit out of the container with `exit`. Then on your local machine navigate to `spark-connector/docker` and tear down containers by running `docker-compose down`
16 |
17 | # Other Connector Options
18 |
19 | For examples of other options, refer to our [scala example](/examples/scala) which demonstrate how to configure the different connector options. While it is in a different language, the ideas are transferable; set the correct options, include our connector JAR, then spark-submit.
20 |
--------------------------------------------------------------------------------
/examples/pyspark/run-python-example.sh:
--------------------------------------------------------------------------------
1 | CONNECTOR_VERSION=$(cat ../../version.properties | grep ${connector-version} | cut -d'=' -f2)
2 | spark-submit --master spark://spark:7077 --jars ../../connector/target/scala-2.12/spark-vertica-connector-assembly-$CONNECTOR_VERSION.jar sparkapp.py
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/examples/pyspark/sparkapp.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkContext, SparkConf
2 | from pyspark.sql import SQLContext, SparkSession
3 | from pyspark import sql
4 |
5 | # Create the spark session
6 | spark = SparkSession \
7 | .builder \
8 | .appName("Vertica Connector Pyspark Example") \
9 | .getOrCreate()
10 | spark_context = spark.sparkContext
11 | sql_context = sql.SQLContext(spark_context)
12 |
13 | # The name of our connector for Spark to look up
14 | format = "com.vertica.spark.datasource.VerticaSource"
15 |
16 | # Set connector options based on our Docker setup
17 | host="vertica"
18 | user="dbadmin"
19 | password=""
20 | db="docker"
21 | staging_fs_url="webhdfs://hdfs:50070/data/"
22 | table="pysparktest"
23 |
24 | # Define data to write to Vertica
25 | columns = ["language","users_count"]
26 | data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
27 | # Create an RDD from the data
28 | rdd = spark_context.parallelize(data)
29 | # Convert the RDD to a DataFrame
30 | df = rdd.toDF(columns)
31 | # Write the DataFrame to the Vertica table pysparktest
32 | df.write.mode('overwrite').save(
33 | # Spark format
34 | format=format,
35 | # Connector specific options
36 | host=host,
37 | user=user,
38 | password=password,
39 | db=db,
40 | staging_fs_url=staging_fs_url,
41 | table=table)
42 |
43 | # Read the data back into a Spark DataFrame
44 | readDf = spark.read.load(
45 | # Spark format
46 | format=format,
47 | # Connector specific options
48 | host=host,
49 | user=user,
50 | password=password,
51 | db=db,
52 | table=table,
53 | staging_fs_url=staging_fs_url)
54 |
55 | # Print the DataFrame contents
56 | readDf.show()
57 |
--------------------------------------------------------------------------------
/examples/scala/build.sbt:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 | import java.util.Properties
14 |
15 | // Retrieving the common property config containing the connector version number.
16 | val props = settingKey[Properties]("Connector version properties")
17 | props := {
18 | val prop = new Properties()
19 | IO.load(prop, new File("../../version.properties"))
20 | prop
21 | }
22 |
23 | scalaVersion := "2.13.16"
24 | name := "spark-vertica-connector-scala-examples"
25 | organization := "com.vertica"
26 | version := props.value.getProperty("connector-version")
27 |
28 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases"
29 | resolvers += "jitpack" at "https://jitpack.io"
30 |
31 | libraryDependencies ++= Seq(
32 | "com.typesafe" % "config" % "1.4.1",
33 | "com.vertica.spark" % "vertica-spark" % s"${version.value}-slim",
34 | "org.apache.spark" %% "spark-core" % "3.5.5",
35 | "org.apache.spark" %% "spark-sql" % "3.5.5",
36 | "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop3-2.2.6",
37 | // This version needs to match the Hadoop version used by Spark
38 | "org.apache.hadoop" % "hadoop-aws" % "3.3.2"
39 | )
40 |
41 | assembly / assemblyJarName := s"vertica-spark-scala-examples.jar"
42 |
43 | assembly / assemblyMergeStrategy := {
44 | case PathList("META-INF", xs @ _*) => MergeStrategy.discard
45 | case x => MergeStrategy.first
46 | }
47 |
48 | assembly / assemblyShadeRules := Seq(
49 | ShadeRule.rename("cats.**" -> "shadeCats.@1").inAll
50 | )
--------------------------------------------------------------------------------
/examples/scala/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases"
2 |
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
--------------------------------------------------------------------------------
/examples/scala/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | # Base configurations used by all examples
2 | examples {
3 | host="vertica"
4 | port=5433
5 | db="docker"
6 | user="dbadmin"
7 | password=""
8 | filepath="webhdfs://hdfs:50070/data/"
9 | }
10 |
11 | # Used by S3 related examples to override the base configurations
12 | s3 {
13 | filepath="s3a://test"
14 | aws_endpoint="minio:9000"
15 | aws_enable_ssl="false"
16 | aws_enable_path_style="true"
17 | aws_access_key_id="minioadmin"
18 | aws_secret_access_key="minioadmin"
19 | }
20 |
21 | # Used by kerberos related examples to override the base configurations
22 | kerberos {
23 | user="user1"
24 | filepath="hdfs://hdfs.example.com:8020"
25 | kerberos_service_name="vertica"
26 | kerberos_host_name="vertica.example.com"
27 | jaas_config_name="Client"
28 | }
29 |
30 |
--------------------------------------------------------------------------------
/examples/scala/src/main/scala/example/Main.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package example
15 |
16 | import example.PrintUtils._
17 | import example.examples.{BasicReadWriteExamples, ComplexTypeExamples, ConnectorOptionsExamples}
18 | import org.apache.spark.sql.SparkSession
19 |
20 | object Main {
21 | def main(args: Array[String]): Unit = {
22 |
23 | // Define a Spark master here
24 | val spark = SparkSession.builder()
25 | .appName("Vertica-Spark Connector Scala Example")
26 | .getOrCreate()
27 |
28 | val basicExamples = new BasicReadWriteExamples(spark)
29 | val ctExamples = new ComplexTypeExamples(spark)
30 | val optExamples = new ConnectorOptionsExamples(spark)
31 |
32 | val m: Map[String, () => Unit] = Map(
33 | "writeCustomStatement" -> optExamples.writeCustomStatement,
34 | "writeCustomCopyList" -> optExamples.writeCustomCopyList,
35 | "writeThenRead" -> basicExamples.writeThenRead,
36 | "complexArrayExample" -> ctExamples.writeThenReadComplexArray,
37 | "writeThenReadRow" -> ctExamples.writeThenReadRow,
38 | "writeMap" -> ctExamples.writeMap,
39 | "writeThenReadExternalTable" -> basicExamples.writeThenReadExternalTable,
40 | "writeDataUsingMergeKey" -> optExamples.writeDataUsingMergeKey,
41 | "writeThenReadWithS3" -> basicExamples.writeThenReadWithS3,
42 | "writeThenReadWithGCS" -> basicExamples.writeThenReadWithGCS,
43 | "writeThenReadWithKerberos" -> basicExamples.writeThenReadWithKerberos
44 | )
45 |
46 | def printAllExamples(): Unit = {
47 | println("Examples available: ")
48 | m.keySet.foreach(exampleName => println(s"- $exampleName"))
49 | }
50 |
51 | def noCase(): Unit = {
52 | println("No example with that name.")
53 | printAllExamples()
54 | }
55 |
56 | if (args.length != 1) {
57 | println("No example specified!")
58 | println("Usage: ")
59 | printAllExamples()
60 | }
61 | else {
62 | val f: () => Unit = m.getOrElse(args.head, noCase)
63 | try {
64 | f()
65 | }
66 | catch {
67 | case e: Exception => {
68 | e.printStackTrace()
69 | printFailed("Unexpected error.")
70 | }
71 | }
72 | }
73 | spark.close()
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/examples/scala/src/main/scala/example/PrintUtils.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package example
15 |
16 | object PrintUtils {
17 |
18 | def printMessage(msg: String): Unit = println(s"------------------------------------\n-\n- EXAMPLE: $msg \n-\n------------------------------------")
19 |
20 | def printNotes(msg: String): Unit = println(s"------------------------------------\n-\n- NOTES: $msg \n-\n------------------------------------")
21 |
22 | def printSuccess(msg: String): Unit = println(s"------------------------------------\n-\n- SUCCESS: $msg \n-\n------------------------------------")
23 |
24 | def printFailed(msg: String): Unit = println(s"-------------------------------------\n-\n- FAILED: $msg \n-\n------------------------------------")
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/examples/scala/submit-examples-debug.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | export SPARK_SUBMIT_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005"
4 |
5 | ./submit-examples.sh "$@"
6 |
--------------------------------------------------------------------------------
/examples/scala/submit-examples-kerberos.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | echo 'user1' | kinit user1
4 |
5 | export JAVA_HOME=/usr/lib/jvm/jre-11-openjdk
6 | export SPARK_DIST_CLASSPATH=$(/hadoop-3.3.1/bin/hadoop classpath)
7 | export SPARK_HOME=/opt/spark
8 | export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
9 |
10 | start-master.sh -h localhost
11 | start-worker.sh spark://localhost:7077
12 |
13 | if [[ "$1" == "debug" ]]; then
14 | export SPARK_SUBMIT_OPTS=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005
15 | fi
16 |
17 | spark-submit --master spark://localhost:7077 --conf "spark.driver.extraClassPath={$SPARK_HOME}/conf/" --driver-java-options "-Djava.security.auth.login.config=/spark-connector/docker/client-krb/jaas.config" ./target/scala-2.12/vertica-spark-scala-examples.jar writeThenReadWithKerberos
18 |
--------------------------------------------------------------------------------
/examples/scala/submit-examples.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | spark-submit --master spark://spark:7077 --driver-memory 2g target/scala-2.12/vertica-spark-scala-examples.jar "$@"
4 |
--------------------------------------------------------------------------------
/examples/sparklyr/README.md:
--------------------------------------------------------------------------------
1 | # Sparklyr Example
2 |
3 | The connector can be used with R by using the sparklyr library.
4 |
5 | In general, you would want to include the connector's fat JAR into Spark's config, then define the appropriate connector options into the option list for a read or write.
6 |
7 | # How to run the example
8 |
9 | First, set up the Docker environment as mentioned in [examples](/examples/README.md), then:
10 | 1. Download the spark connector "all" JAR from our [releases](https://github.com/vertica/spark-connector/releases) and place it in to `/connector/target/scala-2.12/`. You can do this on your local machine as this folder is mounted. Alternatively, you could build the JAR yourself by following the instructions [here](/CONTRIBUTING.md)
11 | 2. Assuming you are in the client container, use `cd /spark-connector/examples/sparklyr` then run the `./run-r-example.sh` script. This will install R and necessary packages before starting the r script. You can see the submitted app on our [standalone cluster](localhost:8080)
12 | 3. To shut down, exit out of the container with `exit`. Then on your local machine navigate to `spark-connector/docker` and tear down containers by running `docker-compose down`
13 |
14 | # Other Connector Options
15 |
16 | For examples of other options, refer to our [Scala example](/examples/scala) which demonstrate how to configure the different connector options. While it is in a different language, the ideas are transferable; set the correct options, include our connector JAR, then spark-submit.
17 |
--------------------------------------------------------------------------------
/examples/sparklyr/run-r-example.sh:
--------------------------------------------------------------------------------
1 | apt-get install -y r-base
2 | apt-get install -y libssl-dev
3 | apt-get install -y libxml2-dev
4 | apt-get install -y libcurl4-openssl-dev
5 | Rscript sparkapp.r
6 |
--------------------------------------------------------------------------------
/examples/sparklyr/sparkapp.r:
--------------------------------------------------------------------------------
1 | install.packages("curl", repo = "http://cran.us.r-project.org")
2 | install.packages("sparklyr", repo = "http://cran.us.r-project.org")
3 | library(sparklyr)
4 |
5 | install.packages('properties', repo = "http://cran.us.r-project.org")
6 | library('properties')
7 |
8 | props <- read.properties("../../version.properties")
9 | version <- props["connector-version"]
10 | # construct the path to Vertica-Spark connector jar. Replace this if the path to the jar is different
11 | connectorJar <- paste("../../connector/target/scala-2.12/spark-vertica-connector-assembly-", version, ".jar", sep = "")
12 |
13 | # Create a Spark config and disable Hive support to avoid errors
14 | config <- spark_config()
15 | config$sparklyr.jars.default <- connectorJar
16 | config$sparklyr.connect.enablehivesupport <- FALSE
17 | config$sparklyr.appName <- "Vertica Spark Connector Sparklyr example"
18 |
19 | print("Connecting to Spark.")
20 |
21 | # Connect to the Spark cluster
22 | sc <- spark_connect(master="spark://spark:7077", version = "3.1", config = config)
23 |
24 | print("Connected to spark. Getting iris_tbl.")
25 |
26 | # The Iris dataset comes with R and is used as test data
27 | # Get the Iris data and store it in a Spark dataframe
28 | iris_tbl <- sdf_copy_to(sc = sc, x = iris, overwrite = T)
29 |
30 | print("Got iris_tbl. Writing to Vertica.")
31 |
32 | # Write the Iris dataframe to the Vertica database
33 | spark_write_source(iris_tbl, "com.vertica.spark.datasource.VerticaSource", "overwrite", list(
34 | "host" = "vertica",
35 | "user" = "dbadmin",
36 | "password" = "",
37 | "db" = "docker",
38 | "staging_fs_url" = "webhdfs://hdfs:50070/data/dirtest",
39 | "table" = "iris"
40 | ))
41 |
42 | print("Wrote to Vertica. Reading from Vertica.")
43 |
44 | # Read the Iris data back from the Vertica database into a Spark dataframe
45 | result <- spark_read_source(sc = sc, name = "example", source = "com.vertica.spark.datasource.VerticaSource", options = list(
46 | "host" = "vertica",
47 | "user" = "dbadmin",
48 | "password" = "",
49 | "db" = "docker",
50 | "staging_fs_url" = "webhdfs://hdfs:50070/data/dirtest",
51 | "table" = "iris"
52 | ))
53 |
54 | print("Finished reading.")
55 |
56 | # Print the dataframe's contents
57 | print(result)
58 |
59 | # Cleanup Spark connection
60 | spark_disconnect(sc)
61 |
--------------------------------------------------------------------------------
/functional-tests/build.sbt:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 | import java.util.Properties
14 | import java.io.File
15 |
16 | // Retrieving the connector version number from a common file.
17 | val versionProps = settingKey[Properties]("Connector version properties")
18 | versionProps := {
19 | val prop = new Properties()
20 | IO.load(prop, new File("../version.properties"))
21 | prop
22 | }
23 |
24 | scalaVersion := "2.13.16"
25 | name := "spark-vertica-connector-functional-tests"
26 | organization := "com.vertica"
27 | version := versionProps.value.getProperty("connector-version")
28 |
29 | val sparkVersion = Option(System.getProperty("sparkVersion")) match {
30 | case Some(sparkVersion) => sparkVersion
31 | case None => sys.env.getOrElse("SPARK_VERSION", "[3.3.0, 3.4.0, 3.5.5)")
32 | }
33 |
34 | val hadoopVersion = Option(System.getProperty("hadoopVersion")) match {
35 | case Some(hadoopVersion) => hadoopVersion
36 | case None => sys.env.getOrElse("HADOOP_VERSION", "3.3.4")
37 | }
38 |
39 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases"
40 | resolvers += "jitpack" at "https://jitpack.io"
41 |
42 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.16"
43 | libraryDependencies += "com.typesafe" % "config" % "1.4.1"
44 |
45 | libraryDependencies += "org.scala-lang.modules" %% "scala-parser-combinators" % "2.3.0"
46 | libraryDependencies += "com.vertica.jdbc" % "vertica-jdbc" % "24.4.0-0"
47 | libraryDependencies += "org.apache.spark" %% "spark-core" % "3.5.5"
48 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.5"
49 | libraryDependencies += "org.scalactic" %% "scalactic" % "3.2.16"
50 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.16" % "test"
51 | libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.9.5"
52 | libraryDependencies += "org.scalamock" %% "scalamock" % "5.2.0" % Test
53 | libraryDependencies += "org.typelevel" %% "cats-core" % "2.10.0"
54 | libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion
55 | libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % hadoopVersion
56 | libraryDependencies += "com.github.scopt" %% "scopt" % "4.0.1"
57 | libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop3-2.2.6"
58 | //libraryDependencies += file("C:\\Users\\chaitanp\\SourceCode\\spark\\spark-connector\\connector\\target\\scala-2.13\\spark-vertica-connector-assembly-3.3.6.jar")
59 |
60 | Compile / unmanagedJars += file("../connector/target/scala-2.13/spark-vertica-connector-assembly-3.3.6.jar")
61 |
62 |
63 | assembly / assemblyJarName := s"vertica-spark-functional-tests.jar"
64 |
65 | assembly / assemblyMergeStrategy := {
66 | case PathList("META-INF", xs @ _*) => MergeStrategy.discard
67 | case x => MergeStrategy.first
68 | }
69 |
70 | assembly / assemblyShadeRules := Seq(
71 | ShadeRule.rename("cats.**" -> "shadeCats.@1").inAll
72 | )
73 |
74 | //unmanagedClasspath in Runtime += new File("/etc/hadoop/conf/")
75 |
--------------------------------------------------------------------------------
/functional-tests/default-config.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | echo -e 'functional-tests {
4 | host="'"vertica"'"
5 | port=5433
6 | db="'"docker"'"
7 | user="'"dbadmin"'"
8 | password="'""'"
9 | filepath="'"webhdfs://hdfs:50070/data/"'"
10 | tlsmode="disable"
11 | truststorepath="'"/truststore.jks"'"
12 | truststorepassword="'"dbadmin"'"
13 | }' > ./src/main/resources/application.conf
14 |
--------------------------------------------------------------------------------
/functional-tests/pipeline-gcs-config.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | echo -e 'functional-tests={
4 | host="'"vertica"'"
5 | port="'"5433"'"
6 | db="'"docker"'"
7 | user="'"dbadmin"'"
8 | password="'""'"
9 | log='true'
10 | filepath="'"$GCS_FILEPATH"'"
11 | tlsmode="'"disable"'"
12 | truststorepath="'"/truststore.jks"'"
13 | truststorepassword="'"dbadmin"'"
14 | }' > ./src/main/resources/application.conf
15 |
--------------------------------------------------------------------------------
/functional-tests/pipeline-s3-config.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | echo -e 'functional-tests={
4 | host="'"vertica"'"
5 | port="'"5433"'"
6 | db="'"docker"'"
7 | user="'"dbadmin"'"
8 | password="'""'"
9 | log='true'
10 | filepath="'"$S3_FILEPATH"'"
11 | tlsmode="'"disable"'"
12 | truststorepath="'"/truststore.jks"'"
13 | truststorepassword="'"dbadmin"'"
14 | }' > ./src/main/resources/application.conf
15 |
--------------------------------------------------------------------------------
/functional-tests/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases"
2 |
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
4 | addDependencyTreePlugin
5 |
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/_SUCCESS
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=1/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , (
2 | $ ,Hspark_schema %col2 &5 col2nr&< ( n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=10/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , A A ( A A
2 | $ A,Hspark_schema %col2 &5 col2nr&< A A ( A A n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=11/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , A A ( A A
2 | $ A,Hspark_schema %col2 &5 col2nr&< A A ( A A n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=12/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , 0A 0A ( 0A 0A
2 | $ 0A,Hspark_schema %col2 &5 col2nr&< 0A 0A ( 0A 0A n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=13/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , @A @A ( @A @A
2 | $ @A,Hspark_schema %col2 &5 col2nr&< @A @A ( @A @A n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=14/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , PA PA ( PA PA
2 | $ PA,Hspark_schema %col2 &5 col2nr&< PA PA ( PA PA n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=15/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , `A `A ( `A `A
2 | $ `A,Hspark_schema %col2 &5 col2nr&< `A `A ( `A `A n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=16/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , pA pA ( pA pA
2 | $ pA,Hspark_schema %col2 &5 col2nr&< pA pA ( pA pA n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=17/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=17/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=18/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=18/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=19/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=19/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=2/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=2/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=20/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=20/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=3/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , @ @ ( @ @
2 | $ @,Hspark_schema %col2 &5 col2nr&< @ @ ( @ @ n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=4/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , @@ @@ ( @@ @@
2 | $ @@,Hspark_schema %col2 &5 col2nr&< @@ @@ ( @@ @@ n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=5/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=5/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=6/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=6/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=7/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=7/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=8/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=8/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/3.1.1/col1=9/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet:
--------------------------------------------------------------------------------
1 | PAR1 , A A ( A A
2 | $ A,Hspark_schema %col2 &5 col2nr&< A A ( A A n ,org.apache.spark.version3.0.2 )org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) j PAR1
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | functional-tests {
2 | host="vertica"
3 | port=5433
4 | db="docker"
5 | user="dbadmin"
6 | password=""
7 | filepath="webhdfs://hdfs:50070/data/"
8 | tlsmode="disable"
9 | truststorepath="/truststore.jks"
10 | truststorepassword="dbadmin"
11 | }
12 |
13 |
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/datafile-17_2_test:
--------------------------------------------------------------------------------
1 | 01/02/06,AAA
2 | 10/18/08,AAA
3 | 11/18/08,AAA
4 | 09/46/08,AAA
5 | 09/46/08,AAA
6 | 09/46/08,AAA
7 | 09/46/08,AAA
8 | 09/46/08,AAA
9 | 09/46/08,AAA
10 | 09/46/08,AAA
11 | 09/46/08,AAA
12 | 09/46/08,AAA
13 | 09/46/08,AAA
14 | 09/46/08,AAA
15 | 12/18/08,AAA
16 | 09/21/08,AAA
17 | 09/22/08,AAA
18 | 09/23/08,AAA
19 | 09/46/08,AAA
20 | 09/24/08,AAA
21 | 09/23/08,AAA
22 |
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/datafile-String-Int.txt:
--------------------------------------------------------------------------------
1 | string test one,1
2 | string test two,2
3 | string test three,3
4 | string test four,4
5 | string test five,5
6 | string test six,6
7 | string test seven,7
8 | string test eight,8
9 | string test nine,9
10 | string test ten,10
11 |
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/date_test_file.txt:
--------------------------------------------------------------------------------
1 | 01/02/06,AAA
2 | 10/18/08,AAA
3 | 11/18/08,AAA
4 | 09/46/08,AAA
5 | 09/46/08,AAA
6 | 09/46/08,AAA
7 | 09/46/08,AAA
8 | 09/46/08,AAA
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/diffTypes.txt:
--------------------------------------------------------------------------------
1 | Test string file row 1,12,false,1
2 |
--------------------------------------------------------------------------------
/functional-tests/src/main/resources/diffTypesORC.txt:
--------------------------------------------------------------------------------
1 | teststring,12,false,1
2 |
--------------------------------------------------------------------------------
/functional-tests/src/main/scala/com/vertica/spark/functests/CleanupUtilTests.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.functests
15 |
16 | import com.vertica.spark.config.FileStoreConfig
17 | import com.vertica.spark.datasource.fs.HadoopFileStoreLayer
18 | import com.vertica.spark.util.cleanup.{CleanupUtils, FileCleanupInfo}
19 | import org.scalatest.BeforeAndAfterAll
20 | import org.scalatest.flatspec.AnyFlatSpec
21 |
22 | class CleanupUtilTests(val cfg: FileStoreConfig) extends AnyFlatSpec with BeforeAndAfterAll {
23 |
24 | val fsLayer = new HadoopFileStoreLayer(cfg, None)
25 | val path: String = cfg.address + "/CleanupTest"
26 | private val perms = "777"
27 |
28 | val cleanupUtils = new CleanupUtils
29 |
30 | override def beforeAll(): Unit = {
31 | fsLayer.createDir(path, perms)
32 | }
33 |
34 | override def afterAll(): Unit = {
35 | fsLayer.removeDir(cfg.address)
36 | }
37 |
38 | it should "Clean up a file" in {
39 | val filename = path + "/test.parquet"
40 |
41 | fsLayer.createFile(filename)
42 |
43 | cleanupUtils.checkAndCleanup(fsLayer, FileCleanupInfo(filename, 0, 3))
44 | cleanupUtils.checkAndCleanup(fsLayer, FileCleanupInfo(filename, 1, 3))
45 | cleanupUtils.checkAndCleanup(fsLayer, FileCleanupInfo(filename, 2, 3))
46 |
47 | fsLayer.fileExists(filename) match {
48 | case Left(err) => fail(err.getFullContext)
49 | case Right(exists) => assert(!exists)
50 | }
51 | fsLayer.fileExists(filename+".cleanup0") match {
52 | case Left(err) => fail(err.getFullContext)
53 | case Right(exists) => assert(!exists)
54 | }
55 | fsLayer.fileExists(filename+".cleanup1") match {
56 | case Left(err) => fail(err.getFullContext)
57 | case Right(exists) => assert(!exists)
58 | }
59 | fsLayer.fileExists(filename+".cleanup2") match {
60 | case Left(err) => fail(err.getFullContext)
61 | case Right(exists) => assert(!exists)
62 | }
63 | }
64 |
65 |
66 | it should "Clean up parent unique directory" in {
67 | val uniqueDir = path + "/unique-dir-123"
68 | fsLayer.createDir(uniqueDir, perms)
69 |
70 | val childDir = uniqueDir + "/tablename"
71 | fsLayer.createDir(childDir, perms)
72 |
73 | val filename = childDir + "/test.parquet"
74 | fsLayer.createFile(filename)
75 |
76 | assert(fsLayer.fileExists(uniqueDir).right.get)
77 |
78 | // Now test cleanup
79 | cleanupUtils.cleanupAll(fsLayer, childDir)
80 |
81 | assert(!fsLayer.fileExists(uniqueDir).right.get)
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/functional-tests/src/main/scala/com/vertica/spark/functests/LargeDataTests.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.functests
2 |
3 | import com.vertica.spark.config.{FileStoreConfig, JDBCConfig}
4 | import com.vertica.spark.functests.endtoend.EndToEnd
5 | import com.vertica.spark.util.error.ConnectorException
6 | import org.apache.spark.sql.SaveMode
7 |
8 | class LargeDataTests(readOpts: Map[String, String], writeOpts: Map[String, String], jdbcConfig: JDBCConfig, fileStoreConfig: FileStoreConfig, remote: Boolean = false)
9 | extends EndToEnd(readOpts, writeOpts, jdbcConfig, fileStoreConfig, remote){
10 |
11 | override def sparkAppName: String = "Large Data Tests"
12 |
13 | val numSparkPartitions = 4
14 |
15 | it should "save a 1600 column table using default copy logic." in {
16 | val tableName = "1600ColumnTable"
17 |
18 | val options = writeOpts + ("table" -> tableName)
19 | val df = spark.read.format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
20 | .option("header", "true").load("src/main/resources/1600ColumnTable.csv")
21 |
22 | val numDfRows = df.count()
23 | val stmt = conn.createStatement()
24 | stmt.execute("DROP TABLE IF EXISTS " + "\"" + options("table") + "\";")
25 |
26 | val mode = SaveMode.Append
27 |
28 | try {
29 | df.write.format("com.vertica.spark.datasource.VerticaSource").options(options).mode(mode).save()
30 | } catch {
31 | case e: ConnectorException => fail(e.error.getFullContext)
32 | }
33 |
34 | var totalRows = 0
35 | val query = "SELECT COUNT(*) AS count FROM " + "\"" + options("table") + "\";"
36 | try {
37 | val rs = stmt.executeQuery(query)
38 | if (rs.next) {
39 | totalRows = rs.getInt("count")
40 | }
41 | }
42 | finally {
43 | stmt.close()
44 | }
45 | assert (totalRows == numDfRows)
46 | TestUtils.dropTable(conn, tableName)
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/functional-tests/src/main/scala/com/vertica/spark/functests/endtoend/BasicJsonReadTests.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.functests.endtoend
15 |
16 | import com.vertica.spark.config.{FileStoreConfig, JDBCConfig}
17 | import com.vertica.spark.functests.TestUtils
18 | import com.vertica.spark.util.error.{BinaryTypeNotSupported, ConnectorException, ErrorList}
19 |
20 | import scala.util.{Failure, Success, Try}
21 |
22 |
23 | /**
24 | * A few minimal tests for the json feature. Not intended to be comprehensive.
25 | * */
26 | class BasicJsonReadTests(readOpts: Map[String, String], writeOpts: Map[String, String], jdbcConfig: JDBCConfig, fileStoreConfig: FileStoreConfig, remote: Boolean = false)
27 | extends EndToEnd(readOpts, writeOpts, jdbcConfig, fileStoreConfig, remote) {
28 |
29 | override def sparkAppName: String = "Basic JSON Read Tests"
30 |
31 | private val jsonReadOpts = readOpts + ("json" -> "true")
32 |
33 | it should "read primitive types" in {
34 | val tableName1 = "dftest"
35 | val n = 1
36 | val stmt = conn.createStatement
37 | TestUtils.createTableBySQL(conn, tableName1, "create table " + tableName1 + " (a int, b varchar, c float, d array[int])")
38 |
39 | TestUtils.populateTableBySQL(stmt, "insert into dftest values (1, 'heeelo', 3.2, array[3,5])", 10)
40 |
41 | val df = spark.read.format("com.vertica.spark.datasource.VerticaSource")
42 | .options(jsonReadOpts + ("table" -> tableName1)).load()
43 | val result = Try {df.show()}
44 | result match {
45 | case Failure(exception) => fail("Expected to succeed", exception)
46 | case Success(_) =>
47 | }
48 | stmt.close()
49 | TestUtils.dropTable(conn, tableName1)
50 | }
51 |
52 | it should "error on binary types" in {
53 | val tableName = "dftest"
54 | val n = 1
55 | val stmt = conn.createStatement
56 | TestUtils.createTableBySQL(conn, tableName, "create table " + tableName + " (a binary, b varbinary, c array[binary], d array[varbinary], e long varbinary)")
57 |
58 | val df = spark.read.format("com.vertica.spark.datasource.VerticaSource")
59 | .options(jsonReadOpts + ("table" -> tableName)).load()
60 | val result = Try{df.collect}
61 | result match {
62 | case Failure(exception) => exception match {
63 | case ConnectorException(error) => {
64 | assert(error.isInstanceOf[ErrorList])
65 | val errorList = error.asInstanceOf[ErrorList].errors.toList
66 | assert(errorList.forall(_.isInstanceOf[BinaryTypeNotSupported]))
67 | assert(errorList(0).asInstanceOf[BinaryTypeNotSupported].fieldName == "a")
68 | assert(errorList(1).asInstanceOf[BinaryTypeNotSupported].fieldName == "b")
69 | assert(errorList(2).asInstanceOf[BinaryTypeNotSupported].fieldName == "c")
70 | assert(errorList(3).asInstanceOf[BinaryTypeNotSupported].fieldName == "d")
71 | assert(errorList(4).asInstanceOf[BinaryTypeNotSupported].fieldName == "e")
72 | }
73 | }
74 | case Success(_) => fail("Expected to fail")
75 | }
76 | stmt.close()
77 | TestUtils.dropTable(conn, tableName)
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/functional-tests/src/main/scala/com/vertica/spark/functests/endtoend/RemoteTests.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.functests.endtoend
2 |
3 | import com.vertica.spark.config.{FileStoreConfig, JDBCConfig}
4 | import org.apache.spark.sql.{Row, SaveMode}
5 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StructField, StructType}
6 |
7 | /**
8 | * Test suites for submitting to a remote driver. This suite is meant to be configured with a master node when submitting.
9 | * */
10 | class RemoteTests(readOpts: Map[String, String], writeOpts: Map[String, String], jdbcConfig: JDBCConfig, fileStoreConfig: FileStoreConfig)
11 | extends EndToEnd(readOpts, writeOpts, jdbcConfig, fileStoreConfig, true) {
12 |
13 | override def sparkAppName: String = "Remote Tests"
14 |
15 | /**
16 | * This test checks the case where remote executors have to perform multiple tasks and see if multiple connections are
17 | * created. Note that if executors have more cores tasks, then they may be able run all tasks in one go and not trigger
18 | * the needed interactions.
19 | *
20 | * Note: You may get a java.lang.OutOfMemoryError when running locally. To allocate more memeory, start sbt with
21 | * sbt -J-Xmx10G, which will increase heap size to 10gb. More here: https://www.scala-sbt.org/1.x/docs/Troubleshoot-Memory-Issues.html
22 | * */
23 | it should "only create constant number of jdbc sessions when write and read" in {
24 | val rowCount = 50000
25 | val data = (1 to rowCount).map(i => Row(i, (0 to 1000).map(i => i).toArray)).toList
26 | val schema = new StructType(Array(StructField("col1", IntegerType), StructField("col2", ArrayType(IntegerType))))
27 |
28 | val partitionsCount = 100
29 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema).repartition(partitionsCount)
30 | val getJDBCConnectionsCount = "select count(client_hostname) from v_monitor.user_sessions where client_type='JDBC Driver';"
31 | val stmt = conn.createStatement()
32 | try {
33 | var rs = stmt.executeQuery(getJDBCConnectionsCount)
34 | assert(rs.next)
35 | val initialJdbcSessionCount = rs.getLong(1)
36 |
37 | val tableName = "dftest"
38 | df.write.format(VERTICA_SOURCE)
39 | .options(writeOpts + ("table" -> tableName))
40 | .mode(SaveMode.Overwrite)
41 | .save()
42 |
43 | rs = stmt.executeQuery(getJDBCConnectionsCount)
44 | assert(rs.next)
45 | val sessionCountWrite = rs.getLong(1)
46 | // We expect only 2 new jdbc connections made on write
47 | assert(sessionCountWrite == initialJdbcSessionCount + 2)
48 |
49 | spark.read.format(VERTICA_SOURCE)
50 | .options(readOpts +
51 | ("table" -> "dftest") +
52 | ("num_partitions"-> "30") +
53 | ("max_row_group_size_export_mb" -> "1") +
54 | ("max_file_size_export_mb" -> "1"))
55 | .load()
56 |
57 | rs = stmt.executeQuery(getJDBCConnectionsCount)
58 | assert(rs.next)
59 | val sessionCountRead = rs.getLong(1)
60 | // We expect only 1 new jdbc connections made on read.
61 | assert(sessionCountRead == initialJdbcSessionCount + 3)
62 |
63 | } catch {
64 | case exception: Exception => fail("Unexpected exception", exception)
65 | } finally {
66 | stmt.execute("drop table dftest;")
67 | stmt.close()
68 | }
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/functional-tests/src/main/scala/com/vertica/spark/functests/endtoend/SparkConfig.scala:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | package com.vertica.spark.functests.endtoend
15 |
16 | import org.apache.spark.SparkConf
17 |
18 | /**
19 | * Mixin for creating a base [[SparkConf]] for a spark session.
20 | * */
21 | trait SparkConfig {
22 |
23 | /**
24 | * The name that will be displayed on Spark Master UI
25 | * */
26 | def sparkAppName: String
27 |
28 | /**
29 | * Get a base [[SparkConf]]
30 | *
31 | * @param remote if false, the config will set master as local[*], else it will be unset.
32 | * */
33 | def baseSparkConf(remote: Boolean): SparkConf = {
34 | val conf = if (remote) {
35 | new SparkConf()
36 | }
37 | else {
38 | new SparkConf().setMaster("local[*]")
39 | }
40 | conf.setAppName(sparkAppName)
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/functional-tests/submit-functional-tests-debug.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | export SPARK_SUBMIT_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005"
4 |
5 | ./submit-functional-tests.sh "$@"
6 |
--------------------------------------------------------------------------------
/functional-tests/submit-functional-tests.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Append option -r to the list of args
4 | args=("-r")
5 | args+=("$@")
6 |
7 | spark-submit --master spark://spark:7077 --driver-memory 2g target/scala-2.12/vertica-spark-functional-tests.jar "${args[@]}"
8 |
--------------------------------------------------------------------------------
/img/CoreArchitecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/img/CoreArchitecture.png
--------------------------------------------------------------------------------
/img/Overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/img/Overview.png
--------------------------------------------------------------------------------
/img/SparkInterfaces.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/img/SparkInterfaces.png
--------------------------------------------------------------------------------
/performance-tests/README.md:
--------------------------------------------------------------------------------
1 | # Spark Connector - Performance Tests
2 |
3 | This project is in place to run performance tests of the connector against a set of Spark, HDFS, and Vertica clusters.
4 |
5 | Configuration is specified with `application.conf` (HOCON format).
6 |
7 | ## How to run the tests
8 |
9 | 1. Set up Vertica, HDFS and Spark
10 | 2. From the performance-tests directory, run `mkdir lib` to create the folder for the connector JAR
11 | 3. From the performance-tests directory, run `cd ../connector && sbt assembly && cp target/scala-2.13/spark-vertica-connector-assembly-.jar ../performance-tests/lib && cd ../performance-tests` to build and copy the connector JAR
12 | 4. From the performance-tests directory, run `sbt assembly` to assemble the test JAR
13 | 5. Use spark-submit on the test JAR, such as `spark-submit --master spark://hdfs.example.com:7077 --deploy-mode cluster target/scala-2.13/spark-vertica-connector-performance-tests-assembly-.jar`
14 |
15 | ## Tuning read performance
16 |
17 | The biggest factor in connector performance will be resources for Vertica and Spark. Vertica, particularly with default settings may run into a memory bottleneck. This can be improved via configuration of resource pools.
18 |
19 | ### Vertica Resource Pool Configuration
20 |
21 | The connector's Vertica-to-Spark functionality relies on a query to export data from Vertica to an intermediate filestore. This operation reserves a lot of memory, and the more memory available to it, the more threads it can create to parallelize the operation.
22 |
23 | It is suggested that the resource pool used for the operation is given as much memory as possible, and has its `plannedconcurrency` value set to as low as possible.
24 |
25 | For an explanation of this, any given Vertica query may only reserve its total provided memory divided by the `plannedconcurrency` value. A more detailed explanation can be found [here](https://www.vertica.com/blog/do-you-need-to-put-your-query-on-a-budgetba-p236830/). The `plannedconcurrency` value sets how many independent queries are expected to be run, and the connector only uses one query at a time. This query is then parallelized by Vertica.
26 |
27 | ### Connector Options
28 |
29 | There are some connector parameters that may affect the performance of a read from Vertica operation.
30 |
31 | - `num_partitions`: Will set how many partitions are created, representing how many parallel executors will be reading data from the intermediate location at once. This should roughly correspond to the processing power / number of cores in the Spark cluster.
32 | - `max_file_size_export_mb` and `max_row_group_size_export_mb`: Represent configuration of the parquet files exported from Vertica to the intermediary location. These values default to where we find the best export performance lies: 16MB Row Group Size and 2048MB file size. However, these can be tweaked depending on details of the given clusters.
33 |
34 | ## Tuning write performance
35 |
36 | Similar steps to the above for tuning Vertica resource pools may be helpful for write performance.
37 |
38 | On writing, the number of partitions is decided not by the connector, but by the number of partitions passed in. To change this, you can call the `coalesce()` function on a dataframe before writing it.
39 |
--------------------------------------------------------------------------------
/performance-tests/build.sbt:
--------------------------------------------------------------------------------
1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates.
2 | // Licensed under the Apache License, Version 2.0 (the "License");
3 | // You may not use this file except in compliance with the License.
4 | // You may obtain a copy of the License at
5 | //
6 | // http://www.apache.org/licenses/LICENSE-2.0
7 | //
8 | // Unless required by applicable law or agreed to in writing, software
9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 |
14 | scalaVersion := "2.13.16"
15 | name := "spark-vertica-connector-performance-tests"
16 | organization := "com.vertica"
17 | version := "1.0"
18 |
19 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases"
20 | resolvers += "jitpack" at "https://jitpack.io"
21 |
22 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.16"
23 | libraryDependencies += "com.typesafe" % "config" % "1.4.1"
24 |
25 | libraryDependencies += "org.scala-lang.modules" %% "scala-parser-combinators" % "2.3.0"
26 | libraryDependencies += "com.vertica.jdbc" % "vertica-jdbc" % "24.4.0-0"
27 | libraryDependencies += "org.apache.spark" %% "spark-core" % "3.5.5"
28 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.5"
29 | libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % "3.3.2"
30 | libraryDependencies += "org.scalactic" %% "scalactic" % "3.2.16"
31 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.16" % "test"
32 | libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.9.5"
33 | libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.2.3"
34 | libraryDependencies += "org.scalamock" %% "scalamock" % "5.2.0" % Test
35 | libraryDependencies += "org.typelevel" %% "cats-core" % "2.3.0"
36 | Compile / unmanagedJars += file("../connector/target/scala-2.13/spark-vertica-connector-assembly-3.3.6.jar")
37 |
38 | assembly / assemblyMergeStrategy := {
39 | case PathList("META-INF", xs @ _*) => MergeStrategy.discard
40 | case x => MergeStrategy.first
41 | }
42 |
43 | Runtime / unmanagedClasspath += new File("/etc/hadoop/etc/hadoop")
44 |
45 |
--------------------------------------------------------------------------------
/performance-tests/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases"
2 |
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
4 |
--------------------------------------------------------------------------------
/performance-tests/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | functional-tests {
2 | host="vertica"
3 | port=5433
4 | db="testdb"
5 | user="release"
6 | password="password"
7 | log=true
8 | filepath="hdfs://hdfs:8020/data/"
9 | dirpath="hdfs://hdfs:8020/data/dirtest/"
10 | colCounts="400"
11 | rowCounts="5000000,10000000"
12 | runCount=5
13 | testMode=both
14 | max_row_group_size="128"
15 | max_file_size="512"
16 | compareJdbc=true
17 | compareV1=false
18 | num_partitions=100
19 | filter="col1 > 0 AND col1 < 1000"
20 | }
21 |
22 |
--------------------------------------------------------------------------------
/performance-tests/src/main/scala/com/vertica/spark/perftests/DataGenUtils.scala:
--------------------------------------------------------------------------------
1 | package com.vertica.spark.perftests
2 |
3 | import com.vertica.spark.perftests.DataGenUtils.{columnType, genDataSchema}
4 | import org.apache.spark.rdd.RDD
5 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
6 | import org.apache.spark.sql.types.{DateType, Decimal, DecimalType, IntegerType, StringType, StructField, StructType}
7 |
8 | object DataGenUtils {
9 | val rand = new scala.util.Random(System.currentTimeMillis)
10 |
11 | def randomMsInLast70Years() = {
12 | -946771200000L + // Time in past
13 | (Math.abs(rand.nextLong) % (
14 | 70L // years
15 | * 365 // days
16 | * 24 // hours
17 | * 60 // minutes
18 | * 60 // seconds
19 | * 1000 // ms
20 | ))
21 | }
22 |
23 | def randomStringGen(length: Int): String = rand.alphanumeric.take(length).mkString
24 |
25 | def randomIntGen(): Int = rand.nextInt()
26 |
27 | def randomDecimalGen(): Decimal = Decimal(rand.nextDouble())
28 |
29 | def randomDateGen(): java.sql.Date = {
30 | val ms = randomMsInLast70Years()
31 | new java.sql.Date(ms)
32 | }
33 |
34 | private def columnType(i: Int) = {
35 | i % 4 match {
36 | case 0 => StringType
37 | case 1 => IntegerType
38 | case 2 => DecimalType(25,10)
39 | case 3 => DateType
40 | }
41 | }
42 |
43 | def genDataRow(colCount: Int): Row = {
44 | val data = (0 until colCount).map(i => columnType(i) match {
45 | case StringType => randomStringGen(10)
46 | case IntegerType => randomIntGen()
47 | case DecimalType() => randomDecimalGen()
48 | case DateType => randomDateGen()
49 | })
50 | Row.fromSeq(data)
51 | }
52 |
53 | def genDataSchema(colCount: Int): StructType = {
54 | StructType(
55 | (0 until colCount).map(i => StructField("col"+i, columnType(i)))
56 | )
57 | }
58 |
59 | def getColumns(colCount: Int): String = {
60 | val cols = (0 until colCount).map(i => {
61 | val colType = columnType(i)
62 |
63 | val t = colType match {
64 | case StringType => "VARCHAR(1024)"
65 | case IntegerType => "INTEGER"
66 | case DecimalType() => "DECIMAL(25, 10)"
67 | case DateType => "DATE"
68 | }
69 |
70 | val n = "col" + i
71 |
72 | n + " " + t
73 | })
74 |
75 | cols.mkString(", ")
76 | }
77 | }
78 |
79 | class DataGenUtils(hdfsPath: String, spark: SparkSession) {
80 |
81 | def loadOrGenerateData(rowsPerPartition: Int, numPartitions: Int, colCount: Int): DataFrame = {
82 | val totalRowCount = rowsPerPartition * numPartitions
83 | println("Getting data for row count " + totalRowCount + " , col count " + colCount)
84 | val dataFileName = hdfsPath + "data_" + totalRowCount + "_" + colCount
85 |
86 | val conf = spark.sparkContext.hadoopConfiguration
87 | val fs = org.apache.hadoop.fs.FileSystem.get(conf)
88 | val exists = fs.exists(new org.apache.hadoop.fs.Path(dataFileName))
89 |
90 | if(exists) {
91 | println("Data already exists, loading")
92 | val df = spark.read.parquet(dataFileName)
93 | df.rdd.count()
94 | df
95 | }
96 | else {
97 | println("Data doesn't exist yet, generating")
98 | val startTime: Long = System.currentTimeMillis()
99 |
100 | val basicData : RDD[Row] = spark.sparkContext.parallelize(Seq[Int](), numPartitions)
101 | .mapPartitions { _ => {
102 | (1 to rowsPerPartition).map{_ => Row(1)}.iterator
103 | }}
104 |
105 | val dataSchema = genDataSchema(colCount)
106 | //println("SCHEMA: " + dataSchema.toString())
107 |
108 | val dataDf = spark.createDataFrame(
109 | basicData.map(_ => DataGenUtils.genDataRow(colCount)),
110 | dataSchema
111 | )
112 |
113 | println("Storing data in file " + dataFileName)
114 | dataDf.write.parquet(dataFileName)
115 |
116 | val endTime: Long = System.currentTimeMillis()
117 | println("start: " + startTime + ", end: " + endTime)
118 | println("it took " + (endTime - startTime) + "MS to generate and write data")
119 |
120 | dataDf
121 | }
122 | }
123 |
124 | }
125 |
--------------------------------------------------------------------------------
/version.properties:
--------------------------------------------------------------------------------
1 | connector-version=3.3.6
2 |
--------------------------------------------------------------------------------