├── .github ├── ISSUE_TEMPLATE │ ├── BUG_REPORT.md │ └── FEATURE_REQUEST.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── README.md │ ├── auto-triage.yml │ ├── main.yml │ ├── nightly.yml │ ├── on-main-push.yml │ ├── on-pull-request.yml │ ├── remove-issue.yml │ └── weekly.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── codecov.yml ├── connector ├── .java-version ├── build.sbt ├── project │ ├── build.properties │ └── plugins.sbt ├── scalastyle-config.xml └── src │ ├── main │ └── scala │ │ └── com │ │ └── vertica │ │ └── spark │ │ ├── config │ │ ├── FileStoreConfig.scala │ │ ├── JDBCConfig.scala │ │ ├── LogProvider.scala │ │ ├── ReadConfig.scala │ │ ├── TableName.scala │ │ ├── VerticaMetadata.scala │ │ └── WriteConfig.scala │ │ ├── datasource │ │ ├── VerticaDatasourceV2.scala │ │ ├── core │ │ │ ├── DSConfigSetup.scala │ │ │ ├── DSReader.scala │ │ │ ├── DSWriter.scala │ │ │ ├── SessionId.scala │ │ │ ├── VerticaDistributedFilesystemReadPipe.scala │ │ │ ├── VerticaDistributedFilesystemWritePipe.scala │ │ │ ├── VerticaPipe.scala │ │ │ └── factory │ │ │ │ └── VerticaPipeFactory.scala │ │ ├── fs │ │ │ └── FileStoreLayerInterface.scala │ │ ├── jdbc │ │ │ └── VerticaJdbcLayer.scala │ │ ├── json │ │ │ ├── JsonBatchFactory.scala │ │ │ └── VerticaJsonScan.scala │ │ ├── partitions │ │ │ ├── file │ │ │ │ ├── PartitionedFileIdentity.scala │ │ │ │ └── VerticaFilePartition.scala │ │ │ ├── mixin │ │ │ │ ├── Cleanup.scala │ │ │ │ └── Identifiable.scala │ │ │ └── parquet │ │ │ │ ├── ParquetFileRange.scala │ │ │ │ └── VerticaDistributedFilesystemPartition.scala │ │ ├── v2 │ │ │ ├── VerticaDatasourceV2Catalog.scala │ │ │ ├── VerticaDatasourceV2Read.scala │ │ │ ├── VerticaDatasourceV2Table.scala │ │ │ └── VerticaDatasourceV2Write.scala │ │ └── wrappers │ │ │ ├── PartitionReaderWrapper.scala │ │ │ ├── PartitionReaderWrapperFactory.scala │ │ │ ├── VerticaScanWrapper.scala │ │ │ ├── VerticaScanWrapperBuilder.scala │ │ │ └── json │ │ │ └── VerticaJsonTableWrapper.scala │ │ ├── parquet │ │ ├── ParquetReadSupport.scala │ │ ├── ParquetRecordMaterializer.scala │ │ ├── ParquetRowConverter.scala │ │ ├── ParquetSchemaConverter.scala │ │ └── VerticaDataSourceUtils.scala │ │ └── util │ │ ├── Timer.scala │ │ ├── cleanup │ │ ├── CleanupUtils.scala │ │ ├── DistributedFilesCleaner.scala │ │ └── FileCleanupInfo.scala │ │ ├── complex │ │ └── ComplexTypeUtils.scala │ │ ├── error │ │ └── ErrorHandling.scala │ │ ├── general │ │ └── Utils.scala │ │ ├── listeners │ │ └── SparkListeners.scala │ │ ├── pushdown │ │ └── PushdownUtils.scala │ │ ├── query │ │ ├── ColumnsTable.scala │ │ ├── ComplexTypesTable.scala │ │ ├── StringParsingUtils.scala │ │ ├── TypesTable.scala │ │ └── VerticaTable.scala │ │ ├── reflections │ │ └── ReflectionTools.scala │ │ ├── schema │ │ ├── ComplexTypesSchemaTools.scala │ │ └── SchemaTools.scala │ │ ├── table │ │ └── TableUtils.scala │ │ └── version │ │ ├── SparkVersionTools.scala │ │ ├── Version.scala │ │ └── VerticaVersionUtils.scala │ └── test │ └── scala │ └── com │ └── vertica │ └── spark │ ├── common │ └── TestObjects.scala │ ├── datasource │ ├── core │ │ ├── DSConfigSetupTest.scala │ │ ├── DSConfigSetupUtilsTest.scala │ │ ├── DSReaderTest.scala │ │ ├── DSWriterTest.scala │ │ ├── JDBCConfigParserTests.scala │ │ ├── TableNameTest.scala │ │ ├── VerticaDistributedFilesystemReadPipeTests.scala │ │ └── VerticaDistributedFilesystemWritePipeTest.scala │ ├── json │ │ ├── JsonBatchFactoryTest.scala │ │ └── VerticaJsonScanTest.scala │ ├── partitions │ │ └── parquet │ │ │ └── ParquetFileRangeTest.scala │ ├── v2 │ │ └── VerticaV2SourceTest.scala │ └── wrappers │ │ ├── PartitionReaderWrapperFactoryTest.scala │ │ ├── PartitionReaderWrapperTest.scala │ │ ├── VerticaScanWrapperBuilderTest.scala │ │ ├── VerticaScanWrapperTest.scala │ │ └── json │ │ └── VerticaJsonTableWrapperTest.scala │ └── util │ ├── cleanup │ └── CleanupUtilsTest.scala │ ├── error │ └── ErrorHandlingTest.scala │ ├── pushdown │ └── PushdownUtilsTest.scala │ ├── query │ ├── StringParsingUtilsTest.scala │ └── VerticaTableTests.scala │ ├── schema │ ├── ComplexTypesSchemaToolsTest.scala │ ├── SchemaToolsTest.scala │ └── SchemaToolsV10Test.scala │ ├── table │ └── TableUtilsTest.scala │ └── version │ ├── SparkVersionToolsTests.scala │ ├── VersionTest.scala │ └── VerticaVersionUtilsTest.scala ├── docker ├── README.md ├── client-krb │ ├── Dockerfile │ ├── docker-entrypoint.sh │ ├── jaas.config │ └── vsql ├── client │ └── Dockerfile ├── docker-compose-kerberos.yml ├── docker-compose.yml ├── hdfs-krb │ ├── Dockerfile │ └── docker-entrypoint.sh ├── hdfs │ └── docker-entrypoint.sh ├── kdc │ ├── Dockerfile │ └── docker-entrypoint.sh ├── keytabs │ └── .gitkeep ├── krb.env ├── vertica-hdfs-config │ ├── hadoop-kerberized │ │ ├── core-site.xml │ │ ├── hdfs-site.xml │ │ ├── keystore │ │ ├── ssl-client.xml │ │ └── ssl-server.xml │ └── hadoop │ │ ├── core-site.xml │ │ └── hdfs-site.xml ├── vertica-krb │ └── docker-entrypoint.sh └── vertica │ ├── docker-entrypoint-legacy.sh │ └── docker-entrypoint.sh ├── docs ├── gcs-guide.md ├── hdfs-guide.md ├── kerberos-guide.md ├── s3-guide.md ├── tls-guide.md └── troubleshooting-guide.md ├── examples ├── README.md ├── jupyter │ ├── README.md │ ├── basic-read-and-write-example.ipynb │ ├── complex-array-example.ipynb │ ├── data │ │ ├── faithful.csv │ │ ├── faithful_testing.csv │ │ └── faithful_training.csv │ ├── linear-regression-example-spark.ipynb │ ├── linear-regression-example-vertica-direct.ipynb │ └── linear-regression-example-verticapy.ipynb ├── pyspark │ ├── README.md │ ├── run-python-example.sh │ └── sparkapp.py ├── scala │ ├── README.md │ ├── build.sbt │ ├── project │ │ └── plugins.sbt │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ └── application.conf │ │ │ └── scala │ │ │ └── example │ │ │ ├── Main.scala │ │ │ ├── PrintUtils.scala │ │ │ ├── TestUtils.scala │ │ │ └── examples │ │ │ ├── BasicReadWriteExamples.scala │ │ │ ├── ComplexTypeExamples.scala │ │ │ └── ConnectorOptionsExamples.scala │ ├── submit-examples-debug.sh │ ├── submit-examples-kerberos.sh │ └── submit-examples.sh └── sparklyr │ ├── README.md │ ├── run-r-example.sh │ └── sparkapp.r ├── functional-tests ├── README.md ├── build.sbt ├── default-config.sh ├── pipeline-gcs-config.sh ├── pipeline-s3-config.sh ├── project │ └── plugins.sbt ├── src │ └── main │ │ ├── resources │ │ ├── 1600ColumnTable.csv │ │ ├── 3.1.1 │ │ │ ├── _SUCCESS │ │ │ ├── col1=1 │ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=10 │ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=11 │ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=12 │ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=13 │ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=14 │ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=15 │ │ │ │ └── part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=16 │ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=17 │ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=18 │ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=19 │ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=2 │ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=20 │ │ │ │ └── part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=3 │ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=4 │ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=5 │ │ │ │ └── part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=6 │ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=7 │ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ ├── col1=8 │ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ │ └── col1=9 │ │ │ │ └── part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet │ │ ├── AuditSVM.xml │ │ ├── application.conf │ │ ├── datafile-100cols-100rows.csv │ │ ├── datafile-17_2_test │ │ ├── datafile-String-Int.txt │ │ ├── date_test_file.txt │ │ ├── diffTypes.txt │ │ ├── diffTypesORC.txt │ │ └── sample_libsvm_data.txt │ │ └── scala │ │ ├── Main.scala │ │ └── com │ │ └── vertica │ │ └── spark │ │ └── functests │ │ ├── CleanupUtilTests.scala │ │ ├── HDFSTests.scala │ │ ├── JDBCTests.scala │ │ ├── LargeDataTests.scala │ │ ├── TestUtils.scala │ │ └── endtoend │ │ ├── BasicJsonReadTests.scala │ │ ├── ComplexTypeTests.scala │ │ ├── ComplexTypeTestsV10.scala │ │ ├── EndToEndTests.scala │ │ ├── RemoteTests.scala │ │ └── SparkConfig.scala ├── submit-functional-tests-debug.sh └── submit-functional-tests.sh ├── img ├── CoreArchitecture.png ├── Overview.png └── SparkInterfaces.png ├── performance-tests ├── README.md ├── build.sbt ├── project │ └── plugins.sbt └── src │ └── main │ ├── resources │ └── application.conf │ └── scala │ ├── Main.scala │ └── com │ └── vertica │ └── spark │ └── perftests │ ├── DataGenUtils.scala │ └── PerformanceTestSuite.scala └── version.properties /.github/ISSUE_TEMPLATE/BUG_REPORT.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Report an issue for the Vertica Spark Connector 4 | title: "[BUG]" 5 | labels: 'bug' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Environment 11 | - Spark version: 12 | - Hadoop version: 13 | - Vertica version: 14 | - Vertica Spark Connector version: 15 | - Java version: 16 | - Additional Environment Information: 17 | 18 | --- 19 | 20 | ## Problem Description 21 | - Describe the issue in as much details as possible, so it is possible to reproduce it. 22 | 23 | 1. Steps to reproduce: 24 | 2. Expected behaviour: 25 | 3. Actual behaviour: 26 | 4. Error message/stack trace: 27 | 5. Code sample or example on how to reproduce the issue: 28 | 29 | --- 30 | 31 | ## Spark Connector Logs 32 | - Add related logs entries here. 33 | 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/FEATURE_REQUEST.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for the Vertica Spark Connector 4 | title: '[FEATURE]' 5 | labels: 'enhancement' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Is your feature request related to a problem? Please describe. 11 | 12 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 13 | 14 | ## Describe the solution you'd like 15 | 16 | A clear and concise description of what you want to happen. 17 | 18 | ## Describe alternatives you've considered 19 | 20 | A clear and concise description of any alternative solutions or features you've considered. 21 | 22 | ## Additional context 23 | 24 | Add any other context or screenshots about the feature request here. 25 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### Summary 2 | 3 | 4 | 5 | ### Description 6 | 7 | 8 | 9 | ### Related Issue 10 | 11 | 12 | 13 | ### Additional Reviewers 14 | 15 | 16 | -------------------------------------------------------------------------------- /.github/workflows/README.md: -------------------------------------------------------------------------------- 1 | # GitHub Work Flows 2 | 3 | The following are descriptions of the workflows used in the repository. 4 | 5 | ## Main Tests 6 | 7 | The workflow `main.yml` is a reusable workflow performing the following critical tests: 8 | 9 | Currently, this includes: 10 | * Compile checks 11 | * Unit-tests checks 12 | * Test coverage checks: 13 | * Require at least 80% coverage when the PR is merged 14 | * [Patch coverage](https://docs.codecov.com/docs/commit-status#patch-status) of at least 80%. Patch coverage only measures the coverage of changes made in the PR 15 | * Scalastyle checks 16 | * Integration tests against the latest Vertica. Uses the default Spark and Hadoop from the functional test which should be the latest. 17 | 18 | It is being used by `on-main-push.yml`, which execute when there's a push to `main` branch (like when a PR is merged). 19 | 20 | ## On Pull Request 21 | 22 | Runs `main.yml` a pull requests to `main` (when a PR is created or has content pushed to it). 23 | 24 | ## Nightly Tests 25 | 26 | The workflow `nightly.yml` runs nightly, from Monday to Friday at 9:18 AM GMT (or 2:18 AM Pacific Time), executing the 27 | `main` branch against non-critical tests. It currently performs regression testing on combinations of Spark 3.x, with 28 | the appropriate Hadoop HDFS, against Vertica 11.1.1-2 and 12.0.4-0. We also test against the latest Spark 3.x on a 29 | standalone Spark cluster. 30 | 31 | ## Weekly Tests 32 | 33 | `weekly.yml` performs weekly tests every Monday at 10:18 AM GMT (or 3:18 AM Pacific Time), executing the following tests: 34 | * Integration tests against different intermediary file-store: 35 | * S3, using a MINIO object store container to mimic S3 36 | * GCS, against an actual GCS bucket provided by Vertica. We could not find a solution to mock a GCS environment yet 37 | * Testing the `json` option against Spark 3.x 38 | * Test against Vertica 10.1.1-0 39 | 40 | Unless specified, all tests use the latest Vertica docker image. This would notify us of breaking changes 41 | 42 | ## Auto Triage and Remove Issue 43 | 44 | When an issue is labeled with a priority, `auto-triage.yml` workflow move it to the backlog, into the respective 45 | priority column. 46 | 47 | `remove-issue.yml` workflow triggers when an issue is closed, removing it from the backlog. 48 | -------------------------------------------------------------------------------- /.github/workflows/auto-triage.yml: -------------------------------------------------------------------------------- 1 | name: Move labeled issue 2 | on: 3 | issues: 4 | types: 5 | - labeled 6 | workflow_dispatch: 7 | jobs: 8 | move-low-priority: 9 | if: github.event.label.name == 'Low Priority' 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: alex-page/github-project-automation-plus@v0.8.1 13 | with: 14 | project: Backlog 15 | column: Low Priority 16 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 17 | 18 | move-normal-priority: 19 | if: github.event.label.name == 'Normal Priority' 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: alex-page/github-project-automation-plus@v0.8.1 23 | with: 24 | project: Backlog 25 | column: Normal Priority 26 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 27 | 28 | move-high-priority: 29 | if: github.event.label.name == 'High Priority' 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: alex-page/github-project-automation-plus@v0.8.1 33 | with: 34 | project: Backlog 35 | column: High Priority 36 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 37 | -------------------------------------------------------------------------------- /.github/workflows/on-main-push.yml: -------------------------------------------------------------------------------- 1 | name: main 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | workflow_dispatch: 7 | 8 | jobs: 9 | functional-tests: 10 | uses: ./.github/workflows/main.yml -------------------------------------------------------------------------------- /.github/workflows/on-pull-request.yml: -------------------------------------------------------------------------------- 1 | name: On Pull Requests 2 | 3 | on: 4 | pull_request: 5 | branches: [ main ] 6 | workflow_dispatch: 7 | 8 | jobs: 9 | functional-tests: 10 | uses: ./.github/workflows/main.yml -------------------------------------------------------------------------------- /.github/workflows/remove-issue.yml: -------------------------------------------------------------------------------- 1 | name: Remove Closed Issue 2 | on: 3 | issues: 4 | types: 5 | - closed 6 | workflow_dispatch: 7 | jobs: 8 | remove-low-priority: 9 | if: contains(github.event.issue.labels.*.name, 'Low Priority') 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: alex-page/github-project-automation-plus@v0.8.1 13 | with: 14 | project: Backlog 15 | column: Low Priority 16 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 17 | action: delete 18 | 19 | remove-normal-priority: 20 | if: contains(github.event.issue.labels.*.name, 'Normal Priority') 21 | runs-on: ubuntu-latest 22 | steps: 23 | - uses: alex-page/github-project-automation-plus@v0.8.1 24 | with: 25 | project: Backlog 26 | column: Normal Priority 27 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 28 | action: delete 29 | 30 | remove-high-priority: 31 | if: contains(github.event.issue.labels.*.name, 'High Priority') 32 | runs-on: ubuntu-latest 33 | steps: 34 | - uses: alex-page/github-project-automation-plus@v0.8.1 35 | with: 36 | project: Backlog 37 | column: High Priority 38 | repo-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 39 | action: delete 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # sbt-specific 2 | .bsp/ 3 | target/ 4 | 5 | .bloop/ 6 | .idea/ 7 | .metals/ 8 | .vscode/ 9 | *.iml 10 | 11 | .DS_Store 12 | *.class 13 | *.log 14 | *.jar 15 | 16 | connector/project/project/ 17 | connector/.scannerwork/ 18 | functional-tests/lib/ 19 | functional-tests/project/build.properties 20 | functional-tests/project/project/ 21 | examples/*/lib/* 22 | examples/*/project/build.properties 23 | examples/*/project/project/ 24 | examples/jupyter/.ipynb_checkpoints 25 | 26 | docker/.env 27 | docker/keytabs/*.keytab 28 | !docker/keytabs/.gitkeep 29 | docker/vertica-hdfs-config/hadoop-kerberized/*.cert 30 | 31 | # Scala .gitignore 32 | /lib/*.jar 33 | /test/files/codelib/*.jar 34 | /test/files/lib/*.jar 35 | /test/files/speclib/instrumented.jar 36 | /tools/*.jar 37 | 38 | # Developer specific properties 39 | /**/build.properties 40 | /buildcharacter.properties 41 | 42 | # might get generated when testing Jenkins scripts locally 43 | /jenkins.properties 44 | 45 | # target directory for build 46 | /build/ 47 | 48 | # other 49 | /out/ 50 | /bin/ 51 | /sandbox/ 52 | 53 | # intellij 54 | /src/intellij*/*.iml 55 | /src/intellij*/*.ipr 56 | /src/intellij*/*.iws 57 | **/.cache 58 | /.idea 59 | /.settings 60 | 61 | # vscode 62 | /.vscode 63 | 64 | # Standard symbolic link to build/quick/bin 65 | /qbin 66 | 67 | # sbt's target directories 68 | /target/ 69 | /project/**/target/ 70 | /test/macro-annot/target/ 71 | /test/files/target/ 72 | /test/target/ 73 | /build-sbt/ 74 | local.sbt 75 | jitwatch.out 76 | 77 | # Used by the restarr/restarrFull commands as target directories 78 | /build-restarr/ 79 | /target-restarr/ 80 | 81 | # metals 82 | .metals 83 | .bloop 84 | **/project/**/metals.sbt 85 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | # basic 6 | target: 80 7 | threshold: 1% 8 | patch: 9 | default: 10 | target: 80 11 | -------------------------------------------------------------------------------- /connector/.java-version: -------------------------------------------------------------------------------- 1 | 1.8 2 | -------------------------------------------------------------------------------- /connector/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.5.5 2 | -------------------------------------------------------------------------------- /connector/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases" 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") 4 | addSbtPlugin("com.artima.supersafe" % "sbtplugin" % "1.1.12") 5 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.1") 6 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2") 7 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.8.1") 8 | addSbtPlugin("com.github.mwz" % "sbt-sonar" % "2.2.0") 9 | addSbtPlugin("com.sksamuel.scapegoat" % "sbt-scapegoat" % "1.1.0") 10 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 11 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0") 12 | addDependencyTreePlugin 13 | 14 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/config/JDBCConfig.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.config 15 | 16 | import com.vertica.spark.datasource.core.TLSMode 17 | 18 | /** 19 | * Represents any config necessary for authenticating to JDBC. 20 | * 21 | * Abstract as there are multiple possible methods of authentication. 22 | */ 23 | sealed trait JdbcAuth { 24 | def user: String 25 | } 26 | 27 | /** 28 | * Authentication to Vertica using username and password 29 | */ 30 | case class BasicJdbcAuth(username: String, password: String) extends JdbcAuth { 31 | override def user: String = username 32 | } 33 | 34 | /** 35 | * Authentication using kerberos 36 | * @param kerberosServiceName the Kerberos service name, as specified when creating the service principal 37 | * @param kerberosHostname the Kerberos host name, as specified when creating the service principal 38 | * @param jaasConfigName the name of the JAAS configuration used for Kerberos authentication 39 | */ 40 | case class KerberosAuth(username: String, 41 | kerberosServiceName: String, 42 | kerberosHostname: String, 43 | jaasConfigName: String) extends JdbcAuth { 44 | override def user: String = username 45 | } 46 | 47 | /** 48 | * Configuration for a JDBC connection to Vertica. 49 | * 50 | * @param host hostname for the JDBC connection 51 | * @param port port for the JDBC connection 52 | * @param db name of the Vertica database to connect to 53 | * @param auth the authentication details, varies depending on method used 54 | * @param tlsConfig the TLS configuration settings for the JDBC connection 55 | * @param backupServerNodes the comma separates list of vertica backup nodes. The host name or IP can optionally be 56 | * followed by a colon and a port number. If not supplied, defaults to the standard Vertica 57 | * port number (5433). To list multiple hosts, separate them by a comma. 58 | */ 59 | final case class JDBCConfig(host: String, 60 | port: Int, 61 | db: String, 62 | auth: JdbcAuth, 63 | tlsConfig: JDBCTLSConfig, 64 | backupServerNodes: Option[String] = None) 65 | 66 | /** 67 | * TLS configuration settings for a JDBC connection to Vertica. 68 | * 69 | * @param tlsMode flag indicating whether to enable TLS for the connection or not 70 | * @param keyStorePath path to the key store 71 | * @param keyStorePassword password for the key store 72 | * @param trustStorePath path to the trust store 73 | * @param trustStorePassword password for the trust store 74 | */ 75 | case class JDBCTLSConfig(tlsMode: TLSMode, 76 | keyStorePath: Option[String], 77 | keyStorePassword: Option[String], 78 | trustStorePath: Option[String], 79 | trustStorePassword: Option[String]) 80 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/config/LogProvider.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.config 15 | 16 | import com.typesafe.scalalogging.Logger 17 | 18 | /** 19 | * Used to provide a logger for a given class, configured with a given log level. 20 | */ 21 | case object LogProvider { 22 | def getLogger(c: Class[_]): Logger = Logger(c) 23 | def getLogger(obj: Object): Logger = Logger(obj.getClass) 24 | } 25 | 26 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/config/TableName.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.config 15 | 16 | object EscapeUtils { 17 | def sqlEscape(str: String, char: Char = '\"'): String = { 18 | val c = char.toString 19 | str.replace(c, c + c) 20 | } 21 | 22 | def sqlEscapeAndQuote(str: String): String = { 23 | "\"" + sqlEscape(str) + "\"" 24 | } 25 | } 26 | 27 | /** 28 | * Parent trait representing a set of data being read from 29 | */ 30 | trait TableSource { 31 | /** 32 | * Get a unique identifier for the operation. 33 | * 34 | * This value is used in a filepath. 35 | */ 36 | def identifier : String 37 | } 38 | 39 | /** 40 | * Represents a fully qualified tablename in Vertica. 41 | * 42 | * @param name Name of the table 43 | * @param dbschema Optionally, the schema of the table. Public schema will be assumed if not specified. 44 | */ 45 | final case class TableName(name: String, dbschema: Option[String]) extends TableSource { 46 | 47 | /** 48 | * Returns the full name of the table, escaped and surrounded with double quotes to prevent injection 49 | * and allow for special characters. 50 | */ 51 | def getFullTableName : String = { 52 | dbschema match { 53 | case None => EscapeUtils.sqlEscapeAndQuote(name) 54 | case Some(schema) => EscapeUtils.sqlEscapeAndQuote(schema) + "." + EscapeUtils.sqlEscapeAndQuote(name) 55 | } 56 | } 57 | 58 | def getTableName : String = EscapeUtils.sqlEscapeAndQuote(name) 59 | 60 | def getDbSchema : String = { 61 | dbschema match { 62 | case None => "" 63 | case Some(schema) => EscapeUtils.sqlEscapeAndQuote(schema) 64 | } 65 | } 66 | 67 | /** 68 | * The table's name is used as an identifier for the operation. 69 | */ 70 | override def identifier: String = name 71 | } 72 | 73 | final case class TableQuery(query: String, uniqueId: String, dbSchema: Option[String]) extends TableSource { 74 | override def identifier: String = uniqueId 75 | } 76 | 77 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/config/VerticaMetadata.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.config 15 | 16 | import org.apache.spark.sql.types._ 17 | import com.vertica.spark.util.version.Version 18 | 19 | /** 20 | * Abstract trait for passing metadata of a table retrieved from Vertica. 21 | */ 22 | trait VerticaMetadata 23 | 24 | /** 25 | * Metadata for read operation. 26 | * @param schema Schema of the table being read in Vertica. 27 | * @param version Version of Vertica being used. 28 | */ 29 | final case class VerticaReadMetadata(schema: StructType, version: Version) extends VerticaMetadata 30 | 31 | /** 32 | * Empty class; No metadata retrieval required for current write operation. 33 | */ 34 | final case class VerticaWriteMetadata() extends VerticaMetadata 35 | 36 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/VerticaDatasourceV2.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource 15 | 16 | import com.vertica.spark.datasource.v2._ 17 | import com.vertica.spark.util.error.{ConnectorException, MissingSparkSessionError} 18 | import org.apache.spark.sql.connector.catalog._ 19 | import org.apache.spark.sql.connector.expressions.Transform 20 | import org.apache.spark.sql.types._ 21 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 22 | import org.apache.spark.sql.SparkSession 23 | 24 | import java.util 25 | import scala.collection.JavaConverters._ 26 | 27 | /** 28 | * Entry-Point for Spark V2 Datasource. 29 | * 30 | * Implements Spark V2 datasource class [[http://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/connector/catalog/TableProvider.html here]] 31 | * 32 | * This and the tree of classes returned by it are to be kept light, and hook into the core of the connector 33 | */ 34 | class VerticaSource extends TableProvider with SupportsCatalogOptions { 35 | 36 | /** 37 | * Used for read operation to get the schema for the table being read from 38 | * 39 | * @param caseInsensitiveStringMap A string map of options that was passed in by user to datasource 40 | * @return The table's schema in spark StructType format 41 | */ 42 | override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType = { 43 | val table = getTable(schema = StructType(Nil), partitioning = Array.empty[Transform], properties = caseInsensitiveStringMap) 44 | table.schema() 45 | } 46 | 47 | /** 48 | * Gets the structure representing a Vertica table 49 | * 50 | * @param schema StructType representing table schema, used for write 51 | * @param partitioning specified partitioning for the table 52 | * @param properties A string map of options that was passed in by user to datasource 53 | * @return [[VerticaTable]] 54 | */ 55 | override def getTable(schema: StructType, 56 | partitioning: Array[Transform], 57 | properties: util.Map[String, String]): Table = { 58 | new VerticaTable(new CaseInsensitiveStringMap(properties)) 59 | } 60 | 61 | 62 | override def extractIdentifier(options: CaseInsensitiveStringMap): Identifier = { 63 | val name = options.asScala.toMap.getOrElse("table", "") 64 | Identifier.of(Array[String](), name) 65 | } 66 | 67 | private val CATALOG_NAME = VerticaDatasourceV2Catalog.NAME 68 | override def extractCatalog(options: CaseInsensitiveStringMap): String = { 69 | // Add all passed in options to spark catalog options 70 | VerticaDatasourceV2Catalog.setOptions(options) 71 | 72 | // Set the spark conf for catalog class 73 | SparkSession.getActiveSession match { 74 | case Some(session) => session.conf.set("spark.sql.catalog." + CATALOG_NAME, "com.vertica.spark.datasource.v2.VerticaDatasourceV2Catalog") 75 | case None => throw new ConnectorException(MissingSparkSessionError()) 76 | } 77 | 78 | CATALOG_NAME 79 | } 80 | 81 | } 82 | 83 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/core/DSWriter.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.core 15 | 16 | import com.typesafe.scalalogging.Logger 17 | import com.vertica.spark.config._ 18 | import com.vertica.spark.datasource.core.factory.{VerticaPipeFactory, VerticaPipeFactoryInterface} 19 | import com.vertica.spark.util.error.ErrorHandling.ConnectorResult 20 | import org.apache.spark.sql.catalyst.InternalRow 21 | 22 | /** 23 | * Interface responsible for writing to the Vertica source. 24 | * 25 | * This interface is initiated and called from each spark worker. 26 | */ 27 | trait DSWriterInterface { 28 | /** 29 | * Called before reading to perform any needed setup with the given configuration. 30 | */ 31 | def openWrite(): ConnectorResult[Unit] 32 | 33 | /** 34 | * Called to write an individual row to the datasource. 35 | */ 36 | def writeRow(row: InternalRow): ConnectorResult[Unit] 37 | 38 | /** 39 | * Called from the executor to cleanup the individual write operation 40 | */ 41 | def closeWrite(): ConnectorResult[Unit] 42 | 43 | /** 44 | * Called by the driver to commit all the write results 45 | */ 46 | def commitRows(): ConnectorResult[Unit] 47 | } 48 | 49 | /** 50 | * Writer class, agnostic to the kind of pipe used for the operation (which VerticaPipe is used) 51 | * 52 | * @param config Configuration data definining the write operation. 53 | * @param uniqueId Unique identifier for this specific writer. The writer for each partition should have a different ID. 54 | * @param pipeFactory Factory returning the underlying implementation of a pipe between us and Vertica, to use for write. 55 | * @param isOnDriver true if the writer will be executed by a driver. 56 | */ 57 | class DSWriter(config: WriteConfig, uniqueId: String, pipeFactory: VerticaPipeFactoryInterface = VerticaPipeFactory, isOnDriver: Boolean) extends DSWriterInterface { 58 | private val logger: Logger = LogProvider.getLogger(classOf[DSWriter]) 59 | private val thread = Thread.currentThread().getName + ": " 60 | logger.debug(thread + "Initializing writer") 61 | 62 | private val pipe = pipeFactory.getWritePipe(config, isOnDriver) 63 | private var blockSize = 0L 64 | 65 | private var data = List[InternalRow]() 66 | 67 | def openWrite(): ConnectorResult[Unit] = { 68 | for { 69 | size <- pipe.getDataBlockSize 70 | _ <- pipe.startPartitionWrite(uniqueId) 71 | _ = this.blockSize = size 72 | } yield () 73 | } 74 | 75 | def writeRow(row: InternalRow): ConnectorResult[Unit] = { 76 | data = data :+ row 77 | if(data.length >= blockSize) { 78 | pipe.writeData(DataBlock(data)) match { 79 | case Right(_) => 80 | data = List[InternalRow]() 81 | Right(()) 82 | case Left(errors) => Left(errors) 83 | } 84 | } 85 | else { 86 | Right(()) 87 | } 88 | } 89 | 90 | def closeWrite(): ConnectorResult[Unit] = { 91 | if(data.nonEmpty) { 92 | val ret = pipe.writeData(DataBlock(data)) 93 | pipe.endPartitionWrite() 94 | ret 95 | } 96 | else { 97 | pipe.endPartitionWrite() 98 | } 99 | } 100 | 101 | def commitRows(): ConnectorResult[Unit] = { 102 | val ret = pipe.commit() 103 | // Ensure all connections are closed, including read connections used by the write operation 104 | val _ = pipeFactory.closeJdbcLayers() 105 | ret 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/core/SessionId.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.core 15 | 16 | /** 17 | * Interface for getting a unique session ID 18 | */ 19 | trait SessionIdInterface { 20 | def getId : String 21 | } 22 | 23 | /** 24 | * Implementation generating unique session ID 25 | */ 26 | object SessionId extends SessionIdInterface { 27 | def getId : String = java.util.UUID.randomUUID.toString.replace("-", "_") 28 | } 29 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/json/JsonBatchFactory.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.json 15 | 16 | import com.vertica.spark.config.ReadConfig 17 | import com.vertica.spark.datasource.wrappers.json.VerticaJsonTableWrapper 18 | import org.apache.spark.sql.SparkSession 19 | import org.apache.spark.sql.connector.read.Batch 20 | import org.apache.spark.sql.execution.datasources.json.JsonFileFormat 21 | import org.apache.spark.sql.execution.datasources.v2.json.JsonTable 22 | import org.apache.spark.sql.types.StructType 23 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 24 | 25 | import scala.collection.JavaConverters.mapAsJavaMapConverter 26 | 27 | class JsonBatchFactory { 28 | def build(filePath: String, schema: Option[StructType], readConfig: ReadConfig, sparkSession: SparkSession): Batch = { 29 | val paths = List(filePath) 30 | val options = CaseInsensitiveStringMap.empty() 31 | val fallback = classOf[JsonFileFormat] 32 | val jsonTable = JsonTable("Vertica Table", sparkSession, options, paths, schema, fallback) 33 | val verticaJsonTable = new VerticaJsonTableWrapper(jsonTable, readConfig) 34 | val builderOpts = new CaseInsensitiveStringMap(Map[String, String]().asJava) 35 | verticaJsonTable.newScanBuilder(builderOpts).build().toBatch 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/json/VerticaJsonScan.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.json 15 | 16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, LogProvider, ReadConfig} 17 | import com.vertica.spark.datasource.core.{DSConfigSetupInterface, TableMetaInterface} 18 | import com.vertica.spark.datasource.fs.FileStoreLayerInterface 19 | import com.vertica.spark.datasource.v2.VerticaScan 20 | import com.vertica.spark.util.cleanup.CleanupUtils 21 | import com.vertica.spark.util.error.{ErrorHandling, InitialSetupPartitioningError} 22 | import org.apache.spark.sql.SparkSession 23 | import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory, Scan} 24 | import org.apache.spark.sql.types.StructType 25 | 26 | /** 27 | * We support reading JSON files by re-using Spark's JSON support implemented in [[JsonTable]]. 28 | * */ 29 | class VerticaJsonScan(config: ReadConfig, readConfigSetup: DSConfigSetupInterface[ReadConfig] with TableMetaInterface[ReadConfig], batchFactory: JsonBatchFactory, fsLayer: FileStoreLayerInterface) extends Scan with Batch { 30 | 31 | private val logger = LogProvider.getLogger(classOf[VerticaScan]) 32 | 33 | private val jsonReadConfig = config match { 34 | case cfg: DistributedFilesystemReadConfig => 35 | val copied = cfg.copy(useJson = true) 36 | copied.setGroupBy(cfg.getGroupBy) 37 | copied.setPushdownAgg(cfg.isAggPushedDown) 38 | copied.setPushdownFilters(cfg.getPushdownFilters) 39 | copied.setRequiredSchema(cfg.getRequiredSchema) 40 | copied 41 | case _ => config 42 | } 43 | 44 | private lazy val batch: Batch = { 45 | // Export JSON before initializing Spark's JSON support. 46 | readConfigSetup.performInitialSetup(jsonReadConfig) match { 47 | case Left(err) => ErrorHandling.logAndThrowError(logger, err) 48 | case Right(opt) => opt match { 49 | case None => ErrorHandling.logAndThrowError(logger, InitialSetupPartitioningError()) 50 | case Some(partitionInfo) => 51 | val sparkSession = SparkSession.getActiveSession.getOrElse(ErrorHandling.logAndThrowError(logger, InitialSetupPartitioningError())) 52 | val batch = batchFactory.build(partitionInfo.rootPath, Some(readSchema()), jsonReadConfig, sparkSession) 53 | 54 | val files = fsLayer.getFileList(partitionInfo.rootPath).getOrElse(ErrorHandling.logAndThrowError(logger, InitialSetupPartitioningError())) 55 | if (files.isEmpty) { 56 | new CleanupUtils().cleanupAll(fsLayer, partitionInfo.rootPath) 57 | } 58 | batch 59 | } 60 | } 61 | } 62 | 63 | override def readSchema(): StructType = { 64 | (readConfigSetup.getTableMetadata(config), jsonReadConfig.getRequiredSchema) match { 65 | case (Right(metadata), requiredSchema) => if (requiredSchema.nonEmpty) { requiredSchema } else { metadata.schema } 66 | case (Left(err), _) => ErrorHandling.logAndThrowError(logger, err) 67 | } 68 | } 69 | 70 | override def planInputPartitions(): Array[InputPartition] = batch.planInputPartitions() 71 | 72 | override def createReaderFactory(): PartitionReaderFactory = batch.createReaderFactory() 73 | 74 | override def toBatch: Batch = this 75 | 76 | } 77 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/partitions/file/PartitionedFileIdentity.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.partitions.file 15 | 16 | import com.vertica.spark.datasource.partitions.mixin.Identifiable 17 | 18 | case class PartitionedFileIdentity(filename: String, index: Long) extends Identifiable 19 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/partitions/file/VerticaFilePartition.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.partitions.file 15 | 16 | import com.vertica.spark.datasource.partitions.mixin.{Cleanup, Identifiable} 17 | import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile} 18 | 19 | /** 20 | * Extended from Spark's FilePartition to hold extra hold extra partitioning data. 21 | * 22 | * @param partitioningRecords A record of the partition count for all file partition created with the key being the 23 | * file path. 24 | * */ 25 | class VerticaFilePartition(override val index: Int, 26 | override val files: Array[PartitionedFile], 27 | val filesIdentity: Array[PartitionedFileIdentity], 28 | val partitioningRecords: Map[String, Int]) 29 | extends FilePartition(index, files) with Cleanup { 30 | 31 | override def getPortions: Seq[Identifiable] = this.filesIdentity 32 | 33 | override def getPartitioningRecord: Map[String, Int] = this.partitioningRecords 34 | } 35 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/partitions/mixin/Cleanup.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.partitions.mixin 15 | 16 | import org.apache.spark.sql.connector.read.InputPartition 17 | 18 | /** 19 | * Mixin trait for [[InputPartition]] that contains information for cleanup 20 | * */ 21 | trait Cleanup { 22 | 23 | /** 24 | * @return returns any [[Identifiable]] object 25 | * */ 26 | def getPortions: Seq[Identifiable] 27 | 28 | /** 29 | * @return return a mapping of filename to their portion count 30 | * */ 31 | def getPartitioningRecord: Map[String, Int] 32 | } 33 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/partitions/mixin/Identifiable.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.partitions.mixin 15 | 16 | /** 17 | * Mixin trait for data portion containing information that identify itself amongst other portions. 18 | * */ 19 | trait Identifiable { 20 | 21 | /** 22 | * @return the name of the file the portion belongs to 23 | * */ 24 | def filename: String 25 | 26 | /** 27 | * @return the portion's index amongst the other portions of a file. 28 | * */ 29 | def index: Long 30 | } 31 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/partitions/parquet/ParquetFileRange.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.partitions.parquet 15 | 16 | import com.vertica.spark.datasource.partitions.mixin.Identifiable 17 | 18 | /** 19 | * Represents a portion of a parquet file 20 | * 21 | * @param filename Full path with name of the parquet file 22 | * @param minRowGroup First row group to read from parquet file 23 | * @param maxRowGroup Last row group to read from parquet file 24 | * @param rangeIdx Range index for this file. Used to track access to this file / cleanup among different nodes. 25 | * If there are three ranges for a given file this will be a value between 0 and 2 26 | */ 27 | final case class ParquetFileRange(filename: String, minRowGroup: Int, maxRowGroup: Int, rangeIdx: Int) extends Identifiable { 28 | 29 | override def index: Long = this.rangeIdx 30 | } 31 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/partitions/parquet/VerticaDistributedFilesystemPartition.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.partitions.parquet 15 | 16 | import com.vertica.spark.datasource.core.VerticaPartition 17 | import com.vertica.spark.datasource.partitions.mixin.{Cleanup, Identifiable} 18 | 19 | /** 20 | * Partition for distributed filesystem transport method using parquet files 21 | * 22 | * @param fileRanges List of files and ranges of row groups to read for those files 23 | * @param rangeCountMap Map representing how many file ranges exist for each file. Used for tracking and cleanup. 24 | */ 25 | final case class VerticaDistributedFilesystemPartition(fileRanges: Seq[ParquetFileRange], rangeCountMap: Map[String, Int]) 26 | extends VerticaPartition with Cleanup { 27 | override def getPortions: Seq[Identifiable] = this.fileRanges 28 | 29 | override def getPartitioningRecord: Map[String, Int] = this.rangeCountMap 30 | } 31 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/wrappers/PartitionReaderWrapper.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.wrappers 15 | 16 | import com.vertica.spark.config.LogProvider 17 | import com.vertica.spark.datasource.partitions.mixin.Cleanup 18 | import com.vertica.spark.util.cleanup.DistributedFilesCleaner 19 | import org.apache.spark.sql.catalyst.InternalRow 20 | import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader} 21 | 22 | /** 23 | * Wraps a [[PartitionReader]], allowing us to intercept it's methods and add additional functionalities. 24 | * */ 25 | class PartitionReaderWrapper(val reader: PartitionReader[InternalRow], 26 | val partitions: Cleanup, 27 | val cleaner: DistributedFilesCleaner) 28 | extends PartitionReader[InternalRow] { 29 | 30 | private val logger = LogProvider.getLogger(classOf[PartitionReaderWrapper]) 31 | 32 | override def next(): Boolean = reader.next() 33 | 34 | override def get(): InternalRow = reader.get() 35 | 36 | override def close(): Unit = { 37 | reader.close() 38 | cleaner.cleanupFiles(partitions) 39 | logger.info("Cleaning up") 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/wrappers/PartitionReaderWrapperFactory.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.wrappers 15 | 16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, ReadConfig} 17 | import com.vertica.spark.datasource.partitions.mixin.Cleanup 18 | import com.vertica.spark.util.cleanup.{CleanupUtils, DistributedFilesCleaner} 19 | import org.apache.spark.sql.catalyst.InternalRow 20 | import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} 21 | 22 | /** 23 | * Wraps a [[PartitionReaderFactory]] so it will create a [[PartitionReaderWrapper]] 24 | * 25 | * planInputPartition() will also record partitioning information. 26 | * */ 27 | class PartitionReaderWrapperFactory(val readerFactory: PartitionReaderFactory, val config: ReadConfig) 28 | extends PartitionReaderFactory { 29 | 30 | override def createReader(inputPartition: InputPartition): PartitionReader[InternalRow] = { 31 | config match { 32 | case readConfig: DistributedFilesystemReadConfig => 33 | val reader = readerFactory.createReader(inputPartition) 34 | val partition = inputPartition.asInstanceOf[Cleanup] 35 | val cleaner = new DistributedFilesCleaner(readConfig, new CleanupUtils) 36 | new PartitionReaderWrapper(reader, partition, cleaner) 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/wrappers/VerticaScanWrapper.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.wrappers 15 | 16 | import com.vertica.spark.config.ReadConfig 17 | import com.vertica.spark.datasource.partitions.file.{PartitionedFileIdentity, VerticaFilePartition} 18 | import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory, Scan} 19 | import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile} 20 | import org.apache.spark.sql.types.StructType 21 | import org.apache.spark.util.SerializableConfiguration 22 | 23 | /** 24 | * Wraps a [[Scan]] so that it will create a [[PartitionReaderWrapperFactory]] 25 | * 26 | * planInputPartition() will also record partitioning information. 27 | * */ 28 | class VerticaScanWrapper(val scan: Scan, val config: ReadConfig) extends Scan with Batch { 29 | 30 | override def readSchema(): StructType = scan.readSchema() 31 | 32 | /** 33 | * Calls the wrapped scan to plan inputs. Then process them into [[VerticaFilePartition]] with partitioning info 34 | * */ 35 | override def planInputPartitions(): Array[InputPartition] = { 36 | val partitioningCounts = scala.collection.mutable.Map[String, Int]() 37 | 38 | def makeFilesIdentity(files: Array[PartitionedFile]): Array[PartitionedFileIdentity] = { 39 | // Record each files to the count and create each an identity 40 | files.map(file => { 41 | val key = file.filePath.toString 42 | val count = partitioningCounts.getOrElse(key, 0) 43 | partitioningCounts.put(key, count + 1) 44 | PartitionedFileIdentity(key, file.start) 45 | }) 46 | } 47 | 48 | scan.toBatch.planInputPartitions() 49 | .map(partition => partition.asInstanceOf[FilePartition]) 50 | .map(filePartition => (filePartition, makeFilesIdentity(filePartition.files))) 51 | .map(result => { 52 | val (filePartition, fileIdentities) = result 53 | new VerticaFilePartition(filePartition.index, filePartition.files, fileIdentities, partitioningCounts.toMap) 54 | }) 55 | } 56 | 57 | override def createReaderFactory(): PartitionReaderFactory = { 58 | new PartitionReaderWrapperFactory(scan.toBatch.createReaderFactory(), config) 59 | } 60 | 61 | override def toBatch: Batch = this 62 | } 63 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/wrappers/VerticaScanWrapperBuilder.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.wrappers 15 | 16 | import com.vertica.spark.config.ReadConfig 17 | import org.apache.spark.sql.connector.read.{Scan, ScanBuilder} 18 | 19 | /** 20 | * Wraps a [[ScanBuilder]] to create a [[VerticaScanWrapper]] 21 | * */ 22 | class VerticaScanWrapperBuilder(val builder: ScanBuilder, val config: ReadConfig) extends ScanBuilder { 23 | override def build(): Scan = new VerticaScanWrapper(builder.build(), config) 24 | } 25 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/datasource/wrappers/json/VerticaJsonTableWrapper.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.wrappers.json 15 | 16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, ReadConfig} 17 | import com.vertica.spark.datasource.wrappers.VerticaScanWrapperBuilder 18 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability} 19 | import org.apache.spark.sql.connector.read.ScanBuilder 20 | import org.apache.spark.sql.execution.datasources.v2.json.JsonTable 21 | import org.apache.spark.sql.types.StructType 22 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 23 | 24 | import java.util 25 | 26 | 27 | /** 28 | * Wraps a [[JsonTable]] so that that it will create a [[VerticaScanWrapperBuilder]]. 29 | * */ 30 | class VerticaJsonTableWrapper(val jsonTable: JsonTable, config: ReadConfig) extends Table with SupportsRead { 31 | override def name(): String = "Vertica" + jsonTable.name 32 | 33 | override def schema(): StructType = jsonTable.schema 34 | 35 | override def capabilities(): util.Set[TableCapability] = jsonTable.capabilities() 36 | 37 | override def newScanBuilder(caseInsensitiveStringMap: CaseInsensitiveStringMap): ScanBuilder = 38 | new VerticaScanWrapperBuilder(jsonTable.newScanBuilder(caseInsensitiveStringMap), config) 39 | } 40 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/parquet/ParquetRecordMaterializer.scala: -------------------------------------------------------------------------------- 1 | // scalastyle:off 2 | /* 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.spark.sql.execution.datasources.parquet.vertica 20 | 21 | import java.time.ZoneId 22 | 23 | import org.apache.parquet.io.api.{GroupConverter, RecordMaterializer} 24 | import org.apache.parquet.schema.MessageType 25 | 26 | import org.apache.spark.sql.catalyst.InternalRow 27 | import org.apache.spark.sql.internal.LegacyBehaviorPolicy 28 | import org.apache.spark.sql.types.StructType 29 | 30 | /** 31 | * A [[RecordMaterializer]] for Catalyst rows. 32 | * 33 | * @param parquetSchema Parquet schema of the records to be read 34 | * @param catalystSchema Catalyst schema of the rows to be constructed 35 | * @param schemaConverter A Parquet-Catalyst schema converter that helps initializing row converters 36 | * @param convertTz the optional time zone to convert to int96 data 37 | * @param datetimeRebaseMode the mode of rebasing date/timestamp from Julian to Proleptic Gregorian 38 | * calendar 39 | * @param int96RebaseMode the mode of rebasing INT96 timestamp from Julian to Proleptic Gregorian 40 | * calendar 41 | */ 42 | private[parquet] class ParquetRecordMaterializer( 43 | parquetSchema: MessageType, 44 | catalystSchema: StructType, 45 | schemaConverter: ParquetToSparkSchemaConverter, 46 | convertTz: Option[ZoneId], 47 | datetimeRebaseMode: LegacyBehaviorPolicy.Value, 48 | int96RebaseMode: LegacyBehaviorPolicy.Value) 49 | extends RecordMaterializer[InternalRow] { 50 | 51 | private val rootConverter = new ParquetRowConverter( 52 | schemaConverter, 53 | parquetSchema, 54 | catalystSchema, 55 | convertTz, 56 | datetimeRebaseMode, 57 | int96RebaseMode, 58 | NoopUpdater) 59 | 60 | override def getCurrentRecord: InternalRow = rootConverter.currentRecord 61 | 62 | override def getRootConverter: GroupConverter = rootConverter 63 | } 64 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/parquet/VerticaDataSourceUtils.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package org.apache.spark.sql.execution.datasources.parquet.vertica 15 | 16 | import org.apache.spark.sql.catalyst.util.RebaseDateTime 17 | import org.apache.spark.sql.execution.datasources.DataSourceUtils 18 | import org.apache.spark.sql.internal.LegacyBehaviorPolicy 19 | 20 | /** 21 | * Copied from Spark 3.2.0 DataSourceUtils implementation. 22 | * 23 | * Classes under the parquet package were copied from Spark for reading parquet into Spark. It is not a public API and 24 | * thus is expected to change. In Spark 3.2.1, the DataSourceUtils interface was changed. As a fix, this class 25 | * copied from Spark 3.2.0 only the needed functions. 26 | * */ 27 | object VerticaDataSourceUtils { 28 | 29 | /** 30 | * Create a function that rebase a given datetime value 31 | * */ 32 | def createDateRebaseFuncInRead( 33 | rebaseMode: LegacyBehaviorPolicy.Value, 34 | format: String): Int => Int = rebaseMode match { 35 | case LegacyBehaviorPolicy.EXCEPTION => days: Int => 36 | if (days < RebaseDateTime.lastSwitchJulianDay) { 37 | throw DataSourceUtils.newRebaseExceptionInRead(format) 38 | } 39 | days 40 | case LegacyBehaviorPolicy.LEGACY => RebaseDateTime.rebaseJulianToGregorianDays 41 | case LegacyBehaviorPolicy.CORRECTED => identity[Int] 42 | } 43 | 44 | /** 45 | * Create a function that rebase a given timestamp value 46 | * */ 47 | def createTimestampRebaseFuncInRead( 48 | rebaseMode: LegacyBehaviorPolicy.Value, 49 | format: String): Long => Long = rebaseMode match { 50 | case LegacyBehaviorPolicy.EXCEPTION => micros: Long => 51 | if (micros < RebaseDateTime.lastSwitchJulianTs) { 52 | throw DataSourceUtils.newRebaseExceptionInRead(format) 53 | } 54 | micros 55 | case LegacyBehaviorPolicy.LEGACY => RebaseDateTime.rebaseJulianToGregorianMicros 56 | case LegacyBehaviorPolicy.CORRECTED => identity[Long] 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/Timer.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util 15 | 16 | import com.typesafe.scalalogging.Logger 17 | 18 | /** 19 | * Class for reporting how long operations take 20 | * 21 | * @param enabled Timer is enabled. If false, timing will not happen and nothing is logged 22 | * @param logger Logger for logging how long operation took 23 | * @param operationName Name of operation being timed 24 | */ 25 | class Timer (val enabled: Boolean, val logger: Logger, val operationName: String ) { 26 | 27 | var t0 = 0L 28 | 29 | def startTime(): Unit = { 30 | if(enabled) { 31 | t0 = System.currentTimeMillis(); 32 | } 33 | } 34 | 35 | def endTime(): Unit = { 36 | if(enabled) { 37 | val t1 = System.currentTimeMillis(); 38 | logger.info("Timed operation: " + operationName + " -- took " + (t1-t0) + " ms."); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/cleanup/DistributedFilesCleaner.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.cleanup 15 | 16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, LogProvider} 17 | import com.vertica.spark.datasource.fs.FileStoreLayerInterface 18 | import com.vertica.spark.datasource.fs.HadoopFileStoreLayer 19 | import com.vertica.spark.datasource.partitions.mixin.Cleanup 20 | 21 | /** 22 | * Class handles cleanup of exported files on file system. Intended to be used by each worker before exiting. 23 | * */ 24 | class DistributedFilesCleaner(val config: DistributedFilesystemReadConfig, 25 | val cleanupUtils: CleanupUtilsInterface, 26 | val optionalFSLayer: Option[FileStoreLayerInterface] = None) { 27 | 28 | private val logger = LogProvider.getLogger(this) 29 | private val fileStoreLayer = optionalFSLayer.getOrElse(HadoopFileStoreLayer.make(config)) 30 | private val fileStoreConfig = config.fileStoreConfig 31 | 32 | /** 33 | * The idea is to first writing to the filesystem, marking that a portion of a file has been read. 34 | * Then, we count if all portion of a file are present. Delete the file if so, else ignore. 35 | * 36 | * This is done for all partitions. 37 | * 38 | * @param partition The object with [[Cleanup]] information. 39 | * */ 40 | def cleanupFiles(partition: Cleanup): Unit = { 41 | logger.info("Removing files before closing read pipe.") 42 | 43 | partition.getPortions.indices.foreach(fileIndex => { 44 | if (!fileStoreConfig.preventCleanup) { 45 | // Cleanup old file if required 46 | getCleanupInfo(partition, fileIndex) match { 47 | case Some(cleanupInfo) => cleanupUtils.checkAndCleanup(fileStoreLayer, cleanupInfo) match { 48 | case Left(err) => logger.warn("Ran into error when calling cleaning up. Treating as non-fatal. Err: " + err.getFullContext) 49 | case Right(_) => () 50 | } 51 | case None => logger.warn("No cleanup info found.") 52 | } 53 | } 54 | }) 55 | } 56 | 57 | def getCleanupInfo(partition: Cleanup, partitionIndex: Int): Option[FileCleanupInfo] = { 58 | logger.debug("Getting cleanup info for partition with idx " + partitionIndex) 59 | if (partitionIndex >= partition.getPortions.size) { 60 | logger.warn("Invalid fileIdx " + partitionIndex + ", can't perform cleanup.") 61 | None 62 | } else { 63 | val fileRange = partition.getPortions(partitionIndex) 64 | Some(FileCleanupInfo(fileRange.filename, fileRange.index, partition.getPartitioningRecord(fileRange.filename))) 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/cleanup/FileCleanupInfo.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.cleanup 15 | 16 | /** 17 | * Structure containing cleanup information for a given portion of a file. 18 | * 19 | * @param filename The file to check for cleanup. 20 | * @param fileIdx Which portion of the file is done being read. 21 | * @param fileRangeCount How many portions of the file exist. 22 | */ 23 | final case class FileCleanupInfo(filename: String, fileIdx: Long, fileRangeCount: Int) 24 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/complex/ComplexTypeUtils.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.complex 15 | 16 | import org.apache.spark.sql.types.{ArrayType, MapType, StructField, StructType} 17 | 18 | import scala.util.Either 19 | 20 | class ComplexTypeUtils { 21 | 22 | def getComplexTypeColumns(schema: StructType): (List[StructField], List[StructField]) = { 23 | val initialAccumulators: (List[StructField], List[StructField]) = (List(), List()) 24 | schema 25 | .foldLeft(initialAccumulators)((acc, col) => { 26 | val (nativeCols, complexTypeCols) = acc 27 | if (isNativeType(col)) { 28 | (col :: nativeCols, complexTypeCols) 29 | } else { 30 | (nativeCols, col :: complexTypeCols) 31 | } 32 | }) 33 | } 34 | 35 | /* 36 | * Check if field is a vertica native type. Vertica native types contains 1D arrays 37 | * */ 38 | private def isNativeType(field: StructField): Boolean = { 39 | field.dataType match { 40 | case ArrayType(elementType, _) => 41 | elementType match { 42 | case MapType(_, _, _) | StructType(_) | ArrayType(_, _) => false 43 | case _ => true 44 | } 45 | case MapType(_, _, _) | StructType(_) => false 46 | case _ => true 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/general/Utils.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.general 15 | 16 | object Utils { 17 | // Used to explicitly ignore returned values 18 | def ignore[T](t: T): Unit = () 19 | } 20 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/listeners/SparkListeners.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.listeners 15 | 16 | import com.vertica.spark.config.{DistributedFilesystemReadConfig, LogProvider} 17 | import com.vertica.spark.datasource.fs.HadoopFileStoreLayer 18 | import com.vertica.spark.util.error.ConnectorError 19 | import org.apache.spark.SparkContext 20 | import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd} 21 | 22 | /** 23 | * This wrapper is created solely for compatibility with unit testing. 24 | *
25 | * Because we could not instantiate during unit test, a dummy class that extends from 26 | * SparkContext is needed to override the functions we are using. However, SparkContext.addSparkListener's argument 27 | * use a private interface thus can't be override. 28 | * */ 29 | case class SparkContextWrapper(sparkContext: Option[SparkContext]){ 30 | 31 | def addSparkListener(listener:SparkListener): Unit ={ 32 | sparkContext match { 33 | // We may not get a context if this is executed on executor nodes. 34 | case None => 35 | case Some(context) => context.addSparkListener(listener) 36 | } 37 | } 38 | } 39 | /** 40 | * This listener is called at the end of Spark app to remove the export folder. 41 | * */ 42 | class ApplicationParquetCleaner(config: DistributedFilesystemReadConfig) extends SparkListener { 43 | private val logger = LogProvider.getLogger(classOf[ApplicationParquetCleaner]) 44 | 45 | private val fileStoreLayer = new HadoopFileStoreLayer(config.fileStoreConfig, config.metadata match { 46 | case Some(metadata) => if (config.getRequiredSchema.nonEmpty) { 47 | Some(config.getRequiredSchema) 48 | } else { 49 | Some(metadata.schema) 50 | } 51 | case _ => None 52 | }) 53 | 54 | override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { 55 | val hdfsPath = config.fileStoreConfig.address 56 | if (!config.fileStoreConfig.preventCleanup) { 57 | fileStoreLayer.removeDir(hdfsPath) match { 58 | case Right(_) => logger.info("Removed " + hdfsPath) 59 | case Left(error) => logger.error(s"Error removing $hdfsPath. ${error.toString}") 60 | } 61 | } 62 | } 63 | } 64 | 65 | case class CleanerRegistrationError() extends ConnectorError { 66 | def getFullContext: String = "Failed to add application shutdown listener to context" 67 | } 68 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/query/ColumnsTable.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.query 15 | 16 | import com.vertica.spark.datasource.jdbc.JdbcLayerInterface 17 | import com.vertica.spark.util.error.ErrorHandling.ConnectorResult 18 | 19 | import java.sql.ResultSet 20 | 21 | case class ColumnInfo(verticaType: Long, dataTypeName: String, precision: Long, scale: Long) 22 | 23 | class ColumnsTable(jdbcLayer: JdbcLayerInterface) extends VerticaTable[ColumnInfo](jdbc = jdbcLayer) { 24 | 25 | override def tableName: String = "columns" 26 | 27 | override def columns: Seq[String] = List("data_type_id", "data_type", "numeric_precision", "numeric_scale") 28 | 29 | override def buildRow(resultSet: ResultSet): ColumnInfo = { 30 | // The column name should be in sync with the ones defined above. 31 | ColumnInfo( 32 | resultSet.getLong("data_type_id"), 33 | getTypeName(resultSet.getString("data_type")), 34 | resultSet.getLong("numeric_precision"), 35 | resultSet.getLong("numeric_scale") 36 | ) 37 | } 38 | 39 | /** 40 | * Type name report by Vertica could be INTEGER or ARRAY[...] or ROW(...) 41 | * and we want to extract just the type identifier 42 | * */ 43 | def getTypeName(dataType:String) : String = { 44 | dataType 45 | .replaceFirst("\\[",",") 46 | .replaceFirst("\\(",",") 47 | .split(',') 48 | .head 49 | } 50 | 51 | def getColumnInfo(columnName: String, tableName: String, schema: String): ConnectorResult[ColumnInfo] = { 52 | val schemaCond = if(schema.nonEmpty) s" AND table_schema='$schema'" else "" 53 | val conditions = s"table_name='$tableName'$schemaCond AND column_name='$columnName'" 54 | super.selectWhereExpectOne(conditions) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/query/ComplexTypesTable.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.query 15 | 16 | import com.vertica.spark.datasource.jdbc.JdbcLayerInterface 17 | import com.vertica.spark.util.error.ErrorHandling.ConnectorResult 18 | 19 | import java.sql.ResultSet 20 | 21 | /** 22 | * A row of complex_types table. Represents a component of the data structure type_id. 23 | * [[https://www.vertica.com/docs/latest/HTML/Content/Authoring/SQLReferenceManual/SystemTables/CATALOG/COMPLEX_TYPES.htm?zoom_highlight=complex%20type Documentations]] 24 | * 25 | * @param typeId The vertica type id of the complex structure. 26 | * @param fieldId the vertica type id of the field. 27 | * 28 | * */ 29 | case class ComplexTypeInfo(typeId: Long, typeName: String, fieldId: Long, fieldTypeName: String, numericScale: Long, typeKind: String, numericPrecision: Long, fieldName: String) 30 | 31 | /** 32 | * When a complex type is created in Vertica, it's structure is recorded in this table. 33 | * Each row represents then a component (a field) of the complex structure, with the type_id being the vertica id of the complex type, 34 | * and field_id being the vertica id of the component. For example, a nested array will have as many rows as 35 | * its depth. 36 | * [[https://www.vertica.com/docs/latest/HTML/Content/Authoring/SQLReferenceManual/SystemTables/CATALOG/COMPLEX_TYPES.htm?zoom_highlight=complex%20type Documentations]] 37 | * */ 38 | class ComplexTypesTable(jdbcLayer: JdbcLayerInterface) 39 | extends VerticaTable[ComplexTypeInfo](jdbc = jdbcLayer) { 40 | 41 | override def tableName: String = "complex_types" 42 | 43 | override protected def columns: Seq[String] = List("type_id", "type_name", "field_id", "field_type_name", "numeric_scale", "type_kind", "numeric_precision", "field_name") 44 | 45 | override protected def buildRow(rs: ResultSet): ComplexTypeInfo = { 46 | // The column name should be in sync with the ones defined above. 47 | ComplexTypeInfo( 48 | rs.getLong("type_id"), 49 | rs.getString("type_name"), 50 | rs.getLong("field_id"), 51 | rs.getString("field_type_name"), 52 | rs.getLong("numeric_scale"), 53 | rs.getString("type_kind"), 54 | rs.getLong("numeric_precision"), 55 | rs.getString("field_name")) 56 | } 57 | 58 | def findComplexTypeInfo(verticaTypeId: Long): ConnectorResult[ComplexTypeInfo] = { 59 | val conditions = s"type_id=$verticaTypeId" 60 | super.selectWhereExpectOne(conditions) 61 | } 62 | 63 | def getComplexTypeFields(verticaTypeId: Long): ConnectorResult[Seq[ComplexTypeInfo]] = { 64 | val conditions = s"type_id=$verticaTypeId" 65 | super.selectWhere(conditions) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/query/StringParsingUtils.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.query 15 | 16 | import scala.annotation.tailrec 17 | 18 | /** 19 | * Class contains helper methods for parsing Vertica SQL queries. Ideally we would like a more robust parser however it 20 | * was not justified because: 21 | * 1. We have not needed to do a lot of SQL parsing, yet! 22 | * 2. We did not find an appropriate library for use 23 | * Should we start handling more SQL parsing, we will need to implement a custom parser. 24 | * */ 25 | object StringParsingUtils { 26 | 27 | /** 28 | * Return the indices of the first open parenthesis and its matching closing parenthesis 29 | * 30 | * @return a tuple of (openParenIndex, closingParenIndex) 31 | * */ 32 | def findFirstParenGroupIndices(str: String): (Int, Int) = { 33 | val openParenIndex = str.indexOf("(") 34 | val subString = str.substring(openParenIndex + 1) 35 | 36 | /** 37 | * This recursion finds the matching paren by tracking the paren count. 38 | * When it is 0 then we have the matching paren. 39 | * */ 40 | @tailrec 41 | def findMatchingClosingParen(char: Char, tail: String, parenCount: Int): Int = { 42 | char match { 43 | case '(' => findMatchingClosingParen(tail.head, tail.tail, parenCount + 1) 44 | case ')' => 45 | if (parenCount == 1) { 46 | subString.length - tail.length 47 | } else { 48 | findMatchingClosingParen(tail.head, tail.tail, parenCount - 1) 49 | } 50 | case _ => findMatchingClosingParen(tail.head, tail.tail, parenCount) 51 | } 52 | } 53 | 54 | val closingParenIndex = openParenIndex + findMatchingClosingParen(subString.head, subString.tail, 1) 55 | (openParenIndex, closingParenIndex) 56 | } 57 | 58 | /** 59 | * Split a string by comma. Will not split on a comma if it is between parentheses. 60 | * 61 | * @return a list of separated strings. 62 | * */ 63 | def splitByComma(str: String): Seq[String] = { 64 | 65 | @tailrec 66 | def recursion(char: Char, tail: String, currStr: String = "", splits: List[String] = List(), parenCount: Int = 0): List[String] = { 67 | val posParenCount = if(parenCount < 0) 0 else parenCount 68 | val nextStr = currStr :+ char 69 | char match { 70 | // Keeping track of parenthesis to know if it should split or not 71 | case '(' if tail.nonEmpty => recursion(tail.head, tail.tail, nextStr, splits, posParenCount + 1) 72 | case ')' if tail.nonEmpty => recursion(tail.head, tail.tail, nextStr, splits, posParenCount - 1) 73 | case ',' if tail.nonEmpty => 74 | if (posParenCount > 0) { 75 | recursion(tail.head, tail.tail, nextStr, splits, posParenCount) 76 | } else { 77 | recursion(tail.head, tail.tail, "", splits :+ currStr.trim, posParenCount) 78 | } 79 | case _ => 80 | if (tail.isEmpty) { 81 | char match { 82 | case ',' if posParenCount == 0 => splits :+ currStr.trim 83 | case _ => splits :+ nextStr.trim 84 | } 85 | } else { 86 | recursion(tail.head, tail.tail, nextStr, splits, posParenCount) 87 | } 88 | } 89 | } 90 | 91 | recursion(str.head, str.tail) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/query/TypesTable.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.query 15 | 16 | import com.vertica.spark.datasource.jdbc.JdbcLayerInterface 17 | import com.vertica.spark.util.error.ErrorHandling.ConnectorResult 18 | import com.vertica.spark.util.schema.ColumnDef 19 | import org.apache.spark.sql.types.{DecimalType, Metadata} 20 | 21 | import java.sql.ResultSet 22 | 23 | case class TypeInfo(typeId: Long, jdbcType: Long, typeName: String, maxScale: Long) 24 | 25 | /** 26 | * Vertica's types table contains type information of primitives and 1D array/set of primitive type. 27 | * */ 28 | class TypesTable(jdbcLayer: JdbcLayerInterface) extends VerticaTable[TypeInfo](jdbcLayer) { 29 | override protected def tableName: String = "types" 30 | 31 | override protected def columns: Seq[String] = List("type_id", "jdbc_type", "type_name", "max_scale") 32 | 33 | override protected def buildRow(rs: ResultSet): TypeInfo = 34 | // The column name should be in sync with the ones defined above. 35 | TypeInfo( 36 | rs.getLong("type_id"), 37 | rs.getLong("jdbc_type"), 38 | rs.getString("type_name"), 39 | rs.getLong("max_scale")) 40 | 41 | def getVerticaTypeInfo(verticaType: Long): ConnectorResult[TypeInfo] = { 42 | val conditions = s"type_id=$verticaType" 43 | super.selectWhereExpectOne(conditions) 44 | } 45 | 46 | private val signedList = List( 47 | java.sql.Types.DOUBLE, 48 | java.sql.Types.FLOAT, 49 | java.sql.Types.REAL, 50 | java.sql.Types.INTEGER, 51 | java.sql.Types.BIGINT, 52 | java.sql.Types.TINYINT, 53 | java.sql.Types.SMALLINT 54 | ) 55 | 56 | def isSigned(jdbcType: Long): Boolean = signedList.contains(jdbcType) 57 | 58 | def getColumnDef(verticaType: Long): ConnectorResult[ColumnDef] = { 59 | getVerticaTypeInfo(verticaType) 60 | .map(typeInfo => 61 | ColumnDef("", 62 | typeInfo.jdbcType.toInt, 63 | typeInfo.typeName, 64 | DecimalType.MAX_PRECISION, 65 | typeInfo.maxScale.toInt, 66 | signed = isSigned(typeInfo.jdbcType), 67 | nullable = false, 68 | Metadata.empty)) 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/reflections/ReflectionTools.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.reflections 15 | 16 | import com.vertica.spark.config.ReadConfig 17 | import com.vertica.spark.datasource.core.DSConfigSetupInterface 18 | import com.vertica.spark.datasource.v2.{VerticaScanBuilder, VerticaScanBuilderWithPushdown} 19 | import org.apache.spark.sql.connector.expressions.aggregate.Aggregation 20 | 21 | class ReflectionTools { 22 | def makeScanBuilderWithPushDown(config: ReadConfig, readSetupInterface: DSConfigSetupInterface[ReadConfig]): VerticaScanBuilder = { 23 | classOf[VerticaScanBuilderWithPushdown] 24 | .getDeclaredConstructor(classOf[ReadConfig], classOf[DSConfigSetupInterface[ReadConfig]]) 25 | .newInstance(config, readSetupInterface) 26 | } 27 | 28 | def makeScanBuilderWithoutPushDown(config: ReadConfig, readSetupInterface: DSConfigSetupInterface[ReadConfig]): VerticaScanBuilder = { 29 | classOf[VerticaScanBuilder] 30 | .getDeclaredConstructor(classOf[ReadConfig], classOf[DSConfigSetupInterface[ReadConfig]]) 31 | .newInstance(config, readSetupInterface) 32 | } 33 | 34 | def aggregationInvokeMethod[T](aggregation: Aggregation, methodName: String): T = { 35 | classOf[Aggregation] 36 | .getDeclaredMethod(methodName) 37 | .invoke(aggregation) 38 | .asInstanceOf[T] 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/version/SparkVersionTools.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.version 15 | 16 | import com.vertica.spark.config.ReadConfig 17 | import com.vertica.spark.datasource.core.DSConfigSetupInterface 18 | import com.vertica.spark.datasource.v2.VerticaScanBuilder 19 | import com.vertica.spark.util.reflections.ReflectionTools 20 | import com.vertica.spark.util.version.SparkVersionTools.{SPARK_3_2_0, SPARK_3_3_0} 21 | import org.apache.spark.sql.SparkSession 22 | import org.apache.spark.sql.connector.expressions.aggregate.Aggregation 23 | import org.apache.spark.sql.connector.expressions.Expression 24 | 25 | import scala.util.Try 26 | 27 | class SparkVersionTools(reflection: ReflectionTools = new ReflectionTools) { 28 | 29 | /** 30 | * @return the version string of Spark 31 | * */ 32 | def getVersionString: Option[String] = SparkSession.getActiveSession.map(_.version) 33 | 34 | /** 35 | * @return a [[Version]] from a Spark version string 36 | * */ 37 | def getVersion: Option[Version] = getVersion(getVersionString) 38 | 39 | /** 40 | * @return a [[Version]] from a Spark version string 41 | * */ 42 | def getVersion(versionStr: Option[String]): Option[Version] = versionStr match { 43 | case Some(str) => 44 | val regex = "([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)".r 45 | Try { 46 | val regex(major, minor, service, _) = str 47 | Some(Version(major.toInt, minor.toInt, service.toInt)) 48 | }.getOrElse(None) 49 | case None => None 50 | } 51 | 52 | /** 53 | * @return a compatible [[VerticaScanBuilder]] for the given spark version. 54 | * */ 55 | def makeCompatibleVerticaScanBuilder(sparkVersion: Version, config: ReadConfig, readSetupInterface: DSConfigSetupInterface[ReadConfig]): VerticaScanBuilder = { 56 | val sparkSupportsAggregatePushDown = sparkVersion >= SPARK_3_2_0 57 | if (sparkSupportsAggregatePushDown) { 58 | reflection.makeScanBuilderWithPushDown(config, readSetupInterface) 59 | } else { 60 | reflection.makeScanBuilderWithoutPushDown(config, readSetupInterface) 61 | } 62 | } 63 | 64 | /** 65 | * Since the connector compiles against the latest version of Spark, for backward compatibility this function uses 66 | * reflection to invoke the appropriate method that returns group-by expressions. 67 | * 68 | * @return an array of [[Expression]] reprsenting the group-by columns. 69 | * */ 70 | def getCompatibleGroupByExpressions(sparkVersion: Version, aggObj: Aggregation): Array[Expression] = { 71 | if(sparkVersion < SPARK_3_3_0){ 72 | // $COVERAGE-OFF$ 73 | reflection.aggregationInvokeMethod[Array[Expression]](aggObj, "groupByColumns") 74 | // $COVERAGE-ON$ 75 | } else { 76 | aggObj.groupByExpressions() 77 | } 78 | } 79 | } 80 | 81 | object SparkVersionTools { 82 | val SPARK_3_3_0: Version = Version(3, 3) 83 | val SPARK_3_2_0: Version = Version(3, 2) 84 | } 85 | -------------------------------------------------------------------------------- /connector/src/main/scala/com/vertica/spark/util/version/Version.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.version 15 | 16 | /** 17 | * A class representing a version string of format major.minor.patch-hotfix. 18 | * Only digits are allowed for minor, patch, and hotfix. 19 | * */ 20 | case class Version(major: Int, minor: Int = 0, servicePack: Int = 0, hotfix: Int = 0) extends Ordered[Version] { 21 | 22 | override def toString: String = s"${major}.${minor}.${servicePack}-${hotfix}" 23 | 24 | override def compare(that: Version): Int = 25 | (this.major * 1000 + this.minor * 100 + this.servicePack * 10 + this.hotfix) - 26 | (that.major * 1000 + that.minor * 100 + that.servicePack * 10 + that.hotfix) 27 | } 28 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/datasource/core/TableNameTest.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.datasource.core 15 | 16 | import com.vertica.spark.config.TableName 17 | import org.scalamock.scalatest.MockFactory 18 | import org.scalatest.BeforeAndAfterAll 19 | import org.scalatest.flatspec.AnyFlatSpec 20 | 21 | class TableNameTest extends AnyFlatSpec with BeforeAndAfterAll with MockFactory { 22 | it should "Escape table name" in { 23 | val tablename = TableName("t\"nam\"e", Some("Sch \" ema")) 24 | 25 | assert(tablename.getFullTableName == "\"Sch \"\" ema\".\"t\"\"nam\"\"e\"") 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/datasource/json/JsonBatchFactoryTest.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.datasource.json 2 | 3 | import com.vertica.spark.common.TestObjects 4 | import com.vertica.spark.datasource.wrappers.VerticaScanWrapper 5 | import org.apache.spark.sql.SparkSession 6 | import org.scalatest.BeforeAndAfterAll 7 | import org.scalatest.flatspec.AnyFlatSpec 8 | 9 | import java.io.{File, PrintWriter} 10 | 11 | class JsonBatchFactoryTest extends AnyFlatSpec with BeforeAndAfterAll{ 12 | 13 | behavior of "JsonBatchFactoryTest" 14 | 15 | val jsonFile = new File("./test.json" ) 16 | val pw = new PrintWriter(jsonFile) 17 | pw.write("{\"a\":9}") 18 | pw.close() 19 | 20 | it should "should build a VerticaScanWrapper" in { 21 | val spark = SparkSession.builder() 22 | .master("local[*]") 23 | .appName("Vertica Connector Test") 24 | .getOrCreate() 25 | 26 | val batch = new JsonBatchFactory().build("./test.json", None, TestObjects.readConfig, spark) 27 | assert(batch.isInstanceOf[VerticaScanWrapper]) 28 | 29 | spark.close() 30 | } 31 | 32 | override protected def afterAll(): Unit = { 33 | jsonFile.delete 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/datasource/partitions/parquet/ParquetFileRangeTest.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.datasource.partitions.parquet 2 | 3 | import org.scalamock.scalatest.MockFactory 4 | import org.scalatest.flatspec.AnyFlatSpec 5 | 6 | class ParquetFileRangeTest extends AnyFlatSpec with MockFactory{ 7 | 8 | behavior of "ParquetFileRangeTest" 9 | 10 | val fileRange = ParquetFileRange("filename", 10, 20, 30) 11 | 12 | it should "return correct filename" in { 13 | assert(fileRange.filename == "filename") 14 | } 15 | 16 | it should "return correct index" in { 17 | assert(fileRange.index == 30) 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/datasource/wrappers/PartitionReaderWrapperFactoryTest.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.datasource.wrappers 2 | 3 | import com.vertica.spark.common.TestObjects 4 | import com.vertica.spark.datasource.partitions.mixin.Cleanup 5 | import org.apache.spark.sql.catalyst.InternalRow 6 | import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} 7 | import org.scalamock.scalatest.MockFactory 8 | import org.scalatest.flatspec.AnyFlatSpec 9 | 10 | class PartitionReaderWrapperFactoryTest extends AnyFlatSpec with MockFactory{ 11 | 12 | behavior of "PartitionReaderWrapperFactoryTest" 13 | 14 | trait MockInputPartition extends InputPartition with Cleanup 15 | 16 | it should "create a PartitionReaderWrapper" in { 17 | val readerFactory = mock[PartitionReaderFactory] 18 | val inputPartition = mock[MockInputPartition] 19 | val partitionReader = mock[PartitionReader[InternalRow]] 20 | (readerFactory.createReader _).expects(inputPartition).returning(partitionReader) 21 | 22 | val reader = new PartitionReaderWrapperFactory(readerFactory, TestObjects.readConfig).createReader(inputPartition) 23 | assert(reader.isInstanceOf[PartitionReaderWrapper]) 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/datasource/wrappers/PartitionReaderWrapperTest.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.datasource.wrappers 2 | 3 | import com.vertica.spark.common.TestObjects 4 | import com.vertica.spark.datasource.partitions.file.VerticaFilePartition 5 | import com.vertica.spark.util.cleanup.{CleanupUtils, DistributedFilesCleaner} 6 | import org.apache.spark.sql.catalyst.InternalRow 7 | import org.apache.spark.sql.connector.read.PartitionReader 8 | import org.scalamock.scalatest.MockFactory 9 | import org.scalatest.flatspec.AnyFlatSpec 10 | 11 | class PartitionReaderWrapperTest extends AnyFlatSpec with MockFactory{ 12 | 13 | behavior of "PartitionReaderWrapperTest" 14 | 15 | private val config = TestObjects.readConfig 16 | 17 | it should "get" in { 18 | val reader = mock[PartitionReader[InternalRow]] 19 | (reader.get _).expects().returning(mock[InternalRow]) 20 | val mockCleanupUtils = new CleanupUtils 21 | val mockCleaner = new DistributedFilesCleaner(config, mockCleanupUtils) 22 | 23 | new PartitionReaderWrapper(reader, mock[VerticaFilePartition], mockCleaner).get() 24 | } 25 | 26 | it should "next" in { 27 | val reader = mock[PartitionReader[InternalRow]] 28 | (reader.next _).expects() 29 | val mockCleanupUtils = new CleanupUtils 30 | val mockCleaner = new DistributedFilesCleaner(config, mockCleanupUtils) 31 | 32 | new PartitionReaderWrapper(reader, mock[VerticaFilePartition], mockCleaner).next() 33 | } 34 | 35 | it should "perform cleanup on close" in { 36 | val reader = mock[PartitionReader[InternalRow]] 37 | (reader.close _).expects() 38 | val partitions = mock[VerticaFilePartition] 39 | (partitions.getPortions _).expects().returning(Seq()) 40 | 41 | val cleanupUtils = new CleanupUtils 42 | val cleaner = new DistributedFilesCleaner(config, cleanupUtils) 43 | new PartitionReaderWrapper(reader, partitions, cleaner).close() 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/datasource/wrappers/VerticaScanWrapperBuilderTest.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.datasource.wrappers 2 | 3 | import com.vertica.spark.common.TestObjects 4 | import org.apache.spark.sql.connector.read.ScanBuilder 5 | import org.scalamock.scalatest.MockFactory 6 | import org.scalatest.flatspec.AnyFlatSpec 7 | 8 | class VerticaScanWrapperBuilderTest extends AnyFlatSpec with MockFactory { 9 | 10 | private val readConfig = TestObjects.readConfig 11 | 12 | it should "build VerticaScanWrapper" in { 13 | val builder = mock[ScanBuilder] 14 | (builder.build _).expects().returning(mock[VerticaScanWrapper]) 15 | assert(new VerticaScanWrapperBuilder(builder, readConfig).build().isInstanceOf[VerticaScanWrapper]) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/datasource/wrappers/json/VerticaJsonTableWrapperTest.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | package com.vertica.spark.datasource.wrappers.json 14 | 15 | import com.vertica.spark.common.TestObjects 16 | import com.vertica.spark.datasource.wrappers.VerticaScanWrapperBuilder 17 | import org.apache.spark.sql.SparkSession 18 | import org.apache.spark.sql.execution.datasources.FileFormat 19 | import org.apache.spark.sql.execution.datasources.json.JsonFileFormat 20 | import org.apache.spark.sql.execution.datasources.v2.json.JsonTable 21 | import org.apache.spark.sql.types.StructType 22 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 23 | import org.scalamock.scalatest.MockFactory 24 | import org.scalatest.BeforeAndAfterAll 25 | import org.scalatest.flatspec.AnyFlatSpec 26 | 27 | class VerticaJsonTableWrapperTest extends AnyFlatSpec with BeforeAndAfterAll with MockFactory { 28 | 29 | behavior of "VerticaJsonTableTest" 30 | 31 | private val spark: SparkSession = SparkSession.builder() 32 | .master("local[*]") 33 | .appName("Vertica Connector Test") 34 | .getOrCreate() 35 | 36 | class MockJsonTable(_name: String, 37 | sparkSession: SparkSession, 38 | options: CaseInsensitiveStringMap, 39 | paths: Seq[String], 40 | userSpecifiedSchema: Option[StructType], 41 | fallbackFileFormat: Class[_ <: FileFormat]) 42 | extends JsonTable(_name, sparkSession, options, paths, userSpecifiedSchema, fallbackFileFormat) { 43 | 44 | override lazy val schema: StructType = StructType(Seq()) 45 | } 46 | 47 | private val mockTable = new MockJsonTable("MockJsonTable", spark, CaseInsensitiveStringMap.empty(), List(), Some(StructType(Seq())), classOf[JsonFileFormat]) 48 | private val readConfig = TestObjects.readConfig 49 | 50 | it should "return JsonTable capabilities" in { 51 | assert(new VerticaJsonTableWrapper(mockTable, readConfig).capabilities() == mockTable.capabilities()) 52 | } 53 | 54 | it should "build VerticaScanWrapperBuilder" in { 55 | assert(new VerticaJsonTableWrapper(mockTable, readConfig).newScanBuilder(CaseInsensitiveStringMap.empty()).isInstanceOf[VerticaScanWrapperBuilder]) 56 | } 57 | 58 | it should "return JsonTable name" in { 59 | assert(new VerticaJsonTableWrapper(mockTable, readConfig).name() == "Vertica" + mockTable.name) 60 | } 61 | 62 | it should "return JsonTable schema" in { 63 | // Comparing references 64 | assert(new VerticaJsonTableWrapper(mockTable, readConfig).schema() == mockTable.schema) 65 | } 66 | 67 | override protected def afterAll(): Unit = spark.close() 68 | } 69 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/util/pushdown/PushdownUtilsTest.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.pushdown 15 | 16 | import org.apache.spark.sql.sources._ 17 | import org.scalamock.scalatest.MockFactory 18 | import org.scalatest.BeforeAndAfterAll 19 | import org.scalatest.flatspec.AnyFlatSpec 20 | 21 | class PushdownUtilsTest extends AnyFlatSpec with BeforeAndAfterAll with MockFactory with org.scalatest.OneInstancePerTest { 22 | 23 | val sparkFilters: Array[Filter] = Array( 24 | EqualTo("a", 5), 25 | GreaterThan("a", 6.7), 26 | GreaterThanOrEqual("a", 8.8f), 27 | LessThan("a", 1000000), 28 | LessThanOrEqual("a", -1000000), 29 | In("a", Array("abc", "123", "456")), 30 | IsNull("a"), 31 | IsNotNull("b"), 32 | Not(EqualTo("a", 5)), 33 | StringStartsWith("a", "abc"), 34 | StringEndsWith("a", "qwe"), 35 | StringContains("a", "zxc") 36 | ) 37 | 38 | val textFilters: Array[String] = Array( 39 | "(\"a\" = 5)", 40 | "(\"a\" > 6.7)", 41 | "(\"a\" >= 8.8)", 42 | "(\"a\" < 1000000)", 43 | "(\"a\" <= -1000000)", 44 | "(\"a\" IN ('abc', '123', '456'))", 45 | "(\"a\" IS NULL)", 46 | "(\"b\" IS NOT NULL)", 47 | "( NOT (\"a\" = 5))", 48 | "(\"a\" like 'abc%')", 49 | "(\"a\" like '%qwe')", 50 | "(\"a\" like '%zxc%')" 51 | ) 52 | 53 | it should "generate all filters" in { 54 | sparkFilters.indices.map( i => { 55 | val filter = sparkFilters(i) 56 | val text = textFilters(i) 57 | 58 | assert(PushdownUtils.genFilter(filter).right.get.filterString.toLowerCase == text.toLowerCase) 59 | }) 60 | } 61 | 62 | it should "compose all filters with AND" in { 63 | sparkFilters.indices.map( i => { 64 | sparkFilters.indices.map( j => { 65 | val filter = And(sparkFilters(i), sparkFilters(j)) 66 | val text = "(" + textFilters(i) + " AND " + textFilters(j) + ")" 67 | 68 | assert(PushdownUtils.genFilter(filter).right.get.filterString.toLowerCase == text.toLowerCase) 69 | }) 70 | }) 71 | } 72 | 73 | it should "compose all filters with OR" in { 74 | sparkFilters.indices.map( i => { 75 | sparkFilters.indices.map( j => { 76 | val filter = Or(sparkFilters(i), sparkFilters(j)) 77 | val text = "(" + textFilters(i) + " OR " + textFilters(j) + ")" 78 | 79 | assert(PushdownUtils.genFilter(filter).right.get.filterString.toLowerCase == text.toLowerCase) 80 | }) 81 | }) 82 | } 83 | 84 | it should "compose all filters with AND + OR + NOT" in { 85 | sparkFilters.indices.map( i => { 86 | sparkFilters.indices.map( j => { 87 | val filter = Not(Or(And(sparkFilters(i), sparkFilters(j)), And(sparkFilters(i), sparkFilters(j))) ) 88 | val text = "( NOT (" + 89 | "(" + textFilters(i) + " AND " + textFilters(j) + ")" + 90 | " OR " + 91 | "(" + textFilters(i) + " AND " + textFilters(j) + ")" + 92 | "))" 93 | 94 | assert(PushdownUtils.genFilter(filter).right.get.filterString.toLowerCase == text.toLowerCase) 95 | }) 96 | }) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/util/query/StringParsingUtilsTest.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.util.query 2 | 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | 5 | class StringParsingUtilsTest extends AnyFlatSpec { 6 | 7 | behavior of "VerticaSQLUtilsTest" 8 | 9 | it should "split a comma separated list" in { 10 | val result = StringParsingUtils.splitByComma("cat, cat dog, shark,") 11 | assert(result.length == 3) 12 | assert(result(0) == "cat") 13 | assert(result(1) == "cat dog") 14 | assert(result(2) == "shark") 15 | } 16 | 17 | it should "split a comma separated list with parentheses" in { 18 | val result = StringParsingUtils.splitByComma("col1 (int, col2) (cat, ((dog))), shark") 19 | assert(result.length == 2) 20 | assert(result.head == "col1 (int, col2) (cat, ((dog)))") 21 | assert(result(1) == "shark") 22 | 23 | val result2 = StringParsingUtils.splitByComma("(col1 int, col2 cat dog,)") 24 | assert(result2.length == 1) 25 | assert(result2.head == "(col1 int, col2 cat dog,)") 26 | 27 | val result3 = StringParsingUtils.splitByComma(")(col1 (int, ()col2, shark)), cat, dog") 28 | assert(result3.length == 3) 29 | assert(result3.head == ")(col1 (int, ()col2, shark))") 30 | assert(result3(1) == "cat") 31 | assert(result3(2) == "dog") 32 | } 33 | 34 | it should "split a comma separated list with non matching parentheses" in { 35 | val result = StringParsingUtils.splitByComma("(col1 int, col2 cat dog,") 36 | assert(result.length == 1) 37 | assert(result.head == "(col1 int, col2 cat dog,") 38 | 39 | val result2 = StringParsingUtils.splitByComma(")col1 (int, (col2, shark)), cat, dog") 40 | assert(result2.length == 3) 41 | assert(result2.head == ")col1 (int, (col2, shark))") 42 | assert(result2(1) == "cat") 43 | assert(result2(2) == "dog") 44 | 45 | val result3 = StringParsingUtils.splitByComma(")(col1 (int, (col2, shark)), cat, dog") 46 | assert(result3.length == 1) 47 | assert(result3.head == ")(col1 (int, (col2, shark)), cat, dog") 48 | } 49 | 50 | it should "find the indices of the first matching parentheses" in { 51 | val str = ")cat(dog_(sha(rk)))cat(_(d)og)" 52 | val (openParen, closeParen) = StringParsingUtils.findFirstParenGroupIndices(str) 53 | assert(openParen == 4) 54 | assert(closeParen == 18) 55 | assert(str.substring(openParen + 1, closeParen) == "dog_(sha(rk))") 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/util/version/SparkVersionToolsTests.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.util.version 2 | 3 | import com.vertica.spark.config.ReadConfig 4 | import com.vertica.spark.datasource.core.DSConfigSetupInterface 5 | import com.vertica.spark.datasource.v2.{VerticaScanBuilder, VerticaScanBuilderWithPushdown} 6 | import com.vertica.spark.util.reflections.ReflectionTools 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.sql.connector.expressions.Expression 9 | import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Aggregation} 10 | import org.scalamock.scalatest.MockFactory 11 | import org.scalatest.flatspec.AnyFlatSpec 12 | 13 | class SparkVersionToolsTests extends AnyFlatSpec with MockFactory{ 14 | 15 | behavior of "SparkUtilsTests" 16 | 17 | it should "correctly parses Spark version string" in { 18 | val version = (new SparkVersionTools).getVersion(Some("3.2.1")) 19 | assert(version.isDefined) 20 | assert(version == Some(Version(3,2,1))) 21 | } 22 | 23 | it should "correctly parses major-minor-patch numbers" in { 24 | val version = (new SparkVersionTools).getVersion(Some("3.2.1-0-vertica-1")) 25 | assert(version.isDefined) 26 | assert(version == Some(Version(3,2,1))) 27 | } 28 | 29 | it should "return a Spark version string" in { 30 | val spark = SparkSession.builder().master("local[1]").getOrCreate() 31 | assert((new SparkVersionTools).getVersionString.isDefined) 32 | spark.close() 33 | } 34 | 35 | private val groupByExpressions = Array[Expression]() 36 | private val aggregates = Array[AggregateFunc]() 37 | // Aggregation is final and can't be mocked with MockFactory 38 | private val mockAggregation = new Aggregation(aggregates, groupByExpressions) 39 | 40 | it should "get group by expressions from groupByColumns method when Spark is than 3.2.x" in { 41 | val sparkVersion = Version(3,2,9) 42 | val reflection = mock[ReflectionTools] 43 | val groupByColumns = Array[Expression]() 44 | (reflection.aggregationInvokeMethod[Array[Expression]] _).expects(mockAggregation, "groupByColumns").returning(groupByColumns) 45 | 46 | assert(new SparkVersionTools(reflection).getCompatibleGroupByExpressions(sparkVersion, mockAggregation) == groupByColumns) 47 | } 48 | 49 | it should "get group by expressions using groupByExpressions method when spark is at least 3.3.0" in { 50 | val sparkVersion = Version(3,3) 51 | assert(new SparkVersionTools(mock[ReflectionTools]).getCompatibleGroupByExpressions(sparkVersion, mockAggregation) == groupByExpressions) 52 | } 53 | 54 | it should "build VerticaScanBuilder for Spark version before than 3.2" in { 55 | val sparkVersion = Version(3, 1, 9) 56 | val reflection = mock[ReflectionTools] 57 | (reflection.makeScanBuilderWithoutPushDown _).expects(*, *).returning(mock[VerticaScanBuilder]) 58 | 59 | new SparkVersionTools(reflection).makeCompatibleVerticaScanBuilder(sparkVersion, mock[ReadConfig], mock[DSConfigSetupInterface[ReadConfig]]) 60 | } 61 | 62 | it should "build VerticaScanBuilder with aggregates push down for Spark version 3.2 or newer" in { 63 | val sparkVersion = Version(3, 2) 64 | val reflection = mock[ReflectionTools] 65 | (reflection.makeScanBuilderWithPushDown _).expects(*, *).returning(mock[VerticaScanBuilderWithPushdown]) 66 | 67 | new SparkVersionTools(reflection).makeCompatibleVerticaScanBuilder(sparkVersion, mock[ReadConfig], mock[DSConfigSetupInterface[ReadConfig]]) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /connector/src/test/scala/com/vertica/spark/util/version/VersionTest.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.util.version 15 | 16 | import org.scalamock.scalatest.MockFactory 17 | import org.scalatest.flatspec.AnyFlatSpec 18 | import org.scalatest.BeforeAndAfterAll 19 | 20 | //scalastyle:off 21 | class VersionTest extends AnyFlatSpec with BeforeAndAfterAll with MockFactory with org.scalatest.OneInstancePerTest { 22 | 23 | it should "compare to bigger version" in { 24 | assert(Version(11, 1, 5, 3) > (Version(10, 4, 7, 5))) 25 | } 26 | 27 | it should "compare to smaller version" in { 28 | assert(Version(11, 1, 5, 3) < (Version(12, 0, 2, 1))) 29 | } 30 | 31 | it should "compare to smaller or equal versions" in { 32 | assert(Version(11, 1, 5, 3) <= (Version(11, 1, 5, 3))) 33 | assert(Version(11, 1, 5, 3) <= (Version(11, 2, 5, 3))) 34 | } 35 | 36 | it should "compare to bigger or equal versions" in { 37 | assert(Version(11, 1, 5, 3) >= (Version(11, 1, 5, 3))) 38 | assert(Version(11, 1, 5, 3) >= (Version(11, 1, 5, 2))) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /docker/client-krb/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM centos:7 2 | 3 | ENV SBT_VERSION 1.3.13 4 | ENV JAVA_OPTS="$JAVA_OPTS -Djava.security.auth.login.config=/spark-connector/docker/client-krb/jaas.config" 5 | 6 | RUN yum install -y java-11-openjdk && \ 7 | yum install -y krb5-workstation && \ 8 | yum install -y epel-release && \ 9 | yum update -y && yum install -y wget && \ 10 | curl -L https://www.scala-sbt.org/sbt-rpm.repo > sbt-rpm.repo && \ 11 | mv sbt-rpm.repo /etc/yum.repos.d/ && \ 12 | yum -y install sbt && \ 13 | wget https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-without-hadoop.tgz && \ 14 | wget https://archive.apache.org/dist/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz && \ 15 | tar xvf spark-3.1.2-bin-without-hadoop.tgz && \ 16 | tar xvf hadoop-3.3.1.tar.gz && \ 17 | mv spark-3.1.2-bin-without-hadoop/ /opt/spark && \ 18 | cd /opt/spark/conf && \ 19 | mv spark-env.sh.template spark-env.sh 20 | 21 | ENTRYPOINT ["/bin/bash"] 22 | -------------------------------------------------------------------------------- /docker/client-krb/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo "[logging] 4 | default = FILE:/var/log/krb5libs.log 5 | kdc = FILE:/var/log/krb5kdc.log 6 | admin_server = FILE:/var/log/kadmind.log 7 | [libdefaults] 8 | default_realm = $REALM 9 | dns_lookup_realm = false 10 | dns_lookup_kdc = false 11 | ticket_lifetime = 24h 12 | renew_lifetime = 7d 13 | forwardable = true 14 | [realms] 15 | $REALM = { 16 | kdc = $KDC 17 | admin_server = $KDC 18 | } 19 | [domain_realm] 20 | .example.com = $REALM 21 | example.com = $REALM" | tee /etc/krb5.conf 22 | 23 | cp /etc/hadoop/conf/* /hadoop-3.3.1/etc/hadoop/ 24 | 25 | keytool -import -file /hadoop-3.3.1/etc/hadoop/hdfs.cert -alias hdfs -keystore cacerts.jks -no-prompt -storepass password 26 | 27 | echo 'user1' | kinit user1 28 | 29 | exec "$@" 30 | -------------------------------------------------------------------------------- /docker/client-krb/jaas.config: -------------------------------------------------------------------------------- 1 | Client { 2 | com.sun.security.auth.module.Krb5LoginModule required 3 | useKeyTab=false 4 | useTicketCache=true 5 | doNotPrompt=true; 6 | }; 7 | -------------------------------------------------------------------------------- /docker/client-krb/vsql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/docker/client-krb/vsql -------------------------------------------------------------------------------- /docker/client/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG SPARK=latest 2 | FROM bitnami/spark:$SPARK 3 | 4 | USER root 5 | 6 | # Install JDK 7 | RUN apt-get update && apt-get install -y openjdk-11-jdk 8 | ENV JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" 9 | # Prepending our JAVA_HOME so that it would take precedent over bitnami's java home path. 10 | ENV PATH=$JAVA_HOME/bin:$PATH 11 | 12 | # Install SBT 13 | RUN apt-get update && \ 14 | apt-get -y install apt-transport-https curl gnupg -yqq && \ 15 | echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \ 16 | echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \ 17 | curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \ 18 | chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \ 19 | apt-get update && \ 20 | apt-get -y install sbt 21 | 22 | # Sync Python version 23 | RUN mkdir -p /tmp/bitnami/pkg/cache/ && cd /tmp/bitnami/pkg/cache/ && \ 24 | COMPONENTS=( \ 25 | "python-3.10.6-8-linux-${OS_ARCH}-debian-11" \ 26 | ) && \ 27 | for COMPONENT in "${COMPONENTS[@]}"; do \ 28 | if [ ! -f "${COMPONENT}.tar.gz" ]; then \ 29 | curl -SsLf "https://downloads.bitnami.com/files/stacksmith/${COMPONENT}.tar.gz" -O ; \ 30 | curl -SsLf "https://downloads.bitnami.com/files/stacksmith/${COMPONENT}.tar.gz.sha256" -O ; \ 31 | fi && \ 32 | sha256sum -c "${COMPONENT}.tar.gz.sha256" && \ 33 | tar -zxf "${COMPONENT}.tar.gz" -C /opt/bitnami --strip-components=2 --no-same-owner --wildcards '*/files' && \ 34 | rm -rf "${COMPONENT}".tar.gz{,.sha256} ; \ 35 | done 36 | 37 | -------------------------------------------------------------------------------- /docker/docker-compose-kerberos.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | services: 3 | krb-client: 4 | build: ./client-krb 5 | entrypoint: /client-krb/docker-entrypoint.sh sleep infinity 6 | container_name: client 7 | hostname: client 8 | domainname: example.com 9 | networks: 10 | default: 11 | aliases: 12 | - client.example.com 13 | ports: 14 | - "5005:5005" 15 | volumes: 16 | - ./..:/spark-connector 17 | - ./vertica-hdfs-config/hadoop-kerberized:/etc/hadoop/conf 18 | - ./client-krb:/client-krb 19 | env_file: 20 | - krb.env 21 | environment: 22 | - HADOOP_VERSION 23 | - SPARK_VERSION 24 | - AWS_ACCESS_KEY_ID 25 | - AWS_SECRET_ACCESS_KEY 26 | - GCS_FILEPATH 27 | - GCS_HMAC_KEY_ID 28 | - GCS_HMAC_KEY_SECRET 29 | - GCS_SERVICE_KEY_ID 30 | - GCS_SERVICE_KEY 31 | - GCS_SERVICE_EMAIL 32 | 33 | kdc: 34 | build: ./kdc 35 | entrypoint: /kdc/docker-entrypoint.sh /usr/sbin/init 36 | container_name: kdc 37 | hostname: kdc 38 | domainname: example.com 39 | networks: 40 | default: 41 | aliases: 42 | - kdc.example.com 43 | volumes: 44 | - ./kdc:/kdc 45 | - ./keytabs:/keytabs 46 | env_file: 47 | - krb.env 48 | 49 | vertica: 50 | image: vertica/vertica-k8s:${VERTICA_VERSION:-latest} 51 | container_name: vertica 52 | hostname: vertica 53 | domainname: example.com 54 | networks: 55 | default: 56 | aliases: 57 | - vertica.example.com 58 | ports: 59 | - "5433:5433" 60 | volumes: 61 | - ./vertica-krb/docker-entrypoint.sh:/usr/local/bin/docker-entrypoint.sh 62 | - ./vertica-hdfs-config/hadoop-kerberized:/etc/hadoop/conf 63 | - ./vertica-krb:/vertica-krb 64 | - ./keytabs:/keytabs 65 | env_file: 66 | - krb.env 67 | environment: 68 | - VERTICA_MEMDEBUG=2 69 | 70 | hdfs: 71 | build: ./hdfs-krb 72 | entrypoint: /usr/local/bin/docker-entrypoint.sh sleep infinity 73 | # Must explicitly set container_name or add entries to /etc/hosts in other containers that 74 | # communicate with hdfs (client and vertica), otherwise Kerberos is unable to perform both 75 | # forward and reverse lookup 76 | container_name: hdfs 77 | hostname: hdfs 78 | domainname: example.com 79 | networks: 80 | default: 81 | aliases: 82 | - hdfs.example.com 83 | ports: 84 | - "22022:22" 85 | - "8020:8020" 86 | - "50010:50010" 87 | - "50020:50020" 88 | - "50070:50070" 89 | - "50071:50071" 90 | - "50075:50075" 91 | - "50076:50076" 92 | volumes: 93 | - ./hdfs-krb/docker-entrypoint.sh:/usr/local/bin/docker-entrypoint.sh 94 | - ./vertica-hdfs-config/hadoop-kerberized:/hadoop/conf 95 | - ./hdfs-krb:/hdfs-krb 96 | - ./keytabs:/keytabs 97 | env_file: 98 | - krb.env 99 | 100 | networks: 101 | default: 102 | name: "EXAMPLE.COM" 103 | driver: bridge 104 | -------------------------------------------------------------------------------- /docker/hdfs-krb/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8 2 | MAINTAINER vertica 3 | 4 | ENV DEBIAN_FRONTEND noninteractive 5 | 6 | # Refresh package lists 7 | RUN apt-get update 8 | RUN apt-get -qy dist-upgrade 9 | 10 | RUN apt-get install -qy rsync curl openssh-server openssh-client vim nfs-common 11 | 12 | RUN mkdir -p /data/hdfs-nfs/ 13 | RUN mkdir -p /opt 14 | WORKDIR /opt 15 | 16 | # Install Hadoop 17 | RUN curl -L https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz -s -o - | tar -xzf - 18 | RUN mv hadoop-3.3.1 hadoop 19 | 20 | # Setup 21 | WORKDIR /opt/hadoop 22 | ENV PATH /opt/hadoop/bin:/opt/hadoop/sbin:$PATH 23 | RUN echo $JAVA_HOME 24 | ENV JAVA_HOME /usr/local/openjdk-8 25 | RUN sed --in-place='.ori' -e "s/\${JAVA_HOME}/\/usr\/local\/openjdk-8/" etc/hadoop/hadoop-env.sh 26 | 27 | # Configure ssh client 28 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa && \ 29 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \ 30 | chmod 0600 ~/.ssh/authorized_keys 31 | 32 | RUN echo "\nHost *\n" >> ~/.ssh/config && \ 33 | echo " StrictHostKeyChecking no\n" >> ~/.ssh/config && \ 34 | echo " UserKnownHostsFile=/dev/null\n" >> ~/.ssh/config 35 | 36 | # Disable sshd authentication 37 | RUN echo "root:root" | chpasswd 38 | RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config 39 | RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config 40 | 41 | # SSH login fix. Otherwise user is kicked off after login 42 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 43 | 44 | # Pseudo-Distributed Operation 45 | RUN echo "export JAVA_HOME=/usr/local/openjdk-8" >> /opt/hadoop/etc/hadoop/hadoop-env.sh 46 | RUN hdfs namenode -format 47 | 48 | ENV HDFS_NAMENODE_USER root 49 | ENV HDFS_DATANODE_USER root 50 | ENV HDFS_SECONDARYNAMENODE_USER root 51 | 52 | # SSH 53 | EXPOSE 22 54 | # hdfs://localhost:8020 55 | EXPOSE 8020 56 | # HDFS namenode 57 | EXPOSE 50020 58 | # HDFS Web browser 59 | EXPOSE 50070 60 | # HDFS datanodes 61 | EXPOSE 50075 62 | # HDFS secondary namenode 63 | EXPOSE 50090 64 | 65 | ENTRYPOINT service ssh start \ 66 | && start-dfs.sh \ 67 | && hadoop-daemon.sh start portmap \ 68 | && hadoop-daemon.sh start nfs3 \ 69 | && bash || bash 70 | -------------------------------------------------------------------------------- /docker/hdfs-krb/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | service ssh start 4 | 5 | # Start HDFS services 6 | rm -f /tmp/*.pid 7 | start-dfs.sh 8 | hadoop-daemon.sh start portmap 9 | hadoop-daemon.sh start nfs3 10 | 11 | # Configure Kerberos 12 | echo "[logging] 13 | default = FILE:/var/log/krb5libs.log 14 | kdc = FILE:/var/log/krb5kdc.log 15 | admin_server = FILE:/var/log/kadmind.log 16 | [libdefaults] 17 | default_realm = $REALM 18 | dns_lookup_realm = false 19 | dns_lookup_kdc = false 20 | ticket_lifetime = 24h 21 | forwardable = true 22 | [realms] 23 | $REALM = { 24 | kdc = $KDC 25 | admin_server = $KDC 26 | } 27 | [domain_realm] 28 | .example.com = $REALM 29 | example.com = $REALM" | tee /etc/krb5.conf 30 | 31 | cp /keytabs/hdfs.keytab /root/.keytab 32 | 33 | cp /hadoop/conf/core-site.xml /opt/hadoop/etc/hadoop/core-site.xml 34 | cp /hadoop/conf/hdfs-site.xml /opt/hadoop/etc/hadoop/hdfs-site.xml 35 | cp /hadoop/conf/ssl-server.xml /opt/hadoop/etc/hadoop/ssl-server.xml 36 | cp /hadoop/conf/keystore /root/.keystore 37 | 38 | export PATH=$PATH:/usr/bin 39 | 40 | rm /hadoop/conf/hdfs.cert 41 | keytool -delete -alias hdfs -keystore /root/.keystore -storepass password 42 | keytool -genkey -keyalg RSA -alias hdfs -keystore /root/.keystore -validity 500 -keysize 2048 -dname "CN=hdfs.example.com, OU=hdfs, O=hdfs, L=hdfs, S=hdfs, C=hdfs" -no-prompt -storepass password -keypass password 43 | echo "password" | keytool -export -alias hdfs -keystore /root/.keystore -rfc -file hdfs.cert 44 | cp hdfs.cert /hadoop/conf/ 45 | 46 | # Restart HDFS service 47 | stop-dfs.sh 48 | start-dfs.sh 49 | 50 | echo "HDFS container is now running" 51 | 52 | exec "$@" 53 | -------------------------------------------------------------------------------- /docker/hdfs/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | service ssh start 4 | 5 | # Override HDFS config 6 | cp /hadoop/conf/*.xml /opt/hadoop/etc/hadoop 7 | 8 | # Start HDFS services 9 | rm -f /tmp/*.pid 10 | start-dfs.sh 11 | hadoop-daemon.sh start portmap 12 | hadoop-daemon.sh start nfs3 13 | 14 | # Copy test data to HDFS 15 | while [ "$(hdfs dfsadmin -safemode get)" = "Safe mode is ON" ]; do sleep 1; done 16 | hadoop fs -copyFromLocal /partitioned /3.1.1 17 | 18 | echo "HDFS container is now running" 19 | 20 | exec "$@" 21 | -------------------------------------------------------------------------------- /docker/kdc/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM centos:8 2 | 3 | RUN (cd /lib/systemd/system/sysinit.target.wants/; \ 4 | for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done); \ 5 | rm -f /lib/systemd/system/multi-user.target.wants/*;\ 6 | rm -f /etc/systemd/system/*.wants/*;\ 7 | rm -f /lib/systemd/system/local-fs.target.wants/*; \ 8 | rm -f /lib/systemd/system/sockets.target.wants/*udev*; \ 9 | rm -f /lib/systemd/system/sockets.target.wants/*initctl*; \ 10 | rm -f /lib/systemd/system/basic.target.wants/*;\ 11 | rm -f /lib/systemd/system/anaconda.target.wants/* && \ 12 | sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \ 13 | sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && \ 14 | yum update -y && \ 15 | yum install python2 wget -y && \ 16 | wget https://raw.githubusercontent.com/gdraheim/docker-systemctl-replacement/master/files/docker/systemctl.py -O /usr/local/bin/systemctl && \ 17 | chmod a+x /usr/local/bin/systemctl && \ 18 | yum -y install initscripts && yum clean all && \ 19 | yum install krb5-server krb5-libs krb5-workstation -y 20 | 21 | EXPOSE 88 22 | 23 | CMD ["/usr/sbin/init"] 24 | -------------------------------------------------------------------------------- /docker/kdc/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo "[logging] 4 | default = FILE:/var/log/krb5libs.log 5 | kdc = FILE:/var/log/krb5kdc.log 6 | admin_server = FILE:/var/log/kadmind.log 7 | [libdefaults] 8 | default_realm = $REALM 9 | dns_lookup_realm = false 10 | dns_lookup_kdc = false 11 | ticket_lifetime = 24h 12 | renew_lifetime = 7d 13 | forwardable = true 14 | [realms] 15 | $REALM = { 16 | kdc = localhost 17 | admin_server = localhost 18 | } 19 | [domain_realm] 20 | .example.com = $REALM 21 | example.com = $REALM" | tee /etc/krb5.conf 22 | 23 | kdb5_util -P 'admin' create 24 | 25 | systemctl start kadmin.service 26 | systemctl start krb5kdc.service 27 | chkconfig krb5kdc on 28 | chkconfig kadmin on 29 | 30 | # Create admin 31 | $KADMIN -q "addprinc -pw admin admin/admin" 32 | echo "*/admin@$REALM *" | tee -a /var/kerberos/krb5kdc/kadm5.acl 33 | 34 | # Add user principals 35 | for u in ${USERS//,/ };do 36 | $KADMIN -q "addprinc -pw ${u} ${u}" 37 | done 38 | 39 | $KADMIN -q "addprinc -randkey $V_PRINC" 40 | $KADMIN -q "ktadd -norandkey -k vertica.keytab $V_PRINC" 41 | chmod 777 vertica.keytab 42 | cp vertica.keytab /keytabs 43 | 44 | $KADMIN -q "addprinc -randkey $HDFS_PRINC" 45 | $KADMIN -q "addprinc -randkey $HTTP_HDFS_PRINC" 46 | $KADMIN -q "ktadd -norandkey -k hdfs.keytab $HDFS_PRINC $HTTP_HDFS_PRINC" 47 | chmod 777 hdfs.keytab 48 | cp hdfs.keytab /keytabs 49 | 50 | exec "$@" 51 | -------------------------------------------------------------------------------- /docker/keytabs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/docker/keytabs/.gitkeep -------------------------------------------------------------------------------- /docker/krb.env: -------------------------------------------------------------------------------- 1 | V_PRINC=vertica/vertica.example.com@EXAMPLE.COM 2 | KDC=kdc 3 | KHOST=vertica.example.com 4 | REALM=EXAMPLE.COM 5 | KTAB=/vertica.keytab 6 | DBNAME=docker 7 | 8 | SERVICE_NAME=vertica 9 | USERS=user1,user2,user3 10 | KADMIN=kadmin.local 11 | 12 | HDFS_PRINC=root/hdfs.example.com@EXAMPLE.COM 13 | HTTP_HDFS_PRINC=HTTP/hdfs.example.com@EXAMPLE.COM 14 | -------------------------------------------------------------------------------- /docker/vertica-hdfs-config/hadoop-kerberized/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.defaultFS 4 | hdfs://hdfs.example.com:8020 5 | 6 | 7 | 8 | hadoop.security.authentication 9 | kerberos 10 | 11 | 12 | 13 | hadoop.security.authorization 14 | true 15 | 16 | 17 | 18 | hadoop.security.auth_to_local 19 | 20 | RULE:[2:$1/$2@$0](.*/.*@EXAMPLE.COM)s/.*/root/ 21 | DEFAULT 22 | 23 | 24 | 25 | 26 | hadoop.rpc.protection 27 | authentication 28 | 29 | 30 | 31 | hadoop.proxyuser.root.groups 32 | * 33 | 34 | 35 | 36 | hadoop.proxyuser.root.hosts 37 | * 38 | 39 | 40 | 41 | hadoop.proxyuser.superuser.hosts 42 | * 43 | 44 | 45 | 46 | hadoop.proxyuser.superuser.groups 47 | * 48 | 49 | 50 | 51 | hadoop.http.authentication.type 52 | kerberos 53 | 54 | 55 | 56 | hadoop.http.authentication.kerberos.keytab 57 | /root/.keytab 58 | 59 | 60 | 61 | hadoop.http.authentication.kerberos.principal 62 | HTTP/hdfs.example.com@EXAMPLE.COM 63 | 64 | 65 | -------------------------------------------------------------------------------- /docker/vertica-hdfs-config/hadoop-kerberized/keystore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/docker/vertica-hdfs-config/hadoop-kerberized/keystore -------------------------------------------------------------------------------- /docker/vertica-hdfs-config/hadoop-kerberized/ssl-client.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | ssl.client.truststore.location 4 | /cacerts.jks 5 | 6 | 7 | 8 | ssl.client.truststore.password 9 | password 10 | 11 | 12 | -------------------------------------------------------------------------------- /docker/vertica-hdfs-config/hadoop-kerberized/ssl-server.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | ssl.server.keystore.keypassword 4 | password 5 | 6 | 7 | 8 | ssl.server.keystore.password 9 | password 10 | 11 | 12 | 13 | ssl.server.keystore.location 14 | /root/.keystore 15 | 16 | 17 | -------------------------------------------------------------------------------- /docker/vertica-hdfs-config/hadoop/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.defaultFS 4 | hdfs://hdfs:8020 5 | 6 | 7 | 8 | dfs.client.use.datanode.hostname 9 | true 10 | 11 | 12 | 13 | dfs.datanode.use.datanode.hostname 14 | true 15 | 16 | 17 | 18 | 19 | 20 | hadoop.proxyuser.root.groups 21 | * 22 | 23 | 24 | 25 | hadoop.proxyuser.root.hosts 26 | * 27 | 28 | 29 | -------------------------------------------------------------------------------- /docker/vertica/docker-entrypoint-legacy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file has been modified for use by the Spark Connector 4 | # See original entrypoint script at https://github.com/vertica/vertica-kubernetes/blob/main/docker-vertica/docker-entrypoint.sh 5 | 6 | set -e 7 | 8 | start_cron(){ 9 | # daemonizes, no need for & 10 | sudo /usr/sbin/crond 11 | } 12 | 13 | # Kubernetes start-up is a little weird 14 | # - in order to configure the host-list correctly, k8s 15 | # has to do an install_vertica, which writes to 16 | # non-volatile store 17 | # - but the agent needs things that will be created by 18 | # that install. 19 | # - so we don't start the agent until we find the database running 20 | start_agent_when_ready(){ 21 | agent_started=No 22 | while [ $agent_started == No ]; do 23 | if [ -f /opt/vertica/config/admintools.conf ]; then 24 | # safe to try to run admintools 25 | db=$(/opt/vertica/bin/admintools -t show_active_db) || true 26 | case "$db"x in 27 | x) 28 | sleep 15 29 | ;; 30 | *) 31 | echo "Starting vertica agent for db $db" 32 | sudo /opt/vertica/sbin/vertica_agent start \ 33 | 2> /tmp/agent_start.err \ 34 | 1> /tmp/agent_start.out 35 | echo "Agent started" 36 | agent_started=Yes 37 | ;; 38 | esac 39 | else 40 | sleep 15 41 | fi 42 | done 43 | } 44 | 45 | restartNode(){ 46 | if [ ! -f /opt/vertica/config/admintools.conf ] 47 | then 48 | echo "Vertica is not installed, expect manual user intervention for install."; 49 | sudo /usr/sbin/sshd -D 50 | # If we get here we fail to force restart of container: 51 | exit 1 52 | fi 53 | # restart local Vertica node 54 | echo "Restart local node" 55 | /opt/vertica/sbin/python3 /opt/vertica/bin/re-ip-node.py --restart-node 56 | sudo /usr/sbin/sshd -D 57 | } 58 | 59 | reIpNode(){ 60 | if [ ! -d /opt/vertica/config/licensing ] || [ -z $(ls -A /opt/vertica/config/licensing/*) ] 61 | then 62 | echo "Installing license..." 63 | mkdir -p /opt/vertica/config/licensing 64 | cp -r /home/dbadmin/licensing/ce/* /opt/vertica/config/licensing 65 | fi 66 | echo "Update IP address on local node" 67 | /opt/vertica/sbin/python3 /opt/vertica/bin/re-ip-node.py --re-ip-node 68 | exit $? 69 | } 70 | 71 | defaultEntrypoint(){ 72 | echo "Vertica container is now running" 73 | sudo /usr/sbin/sshd -D 74 | } 75 | 76 | start_cron 77 | start_agent_when_ready & 78 | 79 | # Create database 80 | /opt/vertica/bin/admintools -t list_db --database=docker || \ 81 | /opt/vertica/bin/admintools -t create_db --database="${VERTICA_DATABASE:-docker}" --password="${VERTICA_PASSWORD}" --hosts=localhost 82 | 83 | # Start database 84 | if [ "$(/opt/vertica/bin/admintools -t db_status --status=DOWN)" == "${VERTICA_DATABASE:-docker}" ]; then 85 | /opt/vertica/bin/admintools -t start_db --database="${VERTICA_DATABASE:-docker}" --password="${VERTICA_PASSWORD}" --hosts=localhost 86 | fi 87 | 88 | # Configure database 89 | /opt/vertica/bin/vsql -c "ALTER DATABASE docker SET MaxClientSessions=100;" 90 | 91 | case $# in 92 | 1) 93 | case $1 in 94 | restart-vertica-node) 95 | restartNode 96 | ;; 97 | re-ip-vertica-node) 98 | reIpNode 99 | ;; 100 | *) 101 | echo "Invalid argument: $1" 102 | exit 1 103 | ;; 104 | esac 105 | ;; 106 | *) 107 | defaultEntrypoint 108 | ;; 109 | esac 110 | -------------------------------------------------------------------------------- /docker/vertica/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file has been modified for use by the Spark Connector 4 | # See original entrypoint script at https://github.com/vertica/vertica-kubernetes/blob/main/docker-vertica/docker-entrypoint.sh 5 | 6 | set -e 7 | 8 | start_cron(){ 9 | # daemonizes, no need for & 10 | sudo /usr/sbin/cron 11 | } 12 | 13 | # We copy back the files normally stored in /opt/vertica/config/. We do this 14 | # because we have a Persistent Volume that backs /opt/vertica/config, so 15 | # it starts up empty and must be populated 16 | copy_config_files() { 17 | mkdir -p /opt/vertica/config/licensing 18 | 19 | mv /home/dbadmin/logrotate/* /opt/vertica/config/ 2>/dev/null || true 20 | 21 | cp -r /home/dbadmin/licensing/ce/* /opt/vertica/config/licensing 2>/dev/null || true 22 | chmod -R ugo+r,u+rw /opt/vertica/config/licensing 23 | } 24 | 25 | # Ensure all PV paths are owned by dbadmin. This is done for some PVs that 26 | # start with restrictive ownership. 27 | ensure_path_is_owned_by_dbadmin() { 28 | # -z is to needed in case input arg is empty 29 | [ -z "$1" ] || [ "$(stat -c "%U" "$1")" == "dbadmin" ] || sudo chown -R dbadmin:verticadba "$1" 30 | } 31 | 32 | start_cron 33 | ensure_path_is_owned_by_dbadmin /opt/vertica/config 34 | ensure_path_is_owned_by_dbadmin /opt/vertica/log 35 | ensure_path_is_owned_by_dbadmin $DATA_PATH 36 | ensure_path_is_owned_by_dbadmin $DEPOT_PATH 37 | copy_config_files 38 | 39 | # Create database 40 | /opt/vertica/bin/admintools -t list_db --database=docker || \ 41 | /opt/vertica/bin/admintools -t create_db --database="${VERTICA_DATABASE:-docker}" --password="${VERTICA_PASSWORD}" --hosts=localhost 42 | 43 | # Start database 44 | if [ "$(/opt/vertica/bin/admintools -t db_status --status=DOWN)" == "${VERTICA_DATABASE:-docker}" ]; then 45 | /opt/vertica/bin/admintools -t start_db --database="${VERTICA_DATABASE:-docker}" --password="${VERTICA_PASSWORD}" --hosts=localhost 46 | fi 47 | 48 | # Configure database 49 | /opt/vertica/bin/vsql -c "ALTER DATABASE docker SET MaxClientSessions=100;" 50 | 51 | echo "Vertica container is now running" 52 | 53 | sudo ssh-keygen -q -A 54 | sudo /usr/sbin/sshd -D 55 | -------------------------------------------------------------------------------- /docs/gcs-guide.md: -------------------------------------------------------------------------------- 1 | # Google Cloud Storage User Guide 2 | 3 | Since Vertica can be deployed on Google Cloud Platform, it is possible for the Spark Connector to make use of Google Cloud Storage as the intermediary storage. 4 | 5 | * **Running on DataProc clusters:** If your Spark cluster deployed on GCP, you will need to obtain an HMAC interoperability key. Then configure connector options `gcs_hmac_key_id` and `gcs_hmac_key_secret`. The instruction for obtaining the key can be found [here](https://cloud.google.com/storage/docs/authentication/managing-hmackeys#create). 6 | * **Running outside of DataProc clusters:** In addition to configuring the HMAC key above, you will obtain a GCS service account key in the form of a JSON service keyfile. Instruction on obtaining one can be found [here](https://cloud.google.com/storage/docs/authentication#generating-a-private-key). 7 | 8 | Then, specify the connector option `gcs_service_keyfile` with the path to your keyfile JSON. Alternatively, the connector can pick up the option from the environment variable `GOOGLE_APPLICATION_CREDENTIALS` as well as the spark configuration option `fs.gs.auth.service.account.json.keyfile`. 9 | 10 | Finally, ensure that you include the [Google Hadoop Connector](https://mvnrepository.com/artifact/com.google.cloud.bigdataoss/gcs-connector) dependency into your project. Make sure your select the appropriate connector distribution for your Hadoop version. 11 | 12 | With the credential specified, you can now configure the connector option `staging_fs_url` to use GCS paths `gs:///path/to/data`. 13 | 14 | Another option to specifying the keyfile path is to set the following connector options: 15 | ``` 16 | gcs_service_key_id = 17 | gcs_service_key = 18 | gcs_service_email = 19 | ``` 20 | 21 | ## Additional Resources 22 | 23 | * [Google Hadoop Connector GitHub](https://github.com/GoogleCloudDataproc/hadoop-connectors) 24 | * [Using Google Hadoop Connector](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage) 25 | -------------------------------------------------------------------------------- /docs/hdfs-guide.md: -------------------------------------------------------------------------------- 1 | # Setting up a single-node HDFS and using it with the Vertica Spark Connector 2 | 3 | Here, we'll give some instructions for a simple one-node cluster setup on a Linux environment. 4 | 5 | ## 1. Download Hadoop 6 | 7 | Navigate to the desired install location and download hadoop. You can replace version number with version of your choice: 8 | 9 | ```shell 10 | wget https://httpd-mirror.sergal.org/apache/hadoop/common/hadoop-2.9.2/hadoop-2.9.2.tar.gz 11 | ``` 12 | 13 | ## 2. Unzip and Change Permissions 14 | 15 | Replace with desired hadoop install location. 16 | 17 | ```shell 18 | mkdir /hadoop 19 | sudo tar -zxvf hadoop-2.7.3.tar.gz -C /hadoop 20 | cd /hadoop 21 | sudo chmod 750 hadoop-2.9.2 22 | ``` 23 | 24 | ## 3. Edit Hadoop Configuration 25 | 26 | Edit etc/hadoop/hadoop-env.sh with the HADOOP_CONF_DIR variable to your directory. If necessary, you can also set the JAVA_HOME variable here 27 | 28 | ```shell 29 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"//hadoop/hadoop-2.9.2/etc/hadoop"} 30 | export JAVA_HOME=... 31 | ``` 32 | 33 | Edit etc/hadoop/core-site.xml with the following configuration (fill in your directory): 34 | 35 | ```shell 36 | 37 | 38 | fs.defaultFS 39 | hdfs://localhost:8020 40 | 41 | 42 | hadoop.tmp.dir 43 | //hadoop/hadooptmpdata 44 | 45 | 46 | ``` 47 | 48 | and etc/hadoop/hdfs-site.xml with the following configuration (fill in your directory): 49 | 50 | ```shell 51 | 52 | 53 | dfs.replication 54 | 1 55 | 56 | 57 | dfs.name.dir 58 | file:///hadoop/hdfs/namenode 59 | 60 | 61 | dfs.data.dir 62 | file:///hadoop/hdfs/datanode 63 | 64 | 65 | dfs.webhdfs.enabled 66 | true 67 | 68 | 69 | ``` 70 | 71 | Finally, set the HADOOP_HOME variable in your .bashrc (of whichever user is running hadoop): 72 | 73 | ```shell 74 | export HADOOP_HOME=/hadoop/hadoop-2.9.2 75 | ``` 76 | 77 | ## 4. Create directories 78 | 79 | Create the directories referenced above: 80 | 81 | ```shell 82 | cd //hadoop/ 83 | mkdir hdfs 84 | mkdir hadooptmpdata 85 | ``` 86 | 87 | ## 5. Set up passwordless ssh to localhost: 88 | 89 | ```shell 90 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys 91 | ``` 92 | 93 | and check that this worked: 94 | 95 | ```shell 96 | ssh localhost 97 | ``` 98 | 99 | ## 6. Format HDFS: 100 | 101 | ```shell 102 | bin/hdfs namenode -format 103 | ``` 104 | 105 | ## 7. Start HDFS 106 | 107 | ```shell 108 | cd /scratch_b//hadoop/hadoop-2.9.2 109 | sbin/start-dfs.sh 110 | ``` 111 | 112 | ## 8. Get Vertica to Work with HDFS 113 | 114 | Each Vertica node needs to have access to a copy of the HDFS configuration. If these are on seperate machines, you can use a command such as rsync to copy the configuration over. This must be done for each Vertica node. 115 | 116 | ```shell 117 | rsync -R --progress /hadoop/hadoop-2.9.2/etc/hadoop/hdfs-site.xml arehnby@eng-g9-158:/etc/hadoop/conf/ 118 | rsync -R --progress /hadoop/hadoop-2.9.2/etc/hadoop/core-site.xml arehnby@eng-g9-158:/etc/hadoop/conf/ 119 | ``` 120 | -------------------------------------------------------------------------------- /docs/s3-guide.md: -------------------------------------------------------------------------------- 1 | # S3 User Guide 2 | 3 | Apache Hadoop provides an [AWS connector](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) allowing Hadoop file system to access S3, and subsequently allowing the connector to use S3 as the staging area. 4 | 5 | ## Required Dependencies 6 | 7 | What you will need: 8 | - Spark 3.x 9 | - An appropriate `hadoop-aws` version for your hadoop install. Note that Spark comes bundled with Hadoop or stand-alone. 10 | - Importantly, the versions of hadoop-aws must be identical to the hadoop install. 11 | - For example, for a sbt project using Hadoop 3.3.0, add to your `build.sbt`: 12 | `libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % "3.3.0"` 13 | - An S3 bucket configured to use either A) access key ID + secret access key or B) IAM roles for authentication 14 | 15 | Some features may work with older versions of hadoop-aws, but we currently only test against the hadoop-aws version compatible with the latest Spark 3. 16 | 17 | ## Spark with User-Provided Hadoop 18 | 19 | The following example sets up a **user-provided Apache Hadoop**. To download Spark, [go here](https://spark.apache.org/downloads.html). Be sure to select package type "Pre-built with user-provided Apache Hadoop". 20 | 21 | You can [download Hadoop 3.3 here](https://hadoop.apache.org/releases.html). Make sure to download the binary. 22 | 23 | ### Setting up Spark with Hadoop 24 | Note: All instructions here are for MacOS or Linux users. 25 | 26 | First, you will need to decompress the Spark tar file and Hadoop tar file: 27 | ```sh 28 | tar xvf spark-3.0.2-bin-without-hadoop.tgz 29 | tar xvf hadoop-3.3.0.tar.gz 30 | ``` 31 | 32 | Move the resulting folder to /opt/spark/: 33 | `mv spark-3.0.2-bin-without-hadoop/ /opt/spark` 34 | 35 | Go to the Spark configuration directory: 36 | `cd /opt/spark/conf` 37 | 38 | There should be a spark-env.sh.template file. You will want a real spark-env.sh file, so rename the template to spark-env.sh: 39 | `mv spark-env.sh.template spark-env.sh` 40 | 41 | Next, set the JAVA_HOME environment variable: 42 | `export JAVA_HOME=/usr/lib/jvm/jre-11-openjdk` 43 | 44 | Now, edit spark-env.sh and point SPARK_DIST_CLASSPATH to the Hadoop folder you extracted earlier. For example, if you extracted it to /myhadoop, you should add the following line: 45 | `export SPARK_DIST_CLASSPATH=$(/myhadoop/hadoop-3.3.0/bin/hadoop classpath)` 46 | 47 | See [Spark's documentation](http://spark.apache.org/docs/latest/hadoop-provided.html) for more information. 48 | 49 | Finally, set the SPARK_HOME environment variable: 50 | ```sh 51 | export SPARK_HOME=/opt/spark 52 | export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin 53 | ``` 54 | 55 | ### Example Application Using S3 56 | 57 | See [here](https://github.com/vertica/spark-connector/tree/main/examples) for an example of how to connect to an S3 bucket with the Spark Connector. 58 | 59 | ## Troubleshooting 60 | 61 | If you see this error: 62 | `java.lang.NoClassDefFoundError: org/apache/hadoop/fs/StreamCapabilities` 63 | it is likely because you are not using Spark with Hadoop 3.3.0 and hadoop-aws 3.3.0. 64 | -------------------------------------------------------------------------------- /docs/tls-guide.md: -------------------------------------------------------------------------------- 1 | # Configuring TLS with the Connector 2 | 3 | In order to use TLS with the connector, you will need to setup Vertica as a TLS server, and the host containing the application that uses the connector as a TLS client. 4 | 5 | The following two sections are meant to be followed in order, with the client configuration following the Vertica configuration. Please note that this guide only uses a self-signed certificate. 6 | 7 | ## Setting up Vertica as a TLS server 8 | 9 | Simply follow the instructions [here](https://www.vertica.com/kb/Using-SSL-Server-Authentication-with-Vertica-Validating-Your-SSL/Content/BestPractices/Using-SSL-Server-Authentication-with-Vertica-Validating-Your-SSL.htm). 10 | 11 | ## Setting up the client machine as a TLS client 12 | 13 | Copy the server.crt certificate created on the Vertica server to the client machine. 14 | 15 | Run the following command on the client machine: 16 | `keytool -keystore truststore.jks -alias bmc -import -file server.crt` 17 | 18 | Note: `keytool` is included as part of the Java runtime. If you do not have it, then you may need to install Java first. 19 | 20 | This will create the truststore file on the client side, prompt you to create a new password for it, and import the server.crt self-signed certificate into the truststore. 21 | 22 | Set the `tls_mode`, `trust_store_path`, and `trust_store_password` properties in the connector options: 23 | ``` 24 | "tls_mode" -> "disable" 25 | "trust_store_path" -> "/truststore.jks" 26 | "trust_store_password" -> "testpass" 27 | ``` 28 | 29 | Here, the absolute path `/truststore.jks` was used. Set this path to wherever you created your truststore.jks file. You will also need to set `trust_store_password` to the password you set on your truststore.jks file. 30 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | These examples are intended to be run either on our provided Docker environment or on your own cluster. 4 | 5 | If you want to try these examples on our Docker environment, then: 6 | 1. Install sbt on your local machine with JDK 11 7 | 2. Clone the project if you haven't already: 8 | ```sh 9 | git clone https://github.com/vertica/spark-connector.git 10 | ``` 11 | 3. Start the appropriate configuration: 12 | ```sh 13 | cd spark-connector/docker 14 | docker-compose up -d 15 | # or, for Kerberos 16 | docker-compose -f docker-compose-kerberos.yml up -d 17 | ``` 18 | 4. Get a shell to the client container: 19 | ```sh 20 | docker exec -it docker-client-1 bash 21 | # or, for Kerberos 22 | docker exec -it client bash 23 | ``` 24 | 25 | Once in the container, navigate to the examples folder using `cd /spark-connector/examples`. 26 | 27 | You can find more information about our docker environment [here](/docker/README.md). 28 | 29 | ### Troubleshooting 30 | 31 | If you are using the thin JAR and running into an error similar to the following: 32 | `java.lang.NoSuchMethodError: 'void cats.kernel.CommutativeSemigroup.$init$(cats.kernel.CommutativeSemigroup)'`, you may need to shade the cats dependency in your project. 33 | 34 | This can be done by adding the following to your build.sbt file: 35 | 36 | ``` 37 | assembly / assemblyShadeRules := { 38 | val shadePackage = "com.azavea.shaded.demo" 39 | Seq( 40 | ShadeRule.rename("cats.kernel.**" -> s"$shadePackage.cats.kernel.@1").inAll 41 | ) 42 | } 43 | ``` 44 | 45 | ### Tear down containers 46 | 47 | To shut down and remove the containers safely: 48 | ```sh 49 | cd spark-connector/docker 50 | docker-compose down 51 | # or, for Kerberos 52 | docker-compose -f docker-compose-kerberos.yml down 53 | ``` 54 | -------------------------------------------------------------------------------- /examples/jupyter/README.md: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook Examples 2 | 3 | ## Creating the Jupyter Notebook Docker Container 4 | 5 | In order to run these examples the Jupyter container must be created and started. To do that start the Docker containers with the "jupyter" profile: 6 | ```sh 7 | cd spark-connector/docker 8 | docker-compose --profile jupyter up -d 9 | ``` 10 | 11 | An important thing to note is that the Spark and Python versions for Spark (master and worker nodes) and Jupyter Notebook must match, otherwise it will not work. Our Docker environment ensures the Python and Spark versions between these images are in-sync. 12 | 13 | For more information see the [Docker README](/docker/README.md). 14 | 15 | ## Running a Notebook 16 | 17 | 1. Go to http://localhost:8888/ and login with the token "test" 18 | 2. Under the File Browser on the left, navigate to the work folder and open the desired example Jupyter Notebook 19 | 3. Execute the cells, in order, using the Run button or by pressing `Shift+Enter` 20 | 21 | ## Examples 22 | 23 | ### Basic Read & Write 24 | 25 | A simple read and write that uses a two column schema of a string and an integer. 26 | 27 | ### Complex Array 28 | 29 | A Spark job that writes a regular array, nested array, and an array representative of a hash map. 30 | 31 | ### Linear Regression 32 | 33 | A Machine Learning example that utilizes Spark's Linear Regression algorithm. This job also contains reading and importing a .csv into Vertica. 34 | 35 | Each Notebook Example is annotated and written in a way to walk the user step-by-step through a Spark job to Vertica. 36 | 37 | ## ARM Limitations 38 | 39 | Due to limited availability of aarch64 images in Docker at this time, if you are running these examples on an ARM-based machine note that there may be performance issues or connection failures between containers. 40 | 41 | ## General Notebook Configuration 42 | 43 | Jupyter must be able to communicate with Spark, Hadoop, Vertica, etc, so it must be on the same Docker network. Our Docker environment configures this for you. 44 | 45 | The Spark Connector JAR must also be available in order to load the JAR and send it to Spark. The entire Spark Connector repo is mounted in the Docker container, including the directory containing the Spark Connector JAR (if you build it yourself). Otherwise you must download the JAR from [Maven](https://mvnrepository.com/artifact/com.vertica.spark/vertica-spark) and reference the location in your environment. 46 | 47 | A new Spark session must be created, pointing to the Spark master as well as loading the Spark Connector JAR. For example: 48 | ```py 49 | from pyspark.sql import SparkSession 50 | 51 | spark = (SparkSession.builder 52 | .config("spark.master", "spark://spark:7077") 53 | .config("spark.driver.memory", "2G") 54 | .config("spark.executor.memory", "1G") 55 | .config("spark.jars", "/spark-connector/connector/target/scala-2.12/spark-vertica-connector-assembly-.jar") 56 | .getOrCreate()) 57 | sc = spark.sparkContext 58 | ``` 59 | 60 | Once that is complete the Spark context may be used to read and write data using the Vertica Spark Connector data source ("com.vertica.spark.datasource.VerticaSource"). See the example Jupyter Notebooks in this folder. 61 | 62 | Note that Jupyter Notebook previously bundled the Spylon kernel so that Scala could be used, but that kernel has not been maintained and is no longer included in Jupyter Notebook by default. As a result it is recommended to use the Python kernel in Jupyter Notebook. 63 | -------------------------------------------------------------------------------- /examples/jupyter/data/faithful_testing.csv: -------------------------------------------------------------------------------- 1 | "id","eruptions","waiting" 2 | "4",2.283,62 3 | "5",4.533,85 4 | "8",3.6,85 5 | "9",1.95,51 6 | "11",1.833,54 7 | "12",3.917,84 8 | "14",1.75,47 9 | "20",4.25,79 10 | "22",1.75,47 11 | "23",3.45,78 12 | "24",3.067,69 13 | "26",3.6,83 14 | "30",4.433,79 15 | "31",4.3,73 16 | "35",3.833,74 17 | "38",4.833,80 18 | "42",1.883,58 19 | "44",1.75,58 20 | "47",3.833,64 21 | "49",4.633,82 22 | "53",1.833,54 23 | "55",1.733,54 24 | "56",4.883,83 25 | "58",1.667,64 26 | "59",4.567,77 27 | "61",2.233,59 28 | "63",1.75,48 29 | "64",4.8,82 30 | "66",4.4,92 31 | "68",4.7,78 32 | "69",2.067,65 33 | "71",4.033,82 34 | "75",1.983,62 35 | "78",4.567,78 36 | "79",3.883,76 37 | "82",4.333,82 38 | "83",4.1,70 39 | "85",4.067,73 40 | "86",4.933,88 41 | "87",3.95,76 42 | "89",2.167,48 43 | "90",4,86 44 | "92",4.333,90 45 | "93",1.867,50 46 | "94",4.817,78 47 | "100",4.9,82 48 | "102",4.367,88 49 | "104",4.5,83 50 | "106",1.867,47 51 | "113",4.9,89 52 | "114",4.417,79 53 | "125",4.6,88 54 | "126",3.767,81 55 | "127",1.917,45 56 | "131",1.867,45 57 | "134",4.333,89 58 | "139",2.033,53 59 | "141",4.233,81 60 | "143",4.533,82 61 | "145",4.333,76 62 | "147",4.633,80 63 | "155",3.567,71 64 | "157",4.5,81 65 | "161",2.2,45 66 | "162",4.15,86 67 | "163",2,58 68 | "165",3.5,66 69 | "167",2.367,63 70 | "171",1.917,49 71 | "181",1.883,55 72 | "184",3.767,83 73 | "190",2.183,55 74 | "194",4.1,84 75 | "197",3.5,87 76 | "198",4.366,77 77 | "203",4.133,91 78 | "204",1.867,53 79 | "207",4.367,77 80 | "211",2.383,71 81 | "215",3.417,64 82 | "216",4.233,76 83 | "217",2.4,53 84 | "218",4.8,94 85 | "219",2,55 86 | "220",4.15,76 87 | "222",4.267,82 88 | "223",1.75,54 89 | "224",4.483,75 90 | "227",4.083,78 91 | "229",3.917,70 92 | "230",4.55,79 93 | "232",2.417,54 94 | "235",4.45,90 95 | "239",3.95,79 96 | "240",2.333,64 97 | "241",4.15,75 98 | "247",2.083,57 99 | "251",2.2,54 100 | "252",4.45,83 101 | "253",3.567,73 102 | "255",4.15,88 103 | "258",4.45,83 104 | "263",1.85,58 105 | "265",1.983,43 106 | "267",4.75,75 107 | "268",4.117,81 108 | "269",2.15,46 109 | "270",4.417,90 110 | "271",1.817,46 111 | "272",4.467,74 112 | -------------------------------------------------------------------------------- /examples/jupyter/data/faithful_training.csv: -------------------------------------------------------------------------------- 1 | "id","eruptions","waiting" 2 | "1",3.6,79 3 | "2",1.8,54 4 | "3",3.333,74 5 | "6",2.883,55 6 | "7",4.7,88 7 | "10",4.35,85 8 | "13",4.2,78 9 | "15",4.7,83 10 | "16",2.167,52 11 | "17",1.75,62 12 | "18",4.8,84 13 | "19",1.6,52 14 | "21",1.8,51 15 | "25",4.533,74 16 | "27",1.967,55 17 | "28",4.083,76 18 | "29",3.85,78 19 | "32",4.467,77 20 | "33",3.367,66 21 | "34",4.033,80 22 | "36",2.017,52 23 | "37",1.867,48 24 | "39",1.833,59 25 | "40",4.783,90 26 | "41",4.35,80 27 | "43",4.567,84 28 | "45",4.533,73 29 | "46",3.317,83 30 | "48",2.1,53 31 | "50",2,59 32 | "51",4.8,75 33 | "52",4.716,90 34 | "54",4.833,80 35 | "57",3.717,71 36 | "60",4.317,81 37 | "62",4.5,84 38 | "65",1.817,60 39 | "67",4.167,78 40 | "70",4.7,73 41 | "72",1.967,56 42 | "73",4.5,79 43 | "74",4,71 44 | "76",5.067,76 45 | "77",2.017,60 46 | "80",3.6,83 47 | "81",4.133,75 48 | "84",2.633,65 49 | "88",4.517,80 50 | "91",2.2,60 51 | "95",1.833,63 52 | "96",4.3,72 53 | "97",4.667,84 54 | "98",3.75,75 55 | "99",1.867,51 56 | "101",2.483,62 57 | "103",2.1,49 58 | "105",4.05,81 59 | "107",4.7,84 60 | "108",1.783,52 61 | "109",4.85,86 62 | "110",3.683,81 63 | "111",4.733,75 64 | "112",2.3,59 65 | "115",1.7,59 66 | "116",4.633,81 67 | "117",2.317,50 68 | "118",4.6,85 69 | "119",1.817,59 70 | "120",4.417,87 71 | "121",2.617,53 72 | "122",4.067,69 73 | "123",4.25,77 74 | "124",1.967,56 75 | "128",4.5,82 76 | "129",2.267,55 77 | "130",4.65,90 78 | "132",4.167,83 79 | "133",2.8,56 80 | "135",1.833,46 81 | "136",4.383,82 82 | "137",1.883,51 83 | "138",4.933,86 84 | "140",3.733,79 85 | "142",2.233,60 86 | "144",4.817,77 87 | "146",1.983,59 88 | "148",2.017,49 89 | "149",5.1,96 90 | "150",1.8,53 91 | "151",5.033,77 92 | "152",4,77 93 | "153",2.4,65 94 | "154",4.6,81 95 | "156",4,70 96 | "158",4.083,93 97 | "159",1.8,53 98 | "160",3.967,89 99 | "164",3.833,78 100 | "166",4.583,76 101 | "168",5,88 102 | "169",1.933,52 103 | "170",4.617,93 104 | "172",2.083,57 105 | "173",4.583,77 106 | "174",3.333,68 107 | "175",4.167,81 108 | "176",4.333,81 109 | "177",4.5,73 110 | "178",2.417,50 111 | "179",4,85 112 | "180",4.167,74 113 | "182",4.583,77 114 | "183",4.25,83 115 | "185",2.033,51 116 | "186",4.433,78 117 | "187",4.083,84 118 | "188",1.833,46 119 | "189",4.417,83 120 | "191",4.8,81 121 | "192",1.833,57 122 | "193",4.8,76 123 | "195",3.966,77 124 | "196",4.233,81 125 | "199",2.25,51 126 | "200",4.667,78 127 | "201",2.1,60 128 | "202",4.35,82 129 | "205",4.6,78 130 | "206",1.783,46 131 | "208",3.85,84 132 | "209",1.933,49 133 | "210",4.5,83 134 | "212",4.7,80 135 | "213",1.867,49 136 | "214",3.833,75 137 | "221",1.867,50 138 | "225",4,78 139 | "226",4.117,79 140 | "228",4.267,78 141 | "231",4.083,70 142 | "233",4.183,86 143 | "234",2.217,50 144 | "236",1.883,54 145 | "237",1.85,54 146 | "238",4.283,77 147 | "242",2.35,47 148 | "243",4.933,86 149 | "244",2.9,63 150 | "245",4.583,85 151 | "246",3.833,82 152 | "248",4.367,82 153 | "249",2.133,67 154 | "250",4.35,74 155 | "254",4.5,73 156 | "256",3.817,80 157 | "257",3.917,71 158 | "259",2,56 159 | "260",4.283,79 160 | "261",4.767,78 161 | "262",4.533,84 162 | "264",4.25,83 163 | "266",2.25,60 164 | -------------------------------------------------------------------------------- /examples/pyspark/README.md: -------------------------------------------------------------------------------- 1 | # Pyspark Example 2 | 3 | This example show how to configure a PySpark application with our connector. 4 | 5 | In general, you would want to define the appropriate connector options. Then, include the connector's fat JAR into `spark-submit` argument `--jars`, For example: 6 | ```sh 7 | spark-submit --master local[*] --jars example.py 8 | ``` 9 | 10 | # How to Run the Example 11 | 12 | First, set up the docker environment as mentioned in [examples](/examples/README.md), then: 13 | 1. Download the spark connector "all" JAR from our [releases](https://github.com/vertica/spark-connector/releases) and place it in to `/connector/target/scala-2.12/`. You can do this on your local machine as this folder is mounted. Alternatively, you could build the JAR yourself by following the instructions [here](/CONTRIBUTING.md) 14 | 2. Assuming you are in the client container, use `cd /spark-connector/examples/pyspark` then run the `./run-python-example.sh` script. This will submit the pyspark example to our [standalone cluster](localhost:8080) 15 | 3. To shut down, exit out of the container with `exit`. Then on your local machine navigate to `spark-connector/docker` and tear down containers by running `docker-compose down` 16 | 17 | # Other Connector Options 18 | 19 | For examples of other options, refer to our [scala example](/examples/scala) which demonstrate how to configure the different connector options. While it is in a different language, the ideas are transferable; set the correct options, include our connector JAR, then spark-submit. 20 | -------------------------------------------------------------------------------- /examples/pyspark/run-python-example.sh: -------------------------------------------------------------------------------- 1 | CONNECTOR_VERSION=$(cat ../../version.properties | grep ${connector-version} | cut -d'=' -f2) 2 | spark-submit --master spark://spark:7077 --jars ../../connector/target/scala-2.12/spark-vertica-connector-assembly-$CONNECTOR_VERSION.jar sparkapp.py 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /examples/pyspark/sparkapp.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext, SparkConf 2 | from pyspark.sql import SQLContext, SparkSession 3 | from pyspark import sql 4 | 5 | # Create the spark session 6 | spark = SparkSession \ 7 | .builder \ 8 | .appName("Vertica Connector Pyspark Example") \ 9 | .getOrCreate() 10 | spark_context = spark.sparkContext 11 | sql_context = sql.SQLContext(spark_context) 12 | 13 | # The name of our connector for Spark to look up 14 | format = "com.vertica.spark.datasource.VerticaSource" 15 | 16 | # Set connector options based on our Docker setup 17 | host="vertica" 18 | user="dbadmin" 19 | password="" 20 | db="docker" 21 | staging_fs_url="webhdfs://hdfs:50070/data/" 22 | table="pysparktest" 23 | 24 | # Define data to write to Vertica 25 | columns = ["language","users_count"] 26 | data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")] 27 | # Create an RDD from the data 28 | rdd = spark_context.parallelize(data) 29 | # Convert the RDD to a DataFrame 30 | df = rdd.toDF(columns) 31 | # Write the DataFrame to the Vertica table pysparktest 32 | df.write.mode('overwrite').save( 33 | # Spark format 34 | format=format, 35 | # Connector specific options 36 | host=host, 37 | user=user, 38 | password=password, 39 | db=db, 40 | staging_fs_url=staging_fs_url, 41 | table=table) 42 | 43 | # Read the data back into a Spark DataFrame 44 | readDf = spark.read.load( 45 | # Spark format 46 | format=format, 47 | # Connector specific options 48 | host=host, 49 | user=user, 50 | password=password, 51 | db=db, 52 | table=table, 53 | staging_fs_url=staging_fs_url) 54 | 55 | # Print the DataFrame contents 56 | readDf.show() 57 | -------------------------------------------------------------------------------- /examples/scala/build.sbt: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | import java.util.Properties 14 | 15 | // Retrieving the common property config containing the connector version number. 16 | val props = settingKey[Properties]("Connector version properties") 17 | props := { 18 | val prop = new Properties() 19 | IO.load(prop, new File("../../version.properties")) 20 | prop 21 | } 22 | 23 | scalaVersion := "2.13.16" 24 | name := "spark-vertica-connector-scala-examples" 25 | organization := "com.vertica" 26 | version := props.value.getProperty("connector-version") 27 | 28 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases" 29 | resolvers += "jitpack" at "https://jitpack.io" 30 | 31 | libraryDependencies ++= Seq( 32 | "com.typesafe" % "config" % "1.4.1", 33 | "com.vertica.spark" % "vertica-spark" % s"${version.value}-slim", 34 | "org.apache.spark" %% "spark-core" % "3.5.5", 35 | "org.apache.spark" %% "spark-sql" % "3.5.5", 36 | "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop3-2.2.6", 37 | // This version needs to match the Hadoop version used by Spark 38 | "org.apache.hadoop" % "hadoop-aws" % "3.3.2" 39 | ) 40 | 41 | assembly / assemblyJarName := s"vertica-spark-scala-examples.jar" 42 | 43 | assembly / assemblyMergeStrategy := { 44 | case PathList("META-INF", xs @ _*) => MergeStrategy.discard 45 | case x => MergeStrategy.first 46 | } 47 | 48 | assembly / assemblyShadeRules := Seq( 49 | ShadeRule.rename("cats.**" -> "shadeCats.@1").inAll 50 | ) -------------------------------------------------------------------------------- /examples/scala/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases" 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") -------------------------------------------------------------------------------- /examples/scala/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | # Base configurations used by all examples 2 | examples { 3 | host="vertica" 4 | port=5433 5 | db="docker" 6 | user="dbadmin" 7 | password="" 8 | filepath="webhdfs://hdfs:50070/data/" 9 | } 10 | 11 | # Used by S3 related examples to override the base configurations 12 | s3 { 13 | filepath="s3a://test" 14 | aws_endpoint="minio:9000" 15 | aws_enable_ssl="false" 16 | aws_enable_path_style="true" 17 | aws_access_key_id="minioadmin" 18 | aws_secret_access_key="minioadmin" 19 | } 20 | 21 | # Used by kerberos related examples to override the base configurations 22 | kerberos { 23 | user="user1" 24 | filepath="hdfs://hdfs.example.com:8020" 25 | kerberos_service_name="vertica" 26 | kerberos_host_name="vertica.example.com" 27 | jaas_config_name="Client" 28 | } 29 | 30 | -------------------------------------------------------------------------------- /examples/scala/src/main/scala/example/Main.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package example 15 | 16 | import example.PrintUtils._ 17 | import example.examples.{BasicReadWriteExamples, ComplexTypeExamples, ConnectorOptionsExamples} 18 | import org.apache.spark.sql.SparkSession 19 | 20 | object Main { 21 | def main(args: Array[String]): Unit = { 22 | 23 | // Define a Spark master here 24 | val spark = SparkSession.builder() 25 | .appName("Vertica-Spark Connector Scala Example") 26 | .getOrCreate() 27 | 28 | val basicExamples = new BasicReadWriteExamples(spark) 29 | val ctExamples = new ComplexTypeExamples(spark) 30 | val optExamples = new ConnectorOptionsExamples(spark) 31 | 32 | val m: Map[String, () => Unit] = Map( 33 | "writeCustomStatement" -> optExamples.writeCustomStatement, 34 | "writeCustomCopyList" -> optExamples.writeCustomCopyList, 35 | "writeThenRead" -> basicExamples.writeThenRead, 36 | "complexArrayExample" -> ctExamples.writeThenReadComplexArray, 37 | "writeThenReadRow" -> ctExamples.writeThenReadRow, 38 | "writeMap" -> ctExamples.writeMap, 39 | "writeThenReadExternalTable" -> basicExamples.writeThenReadExternalTable, 40 | "writeDataUsingMergeKey" -> optExamples.writeDataUsingMergeKey, 41 | "writeThenReadWithS3" -> basicExamples.writeThenReadWithS3, 42 | "writeThenReadWithGCS" -> basicExamples.writeThenReadWithGCS, 43 | "writeThenReadWithKerberos" -> basicExamples.writeThenReadWithKerberos 44 | ) 45 | 46 | def printAllExamples(): Unit = { 47 | println("Examples available: ") 48 | m.keySet.foreach(exampleName => println(s"- $exampleName")) 49 | } 50 | 51 | def noCase(): Unit = { 52 | println("No example with that name.") 53 | printAllExamples() 54 | } 55 | 56 | if (args.length != 1) { 57 | println("No example specified!") 58 | println("Usage: ") 59 | printAllExamples() 60 | } 61 | else { 62 | val f: () => Unit = m.getOrElse(args.head, noCase) 63 | try { 64 | f() 65 | } 66 | catch { 67 | case e: Exception => { 68 | e.printStackTrace() 69 | printFailed("Unexpected error.") 70 | } 71 | } 72 | } 73 | spark.close() 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /examples/scala/src/main/scala/example/PrintUtils.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package example 15 | 16 | object PrintUtils { 17 | 18 | def printMessage(msg: String): Unit = println(s"------------------------------------\n-\n- EXAMPLE: $msg \n-\n------------------------------------") 19 | 20 | def printNotes(msg: String): Unit = println(s"------------------------------------\n-\n- NOTES: $msg \n-\n------------------------------------") 21 | 22 | def printSuccess(msg: String): Unit = println(s"------------------------------------\n-\n- SUCCESS: $msg \n-\n------------------------------------") 23 | 24 | def printFailed(msg: String): Unit = println(s"-------------------------------------\n-\n- FAILED: $msg \n-\n------------------------------------") 25 | 26 | } 27 | -------------------------------------------------------------------------------- /examples/scala/submit-examples-debug.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export SPARK_SUBMIT_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005" 4 | 5 | ./submit-examples.sh "$@" 6 | -------------------------------------------------------------------------------- /examples/scala/submit-examples-kerberos.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo 'user1' | kinit user1 4 | 5 | export JAVA_HOME=/usr/lib/jvm/jre-11-openjdk 6 | export SPARK_DIST_CLASSPATH=$(/hadoop-3.3.1/bin/hadoop classpath) 7 | export SPARK_HOME=/opt/spark 8 | export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin 9 | 10 | start-master.sh -h localhost 11 | start-worker.sh spark://localhost:7077 12 | 13 | if [[ "$1" == "debug" ]]; then 14 | export SPARK_SUBMIT_OPTS=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005 15 | fi 16 | 17 | spark-submit --master spark://localhost:7077 --conf "spark.driver.extraClassPath={$SPARK_HOME}/conf/" --driver-java-options "-Djava.security.auth.login.config=/spark-connector/docker/client-krb/jaas.config" ./target/scala-2.12/vertica-spark-scala-examples.jar writeThenReadWithKerberos 18 | -------------------------------------------------------------------------------- /examples/scala/submit-examples.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | spark-submit --master spark://spark:7077 --driver-memory 2g target/scala-2.12/vertica-spark-scala-examples.jar "$@" 4 | -------------------------------------------------------------------------------- /examples/sparklyr/README.md: -------------------------------------------------------------------------------- 1 | # Sparklyr Example 2 | 3 | The connector can be used with R by using the sparklyr library. 4 | 5 | In general, you would want to include the connector's fat JAR into Spark's config, then define the appropriate connector options into the option list for a read or write. 6 | 7 | # How to run the example 8 | 9 | First, set up the Docker environment as mentioned in [examples](/examples/README.md), then: 10 | 1. Download the spark connector "all" JAR from our [releases](https://github.com/vertica/spark-connector/releases) and place it in to `/connector/target/scala-2.12/`. You can do this on your local machine as this folder is mounted. Alternatively, you could build the JAR yourself by following the instructions [here](/CONTRIBUTING.md) 11 | 2. Assuming you are in the client container, use `cd /spark-connector/examples/sparklyr` then run the `./run-r-example.sh` script. This will install R and necessary packages before starting the r script. You can see the submitted app on our [standalone cluster](localhost:8080) 12 | 3. To shut down, exit out of the container with `exit`. Then on your local machine navigate to `spark-connector/docker` and tear down containers by running `docker-compose down` 13 | 14 | # Other Connector Options 15 | 16 | For examples of other options, refer to our [Scala example](/examples/scala) which demonstrate how to configure the different connector options. While it is in a different language, the ideas are transferable; set the correct options, include our connector JAR, then spark-submit. 17 | -------------------------------------------------------------------------------- /examples/sparklyr/run-r-example.sh: -------------------------------------------------------------------------------- 1 | apt-get install -y r-base 2 | apt-get install -y libssl-dev 3 | apt-get install -y libxml2-dev 4 | apt-get install -y libcurl4-openssl-dev 5 | Rscript sparkapp.r 6 | -------------------------------------------------------------------------------- /examples/sparklyr/sparkapp.r: -------------------------------------------------------------------------------- 1 | install.packages("curl", repo = "http://cran.us.r-project.org") 2 | install.packages("sparklyr", repo = "http://cran.us.r-project.org") 3 | library(sparklyr) 4 | 5 | install.packages('properties', repo = "http://cran.us.r-project.org") 6 | library('properties') 7 | 8 | props <- read.properties("../../version.properties") 9 | version <- props["connector-version"] 10 | # construct the path to Vertica-Spark connector jar. Replace this if the path to the jar is different 11 | connectorJar <- paste("../../connector/target/scala-2.12/spark-vertica-connector-assembly-", version, ".jar", sep = "") 12 | 13 | # Create a Spark config and disable Hive support to avoid errors 14 | config <- spark_config() 15 | config$sparklyr.jars.default <- connectorJar 16 | config$sparklyr.connect.enablehivesupport <- FALSE 17 | config$sparklyr.appName <- "Vertica Spark Connector Sparklyr example" 18 | 19 | print("Connecting to Spark.") 20 | 21 | # Connect to the Spark cluster 22 | sc <- spark_connect(master="spark://spark:7077", version = "3.1", config = config) 23 | 24 | print("Connected to spark. Getting iris_tbl.") 25 | 26 | # The Iris dataset comes with R and is used as test data 27 | # Get the Iris data and store it in a Spark dataframe 28 | iris_tbl <- sdf_copy_to(sc = sc, x = iris, overwrite = T) 29 | 30 | print("Got iris_tbl. Writing to Vertica.") 31 | 32 | # Write the Iris dataframe to the Vertica database 33 | spark_write_source(iris_tbl, "com.vertica.spark.datasource.VerticaSource", "overwrite", list( 34 | "host" = "vertica", 35 | "user" = "dbadmin", 36 | "password" = "", 37 | "db" = "docker", 38 | "staging_fs_url" = "webhdfs://hdfs:50070/data/dirtest", 39 | "table" = "iris" 40 | )) 41 | 42 | print("Wrote to Vertica. Reading from Vertica.") 43 | 44 | # Read the Iris data back from the Vertica database into a Spark dataframe 45 | result <- spark_read_source(sc = sc, name = "example", source = "com.vertica.spark.datasource.VerticaSource", options = list( 46 | "host" = "vertica", 47 | "user" = "dbadmin", 48 | "password" = "", 49 | "db" = "docker", 50 | "staging_fs_url" = "webhdfs://hdfs:50070/data/dirtest", 51 | "table" = "iris" 52 | )) 53 | 54 | print("Finished reading.") 55 | 56 | # Print the dataframe's contents 57 | print(result) 58 | 59 | # Cleanup Spark connection 60 | spark_disconnect(sc) 61 | -------------------------------------------------------------------------------- /functional-tests/build.sbt: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | import java.util.Properties 14 | import java.io.File 15 | 16 | // Retrieving the connector version number from a common file. 17 | val versionProps = settingKey[Properties]("Connector version properties") 18 | versionProps := { 19 | val prop = new Properties() 20 | IO.load(prop, new File("../version.properties")) 21 | prop 22 | } 23 | 24 | scalaVersion := "2.13.16" 25 | name := "spark-vertica-connector-functional-tests" 26 | organization := "com.vertica" 27 | version := versionProps.value.getProperty("connector-version") 28 | 29 | val sparkVersion = Option(System.getProperty("sparkVersion")) match { 30 | case Some(sparkVersion) => sparkVersion 31 | case None => sys.env.getOrElse("SPARK_VERSION", "[3.3.0, 3.4.0, 3.5.5)") 32 | } 33 | 34 | val hadoopVersion = Option(System.getProperty("hadoopVersion")) match { 35 | case Some(hadoopVersion) => hadoopVersion 36 | case None => sys.env.getOrElse("HADOOP_VERSION", "3.3.4") 37 | } 38 | 39 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases" 40 | resolvers += "jitpack" at "https://jitpack.io" 41 | 42 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.16" 43 | libraryDependencies += "com.typesafe" % "config" % "1.4.1" 44 | 45 | libraryDependencies += "org.scala-lang.modules" %% "scala-parser-combinators" % "2.3.0" 46 | libraryDependencies += "com.vertica.jdbc" % "vertica-jdbc" % "24.4.0-0" 47 | libraryDependencies += "org.apache.spark" %% "spark-core" % "3.5.5" 48 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.5" 49 | libraryDependencies += "org.scalactic" %% "scalactic" % "3.2.16" 50 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.16" % "test" 51 | libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.9.5" 52 | libraryDependencies += "org.scalamock" %% "scalamock" % "5.2.0" % Test 53 | libraryDependencies += "org.typelevel" %% "cats-core" % "2.10.0" 54 | libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion 55 | libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % hadoopVersion 56 | libraryDependencies += "com.github.scopt" %% "scopt" % "4.0.1" 57 | libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop3-2.2.6" 58 | //libraryDependencies += file("C:\\Users\\chaitanp\\SourceCode\\spark\\spark-connector\\connector\\target\\scala-2.13\\spark-vertica-connector-assembly-3.3.6.jar") 59 | 60 | Compile / unmanagedJars += file("../connector/target/scala-2.13/spark-vertica-connector-assembly-3.3.6.jar") 61 | 62 | 63 | assembly / assemblyJarName := s"vertica-spark-functional-tests.jar" 64 | 65 | assembly / assemblyMergeStrategy := { 66 | case PathList("META-INF", xs @ _*) => MergeStrategy.discard 67 | case x => MergeStrategy.first 68 | } 69 | 70 | assembly / assemblyShadeRules := Seq( 71 | ShadeRule.rename("cats.**" -> "shadeCats.@1").inAll 72 | ) 73 | 74 | //unmanagedClasspath in Runtime += new File("/etc/hadoop/conf/") 75 | -------------------------------------------------------------------------------- /functional-tests/default-config.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo -e 'functional-tests { 4 | host="'"vertica"'" 5 | port=5433 6 | db="'"docker"'" 7 | user="'"dbadmin"'" 8 | password="'""'" 9 | filepath="'"webhdfs://hdfs:50070/data/"'" 10 | tlsmode="disable" 11 | truststorepath="'"/truststore.jks"'" 12 | truststorepassword="'"dbadmin"'" 13 | }' > ./src/main/resources/application.conf 14 | -------------------------------------------------------------------------------- /functional-tests/pipeline-gcs-config.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo -e 'functional-tests={ 4 | host="'"vertica"'" 5 | port="'"5433"'" 6 | db="'"docker"'" 7 | user="'"dbadmin"'" 8 | password="'""'" 9 | log='true' 10 | filepath="'"$GCS_FILEPATH"'" 11 | tlsmode="'"disable"'" 12 | truststorepath="'"/truststore.jks"'" 13 | truststorepassword="'"dbadmin"'" 14 | }' > ./src/main/resources/application.conf 15 | -------------------------------------------------------------------------------- /functional-tests/pipeline-s3-config.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo -e 'functional-tests={ 4 | host="'"vertica"'" 5 | port="'"5433"'" 6 | db="'"docker"'" 7 | user="'"dbadmin"'" 8 | password="'""'" 9 | log='true' 10 | filepath="'"$S3_FILEPATH"'" 11 | tlsmode="'"disable"'" 12 | truststorepath="'"/truststore.jks"'" 13 | truststorepassword="'"dbadmin"'" 14 | }' > ./src/main/resources/application.conf 15 | -------------------------------------------------------------------------------- /functional-tests/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases" 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") 4 | addDependencyTreePlugin 5 | -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/_SUCCESS -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=1/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,( 2 | $,H spark_schema%col2&5col2nr&<(n,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=10/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,AA(AA 2 | $A,H spark_schema%col2&5col2nr&<AA(AAn,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=11/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1, A A( A A 2 | $ A,H spark_schema%col2&5col2nr&< A A( A An,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=12/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,0A0A(0A0A 2 | $0A,H spark_schema%col2&5col2nr&<0A0A(0A0An,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=13/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,@A@A(@A@A 2 | $@A,H spark_schema%col2&5col2nr&<@A@A(@A@An,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=14/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,PAPA(PAPA 2 | $PA,H spark_schema%col2&5col2nr&<PAPA(PAPAn,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=15/part-00002-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,`A`A(`A`A 2 | $`A,H spark_schema%col2&5col2nr&<`A`A(`A`An,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=16/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,pApA(pApA 2 | $pA,H spark_schema%col2&5col2nr&<pApA(pApAn,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=17/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=17/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=18/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=18/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=19/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=19/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=2/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=2/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=20/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=20/part-00003-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=3/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,@@(@@ 2 | $@,H spark_schema%col2&5col2nr&<@@(@@n,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=4/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,@@@@(@@@@ 2 | $@@,H spark_schema%col2&5col2nr&<@@@@(@@@@n,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=5/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=5/part-00000-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=6/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=6/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=7/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=7/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=8/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/functional-tests/src/main/resources/3.1.1/col1=8/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet -------------------------------------------------------------------------------- /functional-tests/src/main/resources/3.1.1/col1=9/part-00001-a69ed01f-68f6-48f7-9a69-60c3952d2ac5.c000.snappy.parquet: -------------------------------------------------------------------------------- 1 | PAR1,AA(AA 2 | $A,H spark_schema%col2&5col2nr&<AA(AAn,org.apache.spark.version3.0.2)org.apache.spark.sql.parquet.row.metadataY{"type":"struct","fields":[{"name":"col2","type":"float","nullable":true,"metadata":{}}]}Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)jPAR1 -------------------------------------------------------------------------------- /functional-tests/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | functional-tests { 2 | host="vertica" 3 | port=5433 4 | db="docker" 5 | user="dbadmin" 6 | password="" 7 | filepath="webhdfs://hdfs:50070/data/" 8 | tlsmode="disable" 9 | truststorepath="/truststore.jks" 10 | truststorepassword="dbadmin" 11 | } 12 | 13 | -------------------------------------------------------------------------------- /functional-tests/src/main/resources/datafile-17_2_test: -------------------------------------------------------------------------------- 1 | 01/02/06,AAA 2 | 10/18/08,AAA 3 | 11/18/08,AAA 4 | 09/46/08,AAA 5 | 09/46/08,AAA 6 | 09/46/08,AAA 7 | 09/46/08,AAA 8 | 09/46/08,AAA 9 | 09/46/08,AAA 10 | 09/46/08,AAA 11 | 09/46/08,AAA 12 | 09/46/08,AAA 13 | 09/46/08,AAA 14 | 09/46/08,AAA 15 | 12/18/08,AAA 16 | 09/21/08,AAA 17 | 09/22/08,AAA 18 | 09/23/08,AAA 19 | 09/46/08,AAA 20 | 09/24/08,AAA 21 | 09/23/08,AAA 22 | -------------------------------------------------------------------------------- /functional-tests/src/main/resources/datafile-String-Int.txt: -------------------------------------------------------------------------------- 1 | string test one,1 2 | string test two,2 3 | string test three,3 4 | string test four,4 5 | string test five,5 6 | string test six,6 7 | string test seven,7 8 | string test eight,8 9 | string test nine,9 10 | string test ten,10 11 | -------------------------------------------------------------------------------- /functional-tests/src/main/resources/date_test_file.txt: -------------------------------------------------------------------------------- 1 | 01/02/06,AAA 2 | 10/18/08,AAA 3 | 11/18/08,AAA 4 | 09/46/08,AAA 5 | 09/46/08,AAA 6 | 09/46/08,AAA 7 | 09/46/08,AAA 8 | 09/46/08,AAA -------------------------------------------------------------------------------- /functional-tests/src/main/resources/diffTypes.txt: -------------------------------------------------------------------------------- 1 | Test string file row 1,12,false,1 2 | -------------------------------------------------------------------------------- /functional-tests/src/main/resources/diffTypesORC.txt: -------------------------------------------------------------------------------- 1 | teststring,12,false,1 2 | -------------------------------------------------------------------------------- /functional-tests/src/main/scala/com/vertica/spark/functests/CleanupUtilTests.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.functests 15 | 16 | import com.vertica.spark.config.FileStoreConfig 17 | import com.vertica.spark.datasource.fs.HadoopFileStoreLayer 18 | import com.vertica.spark.util.cleanup.{CleanupUtils, FileCleanupInfo} 19 | import org.scalatest.BeforeAndAfterAll 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | 22 | class CleanupUtilTests(val cfg: FileStoreConfig) extends AnyFlatSpec with BeforeAndAfterAll { 23 | 24 | val fsLayer = new HadoopFileStoreLayer(cfg, None) 25 | val path: String = cfg.address + "/CleanupTest" 26 | private val perms = "777" 27 | 28 | val cleanupUtils = new CleanupUtils 29 | 30 | override def beforeAll(): Unit = { 31 | fsLayer.createDir(path, perms) 32 | } 33 | 34 | override def afterAll(): Unit = { 35 | fsLayer.removeDir(cfg.address) 36 | } 37 | 38 | it should "Clean up a file" in { 39 | val filename = path + "/test.parquet" 40 | 41 | fsLayer.createFile(filename) 42 | 43 | cleanupUtils.checkAndCleanup(fsLayer, FileCleanupInfo(filename, 0, 3)) 44 | cleanupUtils.checkAndCleanup(fsLayer, FileCleanupInfo(filename, 1, 3)) 45 | cleanupUtils.checkAndCleanup(fsLayer, FileCleanupInfo(filename, 2, 3)) 46 | 47 | fsLayer.fileExists(filename) match { 48 | case Left(err) => fail(err.getFullContext) 49 | case Right(exists) => assert(!exists) 50 | } 51 | fsLayer.fileExists(filename+".cleanup0") match { 52 | case Left(err) => fail(err.getFullContext) 53 | case Right(exists) => assert(!exists) 54 | } 55 | fsLayer.fileExists(filename+".cleanup1") match { 56 | case Left(err) => fail(err.getFullContext) 57 | case Right(exists) => assert(!exists) 58 | } 59 | fsLayer.fileExists(filename+".cleanup2") match { 60 | case Left(err) => fail(err.getFullContext) 61 | case Right(exists) => assert(!exists) 62 | } 63 | } 64 | 65 | 66 | it should "Clean up parent unique directory" in { 67 | val uniqueDir = path + "/unique-dir-123" 68 | fsLayer.createDir(uniqueDir, perms) 69 | 70 | val childDir = uniqueDir + "/tablename" 71 | fsLayer.createDir(childDir, perms) 72 | 73 | val filename = childDir + "/test.parquet" 74 | fsLayer.createFile(filename) 75 | 76 | assert(fsLayer.fileExists(uniqueDir).right.get) 77 | 78 | // Now test cleanup 79 | cleanupUtils.cleanupAll(fsLayer, childDir) 80 | 81 | assert(!fsLayer.fileExists(uniqueDir).right.get) 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /functional-tests/src/main/scala/com/vertica/spark/functests/LargeDataTests.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.functests 2 | 3 | import com.vertica.spark.config.{FileStoreConfig, JDBCConfig} 4 | import com.vertica.spark.functests.endtoend.EndToEnd 5 | import com.vertica.spark.util.error.ConnectorException 6 | import org.apache.spark.sql.SaveMode 7 | 8 | class LargeDataTests(readOpts: Map[String, String], writeOpts: Map[String, String], jdbcConfig: JDBCConfig, fileStoreConfig: FileStoreConfig, remote: Boolean = false) 9 | extends EndToEnd(readOpts, writeOpts, jdbcConfig, fileStoreConfig, remote){ 10 | 11 | override def sparkAppName: String = "Large Data Tests" 12 | 13 | val numSparkPartitions = 4 14 | 15 | it should "save a 1600 column table using default copy logic." in { 16 | val tableName = "1600ColumnTable" 17 | 18 | val options = writeOpts + ("table" -> tableName) 19 | val df = spark.read.format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") 20 | .option("header", "true").load("src/main/resources/1600ColumnTable.csv") 21 | 22 | val numDfRows = df.count() 23 | val stmt = conn.createStatement() 24 | stmt.execute("DROP TABLE IF EXISTS " + "\"" + options("table") + "\";") 25 | 26 | val mode = SaveMode.Append 27 | 28 | try { 29 | df.write.format("com.vertica.spark.datasource.VerticaSource").options(options).mode(mode).save() 30 | } catch { 31 | case e: ConnectorException => fail(e.error.getFullContext) 32 | } 33 | 34 | var totalRows = 0 35 | val query = "SELECT COUNT(*) AS count FROM " + "\"" + options("table") + "\";" 36 | try { 37 | val rs = stmt.executeQuery(query) 38 | if (rs.next) { 39 | totalRows = rs.getInt("count") 40 | } 41 | } 42 | finally { 43 | stmt.close() 44 | } 45 | assert (totalRows == numDfRows) 46 | TestUtils.dropTable(conn, tableName) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /functional-tests/src/main/scala/com/vertica/spark/functests/endtoend/BasicJsonReadTests.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.functests.endtoend 15 | 16 | import com.vertica.spark.config.{FileStoreConfig, JDBCConfig} 17 | import com.vertica.spark.functests.TestUtils 18 | import com.vertica.spark.util.error.{BinaryTypeNotSupported, ConnectorException, ErrorList} 19 | 20 | import scala.util.{Failure, Success, Try} 21 | 22 | 23 | /** 24 | * A few minimal tests for the json feature. Not intended to be comprehensive. 25 | * */ 26 | class BasicJsonReadTests(readOpts: Map[String, String], writeOpts: Map[String, String], jdbcConfig: JDBCConfig, fileStoreConfig: FileStoreConfig, remote: Boolean = false) 27 | extends EndToEnd(readOpts, writeOpts, jdbcConfig, fileStoreConfig, remote) { 28 | 29 | override def sparkAppName: String = "Basic JSON Read Tests" 30 | 31 | private val jsonReadOpts = readOpts + ("json" -> "true") 32 | 33 | it should "read primitive types" in { 34 | val tableName1 = "dftest" 35 | val n = 1 36 | val stmt = conn.createStatement 37 | TestUtils.createTableBySQL(conn, tableName1, "create table " + tableName1 + " (a int, b varchar, c float, d array[int])") 38 | 39 | TestUtils.populateTableBySQL(stmt, "insert into dftest values (1, 'heeelo', 3.2, array[3,5])", 10) 40 | 41 | val df = spark.read.format("com.vertica.spark.datasource.VerticaSource") 42 | .options(jsonReadOpts + ("table" -> tableName1)).load() 43 | val result = Try {df.show()} 44 | result match { 45 | case Failure(exception) => fail("Expected to succeed", exception) 46 | case Success(_) => 47 | } 48 | stmt.close() 49 | TestUtils.dropTable(conn, tableName1) 50 | } 51 | 52 | it should "error on binary types" in { 53 | val tableName = "dftest" 54 | val n = 1 55 | val stmt = conn.createStatement 56 | TestUtils.createTableBySQL(conn, tableName, "create table " + tableName + " (a binary, b varbinary, c array[binary], d array[varbinary], e long varbinary)") 57 | 58 | val df = spark.read.format("com.vertica.spark.datasource.VerticaSource") 59 | .options(jsonReadOpts + ("table" -> tableName)).load() 60 | val result = Try{df.collect} 61 | result match { 62 | case Failure(exception) => exception match { 63 | case ConnectorException(error) => { 64 | assert(error.isInstanceOf[ErrorList]) 65 | val errorList = error.asInstanceOf[ErrorList].errors.toList 66 | assert(errorList.forall(_.isInstanceOf[BinaryTypeNotSupported])) 67 | assert(errorList(0).asInstanceOf[BinaryTypeNotSupported].fieldName == "a") 68 | assert(errorList(1).asInstanceOf[BinaryTypeNotSupported].fieldName == "b") 69 | assert(errorList(2).asInstanceOf[BinaryTypeNotSupported].fieldName == "c") 70 | assert(errorList(3).asInstanceOf[BinaryTypeNotSupported].fieldName == "d") 71 | assert(errorList(4).asInstanceOf[BinaryTypeNotSupported].fieldName == "e") 72 | } 73 | } 74 | case Success(_) => fail("Expected to fail") 75 | } 76 | stmt.close() 77 | TestUtils.dropTable(conn, tableName) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /functional-tests/src/main/scala/com/vertica/spark/functests/endtoend/RemoteTests.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.functests.endtoend 2 | 3 | import com.vertica.spark.config.{FileStoreConfig, JDBCConfig} 4 | import org.apache.spark.sql.{Row, SaveMode} 5 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StructField, StructType} 6 | 7 | /** 8 | * Test suites for submitting to a remote driver. This suite is meant to be configured with a master node when submitting. 9 | * */ 10 | class RemoteTests(readOpts: Map[String, String], writeOpts: Map[String, String], jdbcConfig: JDBCConfig, fileStoreConfig: FileStoreConfig) 11 | extends EndToEnd(readOpts, writeOpts, jdbcConfig, fileStoreConfig, true) { 12 | 13 | override def sparkAppName: String = "Remote Tests" 14 | 15 | /** 16 | * This test checks the case where remote executors have to perform multiple tasks and see if multiple connections are 17 | * created. Note that if executors have more cores tasks, then they may be able run all tasks in one go and not trigger 18 | * the needed interactions. 19 | * 20 | * Note: You may get a java.lang.OutOfMemoryError when running locally. To allocate more memeory, start sbt with 21 | * sbt -J-Xmx10G, which will increase heap size to 10gb. More here: https://www.scala-sbt.org/1.x/docs/Troubleshoot-Memory-Issues.html 22 | * */ 23 | it should "only create constant number of jdbc sessions when write and read" in { 24 | val rowCount = 50000 25 | val data = (1 to rowCount).map(i => Row(i, (0 to 1000).map(i => i).toArray)).toList 26 | val schema = new StructType(Array(StructField("col1", IntegerType), StructField("col2", ArrayType(IntegerType)))) 27 | 28 | val partitionsCount = 100 29 | val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema).repartition(partitionsCount) 30 | val getJDBCConnectionsCount = "select count(client_hostname) from v_monitor.user_sessions where client_type='JDBC Driver';" 31 | val stmt = conn.createStatement() 32 | try { 33 | var rs = stmt.executeQuery(getJDBCConnectionsCount) 34 | assert(rs.next) 35 | val initialJdbcSessionCount = rs.getLong(1) 36 | 37 | val tableName = "dftest" 38 | df.write.format(VERTICA_SOURCE) 39 | .options(writeOpts + ("table" -> tableName)) 40 | .mode(SaveMode.Overwrite) 41 | .save() 42 | 43 | rs = stmt.executeQuery(getJDBCConnectionsCount) 44 | assert(rs.next) 45 | val sessionCountWrite = rs.getLong(1) 46 | // We expect only 2 new jdbc connections made on write 47 | assert(sessionCountWrite == initialJdbcSessionCount + 2) 48 | 49 | spark.read.format(VERTICA_SOURCE) 50 | .options(readOpts + 51 | ("table" -> "dftest") + 52 | ("num_partitions"-> "30") + 53 | ("max_row_group_size_export_mb" -> "1") + 54 | ("max_file_size_export_mb" -> "1")) 55 | .load() 56 | 57 | rs = stmt.executeQuery(getJDBCConnectionsCount) 58 | assert(rs.next) 59 | val sessionCountRead = rs.getLong(1) 60 | // We expect only 1 new jdbc connections made on read. 61 | assert(sessionCountRead == initialJdbcSessionCount + 3) 62 | 63 | } catch { 64 | case exception: Exception => fail("Unexpected exception", exception) 65 | } finally { 66 | stmt.execute("drop table dftest;") 67 | stmt.close() 68 | } 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /functional-tests/src/main/scala/com/vertica/spark/functests/endtoend/SparkConfig.scala: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package com.vertica.spark.functests.endtoend 15 | 16 | import org.apache.spark.SparkConf 17 | 18 | /** 19 | * Mixin for creating a base [[SparkConf]] for a spark session. 20 | * */ 21 | trait SparkConfig { 22 | 23 | /** 24 | * The name that will be displayed on Spark Master UI 25 | * */ 26 | def sparkAppName: String 27 | 28 | /** 29 | * Get a base [[SparkConf]] 30 | * 31 | * @param remote if false, the config will set master as local[*], else it will be unset. 32 | * */ 33 | def baseSparkConf(remote: Boolean): SparkConf = { 34 | val conf = if (remote) { 35 | new SparkConf() 36 | } 37 | else { 38 | new SparkConf().setMaster("local[*]") 39 | } 40 | conf.setAppName(sparkAppName) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /functional-tests/submit-functional-tests-debug.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export SPARK_SUBMIT_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005" 4 | 5 | ./submit-functional-tests.sh "$@" 6 | -------------------------------------------------------------------------------- /functional-tests/submit-functional-tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Append option -r to the list of args 4 | args=("-r") 5 | args+=("$@") 6 | 7 | spark-submit --master spark://spark:7077 --driver-memory 2g target/scala-2.12/vertica-spark-functional-tests.jar "${args[@]}" 8 | -------------------------------------------------------------------------------- /img/CoreArchitecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/img/CoreArchitecture.png -------------------------------------------------------------------------------- /img/Overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/img/Overview.png -------------------------------------------------------------------------------- /img/SparkInterfaces.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vertica/spark-connector/a350adbc58eb65859e712f410a7596cc3539adad/img/SparkInterfaces.png -------------------------------------------------------------------------------- /performance-tests/README.md: -------------------------------------------------------------------------------- 1 | # Spark Connector - Performance Tests 2 | 3 | This project is in place to run performance tests of the connector against a set of Spark, HDFS, and Vertica clusters. 4 | 5 | Configuration is specified with `application.conf` (HOCON format). 6 | 7 | ## How to run the tests 8 | 9 | 1. Set up Vertica, HDFS and Spark 10 | 2. From the performance-tests directory, run `mkdir lib` to create the folder for the connector JAR 11 | 3. From the performance-tests directory, run `cd ../connector && sbt assembly && cp target/scala-2.13/spark-vertica-connector-assembly-.jar ../performance-tests/lib && cd ../performance-tests` to build and copy the connector JAR 12 | 4. From the performance-tests directory, run `sbt assembly` to assemble the test JAR 13 | 5. Use spark-submit on the test JAR, such as `spark-submit --master spark://hdfs.example.com:7077 --deploy-mode cluster target/scala-2.13/spark-vertica-connector-performance-tests-assembly-.jar` 14 | 15 | ## Tuning read performance 16 | 17 | The biggest factor in connector performance will be resources for Vertica and Spark. Vertica, particularly with default settings may run into a memory bottleneck. This can be improved via configuration of resource pools. 18 | 19 | ### Vertica Resource Pool Configuration 20 | 21 | The connector's Vertica-to-Spark functionality relies on a query to export data from Vertica to an intermediate filestore. This operation reserves a lot of memory, and the more memory available to it, the more threads it can create to parallelize the operation. 22 | 23 | It is suggested that the resource pool used for the operation is given as much memory as possible, and has its `plannedconcurrency` value set to as low as possible. 24 | 25 | For an explanation of this, any given Vertica query may only reserve its total provided memory divided by the `plannedconcurrency` value. A more detailed explanation can be found [here](https://www.vertica.com/blog/do-you-need-to-put-your-query-on-a-budgetba-p236830/). The `plannedconcurrency` value sets how many independent queries are expected to be run, and the connector only uses one query at a time. This query is then parallelized by Vertica. 26 | 27 | ### Connector Options 28 | 29 | There are some connector parameters that may affect the performance of a read from Vertica operation. 30 | 31 | - `num_partitions`: Will set how many partitions are created, representing how many parallel executors will be reading data from the intermediate location at once. This should roughly correspond to the processing power / number of cores in the Spark cluster. 32 | - `max_file_size_export_mb` and `max_row_group_size_export_mb`: Represent configuration of the parquet files exported from Vertica to the intermediary location. These values default to where we find the best export performance lies: 16MB Row Group Size and 2048MB file size. However, these can be tweaked depending on details of the given clusters. 33 | 34 | ## Tuning write performance 35 | 36 | Similar steps to the above for tuning Vertica resource pools may be helpful for write performance. 37 | 38 | On writing, the number of partitions is decided not by the connector, but by the number of partitions passed in. To change this, you can call the `coalesce()` function on a dataframe before writing it. 39 | -------------------------------------------------------------------------------- /performance-tests/build.sbt: -------------------------------------------------------------------------------- 1 | // (c) Copyright [2020-2021] Micro Focus or one of its affiliates. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // You may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | scalaVersion := "2.13.16" 15 | name := "spark-vertica-connector-performance-tests" 16 | organization := "com.vertica" 17 | version := "1.0" 18 | 19 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases" 20 | resolvers += "jitpack" at "https://jitpack.io" 21 | 22 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.16" 23 | libraryDependencies += "com.typesafe" % "config" % "1.4.1" 24 | 25 | libraryDependencies += "org.scala-lang.modules" %% "scala-parser-combinators" % "2.3.0" 26 | libraryDependencies += "com.vertica.jdbc" % "vertica-jdbc" % "24.4.0-0" 27 | libraryDependencies += "org.apache.spark" %% "spark-core" % "3.5.5" 28 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.5" 29 | libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % "3.3.2" 30 | libraryDependencies += "org.scalactic" %% "scalactic" % "3.2.16" 31 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.16" % "test" 32 | libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.9.5" 33 | libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.2.3" 34 | libraryDependencies += "org.scalamock" %% "scalamock" % "5.2.0" % Test 35 | libraryDependencies += "org.typelevel" %% "cats-core" % "2.3.0" 36 | Compile / unmanagedJars += file("../connector/target/scala-2.13/spark-vertica-connector-assembly-3.3.6.jar") 37 | 38 | assembly / assemblyMergeStrategy := { 39 | case PathList("META-INF", xs @ _*) => MergeStrategy.discard 40 | case x => MergeStrategy.first 41 | } 42 | 43 | Runtime / unmanagedClasspath += new File("/etc/hadoop/etc/hadoop") 44 | 45 | -------------------------------------------------------------------------------- /performance-tests/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Artima Maven Repository" at "https://repo.artima.com/releases" 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") 4 | -------------------------------------------------------------------------------- /performance-tests/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | functional-tests { 2 | host="vertica" 3 | port=5433 4 | db="testdb" 5 | user="release" 6 | password="password" 7 | log=true 8 | filepath="hdfs://hdfs:8020/data/" 9 | dirpath="hdfs://hdfs:8020/data/dirtest/" 10 | colCounts="400" 11 | rowCounts="5000000,10000000" 12 | runCount=5 13 | testMode=both 14 | max_row_group_size="128" 15 | max_file_size="512" 16 | compareJdbc=true 17 | compareV1=false 18 | num_partitions=100 19 | filter="col1 > 0 AND col1 < 1000" 20 | } 21 | 22 | -------------------------------------------------------------------------------- /performance-tests/src/main/scala/com/vertica/spark/perftests/DataGenUtils.scala: -------------------------------------------------------------------------------- 1 | package com.vertica.spark.perftests 2 | 3 | import com.vertica.spark.perftests.DataGenUtils.{columnType, genDataSchema} 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 6 | import org.apache.spark.sql.types.{DateType, Decimal, DecimalType, IntegerType, StringType, StructField, StructType} 7 | 8 | object DataGenUtils { 9 | val rand = new scala.util.Random(System.currentTimeMillis) 10 | 11 | def randomMsInLast70Years() = { 12 | -946771200000L + // Time in past 13 | (Math.abs(rand.nextLong) % ( 14 | 70L // years 15 | * 365 // days 16 | * 24 // hours 17 | * 60 // minutes 18 | * 60 // seconds 19 | * 1000 // ms 20 | )) 21 | } 22 | 23 | def randomStringGen(length: Int): String = rand.alphanumeric.take(length).mkString 24 | 25 | def randomIntGen(): Int = rand.nextInt() 26 | 27 | def randomDecimalGen(): Decimal = Decimal(rand.nextDouble()) 28 | 29 | def randomDateGen(): java.sql.Date = { 30 | val ms = randomMsInLast70Years() 31 | new java.sql.Date(ms) 32 | } 33 | 34 | private def columnType(i: Int) = { 35 | i % 4 match { 36 | case 0 => StringType 37 | case 1 => IntegerType 38 | case 2 => DecimalType(25,10) 39 | case 3 => DateType 40 | } 41 | } 42 | 43 | def genDataRow(colCount: Int): Row = { 44 | val data = (0 until colCount).map(i => columnType(i) match { 45 | case StringType => randomStringGen(10) 46 | case IntegerType => randomIntGen() 47 | case DecimalType() => randomDecimalGen() 48 | case DateType => randomDateGen() 49 | }) 50 | Row.fromSeq(data) 51 | } 52 | 53 | def genDataSchema(colCount: Int): StructType = { 54 | StructType( 55 | (0 until colCount).map(i => StructField("col"+i, columnType(i))) 56 | ) 57 | } 58 | 59 | def getColumns(colCount: Int): String = { 60 | val cols = (0 until colCount).map(i => { 61 | val colType = columnType(i) 62 | 63 | val t = colType match { 64 | case StringType => "VARCHAR(1024)" 65 | case IntegerType => "INTEGER" 66 | case DecimalType() => "DECIMAL(25, 10)" 67 | case DateType => "DATE" 68 | } 69 | 70 | val n = "col" + i 71 | 72 | n + " " + t 73 | }) 74 | 75 | cols.mkString(", ") 76 | } 77 | } 78 | 79 | class DataGenUtils(hdfsPath: String, spark: SparkSession) { 80 | 81 | def loadOrGenerateData(rowsPerPartition: Int, numPartitions: Int, colCount: Int): DataFrame = { 82 | val totalRowCount = rowsPerPartition * numPartitions 83 | println("Getting data for row count " + totalRowCount + " , col count " + colCount) 84 | val dataFileName = hdfsPath + "data_" + totalRowCount + "_" + colCount 85 | 86 | val conf = spark.sparkContext.hadoopConfiguration 87 | val fs = org.apache.hadoop.fs.FileSystem.get(conf) 88 | val exists = fs.exists(new org.apache.hadoop.fs.Path(dataFileName)) 89 | 90 | if(exists) { 91 | println("Data already exists, loading") 92 | val df = spark.read.parquet(dataFileName) 93 | df.rdd.count() 94 | df 95 | } 96 | else { 97 | println("Data doesn't exist yet, generating") 98 | val startTime: Long = System.currentTimeMillis() 99 | 100 | val basicData : RDD[Row] = spark.sparkContext.parallelize(Seq[Int](), numPartitions) 101 | .mapPartitions { _ => { 102 | (1 to rowsPerPartition).map{_ => Row(1)}.iterator 103 | }} 104 | 105 | val dataSchema = genDataSchema(colCount) 106 | //println("SCHEMA: " + dataSchema.toString()) 107 | 108 | val dataDf = spark.createDataFrame( 109 | basicData.map(_ => DataGenUtils.genDataRow(colCount)), 110 | dataSchema 111 | ) 112 | 113 | println("Storing data in file " + dataFileName) 114 | dataDf.write.parquet(dataFileName) 115 | 116 | val endTime: Long = System.currentTimeMillis() 117 | println("start: " + startTime + ", end: " + endTime) 118 | println("it took " + (endTime - startTime) + "MS to generate and write data") 119 | 120 | dataDf 121 | } 122 | } 123 | 124 | } 125 | -------------------------------------------------------------------------------- /version.properties: -------------------------------------------------------------------------------- 1 | connector-version=3.3.6 2 | --------------------------------------------------------------------------------