├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── dependabot.yml
    ├── stale.yml
    └── workflows
    │   ├── release.yml
    │   ├── snapshot.yml
    │   └── test.yml
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── dev
    ├── change-scala-version.sh
    ├── deploy-release.sh
    ├── deploy-snapshot.sh
    ├── docker-compose.yml
    └── test.sh
├── docs
    ├── Annotations.md
    ├── Architecture.md
    ├── Condition.md
    ├── Conf.md
    ├── ConfigLoader.md
    ├── Deliverable.md
    ├── Factory.md
    ├── Logging.md
    ├── Pipeline.md
    ├── Quick-Start.md
    ├── SchemaConverter.md
    ├── Setl.md
    ├── SparkRepository-caching.md
    ├── SparkSessionBuilder.md
    ├── Stage.md
    ├── StructAnalyser.md
    ├── Transformer.md
    ├── _config.yml
    ├── data_access_layer
    │   ├── Connector.md
    │   ├── ConnectorBuilder.md
    │   ├── CustomConnector.md
    │   ├── Repository.md
    │   ├── SparkRepositoryAdapter.md
    │   ├── SparkRepositoryBuilder.md
    │   ├── Structured-Streaming-Connector.md
    │   └── configuration_example.md
    ├── img
    │   ├── logo_setl.png
    │   ├── logo_setl_1280_640.png
    │   └── old_logo
    │   │   ├── logo_setl.png
    │   │   └── logo_setl_1280_640.png
    ├── index.md
    ├── utils
    │   └── Compressor_Archiver.md
    └── vocabulary.md
├── pom.xml
└── src
    ├── main
        ├── java
        │   └── io
        │   │   └── github
        │   │       └── setl
        │   │           ├── annotation
        │   │               ├── Benchmark.java
        │   │               ├── Compress.java
        │   │               ├── Delivery.java
        │   │               ├── Experimental.java
        │   │               └── InterfaceStability.java
        │   │           ├── enums
        │   │               ├── PathFormat.java
        │   │               ├── Storage.java
        │   │               └── ValueType.java
        │   │           ├── exception
        │   │               ├── AlreadyExistsException.java
        │   │               ├── BaseException.java
        │   │               ├── ConfException.java
        │   │               ├── ConnectorException.java
        │   │               ├── InvalidConnectorException.java
        │   │               ├── InvalidDeliveryException.java
        │   │               ├── InvalidSchemaException.java
        │   │               ├── RepositoryException.java
        │   │               └── UnknownException.java
        │   │           ├── internal
        │   │               └── BenchmarkInvocationHandler.java
        │   │           └── storage
        │   │               ├── GZIPCompressor.java
        │   │               ├── SnappyCompressor.java
        │   │               └── XZCompressor.java
        └── scala
        │   └── io
        │       └── github
        │           └── setl
        │               ├── BenchmarkResult.scala
        │               ├── Builder.scala
        │               ├── Converter.scala
        │               ├── Setl.scala
        │               ├── SparkSessionBuilder.scala
        │               ├── annotation
        │                   ├── ColumnName.scala
        │                   └── CompoundKey.scala
        │               ├── config
        │                   ├── Conf.scala
        │                   ├── ConfigLoader.scala
        │                   ├── ConnectorConf.scala
        │                   ├── DeltaConnectorConf.scala
        │                   ├── DynamoDBConnectorConf.scala
        │                   ├── FileConnectorConf.scala
        │                   ├── HudiConnectorConf.scala
        │                   ├── JDBCConnectorConf.scala
        │                   └── StructuredStreamingConnectorConf.scala
        │               ├── internal
        │                   ├── CanCreate.scala
        │                   ├── CanDelete.scala
        │                   ├── CanDrop.scala
        │                   ├── CanPartition.scala
        │                   ├── CanUpdate.scala
        │                   ├── CanVacuum.scala
        │                   ├── CanWait.scala
        │                   ├── Configurable.scala
        │                   ├── HasBenchmark.scala
        │                   ├── HasDescription.scala
        │                   ├── HasDiagram.scala
        │                   ├── HasReader.scala
        │                   ├── HasReaderWriter.scala
        │                   ├── HasRegistry.scala
        │                   ├── HasType.scala
        │                   ├── HasWriter.scala
        │                   ├── Identifiable.scala
        │                   ├── Logging.scala
        │                   ├── SchemaConverter.scala
        │                   ├── StructAnalyser.scala
        │                   └── Writable.scala
        │               ├── storage
        │                   ├── Archiver.scala
        │                   ├── Compressor.scala
        │                   ├── Condition.scala
        │                   ├── ConnectorBuilder.scala
        │                   ├── DatasetConverter.scala
        │                   ├── SparkRepositoryBuilder.scala
        │                   ├── ZipArchiver.scala
        │                   ├── connector
        │                   │   ├── ACIDConnector.scala
        │                   │   ├── CSVConnector.scala
        │                   │   ├── CassandraConnector.scala
        │                   │   ├── Connector.scala
        │                   │   ├── ConnectorInterface.scala
        │                   │   ├── DBConnector.scala
        │                   │   ├── DeltaConnector.scala
        │                   │   ├── DynamoDBConnector.scala
        │                   │   ├── ExcelConnector.scala
        │                   │   ├── FileConnector.scala
        │                   │   ├── HudiConnector.scala
        │                   │   ├── JDBCConnector.scala
        │                   │   ├── JSONConnector.scala
        │                   │   ├── ParquetConnector.scala
        │                   │   ├── SparkSQLConnector.scala
        │                   │   ├── StreamingConnector.scala
        │                   │   └── StructuredStreamingConnector.scala
        │                   └── repository
        │                   │   ├── ImplicitRepositoryAdapter.scala
        │                   │   ├── Repository.scala
        │                   │   ├── RepositoryAdapter.scala
        │                   │   └── SparkRepository.scala
        │               ├── transformation
        │                   ├── AbstractFactory.scala
        │                   ├── Deliverable.scala
        │                   ├── Factory.scala
        │                   ├── FactoryDeliveryMetadata.scala
        │                   ├── FactoryInput.scala
        │                   ├── FactoryOutput.scala
        │                   ├── MLTransformer.scala
        │                   └── Transformer.scala
        │               ├── util
        │                   ├── DateUtils.scala
        │                   ├── ExpectedDeliverable.scala
        │                   ├── FilterImplicits.scala
        │                   ├── HasSparkSession.scala
        │                   ├── MermaidUtils.scala
        │                   ├── ReflectUtils.scala
        │                   ├── SparkUtils.scala
        │                   └── TypesafeConfigUtils.scala
        │               └── workflow
        │                   ├── DAG.scala
        │                   ├── DeliverableDispatcher.scala
        │                   ├── External.scala
        │                   ├── Flow.scala
        │                   ├── Node.scala
        │                   ├── Pipeline.scala
        │                   ├── PipelineInspector.scala
        │                   ├── PipelineOptimizer.scala
        │                   ├── SimplePipelineOptimizer.scala
        │                   └── Stage.scala
    └── test
        ├── resources
            ├── application.conf
            ├── dynamodb.conf
            ├── local.conf
            ├── log4j.properties
            ├── myconf.conf
            ├── streaming_test_resources
            │   ├── input
            │   │   └── text.txt
            │   ├── input2
            │   │   └── input2.csv
            │   └── streaming.conf
            ├── test-archiver
            │   ├── test-input-file.txt
            │   └── test-input
            │   │   ├── col3=c
            │   │       ├── file1-1-1.csv
            │   │       ├── file1-2-1.csv
            │   │       └── file1-2-2.csv
            │   │   └── col3=cc
            │   │       └── file2-1.csv
            ├── test-json.json
            ├── test-list-files
            │   ├── file1.csv
            │   ├── subdir1
            │   │   ├── subsubdir1
            │   │   │   ├── file1-1-1.csv
            │   │   │   └── wrongfile1-1-1.csv
            │   │   └── subsubdir2
            │   │   │   ├── file1-2-1.csv
            │   │   │   └── file1-2-2.csv
            │   └── subdir2
            │   │   └── file2-1.csv
            ├── test-list-files2
            │   ├── col3=c
            │   │   ├── file1-1-1.csv
            │   │   ├── file1-2-1.csv
            │   │   └── file1-2-2.csv
            │   └── col3=cc
            │   │   └── file2-1.csv
            ├── test_base_path.csv
            ├── test_connector_builder.conf
            ├── test_priority.conf
            └── test_schema_converter.csv
        └── scala
            └── io
                └── github
                    └── setl
                        ├── MockCassandra.scala
                        ├── SetlSuite.scala
                        ├── SparkSessionBuilderSuite.scala
                        ├── SparkTestUtils.scala
                        ├── TestObject.scala
                        ├── config
                            ├── ConfLoaderSuite.scala
                            ├── ConfSuite.scala
                            ├── DeltaConnectorConfSuite.scala
                            ├── DynamoDBConnectorConfSuite.scala
                            ├── FileConnectorConfSuite.scala
                            ├── HudiConnectorConfSuite.scala
                            ├── JDBCConnectorConfSuite.scala
                            ├── Properties.scala
                            ├── PropertiesSuite.scala
                            └── StructuredStreamingConnectorConfSuite.scala
                        ├── factory
                            └── FactoryDeliveryMetadataSuite.scala
                        ├── internal
                            ├── BenchmarkInvocationHandlerSuite.scala
                            ├── HasRegistrySuite.scala
                            ├── SchemaConverterSuite.scala
                            ├── StructAnalyserSuite.scala
                            └── TestClasses.scala
                        ├── storage
                            ├── ConditionSuite.scala
                            ├── ConnectorBuilderSuite.scala
                            ├── GZIPCompressorSuite.scala
                            ├── SnappyCompressorSuite.scala
                            ├── SparkRepositoryBuilderSuite.scala
                            ├── XZCompressorSuite.scala
                            ├── ZipArchiverSuite.scala
                            ├── connector
                            │   ├── CSVConnectorSuite.scala
                            │   ├── CassandraConnectorSuite.scala
                            │   ├── ConnectorSuite.scala
                            │   ├── DeltaConnectorSuite.scala
                            │   ├── DynamoDBConnectorSuite.scala
                            │   ├── ExcelConnectorSuite.scala
                            │   ├── FileConnectorSuite.scala
                            │   ├── HudiConnectorSuite.scala
                            │   ├── JDBCConnectorSuite.scala
                            │   ├── JSONConnectorSuite.scala
                            │   ├── ParquetConnectorSuite.scala
                            │   ├── SparkSQLConnectorSuite.scala
                            │   └── StructuredStreamingConnectorSuite.scala
                            ├── package.scala
                            └── repository
                            │   ├── RepositoryAdapterSuite.scala
                            │   ├── SparkRepositorySuite.scala
                            │   ├── package.scala
                            │   └── streaming
                            │       └── StreamingRepositorySuite.scala
                        ├── transformation
                            └── DeliverableSuite.scala
                        ├── util
                            ├── DateUtilsSuite.scala
                            ├── FilterImplicitsSuite.scala
                            ├── IOUtils.scala
                            ├── MermaidUtilsSuite.scala
                            └── TypesafeConfigUtilsSuite.scala
                        └── workflow
                            ├── DeliverableDispatcherSuite.scala
                            ├── FlowSuite.scala
                            ├── NodeSuite.scala
                            ├── PipelineInspectorSuite.scala
                            ├── PipelineSuite.scala
                            ├── SimplePipelineOptimizerSuite.scala
                            ├── StageSuite.scala
                            └── package.scala


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "Issue title"
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 | 
22 | **Environment (please complete the following information):**
23 |  - OS: [e.g. iOS]
24 |  - Version [e.g. 22]
25 |  - Dependencies:
26 | 
27 | **Additional context**
28 | Add any other context about the problem here.
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "Feature request title"
 5 | labels: feature
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "maven" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "daily"
12 |     ignore:
13 |       - dependency-name: "scala*"
14 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 60
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: stale
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   release_deployment:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       matrix:
12 |         include:
13 |           - SCALA_VER: "2.12"
14 |             SPARK_VER: "3.2"
15 |           - SCALA_VER: "2.11"
16 |             SPARK_VER: "2.4"
17 | 
18 |     steps:
19 |       - name: Checkout
20 |         uses: actions/checkout@v2
21 | 
22 |       - name: Set up JDK 1.8
23 |         uses: actions/setup-java@v1
24 |         with:
25 |           java-version: 1.8
26 | 
27 |       - name: Before all
28 |         run: |
29 |           chmod +x ./dev/change-scala-version.sh
30 |           ./dev/change-scala-version.sh ${{ matrix.SCALA_VER }}
31 |           docker-compose -f ./dev/docker-compose.yml up -d
32 | 
33 |       - name: Prepare maven
34 |         env:
35 |           MVN_SETTINGS: ${{ secrets.MVN_SETTINGS }}
36 |           MVN_SECURITY: ${{ secrets.MVN_SECURITY_SETTINGS }}
37 |           GPG_KEY: ${{ secrets.GPG_KEY }}
38 |         run: |
39 |           echo "$MVN_SETTINGS" | base64 -d > "$HOME"/.m2/settings.xml
40 |           echo "$MVN_SECURITY" | base64 -d > "$HOME"/.m2/settings-security.xml
41 |           echo "$GPG_KEY" | base64 -d | gpg --import --batch > /dev/null 2>&1
42 | 
43 |       - name: Run tests
44 |         run: |
45 |           set -e
46 |           export AWS_ACCESS_KEY_ID="fakeAccess"
47 |           export AWS_SECRET_ACCESS_KEY="fakeSecret"
48 |           export AWS_REGION="eu-west-1"
49 |           mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${{ matrix.SPARK_VER }}
50 | 
51 |       # If the tag follows the format SETL-X.Y.Z-RC*, then release the RC version
52 |       - name: RC version deployment
53 |         if: ${{ startsWith( github.ref, 'refs/tags/SETL-' ) && contains( github.ref, '-RC' ) }}
54 |         run: |
55 |           RC_VER=-$(echo ${{ github.ref }} | cut -d'-' -f 3)
56 |           mvn clean deploy scala:doc -ntp -B -DskipTests -P release,spark_${{ matrix.SPARK_VER }} -Dchangelist=$RC_VER
57 | 
58 |       # If the tag follows the format SETL-X.Y.Z, then release the stable version
59 |       - name: Deployment
60 |         if: ${{ startsWith( github.ref, 'refs/tags/SETL-' ) && !contains( github.ref, '-RC' ) }}
61 |         run: mvn clean deploy scala:doc -ntp -B -DskipTests -P release,spark_${{ matrix.SPARK_VER }}
62 | 


--------------------------------------------------------------------------------
/.github/workflows/snapshot.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |     paths-ignore:
 7 |       - 'README.md'
 8 |       - 'docs/**'
 9 |       - '.github/ISSUE_TEMPLATE/**'
10 | 
11 | jobs:
12 |   snapshot_deployment:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         include:
17 |           - SCALA_VER: "2.12"
18 |             SPARK_VER: "3.2"
19 |           - SCALA_VER: "2.11"
20 |             SPARK_VER: "2.4"
21 | 
22 |     steps:
23 |       - name: Checkout
24 |         uses: actions/checkout@v2
25 | 
26 |       - name: Set up JDK 1.8
27 |         uses: actions/setup-java@v1
28 |         with:
29 |           java-version: 1.8
30 | 
31 |       - name: Before all
32 |         run: |
33 |           chmod +x ./dev/change-scala-version.sh
34 |           ./dev/change-scala-version.sh ${{ matrix.SCALA_VER }}
35 |           docker-compose -f ./dev/docker-compose.yml up -d
36 | 
37 |       - name: Prepare maven
38 |         env:
39 |           MVN_SETTINGS: ${{ secrets.MVN_SETTINGS }}
40 |           MVN_SECURITY: ${{ secrets.MVN_SECURITY_SETTINGS }}
41 |         run: |
42 |           echo "$MVN_SETTINGS" | base64 -d > "$HOME"/.m2/settings.xml
43 |           echo "$MVN_SECURITY" | base64 -d > "$HOME"/.m2/settings-security.xml
44 | 
45 |       - name: Run tests
46 |         run: |
47 |           set -e
48 |           export AWS_ACCESS_KEY_ID="fakeAccess"
49 |           export AWS_SECRET_ACCESS_KEY="fakeSecret"
50 |           export AWS_REGION="eu-west-1"
51 |           mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${{ matrix.SPARK_VER }}
52 | 
53 |       - name: Upload coverage report
54 |         uses: codecov/codecov-action@v1
55 |         with:
56 |           flags: master_${{ matrix.SCALA_VER }}_${{ matrix.SPARK_VER }}
57 |           name: codecov-master-branch
58 | 
59 |       - name: Deployment
60 |         run: mvn clean deploy scala:doc -ntp -B -DskipTests -P snapshot,spark_${{ matrix.SPARK_VER }}
61 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ master ]
 6 |     paths-ignore:
 7 |       - 'README.md'
 8 |       - 'docs/**'
 9 |       - '.github/ISSUE_TEMPLATE/**'
10 | 
11 | jobs:
12 |   test_setl:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         SCALA_VER: ["2.12", "2.11"]
18 |         SPARK_VER: ["3.2", "3.0", "2.4", "2.3"]
19 |         exclude:
20 |           - SCALA_VER: 2.12
21 |             SPARK_VER: 2.3
22 |           - SCALA_VER: 2.11
23 |             SPARK_VER: 3.0
24 |           - SCALA_VER: 2.11
25 |             SPARK_VER: 3.2
26 |     steps:
27 |       - name: Checkout
28 |         uses: actions/checkout@v2
29 | 
30 |       - name: Set up JDK 1.8
31 |         uses: actions/setup-java@v1
32 |         with:
33 |           java-version: 1.8
34 | 
35 |       - name: Before all
36 |         run: |
37 |           chmod +x ./dev/change-scala-version.sh
38 |           ./dev/change-scala-version.sh ${{ matrix.SCALA_VER }}
39 |           docker-compose -f ./dev/docker-compose.yml up -d
40 | 
41 |       - name: Run tests
42 |         run: |
43 |           set -e
44 |           export AWS_ACCESS_KEY_ID="fakeAccess"
45 |           export AWS_SECRET_ACCESS_KEY="fakeSecret"
46 |           export AWS_REGION="eu-west-1"
47 |           mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${{ matrix.SPARK_VER }}
48 | 
49 |       - name: Upload coverage to Codecov
50 |         uses: codecov/codecov-action@v1
51 |         with:
52 |           flags: pr_${{ matrix.SCALA_VER }}_${{ matrix.SPARK_VER }}
53 |           name: codecov-pull-request
54 | 
55 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by https://www.gitignore.io/api/intellij
 2 | 
 3 | ### Intellij ###
 4 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 5 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 6 | 
 7 | # User-specific stuff:
 8 | .idea/workspace.xml
 9 | .idea/tasks.xml
10 | .idea/dictionaries
11 | .idea/vcs.xml
12 | .idea/jsLibraryMappings.xml
13 | 
14 | # Sensitive or high-churn files:
15 | .idea/dataSources.ids
16 | .idea/dataSources.xml
17 | .idea/dataSources.local.xml
18 | .idea/sqlDataSources.xml
19 | .idea/dynamic.xml
20 | .idea/uiDesigner.xml
21 | 
22 | # Gradle:
23 | .idea/gradle.xml
24 | .idea/libraries
25 | 
26 | # Mongo Explorer plugin:
27 | .idea/mongoSettings.xml
28 | 
29 | ## File-based project format:
30 | *.iws
31 | 
32 | ## Plugin-specific files:
33 | 
34 | # IntelliJ
35 | /out/
36 | 
37 | # mpeltonen/sbt-idea plugin
38 | .idea_modules/
39 | 
40 | # JIRA plugin
41 | atlassian-ide-plugin.xml
42 | 
43 | # Crashlytics plugin (for Android Studio and IntelliJ)
44 | com_crashlytics_export_strings.xml
45 | crashlytics.properties
46 | crashlytics-build.properties
47 | fabric.properties
48 | 
49 | ### Intellij Patch ###
50 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
51 | 
52 | # *.iml
53 | # modules.xml
54 | 
55 | /target/
56 | 
57 | ### STS ###
58 | .apt_generated
59 | .classpath
60 | .factorypath
61 | .project
62 | .settings
63 | .springBeans
64 | .sts4-cache
65 | 
66 | ### IntelliJ IDEA ###
67 | .idea
68 | *.iml
69 | *.ipr
70 | 
71 | ### NetBeans ###
72 | /nbproject/private/
73 | /build/
74 | /nbbuild/
75 | /dist/
76 | /nbdist/
77 | /.nb-gradle/
78 | 
79 | */target/
80 | null/
81 | */null/
82 | /data/
83 | .toDelete
84 | *.log
85 | 
86 | # OS generated files #
87 | ######################
88 | .DS_Store
89 | .DS_Store?
90 | ._*
91 | .Spotlight-V100
92 | .Trashes
93 | ehthumbs.db
94 | Thumbs.db


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at setl@qinxuzhou.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to SETL
 2 | 
 3 | Thanks sooooo much for taking time to contribute :+1:
 4 | 
 5 | ## Bug report
 6 | 
 7 | When you are creating a bug report, please include as many details as possible. 
 8 | Fill out the required template, the information it asks for helps us resolve issues faster.
 9 | 
10 | ## Feature request
11 | 
12 | When you are creating an enhancement suggestion, please include as many details as possible. 
13 | Fill in the template, including the steps that you imagine you would take if the feature you're requesting existed.
14 | 
15 | ## Development
16 | 
17 | ### Quick guide
18 | - Fork the project & clone locally.
19 | - Create an upstream remote and sync your local copy before you branch.
20 | - Branch for each separate piece of work.
21 | - Push to the origin repository (the fork).
22 | - Create a new Pull Request in GitHub.
23 | 
24 | ### Build
25 | 
26 | Use pre-created profiles to change version.
27 | 
28 | ```shell
29 | # Build SNAPSHOT with Scala 2.11
30 | mvn clean package -Psnapshot -Pscala_2.11 -Pspark_2.4
31 | 
32 | # Build RELEASE with Scala 2.11
33 | mvn clean package -Prelease -Pscala_2.11 -Pspark_2.4
34 | 
35 | # Build SNAPSHOT with Scala 2.12
36 | ./dev/change-scala-version.sh 2.12
37 | mvn clean package -Psnapshot -Pscala_2.12 -Pspark_2.4
38 | 
39 | # Build RELEASE with Scala 2.12
40 | ./dev/change-scala-version.sh 2.12
41 | mvn clean package -Prelease -Pscala_2.12 -Pspark_2.4
42 | ```
43 | 
44 | ### Unit tests
45 | 
46 | We use docker to provide services for the unit test. Run the following command before the unit test:
47 | ```shell
48 | docker-compose -f ./dev/docker-compose.yml up
49 | ```
50 | 
51 | To start the test with cli:
52 | ```shell
53 | export SCALA_VER=2.11
54 | export SPARK_VER=2.4
55 | ./dev/test.sh
56 | ```
57 | 
58 | Note: in some case you get the following error 
59 | ```
60 | java.net.BindException: Can't assign requested address: Service 'sparkDriver'
61 | ```
62 | then you have to bind the spark to local ip like this
63 | ```shell
64 | export SPARK_LOCAL_IP=127.0.0.1
65 | ```
66 | 
67 | ## Styleguide
68 | 
69 | ### Commit styleguide
70 | 
71 | Please refer to [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0-beta.2/)
72 | 
73 | ### Scala styleguide
74 | 
75 | Please refer to [Databricks Scala Guide](https://github.com/databricks/scala-style-guide)
76 | 


--------------------------------------------------------------------------------
/dev/change-scala-version.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | set -e
21 | 
22 | VALID_VERSIONS=( 2.11 2.12 )
23 | 
24 | usage() {
25 |   echo "Usage: $(basename $0) [-h|--help] <version>
26 | where :
27 |   -h| --help Display this help text
28 |   valid version values : ${VALID_VERSIONS[*]}
29 | " 1>&2
30 |   exit 1
31 | }
32 | 
33 | if [[ ($# -ne 1) || ( $1 == "--help") ||  $1 == "-h" ]]; then
34 |   usage
35 | fi
36 | 
37 | TO_COMP_VERSION=$1
38 | 
39 | check_scala_version() {
40 |   for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done
41 |   echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2
42 |   exit 1
43 | }
44 | 
45 | check_scala_version "$TO_COMP_VERSION"
46 | 
47 | if [ $TO_COMP_VERSION = "2.11" ]; then
48 |   FROM_COMP_VERSION="2.12"
49 |   FROM_VERSION="2.12.10"
50 |   TO_VERSION="2.11.12"
51 | else
52 |   FROM_COMP_VERSION="2.11"
53 |   FROM_VERSION="2.11.12"
54 |   TO_VERSION="2.12.10"
55 | fi
56 | 
57 | sed_i() {
58 |   sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2"
59 | }
60 | 
61 | export -f sed_i
62 | 
63 | BASEDIR=$(dirname $0)/..
64 | 
65 | find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
66 |   -exec bash -c "sed_i 's/\(artifactId>setl\)_'$FROM_COMP_VERSION'/\1_'$TO_COMP_VERSION'/g' {}" \;
67 | 
68 | find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
69 |   -exec bash -c "sed_i 's/\(scala.compat.version>\)'$FROM_COMP_VERSION'/\1'$TO_COMP_VERSION'/g' {}" \;
70 | 
71 | find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
72 |   -exec bash -c "sed_i 's/\(scala.version>\)'$FROM_VERSION'/\1'$TO_VERSION'/g' {}" \;
73 | 
74 | 


--------------------------------------------------------------------------------
/dev/deploy-release.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | echo ${MVN_SETTINGS} | base64 -d > ${HOME}/.m2/settings.xml
 6 | echo ${MVN_SECURITY} | base64 -d > ${HOME}/.m2/settings-security.xml
 7 | echo ${GPG_KEY} | base64 -d | gpg --import --batch > /dev/null 2>&1
 8 | 
 9 | mvn clean deploy scala:doc -ntp -B -DskipTests -P release,spark_${SPARK_VER}
10 | 


--------------------------------------------------------------------------------
/dev/deploy-snapshot.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | echo ${MVN_SETTINGS} | base64 -d > ${HOME}/.m2/settings.xml
6 | echo ${MVN_SECURITY} | base64 -d > ${HOME}/.m2/settings-security.xml
7 | 
8 | mvn clean deploy scala:doc -ntp -B -DskipTests -P snapshot,spark_${SPARK_VER}
9 | 


--------------------------------------------------------------------------------
/dev/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.2'
 2 | services:
 3 |   psql:
 4 |     image: "postgres"
 5 |     container_name: "postgres-unit-test"
 6 |     environment:
 7 |       - POSTGRES_USER=postgres
 8 |       - POSTGRES_PASSWORD=postgres
 9 |       - POSTGRES_DB=framework_dev
10 |     ports:
11 |       - "5432:5432"
12 | 
13 |   cassandra:
14 |     image: "cassandra"
15 |     container_name: "cassandra-unit-test"
16 |     ports:
17 |       - "9042:9042"
18 | 
19 |   dynamodb:
20 |     image: "amazon/dynamodb-local"
21 |     container_name: "dynamodb-unit-test"
22 |     ports:
23 |       - "8000:8000"
24 | 


--------------------------------------------------------------------------------
/dev/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | export AWS_ACCESS_KEY_ID="fakeAccess"
 6 | export AWS_SECRET_ACCESS_KEY="fakeSecret"
 7 | export AWS_REGION="eu-west-1"
 8 | 
 9 | mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${SPARK_VER}
10 | 


--------------------------------------------------------------------------------
/docs/Architecture.md:
--------------------------------------------------------------------------------
1 | ![image](uploads/8c6071c49e88dcb3ead283edcebd4927/image.png)
2 | 
3 | ![image](uploads/1488d95144a09bd5bb0543bde5a1f193/image.png)
4 | 
5 | ![image](uploads/7e800d5ff258cfd2bccb38a4deca6f5e/image.png)


--------------------------------------------------------------------------------
/docs/Condition.md:
--------------------------------------------------------------------------------
 1 | ## Definition
 2 | 
 3 | **Condition** is used by the `findBy` method of a **Repository**
 4 | 
 5 | ```scala
 6 | val cond = Set(
 7 |   Condition("column1", ">", 100),
 8 |   Condition("column2", "=", "value2")
 9 | )
10 | 
11 | myRepository.findBy(cond)
12 | ```
13 | 
14 | ## Operation
15 | - `>`
16 | - `<`
17 | - `>=`
18 | - `<=`
19 | - `=`


--------------------------------------------------------------------------------
/docs/Conf.md:
--------------------------------------------------------------------------------
1 | ## Definition


--------------------------------------------------------------------------------
/docs/ConfigLoader.md:
--------------------------------------------------------------------------------
1 | ## Definition


--------------------------------------------------------------------------------
/docs/Deliverable.md:
--------------------------------------------------------------------------------
1 | ## Definition


--------------------------------------------------------------------------------
/docs/Factory.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | A **Factory[A]** is a complete data transformation job to produce an object of type A. 
 3 | 
 4 | ## Difference with *Transformer*
 5 | A **Factory** is more complex than a **Transformer**. In addition to data transformation, a **Factory** contains also logics for reading and writing data.
 6 | 
 7 | ## Demo
 8 | You could implement your own factory by extending the class **Factory[A]**.
 9 | 
10 | ```scala
11 | case class MyProduct
12 | 
13 | // MyFactory will produce MyProduct
14 | class MyFactory extend Factory[MyProduct] {
15 |   override def read(): this.type = ...
16 |   override def process(): this.type = ...
17 |   override def write(): this.type = ...
18 |   override def get(): MyProduct = ...
19 | }
20 | ```
21 | 
22 | To run **MyFactory**:
23 | ```scala
24 | new MyFactory().read().process().write().get()
25 | ```
26 | 
27 | ## Dependency Handling
28 | Dependency of a **Factory** could be handled by a **Pipeline** if the field has the **Delivery** annotation.
29 | For the previous **MyFactory** class:
30 | 
31 | ```scala
32 | case class MyProduct
33 | 
34 | // MyFactory will produce MyProduct
35 | class MyFactory extend Factory[MyProduct] {
36 |   
37 |   @Delivery
38 |   var input: String = _
39 | 
40 |   override def read(): this.type = ...
41 |   override def process(): this.type = ...
42 |   override def write(): this.type = ...
43 |   override def get(): MyProduct = ...
44 | }
45 | ```
46 | 
47 | By adding `@Delivery` to the variable **input**, the value of **input** will be automatically injected by **Pipeline**.
48 | 
49 | For more information about dependency handling, read the [doc of **Pipeline**](Pipeline).
50 | 


--------------------------------------------------------------------------------
/docs/Logging.md:
--------------------------------------------------------------------------------
1 | ## Definition
2 | Logging module


--------------------------------------------------------------------------------
/docs/SchemaConverter.md:
--------------------------------------------------------------------------------
 1 | ## Definition
 2 | 
 3 | **SchemaConverter** can:
 4 | - Convert a Dataset[A] to a DataFrame with the metadata of class **A** (extracted by **StructAnalyser**)
 5 | - Convert a DataFrame to a Dataset[A]
 6 | 
 7 | For each of the three annotations: ColumnName, CompoundKey and Compress, SchemaConverter will
 8 | - rename the column
 9 | - create/drop the compound key column(s)
10 | - compress/decompress the column(s) having Compress annotation.
11 | 
12 | ## Demo
13 | 
14 | ### Dataset to DataFrame
15 | ```scala
16 | val ds: Dataset[MyClass] = ...
17 | SchemaConverter.toDF(ds)
18 | ```
19 | 
20 | ### DataFrame to Dataset
21 | ```scala
22 | val df: DataFrame = ...
23 | SchemaConverter.fromDF[MyClass](df)
24 | ```


--------------------------------------------------------------------------------
/docs/SparkRepository-caching.md:
--------------------------------------------------------------------------------
1 | [![](https://mermaid.ink/img/eyJjb2RlIjoiZ3JhcGggVERcbiAgICByZWFkW2ludm9rZSBgZmluZEFsbGAgb3IgYGZpbmRCeWBdICAtLT4gY2hlY2tfcGVyc2lzdGVuY2VcblxuICAgIGNoZWNrX3BlcnNpc3RlbmNle3BlcnNpc3RSZWFkRGF0YT99XG5cbiAgICBjaGVja19wZXJzaXN0ZW5jZSAtLiBmYWxzZSAuLT4gbG9hZF9kaXNrW2xvYWQgZnJvbSBkYXRhIHN0b3JhZ2VdXG4gICAgY2hlY2tfcGVyc2lzdGVuY2UgLS4gdHJ1ZSAuLT4gY2hlY2tfZmx1c2h7Zmx1c2hSZWFkQ2FjaGU_fVxuXG4gICAgY2hlY2tfZmx1c2ggLS4gdHJ1ZSAuLT4gdW5wZXJzaXN0W3VucGVyc2lzdCBsYXN0IHJlYWQgY2FjaGVdXG4gICAgdW5wZXJzaXN0IC0tPiB1cGRhdGVfY29uZGl0aW9uX2hhc2hbc2F2ZSByZXF1ZXN0IGhpc3RvcnldXG4gICAgdXBkYXRlX2NvbmRpdGlvbl9oYXNoIC0tPiB1cGRhdGVfcmVhZF9jYWNoZVtvdmVyd3JpdGUgbGFzdCByZWFkIGNhY2hlXVxuICAgIHVwZGF0ZV9yZWFkX2NhY2hlIC0tPiBwZXJzaXN0W3BlcnNpc3QgbmV3IHJlYWQgY2FjaGVdXG4gICAgcGVyc2lzdCAtLT4gcmVhZF9jYWNoZVxuXG4gICAgY2hlY2tfZmx1c2ggLS4gZmFsc2UgLi0-IGNoZWNrX2NvbmRpdGlvbl9oYXNoe3NhbWUgcmVhZCByZXF1ZXN0IGFzIGxhc3QgdGltZT99XG5cbiAgICBjaGVja19jb25kaXRpb25faGFzaCAtLiB0cnVlIC4tPiByZWFkX2NhY2hlW3JldHVybiByZWFkIGNhY2hlXVxuICAgIGNoZWNrX2NvbmRpdGlvbl9oYXNoIC0uIGZhbHNlIC4tPiB1bnBlcnNpc3RcbiIsIm1lcm1haWQiOnsidGhlbWUiOiJkZWZhdWx0In0sInVwZGF0ZUVkaXRvciI6ZmFsc2V9)](https://mermaid-js.github.io/mermaid-live-editor/#/edit/eyJjb2RlIjoiZ3JhcGggVERcbiAgICByZWFkW2ludm9rZSBgZmluZEFsbGAgb3IgYGZpbmRCeWBdICAtLT4gY2hlY2tfcGVyc2lzdGVuY2VcblxuICAgIGNoZWNrX3BlcnNpc3RlbmNle3BlcnNpc3RSZWFkRGF0YT99XG5cbiAgICBjaGVja19wZXJzaXN0ZW5jZSAtLiBmYWxzZSAuLT4gbG9hZF9kaXNrW2xvYWQgZnJvbSBkYXRhIHN0b3JhZ2VdXG4gICAgY2hlY2tfcGVyc2lzdGVuY2UgLS4gdHJ1ZSAuLT4gY2hlY2tfZmx1c2h7Zmx1c2hSZWFkQ2FjaGU_fVxuXG4gICAgY2hlY2tfZmx1c2ggLS4gdHJ1ZSAuLT4gdW5wZXJzaXN0W3VucGVyc2lzdCBsYXN0IHJlYWQgY2FjaGVdXG4gICAgdW5wZXJzaXN0IC0tPiB1cGRhdGVfY29uZGl0aW9uX2hhc2hbc2F2ZSByZXF1ZXN0IGhpc3RvcnldXG4gICAgdXBkYXRlX2NvbmRpdGlvbl9oYXNoIC0tPiB1cGRhdGVfcmVhZF9jYWNoZVtvdmVyd3JpdGUgbGFzdCByZWFkIGNhY2hlXVxuICAgIHVwZGF0ZV9yZWFkX2NhY2hlIC0tPiBwZXJzaXN0W3BlcnNpc3QgbmV3IHJlYWQgY2FjaGVdXG4gICAgcGVyc2lzdCAtLT4gcmVhZF9jYWNoZVxuXG4gICAgY2hlY2tfZmx1c2ggLS4gZmFsc2UgLi0-IGNoZWNrX2NvbmRpdGlvbl9oYXNoe3NhbWUgcmVhZCByZXF1ZXN0IGFzIGxhc3QgdGltZT99XG5cbiAgICBjaGVja19jb25kaXRpb25faGFzaCAtLiB0cnVlIC4tPiByZWFkX2NhY2hlW3JldHVybiByZWFkIGNhY2hlXVxuICAgIGNoZWNrX2NvbmRpdGlvbl9oYXNoIC0uIGZhbHNlIC4tPiB1bnBlcnNpc3RcbiIsIm1lcm1haWQiOnsidGhlbWUiOiJkZWZhdWx0In0sInVwZGF0ZUVkaXRvciI6ZmFsc2V9)


--------------------------------------------------------------------------------
/docs/SparkSessionBuilder.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | The class `SparkSessionBuilder` is used to configure and build new spark session for the given usage(s).
 4 | 
 5 | ## Code Example
 6 | 
 7 | ```scala
 8 | import com.jcdecaux.datacorp.spark.SparkSessionBuilder
 9 | 
10 | // Auto-configure
11 | val spark1: SparkSession = new SparkSessionBuilder("cassandra")
12 |   .setAppName("myApp")
13 |   .setEnv("dev")  // or AppEnv.DEV 
14 |   .setCassandraHost("localhost")
15 |   .build()
16 |   .get()
17 | 
18 | // Build with your own SparkConf
19 | val spark2: SparkSession = new SparkSessionBuilder()
20 |   .configure(yourSparkConf)
21 |   .build()
22 |   .get()
23 | 
24 | ```
25 | 
26 | 


--------------------------------------------------------------------------------
/docs/Stage.md:
--------------------------------------------------------------------------------
 1 | ## Definition
 2 | A **Stage** is a collection of independent **Factories**. All the stages of a pipeline will be executed sequentially at runtime. Within a stage, all factories could be executed parallelly or sequentially.
 3 | 
 4 | ## Demo
 5 | 
 6 | You could instantiate a stage like the follows: 
 7 | ```scala
 8 | val stage = new Stage()
 9 | ```
10 | 
11 | Run in sequential mode:
12 | ```scala
13 | stage.parallel(false)
14 | ```
15 | 
16 | Add a factory into this stage:
17 | ```scala
18 | // Add an already existed instance of factory
19 | val myFactory = new MyFactory()
20 | stage.addFactory(myFactory)
21 | 
22 | // Or let the framework handle the instantiation
23 | stage.addFactory(classOf[MyFactory], constructorArguments...)
24 | ```
25 | 
26 | Describe the current stage:
27 | ```scala
28 | stage.describe()
29 | ```
30 | 
31 | Run the current stage:
32 | ```scala
33 | stage.run()
34 | ```


--------------------------------------------------------------------------------
/docs/StructAnalyser.md:
--------------------------------------------------------------------------------
 1 | ## Definition
 2 | 
 3 | **StructAnalyser** provides functionalities to retrieve annotation information from a class.
 4 | 
 5 | It scans the class' metadata and returns a **StructType** so that the **SchemaConverter** could use to transform the schema of a DataFrame/Dataset.
 6 | 
 7 | You can access the metadata of your class by getting the metadata of **StructField**
 8 | 
 9 | ### Demo
10 | 
11 | ```scala
12 | case class MyClass(col1: String, @ColumnName("column_2") col2: String)
13 | 
14 | // analyseSchema will return a StructType of MyClass
15 | val structType = StructAnalyser.analyseSchema[MyClass]
16 | ```
17 | 


--------------------------------------------------------------------------------
/docs/Transformer.md:
--------------------------------------------------------------------------------
 1 | The notion of the transformer is preliminary. 
 2 | 
 3 | # Definition
 4 | **Transformer** is the atomic class for data transformation. A `transformer[T]` will transform some input data into an object of type **T**. 
 5 | 
 6 | 
 7 | ## When should I use a transformer
 8 | The original idea of the transformer is to decouple a complex data processing procedure of a **Factory**. Generally, a transformer should be placed inside a **Factory**. A factory can have multiple transformers.
 9 | 
10 | A transformer should be simple (in terms of task, for example, transform an object of type A to type B) and stateless (which means it should minimize its dependence on the application context).
11 | 
12 | Another use case would be to implement several different data transformation logic for one factory (for example, there may be several different ML models for one single prediction job). In this case, there should be a way to select the most appropriate transformer according to their performance in a specific environment.
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/docs/data_access_layer/ConnectorBuilder.md:
--------------------------------------------------------------------------------
 1 | ## Definition
 2 | [**ConnectorBuilder**](https://github.com/SETL-Developers/setl/tree/master/src/main/scala/com/jcdecaux/setl/storage/ConnectorBuilder.scala) provides a simplified way to create **Connector**.
 3 | 
 4 | ## Usage
 5 | You have two ways to instantiate a **ConnectorBuilder**:
 6 | - with a *Typesafe* [**Config**](https://github.com/lightbend/config) object from a configuration file
 7 | - with a [**Conf**](https://github.com/SETL-Developers/setl/tree/master/src/main/scala/com/jcdecaux/setl/config/Conf.scala) object from a `Map[String, String]`.
 8 | 
 9 | ### With Typesafe Config
10 | Firstly, you should create a configuration file in your project's resources directory.
11 | 
12 | In this case, let's call it `application.conf`.
13 | 
14 | ```text
15 | csvConfiguration {
16 |   storage = "CSV"
17 |   path = "your/path/to/file.csv"
18 |   inferSchema = "true"
19 |   delimiter = ";"
20 |   header = "true"
21 |   saveMode = "Append"
22 | }
23 | ```
24 | 
25 | Then you can use **ConfigLoader** to load your configuration file. By default, it loads `application.conf`.
26 | ```scala
27 | object Properties extends ConfigLoader
28 | 
29 | val connector = new ConnectorBuilder(spark, Properties.getConfig("csvConfiguration")).getOrCreate()
30 | 
31 | connector.read()
32 | connector.write(df)
33 | ```
34 | 
35 | ### With Conf
36 | You can create a **Conf** object from a **Map**.
37 | ```scala
38 | val conf = Conf.fromMap(
39 |   Map(
40 |     "storage" -> "PARQUET",
41 |     "path" -> "path/to/your/file",
42 |     ...
43 |   )
44 | )
45 | 
46 | val connector = new ConnectorBuilder(spark, conf).getOrCreate()
47 | 
48 | connector.read()
49 | connector.write(df)
50 | 
51 | ```
52 | 
53 | ## Parameters
54 | Please refer to [Connector documentation](Connector)


--------------------------------------------------------------------------------
/docs/data_access_layer/CustomConnector.md:
--------------------------------------------------------------------------------
 1 | ## Custom Connector
 2 | 
 3 | You can implement you own data source connector by implementing the `ConnectorInterface`
 4 | 
 5 | ```scala
 6 | import io.github.setl.storage.connector.ConnectorInterface
 7 | import io.github.setl.internal.CanDrop
 8 | import io.github.setl.config.Conf
 9 | import org.apache.spark.sql.DataFrame
10 | 
11 | class CustomConnector extends ConnectorInterface with CanDrop {
12 |   override def setConf(conf: Conf): Unit = {
13 |     // configuration
14 |   }
15 | 
16 |   override def read(): DataFrame = {
17 |     import spark.implicits._
18 |     Seq(1, 2, 3).toDF("id")
19 |   }
20 | 
21 |   override def write(t: DataFrame, suffix: Option[String]): Unit = logDebug("Write with suffix")
22 | 
23 |   override def write(t: DataFrame): Unit = logDebug("Write")
24 | 
25 |   override def drop(): Unit = logDebug("drop")
26 | }
27 | ```
28 | 
29 | ### Functionalities
30 | 
31 | Like the previous example, by extending your connector class with functionality traits (like `CanDrop`) 
32 | and implementing accordingly their abstract methods, SparkRepository will be able to use these specific
33 | functionalities.
34 | 
35 | ### Use the custom connector
36 | 
37 | To use this connector, set the storage to **OTHER** and provide the class reference of your connector:
38 | 
39 | ```txt
40 | myConnector {
41 |   storage = "OTHER"
42 |   class = "com.example.CustomConnector"  // class reference of your connector
43 |   yourParam = "some parameter" // put your parameters here
44 | }
45 | ```
46 | 


--------------------------------------------------------------------------------
/docs/data_access_layer/SparkRepositoryAdapter.md:
--------------------------------------------------------------------------------
 1 | # RepositoryAdapter
 2 | 
 3 | In some situation, the data format defined in the data source doesn't match the case class defined in our project, and we want to hide 
 4 | the conversion detail (which may be irrelevant to the business logic). We can achieve this by using the 
 5 | [SparkRepositoryAdapter](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/repository/ImplicitRepositoryAdapter.scala).
 6 | and [DatasetConverter](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/DatasetConverter.scala)
 7 | 
 8 | ## Example
 9 | 
10 | Imagine our datasource has a format that match the following case class:
11 | 
12 | ```scala
13 | case class DataSourceFormat(col1: String, col2: Int, col3: String)
14 | 
15 | // col1, col2, col3
16 | //   r1,    1, r1-1
17 | //   r2,    2, r1-2
18 | ```
19 | 
20 | The column `col3` is not necessary (as it's only a concatenation of `col1` and `col2`, we can ignore it and use this 
21 | case class in out project:
22 | 
23 | ```scala
24 | case class ProjectFormat(col1: String, col2: Int)
25 | ```
26 | 
27 | So the data conversions that we want to hide are:
28 | - during the reading, we want to implicitly drop the `col3`
29 | - during the writing, we want to implicitly create `col3` by concatenating `col1` and `col2`
30 | 
31 | Let's implement our dataset converter:
32 | ```scala
33 | import io.github.setl.storage.DatasetConverter
34 | 
35 | implicit val myConverter = new DatasetConverter[ProjectFormat, DataSourceFormat] {
36 |   override def convertFrom(t2: Dataset[DataSourceFormat]): Dataset[ProjectFormat] = {
37 |     t2.drop("col3")
38 |       .as[ProjectFormat](ExpressionEncoder[ProjectFormat])
39 |   }
40 | 
41 |   override def convertTo(t1: Dataset[ProjectFormat]): Dataset[DataSourceFormat] = {
42 |     import org.apache.spark.sql.functions._
43 | 
44 |     t1.withColumn("col3", concat(col("col1"), lit("-"), col("col2")))
45 |       .as[DataSourceFormat](ExpressionEncoder[DataSourceFormat])
46 |   }
47 | }
48 | ```
49 | 
50 | To use this converter:
51 | ```scala
52 | import io.github.setl.storage.repository.ImplicitRepositoryAdapter._
53 | 
54 | // Supposed that we have a repository of type ProjectFormat.
55 | // After the import, several new methods will be added to the SparkRepository
56 | // For example: convertAndSave and findAllAndConvert
57 | val projectFormatRepo = SparkRepository[ProjectFormat]
58 | 
59 | // This will convert a Dataset[ProjectFormat] to a Dataset[DataSourceFormat] and save it 
60 | projectFormatRepo.convertAndSave(projectFormatDataset)
61 | 
62 | // This will load a Dataset[DataSourceFormat] and automatically convert it to a Dataset[ProjectFormat] 
63 | val loaded = projectFormatRepo.findAllAndConvert()
64 | ```
65 | 


--------------------------------------------------------------------------------
/docs/data_access_layer/SparkRepositoryBuilder.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Based on the same idea of [**ConnectorBuilder**](ConnectorBuilder), [**SparkRepositoryBuilder**](https://github.com/SETL-Developers/setl/tree/master/src/main/scala/com/jcdecaux/setl/storage/SparkRepositoryBuilder.scala) helps you create your **SparkRepository** :ok_hand: 
 4 | 
 5 | ## Usage
 6 | Firstly, you should create a configuration file in your project's resources directory.
 7 | 
 8 | In this case, let's call it `application.conf`.
 9 | 
10 | ```text
11 | csvConfiguration {
12 |   storage = "CSV"
13 |   path = "your/path/to/file.csv"
14 |   inferSchema = "true"
15 |   delimiter = ";"
16 |   header = "true"
17 |   saveMode = "Append"
18 | }
19 | ```
20 | 
21 | Then you can use **ConfigLoader** to load your configuration file. By default it loads `application.conf`.
22 | ```scala
23 | val repo = new SparkRepositoryBuilder[MyClass](setl.configLoader.getConfig("csvConfiguration")).getOrCreate()
24 | 
25 | repo.findAll()
26 | repo.save(dataset)
27 | ```
28 | 
29 | ## Parameters
30 | Please refer to [Connector documentation](Connector)


--------------------------------------------------------------------------------
/docs/data_access_layer/Structured-Streaming-Connector.md:
--------------------------------------------------------------------------------
 1 | **StructuredStreamingConnector** is a new connector added since the version 0.4.3. It brings the Spark Structured Streaming API together with the Connector API. It allows users to manipulate streaming data like any other static connectors.
 2 |  
 3 | Here is an implementation of the [word count program](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#quick-example) from the Spark structured streaming documentation:
 4 | 
 5 | ```scala
 6 |   // Configuration
 7 |   val input = Map(
 8 |     "storage" -> "STRUCTURED_STREAMING",
 9 |     "format" -> "socket",
10 |     "host" -> "localhost",
11 |     "port" -> "9999"
12 |   )
13 | 
14 |   val output = Map(
15 |     "storage" -> "STRUCTURED_STREAMING",
16 |     "outputMode" -> "complete",
17 |     "format" -> "console"
18 |   )
19 | 
20 |   val spark = SparkSession
21 |     .builder
22 |     .appName("StructuredNetworkWordCount")
23 |     .master("local")
24 |     .getOrCreate()
25 | 
26 |   import spark.implicits._
27 | 
28 |   val inputConnector = new ConnectorBuilder(Conf.fromMap(input)).getOrCreate()
29 |   val outputConnector = new ConnectorBuilder(Conf.fromMap(output)).getOrCreate().asInstanceOf[StructuredStreamingConnector]
30 |   
31 |   // read lines
32 |   val lines = inputConnector.read()
33 |   // Split the lines into words
34 |   val words = lines.as[String].flatMap(_.split(" "))
35 |   // Generate running word count
36 |   val wordCounts = words.groupBy("value").count()
37 |   // Show the output
38 |   outputConnector.write(wordCounts)
39 |   outputConnector.awaitTermination()
40 | ```


--------------------------------------------------------------------------------
/docs/img/logo_setl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SETL-Framework/setl/68f4e0213f5b6793acb96b2d9e08c102439565c4/docs/img/logo_setl.png


--------------------------------------------------------------------------------
/docs/img/logo_setl_1280_640.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SETL-Framework/setl/68f4e0213f5b6793acb96b2d9e08c102439565c4/docs/img/logo_setl_1280_640.png


--------------------------------------------------------------------------------
/docs/img/old_logo/logo_setl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SETL-Framework/setl/68f4e0213f5b6793acb96b2d9e08c102439565c4/docs/img/old_logo/logo_setl.png


--------------------------------------------------------------------------------
/docs/img/old_logo/logo_setl_1280_640.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SETL-Framework/setl/68f4e0213f5b6793acb96b2d9e08c102439565c4/docs/img/old_logo/logo_setl_1280_640.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ![logo](img/logo_setl.png)
 2 | -----------
 3 | If you’re a **data scientist** or **data engineer**, this might sound familiar while working on **ETL** projects: 
 4 | 
 5 | - Switching between multiple projects is a hassle 
 6 | - Debugging others’ code is a nightmare
 7 | - Spending a lot of time solving non-business-related issues 
 8 | 
 9 | **SETL** (Spark ETL, pronounced "settle") is a Scala framework that helps you structure your Spark ETL projects, modularize your data transformation logic and speed up your development.
10 | 
11 | ## Table of contents
12 | 
13 | - [Quick start](Quick-Start)
14 | - [Setl](Setl)
15 | - Data Access Layer
16 |   - [Access data with Connector](data_access_layer/Connector)
17 |     - [FileConnector](data_access_layer/Connector#fileconnector)
18 |     - [DBConnector](data_access_layer/Connector#dbconnector)
19 |     - [StructuredStreamingConnector](data_access_layer/Structured-Streaming-Connector)
20 |     - [Use your own connector](data_access_layer/CustomConnector)
21 |   - [Access data with Repository](data_access_layer/Repository)
22 |   - [Create a connector](data_access_layer/ConnectorBuilder)
23 |   - [Create a repository](data_access_layer/SparkRepositoryBuilder)
24 |   - [Hide implicit data conversion with SparkRepositoryAdapter](data_access_layer/SparkRepositoryAdapter)
25 |   - [Configuration Example](data_access_layer/configuration_example)
26 | - Data Transformation API
27 |   - [Transformer](Transformer)
28 |   - [Factory](Factory)
29 | - Workflow Management
30 |   - [Stage](Stage)
31 |   - [Pipeline](Pipeline)
32 |   - [Pipeline execution optimization (preliminary feature)](PipelineOptimizer)
33 | - Utilities
34 |   - [Annotations](Annotations)
35 |   - [SparkSession builder](SparkSessionBuilder)
36 |   - [ConfigLoader](ConfigLoader)
37 |   - [DateUtils](DateUtils)
38 |   - [Condition](Condition)
39 | - Developer
40 |   - [StructAnalyser](StructAnalyser)
41 |   - [SchemaConverter](SchemaConverter)
42 |   - [PipelineInspector](PipelineInspector)
43 |   - [PipelineOptimizer](PipelineOptimizer)
44 |   - [DeliverableDispatcher](DeliverableDispatcher)
45 |   - [Read cache strategy](SparkRepository-caching)
46 |   - [Logging](Logging)
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/docs/utils/Compressor_Archiver.md:
--------------------------------------------------------------------------------
 1 | # Compressor
 2 | 
 3 | A [compressor](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/Compressor.scala)
 4 | can:
 5 | - compress a string to a byte array
 6 | - decompress a byte array to a string
 7 | 
 8 | ## Example:
 9 | 
10 | ```scala
11 | import io.github.setl.storage.GZIPCompressor
12 | 
13 | val compressor = new GZIPCompressor()
14 | 
15 | val compressed = compressor.compress("data to be compressed")
16 | val data = compressor.decompress(compressed)
17 | ```  
18 | 
19 | # Archiver
20 | 
21 | An [Archiver](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/Archiver.scala) can
22 | package files and directories into a single data archive file.
23 | 
24 | 


--------------------------------------------------------------------------------
/docs/vocabulary.md:
--------------------------------------------------------------------------------
 1 | #### Data access layer
 2 | Data access layer is a layer of a computer program which provides simplified access (saving and retrieving) data to data stored in persistent storage.
 3 | 
 4 | #### Business logic layer
 5 | Business logic layer contains code which works with the data, processing it according to the rules of the business logic.
 6 | 
 7 | #### Persistence storage
 8 | A storage of data, *e.g* a database, a distributed filesystem, etc.
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/annotation/Benchmark.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.annotation;
 2 | 
 3 | import java.lang.annotation.ElementType;
 4 | import java.lang.annotation.Retention;
 5 | import java.lang.annotation.RetentionPolicy;
 6 | import java.lang.annotation.Target;
 7 | 
 8 | 
 9 | /**
10 |  * <P>The Benchmark annotation should be put on any class of Factory[T] to enable the benchmark process.
11 |  * The total elapsed time of the factory will then be recorded. </p>
12 |  *
13 |  * <p>In addition, user can also put it onto any the "read", "process" or "write" methods that are defined
14 |  * in AbstractFactory[T], and the elapsed time of each method will be recorded as well.</p>
15 |  */
16 | @InterfaceStability.Evolving
17 | @Retention(RetentionPolicy.RUNTIME)
18 | @Target({ElementType.METHOD, ElementType.TYPE})
19 | public @interface Benchmark {
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/annotation/Compress.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.annotation;
 2 | 
 3 | import io.github.setl.internal.SchemaConverter;
 4 | import io.github.setl.internal.StructAnalyser;
 5 | import io.github.setl.storage.Compressor;
 6 | import io.github.setl.storage.XZCompressor;
 7 | 
 8 | import java.lang.annotation.ElementType;
 9 | import java.lang.annotation.Retention;
10 | import java.lang.annotation.RetentionPolicy;
11 | import java.lang.annotation.Target;
12 | 
13 | /**
14 |  * <p>
15 |  * The annotation Compress indicates {@link StructAnalyser} to save the metadata of corresponding fields
16 |  * into the output StructType object. All annotated columns will be compressed by {@link SchemaConverter}
17 |  * during the saving process in SparkRepository
18 |  * </p>
19 |  *
20 |  * <p>
21 |  * By default, the compression algorithm is XZ with the default compression level (=6). You can define other compressor
22 |  * by implementing <code>com.jcdecaux.datacorp.storage.Compressor</code> interface.
23 |  * </p>
24 |  */
25 | @InterfaceStability.Stable
26 | @Retention(RetentionPolicy.RUNTIME)
27 | @Target({ElementType.PARAMETER})
28 | public @interface Compress {
29 | 
30 |     Class<? extends Compressor> compressor() default XZCompressor.class;
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/annotation/Delivery.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.annotation;
 2 | 
 3 | import io.github.setl.workflow.External;
 4 | 
 5 | import java.lang.annotation.ElementType;
 6 | import java.lang.annotation.Retention;
 7 | import java.lang.annotation.RetentionPolicy;
 8 | import java.lang.annotation.Target;
 9 | 
10 | /**
11 |  * The annotation @Delivery indicates {@link io.github.setl.workflow.DeliverableDispatcher} that the current field
12 |  * or method is marked as an input and it will be injected during the runtime by the DispatchManager.
13 |  * <p>
14 |  * If multiple {@link io.github.setl.transformation.Deliverable} of the same type were found in the delivery pool of DispatchManager, then
15 |  * it will try to compare the producer of the Deliverable
16 |  */
17 | @InterfaceStability.Evolving
18 | @Retention(RetentionPolicy.RUNTIME)
19 | @Target({ElementType.FIELD, ElementType.METHOD})
20 | public @interface Delivery {
21 | 
22 |     /**
23 |      * Producer of the current delivery that will be use by DispatchManager in order to find the corresponding delivery
24 |      */
25 |     Class<?> producer() default External.class;
26 | 
27 |     /**
28 |      * Indicates whether the current Delivery is optional or not
29 |      */
30 |     boolean optional() default false;
31 | 
32 |     boolean autoLoad() default false;
33 | 
34 |     String condition() default "";
35 | 
36 |     String id() default "";
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/annotation/Experimental.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.annotation;
 2 | 
 3 | import java.lang.annotation.ElementType;
 4 | import java.lang.annotation.Retention;
 5 | import java.lang.annotation.RetentionPolicy;
 6 | import java.lang.annotation.Target;
 7 | 
 8 | /**
 9 |  * The Experimental annotation indicate that the annotated class/method/field is supposed to be an experimental feature,
10 |  * thus the stability can't be guaranteed.
11 |  */
12 | @Retention(RetentionPolicy.CLASS)
13 | @Target({ElementType.FIELD, ElementType.METHOD, ElementType.TYPE})
14 | public @interface Experimental {
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/annotation/InterfaceStability.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.annotation;
 2 | 
 3 | import java.lang.annotation.Documented;
 4 | 
 5 | /**
 6 |  * Annotation to inform users of how much to rely on a particular package,
 7 |  * class or method not changing over time.
 8 |  */
 9 | public class InterfaceStability {
10 | 
11 |     /**
12 |      * Stable APIs that retain source and binary compatibility within a major release.
13 |      * These interfaces can change from one major release to another major release
14 |      * (e.g. from 1.0 to 2.0).
15 |      */
16 |     @Documented
17 |     public @interface Stable {
18 |     }
19 | 
20 |     /**
21 |      * APIs that are meant to evolve towards becoming stable APIs, but are not stable APIs yet.
22 |      * Evolving interfaces can change from one feature release to another release (i.e. 2.1 to 2.2).
23 |      */
24 |     @Documented
25 |     public @interface Evolving {
26 |     }
27 | 
28 |     /**
29 |      * Unstable APIs, with no guarantee on stability.
30 |      * Classes that are unannotated are considered Unstable.
31 |      */
32 |     @Documented
33 |     public @interface Unstable {
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/enums/PathFormat.java:
--------------------------------------------------------------------------------
1 | package io.github.setl.enums;
2 | 
3 | public enum PathFormat {
4 |     WILDCARD,
5 |     REGEX;
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/enums/Storage.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.enums;
 2 | 
 3 | /**
 4 |  * StorageType
 5 |  */
 6 | public enum Storage {
 7 |     CSV("io.github.setl.storage.connector.CSVConnector"),
 8 |     EXCEL("io.github.setl.storage.connector.ExcelConnector"),
 9 |     PARQUET("io.github.setl.storage.connector.ParquetConnector"),
10 |     DELTA("io.github.setl.storage.connector.DeltaConnector"),
11 |     CASSANDRA("io.github.setl.storage.connector.CassandraConnector"),
12 |     DYNAMODB("io.github.setl.storage.connector.DynamoDBConnector"),
13 |     JSON("io.github.setl.storage.connector.JSONConnector"),
14 |     JDBC("io.github.setl.storage.connector.JDBCConnector"),
15 |     STRUCTURED_STREAMING("io.github.setl.storage.connector.StructuredStreamingConnector"),
16 |     HUDI("io.github.setl.storage.connector.HudiConnector"),
17 |     SPARK_SQL("io.github.setl.storage.connector.SparkSQLConnector"),
18 |     OTHER(null);
19 | 
20 |     private String connectorName;
21 | 
22 |     Storage(String cls) {
23 |         this.connectorName = cls;
24 |     }
25 | 
26 |     public String connectorName() {
27 |         return connectorName;
28 |     }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/enums/ValueType.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.enums;
 2 | 
 3 | public enum ValueType {
 4 |     STRING("string"),
 5 |     DATETIME("timestamp"),
 6 |     DATE("date"),
 7 |     NUMBER("number"),
 8 |     SET("set"),
 9 |     COLUMN("column");
10 | 
11 |     private final String value;
12 | 
13 |     ValueType(String value) {
14 |         this.value = value;
15 |     }
16 | 
17 |     public String value() {
18 |         return value;
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/exception/AlreadyExistsException.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.exception;
 2 | 
 3 | public class AlreadyExistsException extends BaseException {
 4 |     public AlreadyExistsException() {
 5 |     }
 6 | 
 7 |     public AlreadyExistsException(String message) {
 8 |         super(message);
 9 |     }
10 | 
11 |     public AlreadyExistsException(String message, Throwable cause) {
12 |         super(message, cause);
13 |     }
14 | 
15 |     public AlreadyExistsException(Throwable cause) {
16 |         super(cause);
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/exception/BaseException.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.exception;
 2 | 
 3 | public class BaseException extends RuntimeException {
 4 | 
 5 |     public BaseException() {
 6 |     }
 7 | 
 8 |     public BaseException(String message) {
 9 |         super(message);
10 |     }
11 | 
12 |     public BaseException(String message, Throwable cause) {
13 |         super(message, cause);
14 |     }
15 | 
16 |     public BaseException(Throwable cause) {
17 |         super(cause);
18 |     }
19 | 
20 |     public BaseException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
21 |         super(message, cause, enableSuppression, writableStackTrace);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/exception/ConfException.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.exception;
 2 | 
 3 | public class ConfException extends BaseException {
 4 | 
 5 |     public ConfException(String errorMessage) {
 6 |         super(errorMessage);
 7 |     }
 8 | 
 9 |     public static class Format extends ConfException {
10 |         /**
11 |          * @param errorMessage error message
12 |          */
13 |         public Format(String errorMessage) {
14 |             super(errorMessage);
15 |         }
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/exception/ConnectorException.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.exception;
 2 | 
 3 | public class ConnectorException extends BaseException {
 4 |     public ConnectorException() {
 5 |     }
 6 | 
 7 |     public ConnectorException(String message) {
 8 |         super(message);
 9 |     }
10 | 
11 |     public ConnectorException(String message, Throwable cause) {
12 |         super(message, cause);
13 |     }
14 | 
15 |     public ConnectorException(Throwable cause) {
16 |         super(cause);
17 |     }
18 | 
19 |     public ConnectorException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
20 |         super(message, cause, enableSuppression, writableStackTrace);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/exception/InvalidConnectorException.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.exception;
 2 | 
 3 | public class InvalidConnectorException extends BaseException {
 4 |     public InvalidConnectorException() {
 5 |     }
 6 | 
 7 |     public InvalidConnectorException(String message) {
 8 |         super(message);
 9 |     }
10 | 
11 |     public InvalidConnectorException(String message, Throwable cause) {
12 |         super(message, cause);
13 |     }
14 | 
15 |     public InvalidConnectorException(Throwable cause) {
16 |         super(cause);
17 |     }
18 | 
19 |     public InvalidConnectorException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
20 |         super(message, cause, enableSuppression, writableStackTrace);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/exception/InvalidDeliveryException.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.exception;
 2 | 
 3 | public class InvalidDeliveryException extends BaseException {
 4 | 
 5 |     public InvalidDeliveryException() {
 6 |     }
 7 | 
 8 |     public InvalidDeliveryException(String message) {
 9 |         super(message);
10 |     }
11 | 
12 |     public InvalidDeliveryException(String message, Throwable cause) {
13 |         super(message, cause);
14 |     }
15 | 
16 |     public InvalidDeliveryException(Throwable cause) {
17 |         super(cause);
18 |     }
19 | 
20 |     public InvalidDeliveryException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
21 |         super(message, cause, enableSuppression, writableStackTrace);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/exception/InvalidSchemaException.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.exception;
 2 | 
 3 | public class InvalidSchemaException extends BaseException {
 4 |     public InvalidSchemaException() {
 5 |     }
 6 | 
 7 |     public InvalidSchemaException(String message) {
 8 |         super(message);
 9 |     }
10 | 
11 |     public InvalidSchemaException(String message, Throwable cause) {
12 |         super(message, cause);
13 |     }
14 | 
15 |     public InvalidSchemaException(Throwable cause) {
16 |         super(cause);
17 |     }
18 | 
19 |     public InvalidSchemaException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
20 |         super(message, cause, enableSuppression, writableStackTrace);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/exception/RepositoryException.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.exception;
 2 | 
 3 | public class RepositoryException extends BaseException {
 4 |     public RepositoryException() {
 5 |     }
 6 | 
 7 |     public RepositoryException(String message) {
 8 |         super(message);
 9 |     }
10 | 
11 |     public RepositoryException(String message, Throwable cause) {
12 |         super(message, cause);
13 |     }
14 | 
15 |     public RepositoryException(Throwable cause) {
16 |         super(cause);
17 |     }
18 | 
19 |     public RepositoryException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
20 |         super(message, cause, enableSuppression, writableStackTrace);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/exception/UnknownException.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.exception;
 2 | 
 3 | /**
 4 |  * UnknownException
 5 |  */
 6 | public class UnknownException extends BaseException {
 7 | 
 8 |     public UnknownException(String errorMessage) {
 9 |         super(errorMessage);
10 |     }
11 | 
12 |     public static class Storage extends UnknownException {
13 |         public Storage(String errorMessage) {
14 |             super(errorMessage);
15 |         }
16 |     }
17 | 
18 |     public static class Format extends UnknownException {
19 |         public Format(String errorMessage) {
20 |             super(errorMessage);
21 |         }
22 |     }
23 | 
24 |     public static class Environment extends UnknownException {
25 |         public Environment(String errorMessage) {
26 |             super(errorMessage);
27 |         }
28 |     }
29 | 
30 |     public static class ValueType extends UnknownException {
31 |         public ValueType(String errorMessage) {
32 |             super(errorMessage);
33 |         }
34 |     }
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/internal/BenchmarkInvocationHandler.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal;
 2 | 
 3 | import io.github.setl.annotation.Benchmark;
 4 | import org.apache.log4j.LogManager;
 5 | import org.apache.log4j.Logger;
 6 | 
 7 | import java.lang.reflect.InvocationHandler;
 8 | import java.lang.reflect.Method;
 9 | import java.util.HashMap;
10 | import java.util.Map;
11 | 
12 | /**
13 |  * BenchmarkInvocationHandler is used to handle the `@Benchmark` annotation. It measure the elapsed time of the method
14 |  * having the annotation.
15 |  */
16 | public class BenchmarkInvocationHandler implements InvocationHandler {
17 | 
18 |     private Object target;
19 | 
20 |     private final Map<String, Method> methods = new HashMap<>();
21 | 
22 |     private Map<String, Double> benchmarkResult = new HashMap<>();
23 | 
24 |     private static Logger logger = LogManager.getLogger(BenchmarkInvocationHandler.class);
25 | 
26 |     public BenchmarkInvocationHandler(Object target) {
27 |         this.target = target;
28 |         for (Method method : target.getClass().getDeclaredMethods()) {
29 |             // Exclude all the bridge methods
30 |             if (!method.isBridge()) {
31 |                 this.methods.put(method.getName(), method);
32 |             }
33 |         }
34 |     }
35 | 
36 |     public Map<String, Double> getBenchmarkResult() {
37 |         return benchmarkResult;
38 |     }
39 | 
40 |     @Override
41 |     public Object invoke(Object proxy, Method method, Object[] args) throws Throwable {
42 | 
43 |         Method targetMethod = methods.get(method.getName());
44 |         Object result;
45 | 
46 |         if (targetMethod.isAnnotationPresent(Benchmark.class)) {
47 |             // Measure the elapsed time if the method has @Benchmark annotation
48 |             long start = System.nanoTime();
49 |             result = targetMethod.invoke(target, args);
50 |             long elapsed = System.nanoTime() - start;
51 |             double seconds = (double)elapsed / 1_000_000_000.0;
52 | 
53 |             this.benchmarkResult.put(targetMethod.getName(), seconds);
54 | 
55 |             logger.info("Executing " + target.getClass().getSimpleName() + "." +
56 |                     method.getName() + " finished in " + seconds + " s");
57 |         } else {
58 |             // if the method doesn't have the Benchmark annotation, run it without measuring the elapsed time
59 |             result = targetMethod.invoke(target, args);
60 |         }
61 | 
62 |         return result;
63 |     }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/storage/GZIPCompressor.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage;
 2 | 
 3 | import java.io.*;
 4 | import java.nio.charset.StandardCharsets;
 5 | import java.util.stream.Collectors;
 6 | import java.util.zip.GZIPInputStream;
 7 | import java.util.zip.GZIPOutputStream;
 8 | 
 9 | /**
10 |  * XZCompressor implement {@link Compressor}'s interface with the GZIP compression algorithm
11 |  */
12 | public class GZIPCompressor implements Compressor {
13 | 
14 |     @Override
15 |     public byte[] compress(String input) throws IOException {
16 |         if ((input == null) || (input.length() == 0)) {
17 |             return null;
18 |         }
19 |         ByteArrayOutputStream compressedBytes = new ByteArrayOutputStream();
20 |         GZIPOutputStream gzipOutput = new GZIPOutputStream(compressedBytes);
21 |         gzipOutput.write(input.getBytes(StandardCharsets.UTF_8));
22 |         gzipOutput.flush();
23 |         gzipOutput.close();
24 |         return compressedBytes.toByteArray();
25 |     }
26 | 
27 |     @Override
28 |     public String decompress(byte[] bytes) throws IOException {
29 |         if ((bytes == null) || (bytes.length == 0)) {
30 |             return "";
31 |         }
32 | 
33 |         InputStreamReader inputStreamReader;
34 |         if (isCompressed(bytes)) {
35 |             GZIPInputStream gzipInputStream = new GZIPInputStream(new ByteArrayInputStream(bytes));
36 |             inputStreamReader = new InputStreamReader(gzipInputStream, StandardCharsets.UTF_8);
37 |         } else {
38 |             inputStreamReader = new InputStreamReader(new ByteArrayInputStream(bytes), StandardCharsets.UTF_8);
39 |         }
40 | 
41 |         return new BufferedReader(inputStreamReader).lines().collect(Collectors.joining("\n"));
42 |     }
43 | 
44 |     private static boolean isCompressed(final byte[] compressed) {
45 |         return (compressed[0] == (byte) (GZIPInputStream.GZIP_MAGIC)) &&
46 |                 (compressed[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8));
47 |     }
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/storage/SnappyCompressor.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage;
 2 | 
 3 | import org.xerial.snappy.Snappy;
 4 | 
 5 | import java.io.IOException;
 6 | import java.nio.charset.StandardCharsets;
 7 | 
 8 | public class SnappyCompressor implements Compressor {
 9 | 
10 |     @Override
11 |     public byte[] compress(String input) throws IOException {
12 |         return Snappy.compress(input, StandardCharsets.UTF_8);
13 |     }
14 | 
15 |     @Override
16 |     public String decompress(byte[] bytes) throws IOException {
17 |         return Snappy.uncompressString(bytes, StandardCharsets.UTF_8);
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/setl/storage/XZCompressor.java:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage;
 2 | 
 3 | import org.tukaani.xz.LZMA2Options;
 4 | import org.tukaani.xz.XZInputStream;
 5 | import org.tukaani.xz.XZOutputStream;
 6 | 
 7 | import java.io.ByteArrayInputStream;
 8 | import java.io.ByteArrayOutputStream;
 9 | import java.io.IOException;
10 | import java.nio.charset.StandardCharsets;
11 | 
12 | /**
13 |  * XZCompressor implement {@link Compressor}'s interface with the XZ compression algorithm
14 |  */
15 | public class XZCompressor implements Compressor {
16 | 
17 |     @Override
18 |     public byte[] compress(String input) throws IOException {
19 |         if ((input == null) || (input.length() == 0)) {
20 |             return null;
21 |         }
22 |         ByteArrayOutputStream xzOutput = new ByteArrayOutputStream();
23 |         XZOutputStream xzStream = new XZOutputStream(xzOutput, new LZMA2Options(LZMA2Options.PRESET_DEFAULT));
24 |         xzStream.write(input.getBytes(StandardCharsets.UTF_8));
25 |         xzStream.close();
26 |         return xzOutput.toByteArray();
27 |     }
28 | 
29 |     @Override
30 |     public String decompress(byte[] bytes) throws IOException {
31 |         if ((bytes == null) || (bytes.length == 0)) {
32 |             return "";
33 |         }
34 |         XZInputStream xzInputStream = new XZInputStream(new ByteArrayInputStream(bytes));
35 |         byte firstByte = (byte) xzInputStream.read();
36 |         byte[] buffer = new byte[xzInputStream.available() + 1];
37 |         buffer[0] = firstByte;
38 |         xzInputStream.read(buffer, 1, buffer.length - 1);
39 |         xzInputStream.close();
40 |         return new String(buffer);
41 |     }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/BenchmarkResult.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl
 2 | 
 3 | case class BenchmarkResult(cls: String, read: Double, process: Double, write: Double, get: Double, total: Double) {
 4 | 
 5 |   override def toString: String = {
 6 | 
 7 |     val formatter = java.text.NumberFormat.getNumberInstance
 8 | 
 9 |     s"Benchmark class: $cls\n" +
10 |       s"Total elapsed time: ${formatter.format(total)} s\n" +
11 |       s"read: ${formatter.format(read)} s\n" +
12 |       s"process: ${formatter.format(process)} s\n" +
13 |       s"write: ${formatter.format(write)} s\n" +
14 |       "================="
15 |   }
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/Builder.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import io.github.setl.internal.Logging
 5 | 
 6 | /**
 7 |  * Builder could be used to build or initialize objects
 8 |  *
 9 |  * @tparam A the type of object that the builder is supposed to produce
10 |  */
11 | @InterfaceStability.Evolving
12 | trait Builder[+A] extends Logging {
13 | 
14 |   /**
15 |    * Build an object
16 |    *
17 |    * @return
18 |    */
19 |   def build(): this.type
20 | 
21 |   def get(): A
22 | 
23 |   def getOrCreate(): A = this.build().get()
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/Converter.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | 
 5 | /**
 6 |  * A converter should be able to convert between two types T1 and T2.
 7 |  */
 8 | @InterfaceStability.Evolving
 9 | trait Converter {
10 |   type T1
11 |   type T2
12 | 
13 |   /**
14 |    * Convert from an object of type T2 to an object of type T1
15 |    *
16 |    * @param t2 object of type T2
17 |    * @return an object of type T1
18 |    */
19 |   def convertFrom(t2: T2): T1
20 | 
21 |   /**
22 |    * Convert an object of type T1 to an object of type T2
23 |    *
24 |    * @param t1 object of type T1 to be convert to T2
25 |    * @return an object of type T2
26 |    */
27 |   def convertTo(t1: T1): T2
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/annotation/ColumnName.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.annotation
 2 | 
 3 | import scala.annotation.StaticAnnotation
 4 | 
 5 | /**
 6 |  * Define an alias for the current field in the table
 7 |  *
 8 |  * @param name alias of the current field name
 9 |  */
10 | @InterfaceStability.Stable
11 | final case class ColumnName(name: String) extends StaticAnnotation
12 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/annotation/CompoundKey.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.annotation
 2 | 
 3 | import scala.annotation.StaticAnnotation
 4 | 
 5 | /**
 6 |  * Mark current field as a part of a compound key. All the compound keys will be concatenated with a separator
 7 |  * The position of the current field could be set with the position argument
 8 |  *
 9 |  * @param id       String, "primary", "sort", etc.
10 |  * @param position String, "1", "2", etc.
11 |  */
12 | @InterfaceStability.Stable
13 | final case class CompoundKey(id: String, position: String) extends StaticAnnotation
14 | 
15 | private[setl] object CompoundKey {
16 | 
17 |   private[this] val separator: String = "!@"
18 | 
19 |   import scala.reflect.runtime.{universe => ru}
20 | 
21 |   /** To be used to handle the scala reflect annotation api of compound key */
22 |   def serialize(compoundKey: ru.AnnotationApi): String = {
23 |     val attributes = compoundKey.tree.children.tail.collect {
24 |       case ru.Literal(ru.Constant(attribute)) => attribute.toString
25 |     }
26 |     attributes.mkString(separator)
27 |   }
28 | 
29 |   /** Deserialize the string of compound key into an object of CompoundKey */
30 |   def deserialize(str: String): CompoundKey = {
31 |     val data = str.split(separator)
32 |     CompoundKey(data(0), data(1))
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/config/ConnectorConf.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | abstract class ConnectorConf extends Conf {
 4 | 
 5 |   def getReaderConf: Map[String, String]
 6 | 
 7 |   def getWriterConf: Map[String, String]
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/config/DeltaConnectorConf.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import io.github.setl.exception.ConfException
 4 | import org.apache.spark.sql.SaveMode
 5 | 
 6 | class DeltaConnectorConf extends ConnectorConf {
 7 | 
 8 |   import DeltaConnectorConf._
 9 | 
10 |   def setPath(path: String): this.type = set("path", path)
11 | 
12 |   def setSaveMode(saveMode: String): this.type = set("saveMode", saveMode)
13 | 
14 |   def setSaveMode(saveMode: SaveMode): this.type = set("saveMode", saveMode.toString)
15 | 
16 |   def getPath: String = get("path") match {
17 |     case Some(path) => path
18 |     case _ => throw new ConfException("The value of path is not set")
19 |   }
20 | 
21 |   def getSaveMode: SaveMode = SaveMode.valueOf(get("saveMode", SaveMode.Append.toString))
22 | 
23 |   override def getReaderConf: Map[String, String] = {
24 |     import scala.collection.JavaConverters._
25 |     settings.asScala.toMap - PATH
26 |   }
27 | 
28 |   override def getWriterConf: Map[String, String] = {
29 |     import scala.collection.JavaConverters._
30 |     settings.asScala.toMap - SAVEMODE - PATH
31 |   }
32 | }
33 | 
34 | object DeltaConnectorConf {
35 | 
36 |   def fromMap(options: Map[String, String]): DeltaConnectorConf = new DeltaConnectorConf().set(options)
37 | 
38 |   val SAVEMODE: String = "saveMode"
39 |   val PATH: String = "path"
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/config/DynamoDBConnectorConf.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import org.apache.spark.sql.SaveMode
 4 | 
 5 | class DynamoDBConnectorConf extends ConnectorConf {
 6 | 
 7 |   import DynamoDBConnectorConf._
 8 | 
 9 |   def setTable(table: String): this.type = set(TABLE, table)
10 | 
11 |   def getTable: Option[String] = get(TABLE)
12 | 
13 |   def setReadPartitions(readPartitions: String): this.type = set(Reader.READ_PARTITIONS, readPartitions)
14 | 
15 |   def getReadPartitions: Option[String] = get(Reader.READ_PARTITIONS)
16 | 
17 |   def getSaveMode: SaveMode = SaveMode.valueOf(get("saveMode", "ErrorIfExists"))
18 | 
19 |   override def getReaderConf: Map[String, String] = {
20 |     import scala.collection.JavaConverters._
21 |     settings.asScala.toMap -
22 |       Writer.WRITE_BATCH_SIZE -
23 |       Writer.UPDATE -
24 |       TABLE -
25 |       Writer.SAVE_MODE
26 |   }
27 | 
28 |   override def getWriterConf: Map[String, String] = {
29 |     import scala.collection.JavaConverters._
30 |     settings.asScala.toMap -
31 |       Reader.READ_PARTITIONS -
32 |       Reader.MAX_PARTITION_BYTES -
33 |       Reader.DEFAULT_PARALLELISM -
34 |       Reader.STRONGLY_CONSISTENT_READS -
35 |       Reader.BYTES_PER_RCU -
36 |       Reader.FILTER_PUSHDOWN -
37 |       TABLE -
38 |       Writer.SAVE_MODE
39 |   }
40 | 
41 |   def getRegion: Option[String] = get(REGION)
42 | 
43 | }
44 | 
45 | object DynamoDBConnectorConf {
46 | 
47 |   object Reader {
48 |     val READ_PARTITIONS: String = "readPartitions"
49 |     val MAX_PARTITION_BYTES: String = "maxPartitionBytes"
50 |     val DEFAULT_PARALLELISM: String = "defaultParallelism"
51 |     val STRONGLY_CONSISTENT_READS: String = "stronglyConsistentReads"
52 |     val BYTES_PER_RCU: String = "bytesPerRCU"
53 |     val FILTER_PUSHDOWN: String = "filterPushdown"
54 |   }
55 | 
56 |   object Writer {
57 |     val WRITE_BATCH_SIZE = "writeBatchSize"
58 |     val UPDATE = "update"
59 |     val SAVE_MODE = "saveMode"
60 |   }
61 | 
62 |   val REGION: String = "region"
63 |   val TABLE: String = "table"
64 |   val THROUGHPUT: String = "throughput"
65 |   val TARGET_CAPACITY: String = "targetCapacity"
66 | }
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/config/HudiConnectorConf.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import io.github.setl.exception.ConfException
 4 | import org.apache.spark.sql.SaveMode
 5 | 
 6 | class HudiConnectorConf extends ConnectorConf {
 7 | 
 8 |   import HudiConnectorConf._
 9 | 
10 |   override def getReaderConf: Map[String, String] = {
11 |     import scala.collection.JavaConverters._
12 |     settings.asScala.toMap - PATH
13 |   }
14 | 
15 |   override def getWriterConf: Map[String, String] = {
16 |     import scala.collection.JavaConverters._
17 |     settings.asScala.toMap - SAVEMODE - PATH
18 |   }
19 | 
20 |   def setPath(path: String): this.type = set("path", path)
21 | 
22 |   def setSaveMode(saveMode: String): this.type = set("saveMode", saveMode)
23 | 
24 |   def setSaveMode(saveMode: SaveMode): this.type = set("saveMode", saveMode.toString)
25 | 
26 |   def getPath: String = get("path") match {
27 |     case Some(path) => path
28 |     case _ => throw new ConfException("The value of path is not set")
29 |   }
30 | 
31 |   def getSaveMode: SaveMode = SaveMode.valueOf(get("saveMode", SaveMode.Append.toString))
32 | 
33 | }
34 | 
35 | object HudiConnectorConf {
36 |   def fromMap(options: Map[String, String]): HudiConnectorConf = new HudiConnectorConf().set(options)
37 | 
38 |   val SAVEMODE: String = "saveMode"
39 |   val PATH: String = "path"
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/config/StructuredStreamingConnectorConf.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | /**
 4 |  * Configuration parameters: <a href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#input-sources">
 5 |  * Spark documentation</a>
 6 |  */
 7 | class StructuredStreamingConnectorConf extends ConnectorConf {
 8 | 
 9 |   import StructuredStreamingConnectorConf._
10 | 
11 |   def setFormat(format: String): this.type = set(FORMAT.toLowerCase(), format)
12 | 
13 |   def getFormat: String = getWithException(FORMAT).toLowerCase()
14 | 
15 |   def setSchema(schema: String): this.type = set(SCHEMA, schema)
16 | 
17 |   def getSchema: String = getWithException(SCHEMA)
18 | 
19 |   def setOutputMode(mode: String): this.type = set(OUTPUT_MODE, mode)
20 | 
21 |   def getOutputMode: String = getWithException(OUTPUT_MODE)
22 | 
23 |   def setPath(path: String): this.type = set(PATH, path)
24 | 
25 |   def getPath: String = getWithException(PATH)
26 | 
27 |   override def getReaderConf: Map[String, String] = removePrivateConf()
28 | 
29 |   override def getWriterConf: Map[String, String] = removePrivateConf()
30 | 
31 |   private[this] def getWithException(key: String): String = {
32 |     get(key).getOrElse(throw new IllegalArgumentException(s"Can't find $key"))
33 |   }
34 | 
35 |   private[this] def removePrivateConf(): Map[String, String] = {
36 |     import scala.collection.JavaConverters._
37 |     settings.asScala.toMap - FORMAT - SCHEMA - OUTPUT_MODE
38 |   }
39 | }
40 | 
41 | object StructuredStreamingConnectorConf {
42 |   def fromMap(options: Map[String, String]): StructuredStreamingConnectorConf =
43 |     new StructuredStreamingConnectorConf().set(options)
44 | 
45 |   val FORMAT: String = "format"
46 |   val SCHEMA: String = "schema"
47 |   val OUTPUT_MODE: String = "outputMode"
48 |   val PATH: String = "path"
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/CanCreate.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.storage.connector.Connector
 4 | import org.apache.spark.sql.DataFrame
 5 | 
 6 | /**
 7 |  * Connectors that inherit CanCreate should be able to create a table in a database or a file/folder in a file system
 8 |  */
 9 | trait CanCreate {
10 |   self: Connector =>
11 | 
12 |   /**
13 |    * Create a data storage (e.g. table in a database or file/folder in a file system) with a suffix
14 |    *
15 |    * @param t      data frame to be written
16 |    * @param suffix suffix to be appended at the end of the data storage name
17 |    */
18 |   def create(t: DataFrame, suffix: Option[String]): Unit
19 | 
20 |   /**
21 |    * Create a data storage (e.g. table in a database or file/folder in a file system)
22 |    *
23 |    * @param t data frame to be written
24 |    */
25 |   def create(t: DataFrame): Unit
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/CanDelete.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.storage.connector.Connector
 4 | 
 5 | /**
 6 |  * Connectors that inherit CanDelete should be able to delete records for a given query string
 7 |  */
 8 | trait CanDelete {
 9 |   self: Connector =>
10 | 
11 |   /**
12 |    * Delete rows according to the query
13 |    *
14 |    * @param query a query string
15 |    */
16 |   def delete(query: String): Unit
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/CanDrop.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.storage.connector.Connector
 4 | 
 5 | /**
 6 |  * Connectors that inherit CanDrop should be able to drop the entire data table
 7 |  */
 8 | trait CanDrop {
 9 |   self: Connector =>
10 | 
11 |   /**
12 |    * Drop the entire table.
13 |    */
14 |   def drop(): Unit
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/CanPartition.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.storage.connector.Connector
 4 | 
 5 | /**
 6 |  * Connectors that inherit CanPartition should be able to partition the output by the given columns on the file system
 7 |  */
 8 | trait CanPartition {
 9 |   self: Connector =>
10 | 
11 |   /**
12 |    * Partitions the output by the given columns on the file system. If specified, the output is
13 |    * laid out on the file system similar to Hive's partitioning scheme. As an example, when we
14 |    * partition a dataset by year and then month, the directory layout would look like:
15 |    * <ul>
16 |    * <li>year=2016/month=01/</li>
17 |    * <li>year=2016/month=02/</li>
18 |    * </ul>
19 |    *
20 |    * Partitioning is one of the most widely used techniques to optimize physical data layout.
21 |    * It provides a coarse-grained index for skipping unnecessary data reads when queries have
22 |    * predicates on the partitioned columns. In order for partitioning to work well, the number
23 |    * of distinct values in each column should typically be less than tens of thousands.
24 |    *
25 |    * This is applicable for all file-based data sources (e.g. Parquet, JSON)
26 |    */
27 |   def partitionBy(columns: String*): this.type
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/CanUpdate.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.storage.connector.Connector
 4 | import org.apache.spark.sql.DataFrame
 5 | 
 6 | /**
 7 |  * Connectors that inherit CanUpdate should be able to update the data store with a new data frame and a given matching
 8 |  * columns.
 9 |  */
10 | trait CanUpdate {
11 |   self: Connector =>
12 | 
13 |   /**
14 |    * Update the data store with a new data frame and the given matching columns.
15 |    *
16 |    * All the matched data will be updated, the non-matched data will be inserted
17 |    *
18 |    * @param df new data
19 |    * @param columns other columns to be matched
20 |    */
21 |   def update(df: DataFrame, columns: String*): Unit
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/CanVacuum.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.storage.connector.Connector
 4 | 
 5 | /**
 6 |  * Connectors that inherit CanVacuum should be able to recursively delete files and directories in the table that are
 7 |  * not needed by the table for maintaining older versions up to the given retention threshold
 8 |  */
 9 | trait CanVacuum {
10 |   self: Connector =>
11 | 
12 |   /**
13 |    * Recursively delete files and directories in the table that are not needed by the table for
14 |    * maintaining older versions up to the given retention threshold. This method will return an
15 |    * empty DataFrame on successful completion.
16 |    *
17 |    * @param retentionHours The retention threshold in hours. Files required by the table for
18 |    *                       reading versions earlier than this will be preserved and the
19 |    *                       rest of them will be deleted.
20 |    */
21 |   def vacuum(retentionHours: Double): Unit
22 | 
23 |   /**
24 |    * Recursively delete files and directories in the table that are not needed by the table for
25 |    * maintaining older versions up to the given retention threshold. This method will return an
26 |    * empty DataFrame on successful completion.
27 |    *
28 |    * note: This will use the default retention period of 7 days.
29 |    */
30 |   def vacuum(): Unit
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/CanWait.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.storage.connector.Connector
 4 | import io.github.setl.storage.connector.Connector
 5 | 
 6 | /**
 7 |  * Connectors that inherit CanWait should be able to wait for the execution to stop
 8 |  */
 9 | trait CanWait {
10 |   self: Connector =>
11 | 
12 |   /**
13 |    * Wait for the execution to stop. Any exceptions that occurs during the execution
14 |    * will be thrown in this thread.
15 |    */
16 |   def awaitTermination(): Unit
17 | 
18 |   /**
19 |    * Wait for the execution to stop. Any exceptions that occurs during the execution
20 |    * will be thrown in this thread.
21 |    *
22 |    * @param timeout time to wait in milliseconds
23 |    * @return `true` if it's stopped; or throw the reported error during the execution; or `false`
24 |    *         if the waiting time elapsed before returning from the method.
25 |    */
26 |   def awaitTerminationOrTimeout(timeout: Long): Boolean
27 | 
28 |   /**
29 |    * Stops the execution of this query if it is running.
30 |    */
31 |   def stop(): Unit
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/Configurable.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | 
 5 | @InterfaceStability.Evolving
 6 | trait Configurable {
 7 | 
 8 |   def set(key: String, value: String): this.type
 9 | 
10 |   def get(key: String): Option[String]
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/HasBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.BenchmarkResult
 4 | 
 5 | /**
 6 |  * HasBenchmark should be used for object having an aggregated benchmark. Typically a Pipeline or a Stage
 7 |  */
 8 | trait HasBenchmark {
 9 | 
10 |   protected var _benchmark: Option[Boolean] = None
11 | 
12 |   /**
13 |    * True if the benchmark will be measured, otherwise false
14 |    *
15 |    * @return boolean
16 |    */
17 |   def benchmark: Option[Boolean] = _benchmark
18 | 
19 |   /**
20 |    * Set to true to enable the benchmarking
21 |    *
22 |    * @param boo true to enable benchmarking
23 |    * @return this object
24 |    */
25 |   def benchmark(boo: Boolean): this.type = {
26 |     _benchmark = Option(boo)
27 |     this
28 |   }
29 | 
30 |   /**
31 |    * Get the aggregated benchmark result.
32 |    *
33 |    * @return an array of BenchmarkResult
34 |    */
35 |   def getBenchmarkResult: Array[BenchmarkResult]
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/HasDescription.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import io.github.setl.util.ReflectUtils
 5 | 
 6 | 
 7 | @InterfaceStability.Evolving
 8 | trait HasDescription {
 9 | 
10 |   def getPrettyName: String = ReflectUtils.getPrettyName(this.getClass)
11 | 
12 |   /** Describe the current class */
13 |   def describe(): this.type
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/HasDiagram.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import scala.reflect.runtime
 4 | 
 5 | trait HasDiagram {
 6 | 
 7 |   /** Generate the diagram */
 8 |   def toDiagram: String
 9 | 
10 |   /** Get the diagram ID */
11 |   def diagramId: String
12 | 
13 |   protected def getTypeArgList(tpe: runtime.universe.Type): List[runtime.universe.Symbol] = {
14 |     tpe
15 |       .baseClasses.head
16 |       .asClass
17 |       .primaryConstructor
18 |       .typeSignature
19 |       .paramLists
20 |       .head
21 |   }
22 | 
23 |   protected def formatDiagramId(prettyName: String,
24 |                                 deliveryId: String,
25 |                                 suffix: String): String = {
26 |     prettyName.replaceAll("[\\[\\]]", "") + deliveryId.capitalize + suffix
27 |   }
28 | 
29 |   /** Display the diagram */
30 |   def showDiagram(): Unit = println(toDiagram)
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/HasReader.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import org.apache.spark.sql.DataFrameReader
 4 | 
 5 | trait HasReader { Connector =>
 6 | 
 7 |   protected val reader: DataFrameReader
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/HasReaderWriter.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.internal
2 | 
3 | trait HasReaderWriter extends HasReader with HasWriter { Connector =>
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/HasRegistry.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import java.util.UUID
 4 | 
 5 | import io.github.setl.annotation.InterfaceStability
 6 | import io.github.setl.exception.AlreadyExistsException
 7 | 
 8 | import scala.collection.immutable.ListMap
 9 | 
10 | /**
11 |  * HasUUIDRegistry provide a UUID registry and methods to check if an
12 |  * [[io.github.setl.internal.Identifiable]] object already
13 |  * exists in its registry
14 |  */
15 | @InterfaceStability.Evolving
16 | trait HasRegistry[T <: Identifiable] {
17 | 
18 |   /**
19 |    * Registry is a HashSet that keeps the UUID of identifiable objects
20 |    */
21 |   private[this] var registry: ListMap[UUID, T] = ListMap.empty
22 | 
23 |   /**
24 |    * Register a new [[io.github.setl.internal.Identifiable]] in registry
25 |    *
26 |    * @param item an object that inherit [[io.github.setl.internal.Identifiable]]
27 |    * @return true if the given item is registered, false otherwise
28 |    */
29 |   @throws[AlreadyExistsException]
30 |   protected def registerNewItem(item: T): Unit = {
31 |     if (hasRegisteredItem(item)) {
32 |       throw new AlreadyExistsException(s"The current item ${item.getUUID} of type ${item.getCanonicalName} already exists")
33 |     } else {
34 |       registry += (item.getUUID -> item)
35 |     }
36 |   }
37 | 
38 |   /** Clear the registry */
39 |   protected def clearRegistry(): Unit = {
40 |     registry = ListMap.empty
41 |   }
42 | 
43 |   /**
44 |    * Register multiple items
45 |    *
46 |    * @param items an [[io.github.setl.internal.Identifiable]] object
47 |    */
48 |   protected def registerNewItems(items: Iterable[T]): Unit = items.foreach(this.registerNewItem)
49 | 
50 |   /**
51 |    * Check if the Identifiable exists in the registry
52 |    *
53 |    * @param item an object that inherit [[io.github.setl.internal.Identifiable]]
54 |    * @return true if it already exists in the registry, false otherwise
55 |    */
56 |   def hasRegisteredItem(item: Identifiable): Boolean = this.hasRegisteredItem(item.getUUID)
57 | 
58 |   /**
59 |    * Check if the UUID exists in the registry
60 |    *
61 |    * @param uuid an UUID
62 |    * @return true if it already exists in the registry, false otherwise
63 |    */
64 |   def hasRegisteredItem(uuid: UUID): Boolean = registry.contains(uuid)
65 | 
66 |   /** Return the registry */
67 |   def getRegistry: ListMap[UUID, T] = this.registry
68 | 
69 |   /**
70 |    * For a given UUID, return the corresponding registered item
71 |    *
72 |    * @param uuid uuid
73 |    * @return
74 |    */
75 |   def getRegisteredItem(uuid: UUID): Option[T] = registry.get(uuid)
76 | 
77 |   /** Return the number of items in the current registry */
78 |   def getRegistryLength: Long = registry.size
79 | 
80 |   /** Return true if the registry is empty, false otherwise */
81 |   def isRegistryEmpty: Boolean = registry.isEmpty
82 | 
83 |   /**
84 |    * Return the last registered item
85 |    *
86 |    * @return if the registry is empty, None will be returned
87 |    */
88 |   def lastRegisteredItem: Option[T] = if (isRegistryEmpty) {
89 |     None
90 |   } else {
91 |     Option(registry.last._2)
92 |   }
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/HasType.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | 
 5 | import scala.reflect.runtime
 6 | 
 7 | /**
 8 |  * HasType should be used on classed having a payload
 9 |  */
10 | @InterfaceStability.Evolving
11 | trait HasType {
12 | 
13 |   val runtimeType: runtime.universe.Type
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/HasWriter.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row}
 4 | 
 5 | trait HasWriter {  Connector =>
 6 | 
 7 |   protected val writer: DataFrame => DataFrameWriter[Row]
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/Identifiable.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import java.util.UUID
 4 | 
 5 | import io.github.setl.annotation.InterfaceStability
 6 | 
 7 | /**
 8 |  * Identifiable generates an UUID for any object that implement the trait
 9 |  */
10 | @InterfaceStability.Evolving
11 | trait Identifiable {
12 | 
13 |   private[this] val _uuid: UUID = UUID.randomUUID
14 | 
15 |   private[this] val _name: String = getClass.getCanonicalName
16 | 
17 |   def getUUID: UUID = _uuid
18 | 
19 |   def getCanonicalName: String = _name
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/Logging.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import org.apache.log4j.{LogManager, Logger}
 5 | 
 6 | /**
 7 |  * Logging provide logging features for the class that extends this trait
 8 |  */
 9 | @InterfaceStability.Evolving
10 | private[setl] trait Logging {
11 | 
12 |   // Make the log field transient so that objects with Logging can
13 |   // be serialized and used on another machine
14 |   @transient private var logger: Logger = _
15 | 
16 |   // Method to get or create the logger for this object
17 |   protected def log: Logger = {
18 |     if (logger == null) {
19 |       logger = LogManager.getLogger(logName)
20 |     }
21 |     logger
22 |   }
23 | 
24 |   // Method to get the logger name for this object
25 |   protected def logName: String = {
26 |     // Ignore trailing $'s in the class names for Scala objects
27 |     this.getClass.getName.stripSuffix("$")
28 |   }
29 | 
30 |   protected def logInfo(msg: => String): Unit = {
31 |     if (log.isInfoEnabled) log.info(msg)
32 |   }
33 | 
34 |   protected def logDebug(msg: => String): Unit = {
35 |     if (log.isDebugEnabled) log.debug(msg)
36 |   }
37 | 
38 |   protected def logTrace(msg: => String): Unit = {
39 |     if (log.isTraceEnabled) log.trace(msg)
40 |   }
41 | 
42 |   protected def logWarning(msg: => String): Unit = log.warn(msg)
43 | 
44 |   protected def logError(msg: => String): Unit = log.error(msg)
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/Writable.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | /**
 4 |  * Indicate that users can activate or deactivate the write of the class
 5 |  */
 6 | trait Writable {
 7 | 
 8 |   protected var _write: Boolean = true
 9 | 
10 |   /**
11 |    * Whether invoke the write method or not
12 |    *
13 |    * @param write if set to true, then the write method of the factory will be invoked
14 |    * @return
15 |    */
16 |   def writable(write: Boolean): this.type = {
17 |     this._write = write
18 |     this
19 |   }
20 | 
21 |   /** Return true if the write method will be invoked by the pipeline */
22 |   def writable: Boolean = this._write
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/Archiver.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage
 2 | 
 3 | import io.github.setl.exception.InvalidConnectorException
 4 | import io.github.setl.storage.connector.FileConnector
 5 | import io.github.setl.storage.repository.SparkRepository
 6 | import org.apache.hadoop.fs.Path
 7 | 
 8 | trait Archiver {
 9 | 
10 |   def addFile(file: Path, name: Option[String] = None): this.type
11 | 
12 |   /**
13 |    * Add the connector's data to the consolidator. For a directory with the following structure:
14 |    *
15 |    * {{{
16 |    *   base_path
17 |    *      |-- dir_1
18 |    *      |     |-- file1
19 |    *      |-- dir_2
20 |    *            |-- file2
21 |    * }}}
22 |    *
23 |    * After calling <code>addConnector(connector, Some("new_name"))</code>, the structure in the compressed zip file will be:
24 |    *
25 |    * {{{
26 |    *   outputPath.zip  // outputPath.zip is given during the instantiation of FileConsolidator
27 |    *      |--new_name
28 |    *           |-- dir_1
29 |    *           |     |-- file1
30 |    *           |-- dir_2
31 |    *                 |-- file2
32 |    * }}}
33 |    *
34 |    * @param repository Repository that will be used to load data
35 |    * @param name       name of the directory in the zip output. default is the name of the base directory of the connector
36 |    * @return
37 |    */
38 |   @throws[InvalidConnectorException]
39 |   def addRepository(repository: SparkRepository[_], name: Option[String] = None): this.type
40 | 
41 |   /**
42 |    * Add the connector's data to the consolidator. For a directory with the following structure:
43 |    *
44 |    * {{{
45 |    *   base_path
46 |    *      |-- dir_1
47 |    *      |     |-- file1
48 |    *      |-- dir_2
49 |    *            |-- file2
50 |    * }}}
51 |    *
52 |    * After calling <code>addConnector(connector, Some("new_name"))</code>, the structure in the compressed zip file will be:
53 |    *
54 |    * {{{
55 |    *   outputPath.zip  // outputPath.zip is given during the instantiation of FileConsolidator
56 |    *      |--new_name
57 |    *           |-- dir_1
58 |    *           |     |-- file1
59 |    *           |-- dir_2
60 |    *                 |-- file2
61 |    * }}}
62 |    *
63 |    * @param connector FileConnector that will be used to load data
64 |    * @param name      name of the directory in the zip output. default is the name of the base directory of the connector
65 |    * @return
66 |    */
67 |   def addConnector(connector: FileConnector, name: Option[String] = None): this.type
68 | 
69 |   def archive(outputPath: Path): this.type
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/Compressor.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage
 2 | 
 3 | import java.io.IOException
 4 | 
 5 | import io.github.setl.annotation.InterfaceStability
 6 | 
 7 | /**
 8 |  * A Compressor is able to compress an input string into a byte array and vice versa.
 9 |  */
10 | @InterfaceStability.Evolving
11 | trait Compressor extends Serializable {
12 | 
13 |   /**
14 |    * Compress an input string into a byte array
15 |    */
16 |   @throws[IOException]
17 |   def compress(input: String): Array[Byte]
18 | 
19 |   /**
20 |    * Decompress a byte array into an input string
21 |    */
22 |   @throws[IOException]
23 |   def decompress(bytes: Array[Byte]): String
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/DatasetConverter.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage
 2 | 
 3 | import io.github.setl.Converter
 4 | import io.github.setl.annotation.InterfaceStability
 5 | import org.apache.spark.sql.Dataset
 6 | 
 7 | /**
 8 |  * DatasetConverter inherits from a Converter. It can convert between two Dataset: Dataset[A] and Dataset[B]
 9 |  *
10 |  * @tparam A Type of Dataset[A]
11 |  * @tparam B Type of Dataset[B]
12 |  */
13 | @InterfaceStability.Evolving
14 | abstract class DatasetConverter[A, B] extends Converter {
15 | 
16 |   override type T1 = Dataset[A]
17 |   override type T2 = Dataset[B]
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/ACIDConnector.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import io.github.setl.internal.{CanDelete, CanDrop, CanUpdate, CanVacuum}
 5 | 
 6 | @InterfaceStability.Evolving
 7 | abstract class ACIDConnector extends Connector
 8 |   with CanUpdate
 9 |   with CanDrop
10 |   with CanDelete
11 |   with CanVacuum {
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/Connector.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import io.github.setl.enums.Storage
 5 | import io.github.setl.internal.Logging
 6 | import io.github.setl.util.HasSparkSession
 7 | import org.apache.spark.sql._
 8 | 
 9 | /**
10 |  * Connector is a non-typed data access layer (DAL) abstraction that provides read/write functionalities.
11 |  *
12 |  * <br>
13 |  * A basic data storage connector has two main functionalities:
14 |  * <ul>
15 |  * <li>Read data from the persistence store</li>
16 |  * <li>Write data into the persistence store</li>
17 |  * </ul>
18 |  *
19 |  */
20 | @InterfaceStability.Evolving
21 | trait Connector extends HasSparkSession with Logging {
22 | 
23 |   val storage: Storage
24 | 
25 |   /**
26 |    * Read data from the data source
27 |    * @return a [[DataFrame]]
28 |    */
29 |   def read(): DataFrame
30 | 
31 |   /**
32 |    * Write a [[DataFrame]] into the data storage
33 |    * @param t a [[DataFrame]] to be saved
34 |    * @param suffix for data connectors that support suffix (e.g. [[FileConnector]],
35 |    *               add the given suffix to the save path
36 |    */
37 |   def write(t: DataFrame, suffix: Option[String]): Unit
38 | 
39 |   /**
40 |    * Write a [[DataFrame]] into the data storage
41 |    * @param t a [[DataFrame]] to be saved
42 |    */
43 |   def write(t: DataFrame): Unit
44 | 
45 | }
46 | 
47 | object Connector {
48 | 
49 |   /**
50 |    * Create an empty Connector
51 |    * @return an empty Connector
52 |    */
53 |   def empty: Connector = new Connector {
54 |     override val spark: SparkSession = null
55 |     override val storage: Storage = null
56 |     override def read(): DataFrame = null
57 |     override def write(t: DataFrame, suffix: Option[String]): Unit = {}
58 |     override def write(t: DataFrame): Unit = {}
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/ConnectorInterface.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import io.github.setl.config.Conf
 4 | import io.github.setl.enums.Storage
 5 | import com.typesafe.config.Config
 6 | 
 7 | /**
 8 |  * ConnectorInterface provides the abstraction of a pluggable connector that could be used by [[io.github.setl.storage.ConnectorBuilder]].
 9 |  * Users can implement their customized data source connector by extending this trait.
10 |  */
11 | trait ConnectorInterface extends Connector {
12 | 
13 |   /**
14 |    * By default, the custom connector's storage type should be OTHER.
15 |    */
16 |   override val storage: Storage = Storage.OTHER
17 | 
18 |   /**
19 |    * Configure the connector with the given [[Conf]]
20 |    * @param conf an object of [[Conf]]
21 |    */
22 |   def setConf(conf: Conf): Unit
23 | 
24 |   /**
25 |    * Configure the connector with the given [[Config]]
26 |    * @param config an object of [[Config]]
27 |    */
28 |   def setConfig(config: Config): Unit = this.setConf(Conf.fromConfig(config))
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/DBConnector.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import io.github.setl.internal.{CanCreate, CanDelete, CanDrop}
 5 | 
 6 | @InterfaceStability.Evolving
 7 | abstract class DBConnector extends Connector
 8 |   with CanCreate
 9 |   with CanDrop
10 |   with CanDelete {
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/HudiConnector.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import com.typesafe.config.Config
 4 | import io.github.setl.config.{Conf, HudiConnectorConf}
 5 | import io.github.setl.enums.Storage
 6 | import io.github.setl.internal.HasReaderWriter
 7 | import io.github.setl.util.TypesafeConfigUtils
 8 | import org.apache.spark.sql._
 9 | 
10 | class HudiConnector(val options: HudiConnectorConf) extends Connector with HasReaderWriter {
11 |   override val storage: Storage = Storage.HUDI
12 | 
13 |   def this(options: Map[String, String]) = this(HudiConnectorConf.fromMap(options))
14 | 
15 |   def this(config: Config) = this(TypesafeConfigUtils.getMap(config))
16 | 
17 |   def this(conf: Conf) = this(conf.toMap)
18 | 
19 |   override val reader: DataFrameReader = {
20 |     spark.read
21 |       .format("hudi")
22 |       .options(options.getReaderConf)
23 |   }
24 | 
25 |   override val writer: DataFrame => DataFrameWriter[Row] = (df: DataFrame) => {
26 |     df.write
27 |       .format("hudi")
28 |       .mode(options.getSaveMode)
29 |       .options(options.getWriterConf)
30 |   }
31 | 
32 |   /**
33 |    * Read data from the data source
34 |    *
35 |    * @return a [[DataFrame]]
36 |    */
37 |   @throws[java.io.FileNotFoundException](s"${options.getPath} doesn't exist")
38 |   @throws[org.apache.spark.sql.AnalysisException](s"${options.getPath} doesn't exist")
39 |   override def read(): DataFrame = {
40 |     logDebug(s"Reading ${storage.toString} file in: '${options.getPath}'")
41 |     this.setJobDescription(s"Read file(s) from '${options.getPath}'")
42 |     reader.load(options.getPath)
43 |   }
44 | 
45 |   /**
46 |    * Write a [[DataFrame]] into the data storage
47 |    *
48 |    * @param t      a [[DataFrame]] to be saved
49 |    * @param suffix for data connectors that support suffix (e.g. [[FileConnector]],
50 |    *               add the given suffix to the save path
51 |    */
52 |   override def write(t: DataFrame, suffix: Option[String]): Unit = {
53 |     if (suffix.isDefined) logWarning("Suffix is not supported in HudiConnector")
54 |     write(t)
55 |   }
56 | 
57 |   /**
58 |    * Write a [[DataFrame]] into the data storage
59 |    *
60 |    * @param t a [[DataFrame]] to be saved
61 |    */
62 |   override def write(t: DataFrame): Unit = {
63 |     this.setJobDescription(s"Write file to ${options.getPath}")
64 |     writer(t).save(options.getPath)
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/ParquetConnector.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import io.github.setl.config.{Conf, FileConnectorConf}
 5 | import io.github.setl.enums.Storage
 6 | import io.github.setl.util.TypesafeConfigUtils
 7 | import com.typesafe.config.Config
 8 | import org.apache.spark.sql._
 9 | 
10 | /**
11 |  * ParquetConnector contains functionality for transforming [[DataFrame]] into parquet files
12 |  */
13 | @InterfaceStability.Evolving
14 | class ParquetConnector(override val options: FileConnectorConf) extends FileConnector(options) {
15 | 
16 |   def this(options: Map[String, String]) = this(FileConnectorConf.fromMap(options))
17 | 
18 |   def this(path: String, saveMode: SaveMode) = this(Map("path" -> path, "saveMode" -> saveMode.toString))
19 | 
20 |   def this(config: Config) = this(TypesafeConfigUtils.getMap(config))
21 | 
22 |   def this(conf: Conf) = this(conf.toMap)
23 | 
24 |   override val storage: Storage = Storage.PARQUET
25 | 
26 |   this.options.setStorage(storage)
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/SparkSQLConnector.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import com.typesafe.config.Config
 4 | import io.github.setl.config.Conf
 5 | import io.github.setl.enums.Storage
 6 | import io.github.setl.util.TypesafeConfigUtils
 7 | import org.apache.spark.sql.DataFrame
 8 | 
 9 | class SparkSQLConnector(val query: String) extends Connector {
10 |   override val storage: Storage = Storage.SPARK_SQL
11 | 
12 |   def this(conf: Conf) = this(conf.get("query", ""))
13 |   def this(config: Config) = this(
14 |     query = TypesafeConfigUtils.getAs[String](config, "query").getOrElse("")
15 |   )
16 | 
17 |   require(query.nonEmpty, "query is not defined")
18 | 
19 |   /**
20 |    * Read data from the data source
21 |    *
22 |    * @return a [[DataFrame]]
23 |    */
24 |   @throws[org.apache.spark.sql.AnalysisException](s"$query is invalid")
25 |   override def read(): DataFrame = spark.sql(query)
26 | 
27 |   /**
28 |    * Write a [[DataFrame]] into the data storage
29 |    *
30 |    * @param t      a [[DataFrame]] to be saved
31 |    * @param suffix for data connectors that support suffix (e.g. [[FileConnector]],
32 |    *               add the given suffix to the save path
33 |    */
34 |   override def write(t: DataFrame, suffix: Option[String]): Unit = {
35 |     if (suffix.isDefined) logWarning("suffix is not supported in SparkSQLConnector")
36 |     write(t)
37 |   }
38 | 
39 |   /**
40 |    * Write a [[DataFrame]] into the data storage
41 |    *
42 |    * @param t a [[DataFrame]] to be saved
43 |    */
44 |   override def write(t: DataFrame): Unit = {
45 |     logWarning("write is not supported in SparkSQLConnector")
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/StreamingConnector.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.connector
2 | 
3 | import io.github.setl.internal.CanWait
4 | 
5 | abstract class StreamingConnector extends Connector
6 |   with CanWait {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/StructuredStreamingConnector.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import io.github.setl.annotation.{Experimental, InterfaceStability}
 4 | import io.github.setl.config.{Conf, StructuredStreamingConnectorConf}
 5 | import io.github.setl.enums.Storage
 6 | import io.github.setl.util.TypesafeConfigUtils
 7 | import com.typesafe.config.Config
 8 | import org.apache.spark.sql.streaming.{DataStreamReader, DataStreamWriter, StreamingQuery}
 9 | import org.apache.spark.sql.{DataFrame, Row}
10 | 
11 | /**
12 |  * :: Experimental ::
13 |  *
14 |  * Spark Structured Streaming connector
15 |  *
16 |  * @param conf configuration, see <a href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html">
17 |  *             Spark structured streaming documentation</a> for details
18 |  */
19 | @Experimental
20 | @InterfaceStability.Unstable
21 | class StructuredStreamingConnector(val conf: StructuredStreamingConnectorConf) extends StreamingConnector {
22 | 
23 |   private[this] var streamingQuery: StreamingQuery = _
24 | 
25 |   def this(options: Map[String, String]) = this(StructuredStreamingConnectorConf.fromMap(options))
26 | 
27 |   def this(config: Config) = this(TypesafeConfigUtils.getMap(config))
28 | 
29 |   def this(config: Conf) = this(config.toMap)
30 | 
31 |   override val storage: Storage = Storage.STRUCTURED_STREAMING
32 | 
33 |   @inline protected val streamReader: DataStreamReader = spark.readStream
34 |     .format(conf.getFormat)
35 |     .options(conf.getReaderConf)
36 | 
37 |   protected val streamWriter: DataFrame => DataStreamWriter[Row] = (df: DataFrame) => {
38 |     df.writeStream
39 |       .outputMode(conf.getOutputMode)
40 |       .format(conf.getFormat)
41 |       .options(conf.getWriterConf)
42 |   }
43 | 
44 |   override def read(): DataFrame = {
45 |     if (conf.has(StructuredStreamingConnectorConf.SCHEMA)) {
46 |       logInfo("Apply user-defined schema")
47 |       streamReader
48 |         .schema(conf.getSchema)
49 |         .load()
50 |     } else {
51 |       streamReader.load()
52 |     }
53 |   }
54 | 
55 |   override def write(t: DataFrame, suffix: Option[String]): Unit = {
56 |     logWarning("Suffix will be ignored by StructuredStreamingConnector")
57 |     write(t)
58 |   }
59 | 
60 |   override def write(t: DataFrame): Unit = {
61 |     streamingQuery = streamWriter(t).start()
62 |   }
63 | 
64 |   /**
65 |    * Wait for the execution to stop. Any exceptions that occurs during the execution
66 |    * will be thrown in this thread.
67 |    */
68 |   override def awaitTermination(): Unit = streamingQuery.awaitTermination()
69 | 
70 |   /**
71 |    * Wait for the execution to stop. Any exceptions that occurs during the execution
72 |    * will be thrown in this thread.
73 |    *
74 |    * @param timeout time to wait in milliseconds
75 |    * @return `true` if it's stopped; or throw the reported error during the execution; or `false`
76 |    *         if the waiting time elapsed before returning from the method.
77 |    */
78 |   override def awaitTerminationOrTimeout(timeout: Long): Boolean = streamingQuery.awaitTermination(timeout)
79 | 
80 |   /**
81 |    * Stops the execution of this query if it is running.
82 |    */
83 |   override def stop(): Unit = streamingQuery.stop()
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/repository/ImplicitRepositoryAdapter.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.repository
 2 | 
 3 | import io.github.setl.internal.{SchemaConverter, StructAnalyser}
 4 | import io.github.setl.storage.{Condition, DatasetConverter}
 5 | import org.apache.spark.sql.Dataset
 6 | import org.apache.spark.sql.types.StructType
 7 | 
 8 | import scala.reflect.runtime.universe.TypeTag
 9 | 
10 | object ImplicitRepositoryAdapter {
11 | 
12 |   /**
13 |    * SparkRepositoryAdapter is an implemented implicit RepositoryAdapter that provides 4 additional methods to an
14 |    * existing `SparkRepository[A]`.
15 |    *
16 |    * {{{
17 |    *   // Example:
18 |    *
19 |    *   implicit val converter = new DatasetConverter[A, B] {
20 |    *     // implementation
21 |    *   }
22 |    *
23 |    *   val defaultRepository: SparkRepository[A]  // a default repository that can save a Dataset[A]
24 |    *
25 |    *   import io.github.setl.storage.repository.ImplicitRepositoryAdapter._
26 |    *
27 |    *   // This will convert dsOfTypeA (a Dataset[A]) to a Dataset[B] by using the previous implicit converter, then
28 |    *   // save the converted dataset into the data store
29 |    *   defaultRepository.convertAndSave(dsOfTypeA)
30 |    *
31 |    *   defaultRepository.findAllAndConvert()
32 |    * }}}
33 |    *
34 |    * @param repository an existing repository
35 |    * @param converter  a DatasetConverter (should be implemented by user)
36 |    * @tparam A source type
37 |    * @tparam B target type
38 |    */
39 |   implicit class SparkRepositoryAdapter[A: TypeTag, B: TypeTag]
40 |   (override val repository: SparkRepository[A])
41 |   (override implicit val converter: DatasetConverter[A, B]) extends RepositoryAdapter[Dataset[A], Dataset[B]] {
42 | 
43 |     private[this] val DBTypeSchema: StructType = StructAnalyser.analyseSchema[B]
44 | 
45 |     def findAllAndConvert(): Dataset[A] = {
46 |       val data = repository.readDataFrame()
47 |       converter.convertFrom(SchemaConverter.fromDF[B](data))
48 |     }
49 | 
50 |     def findByAndConvert(conditions: Set[Condition]): Dataset[A] = {
51 |       val data = repository.readDataFrame(SparkRepository.handleConditions(conditions, DBTypeSchema))
52 |       converter.convertFrom(SchemaConverter.fromDF[B](data))
53 |     }
54 | 
55 |     def findByAndConvert(condition: Condition): Dataset[A] = {
56 |       findByAndConvert(Set(condition))
57 |     }
58 | 
59 |     def convertAndSave(data: Dataset[A], suffix: Option[String] = None): SparkRepositoryAdapter.this.type = {
60 |       val dsToSave = converter.convertTo(data)
61 |       repository.configureConnector(dsToSave.toDF(), suffix)
62 |       repository.writeDataFrame(SchemaConverter.toDF[B](dsToSave))
63 |       this
64 |     }
65 |   }
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/repository/Repository.scala:
--------------------------------------------------------------------------------
  1 | package io.github.setl.storage.repository
  2 | 
  3 | import io.github.setl.annotation.InterfaceStability
  4 | import io.github.setl.storage.Condition
  5 | import org.apache.spark.sql.{Column, DataFrame, Dataset}
  6 | 
  7 | /**
  8 |  * The goal of Repository is to significantly reduce the amount of boilerplate code required to
  9 |  * implement data access layers for various persistence stores.
 10 |  *
 11 |  * @tparam DT data type
 12 |  */
 13 | @InterfaceStability.Evolving
 14 | trait Repository[DT] {
 15 | 
 16 |   /**
 17 |    * Find data by giving a set of conditions
 18 |    *
 19 |    * @param conditions Set of [[Condition]]
 20 |    * @return
 21 |    */
 22 |   def findBy(conditions: Set[Condition]): DT
 23 | 
 24 |   /**
 25 |    * Find data by giving a single condition
 26 |    *
 27 |    * @param condition a [[Condition]]
 28 |    * @return
 29 |    */
 30 |   def findBy(condition: Condition): DT = this.findBy(Set(condition))
 31 | 
 32 |   /**
 33 |    * Find data by giving a Spark sql column
 34 |    *
 35 |    * @param column a column object (could be chained)
 36 |    * @return
 37 |    */
 38 |   def findBy(column: Column): DT = this.findBy(Condition(column))
 39 | 
 40 |   /**
 41 |    * Retrieve all data
 42 |    *
 43 |    * @return
 44 |    */
 45 |   def findAll(): DT
 46 | 
 47 |   /**
 48 |    * Save a [[Dataset]] into a data persistence store
 49 |    *
 50 |    * @param data   data to be saved
 51 |    * @param suffix an optional string to separate data
 52 |    * @return this repository instance
 53 |    */
 54 |   def save(data: DT, suffix: Option[String]): this.type
 55 | 
 56 | 
 57 |   /**
 58 |    * Update/Insert a [[Dataset]] into a data persistence store
 59 |    *
 60 |    * @param data data to be saved
 61 |    * @return this repository instance
 62 |    */
 63 |   def update(data: DT): this.type
 64 | 
 65 |   /**
 66 |    * Drop the entire table/file/directory
 67 |    * @return this repository instance
 68 |    */
 69 |   def drop(): this.type
 70 | 
 71 |   def delete(query: String): this.type
 72 | 
 73 |   /**
 74 |    * Create a data storage (e.g. table in a database or file/folder in a file system) with a suffix
 75 |    *
 76 |    * @param t      data frame to be written
 77 |    * @param suffix suffix to be appended at the end of the data storage name
 78 |    */
 79 |   def create(t: DataFrame, suffix: Option[String]): this.type
 80 | 
 81 |   /**
 82 |    * Create a data storage (e.g. table in a database or file/folder in a file system)
 83 |    *
 84 |    * @param t data frame to be written
 85 |    */
 86 |   def create(t: DataFrame): this.type
 87 | 
 88 |   def vacuum(retentionHours: Double): this.type
 89 | 
 90 |   def vacuum(): this.type
 91 | 
 92 |   /**
 93 |    * Wait for the execution to stop. Any exceptions that occurs during the execution
 94 |    * will be thrown in this thread.
 95 |    */
 96 |   def awaitTermination(): Unit
 97 | 
 98 |   /**
 99 |    * Wait for the execution to stop. Any exceptions that occurs during the execution
100 |    * will be thrown in this thread.
101 |    *
102 |    * @param timeout time to wait in milliseconds
103 |    * @return `true` if it's stopped; or throw the reported error during the execution; or `false`
104 |    *         if the waiting time elapsed before returning from the method.
105 |    */
106 |   def awaitTerminationOrTimeout(timeout: Long): Boolean
107 | 
108 |   /**
109 |    * Stops the execution of this query if it is running.
110 |    */
111 |   def stopStreaming(): this.type
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/repository/RepositoryAdapter.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.repository
 2 | 
 3 | import io.github.setl.Converter
 4 | import io.github.setl.annotation.InterfaceStability
 5 | import io.github.setl.storage.Condition
 6 | 
 7 | /**
 8 |  * RepositoryAdapter could be used when one wants to save a `Dataset[A]` to a data store of type `B`.
 9 |  *
10 |  * A `Repository[A]` and a `DatasetConverter[A, B]` must be provided (either explicitly or implicitly)
11 |  *
12 |  * @tparam A Type of the Repository
13 |  * @tparam B Target data store type
14 |  */
15 | @InterfaceStability.Evolving
16 | trait RepositoryAdapter[A, B] {
17 | 
18 |   val repository: Repository[A]
19 | 
20 |   val converter: Converter
21 | 
22 |   def findAllAndConvert(): A
23 | 
24 |   def findByAndConvert(conditions: Set[Condition]): A
25 | 
26 |   def findByAndConvert(condition: Condition): A
27 | 
28 |   def convertAndSave(data: A, suffix: Option[String]): this.type
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/transformation/AbstractFactory.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.transformation
 2 | 
 3 | trait AbstractFactory[A] {
 4 | 
 5 |   def read(): this.type
 6 | 
 7 |   def process(): this.type
 8 | 
 9 |   def write(): this.type
10 | 
11 |   def get(): A
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/transformation/Factory.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.transformation
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import io.github.setl.internal.{HasDescription, Identifiable, Logging, Writable}
 5 | import io.github.setl.util.ReflectUtils
 6 | 
 7 | import scala.reflect.runtime.{universe => ru}
 8 | 
 9 | /**
10 |  * Factory could be used to manipulate data.
11 |  *
12 |  * A Factory is able to read data from a data source, process/transform it
13 |  * and write it back to the storage.</br>
14 |  *
15 |  * @tparam A the type of object that the factory is supposed to produce
16 |  */
17 | @InterfaceStability.Evolving
18 | abstract class Factory[A: ru.TypeTag] extends AbstractFactory[A]
19 |   with Logging
20 |   with Identifiable
21 |   with HasDescription
22 |   with Writable {
23 | 
24 |   private[this] val _consumers: Seq[Class[_ <: Factory[_]]] = Seq.empty
25 |   private[this] val _deliveryId: String = Deliverable.DEFAULT_ID
26 | 
27 |   /** Return the list of consumer class */
28 |   def consumers: Seq[Class[_ <: Factory[_]]] = this._consumers
29 | 
30 |   /** Return the delivery id of this factory */
31 |   def deliveryId: String = this._deliveryId
32 | 
33 |   /** Read data */
34 |   override def read(): this.type
35 | 
36 |   /** Process data */
37 |   override def process(): this.type
38 | 
39 |   /** Write data */
40 |   override def write(): this.type
41 | 
42 |   /** Get the processed data */
43 |   override def get(): A
44 | 
45 |   /** Create a new Deliverable object */
46 |   def getDelivery: Deliverable[A] = {
47 |     new Deliverable[A](this.get())
48 |       .setProducer(this.getClass)
49 |       .setConsumers(consumers)
50 |       .setDeliveryId(deliveryId)
51 |   }
52 | 
53 |   /** Get the type of deliverable payload */
54 |   def deliveryType(): ru.Type = ru.typeTag[A].tpe
55 | 
56 |   /** Describe the */
57 |   override def describe(): this.type = {
58 |     logInfo(s"$getPrettyName will produce a ${ReflectUtils.getPrettyName(deliveryType())}")
59 |     this
60 |   }
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/transformation/FactoryInput.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.transformation
 2 | 
 3 | import io.github.setl.internal.HasType
 4 | 
 5 | import scala.language.existentials
 6 | import scala.reflect.runtime
 7 | 
 8 | /**
 9 |  * Metadata of an input of a Factory.
10 |  *
11 |  * If a `FactoryDeliveryMetadata` represents a method, then it may be converted to multiple FactoryInputs as each of its
12 |  * arguments will be abstracted as a `FactoryInput`.
13 |  *
14 |  * @param runtimeType runtime type of the input
15 |  * @param producer    producer of the input
16 |  * @param deliveryId  delivery id of the input
17 |  */
18 | private[setl] case class FactoryInput(override val runtimeType: runtime.universe.Type,
19 |                                       producer: Class[_],
20 |                                       deliveryId: String = Deliverable.DEFAULT_ID,
21 |                                       autoLoad: Boolean,
22 |                                       optional: Boolean,
23 |                                       consumer: Class[_ <: Factory[_]]) extends HasType
24 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/transformation/FactoryOutput.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.transformation
 2 | 
 3 | import io.github.setl.internal.{HasDiagram, HasType}
 4 | import io.github.setl.util.ReflectUtils
 5 | import org.apache.spark.sql.Dataset
 6 | 
 7 | import scala.reflect.runtime
 8 | 
 9 | private[setl] case class FactoryOutput(override val runtimeType: runtime.universe.Type,
10 |                                        consumer: Seq[Class[_ <: Factory[_]]],
11 |                                        deliveryId: String = Deliverable.DEFAULT_ID,
12 |                                        finalOutput: Boolean = false,
13 |                                        external: Boolean = false) extends HasType with HasDiagram {
14 | 
15 |   override def diagramId: String = {
16 |     val finalSuffix = if (finalOutput) "Final" else ""
17 | 
18 |     val externalSuffix = if (external) "External" else ""
19 | 
20 |     super.formatDiagramId(ReflectUtils.getPrettyName(runtimeType), deliveryId, finalSuffix + externalSuffix)
21 |   }
22 | 
23 |   private[this] val typeToExclude = List(
24 |     "String", "Double", "Int", "Float", "Long"
25 |   )
26 | 
27 |   private[this] def payloadField: List[String] = {
28 |     val isDataset = this.runtimeType.baseClasses.head.asClass == runtime.universe.symbolOf[Dataset[_]].asClass
29 | 
30 |     val isCaseClass = {
31 |       runtimeType.baseClasses.head.asClass.isCaseClass
32 |     }
33 | 
34 |     if (isDataset) {
35 |       if (this.runtimeType.typeArgs.isEmpty) {
36 |         // DataFrame
37 |         List.empty
38 |       } else {
39 |         // Dataset
40 |         val datasetTypeArgFields = super.getTypeArgList(this.runtimeType.typeArgs.head)
41 |         datasetTypeArgFields.map {
42 |           i => s">${i.name}: ${ReflectUtils.getPrettyName(i.typeSignature)}"
43 |         }
44 |       }
45 | 
46 |     } else if (isCaseClass) {
47 |       val typeArgFields = super.getTypeArgList(this.runtimeType)
48 |       typeArgFields.map {
49 |         i => s"-${i.name}: ${ReflectUtils.getPrettyName(i.typeSignature)}"
50 |       }
51 | 
52 |     } else {
53 |       List.empty
54 |     }
55 |   }
56 | 
57 |   override def toDiagram: String = {
58 | 
59 |     val fields = this.payloadField.mkString("\n  ")
60 | 
61 |     s"""class ${this.diagramId} {
62 |        |  <<${ReflectUtils.getPrettyName(this.runtimeType)}>>
63 |        |  $fields
64 |        |}
65 |        |""".stripMargin
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/transformation/MLTransformer.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.transformation
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import org.apache.hadoop.fs.Path
 5 | import org.apache.spark.ml.Model
 6 | 
 7 | /**
 8 |  * A MLTransformer is a basic transformer with a ML model and ML-related functionality.
 9 |  *
10 |  * @tparam T Data type of the transformer
11 |  */
12 | @InterfaceStability.Evolving
13 | trait MLTransformer[T, M <: Model[_]] extends Transformer[T] {
14 | 
15 |   var model: M = _
16 |   val modelPath: Path
17 |   var overwriteModel: Boolean = false
18 | 
19 |   /** Fit a model with the current data */
20 |   def fit(): MLTransformer.this.type
21 | 
22 |   /** Load a model from a given path */
23 |   def loadModel(): MLTransformer.this.type
24 | 
25 |   /** Save the current model */
26 |   def saveModel(): MLTransformer.this.type
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/transformation/Transformer.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.transformation
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import io.github.setl.internal.{Identifiable, Logging}
 5 | 
 6 | /**
 7 |  * A transformer can transform data into a type A
 8 |  *
 9 |  * @tparam T : Type of output data
10 |  */
11 | @InterfaceStability.Evolving
12 | trait Transformer[T] extends Logging with Identifiable {
13 | 
14 |   /**
15 |    * Get the transformed data
16 |    *
17 |    * @return
18 |    */
19 |   def transformed: T
20 | 
21 |   /**
22 |    * Transform the current data
23 |    */
24 |   def transform(): this.type
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/util/ExpectedDeliverable.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.util
 2 | 
 3 | import io.github.setl.transformation.Factory
 4 | 
 5 | case class ExpectedDeliverable(deliverableType: String,
 6 |                                deliveryId: String,
 7 |                                producer: Class[_],
 8 |                                consumer: Class[_ <: Factory[_]]) {
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/util/FilterImplicits.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.util
 2 | 
 3 | import io.github.setl.internal.Logging
 4 | import io.github.setl.storage.Condition
 5 | import org.apache.spark.sql.Dataset
 6 | 
 7 | object FilterImplicits extends Logging {
 8 | 
 9 |   implicit class DatasetFilterByCondition[T](dataset: Dataset[T]) {
10 | 
11 |     def filter(conditions: Set[Condition]): Dataset[T] = {
12 |       dataset.filter(conditions.toSqlRequest)
13 |     }
14 | 
15 |     def filter(condition: Condition): Dataset[T] = {
16 |       dataset.filter(condition.toSqlRequest)
17 |     }
18 |   }
19 | 
20 |   implicit class ConditionsToRequest(conditions: Set[Condition]) {
21 | 
22 |     /**
23 |      * Convert a [[Set]] of [[io.github.setl.storage.Condition]] objects to a spark SQL query string
24 |      *
25 |      * @throws IllegalArgumentException if a datetime/date filter doesn't have a value with correct format,
26 |      *                                  an illegal argument exception will be thrown
27 |      * @return String
28 |      */
29 |     @throws[IllegalArgumentException]
30 |     def toSqlRequest: String = {
31 |       val query = conditions
32 |         .filter(row => row.value.isDefined)
33 |         .map(_.toSqlRequest)
34 |         .filter(_ != null)
35 |         .mkString(" AND ")
36 |       query
37 |     }
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/util/HasSparkSession.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.util
 2 | 
 3 | import org.apache.spark.SparkException
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | trait HasSparkSession {
 7 | 
 8 |   val spark: SparkSession = SparkSession.getActiveSession match {
 9 |     case Some(ss) => ss
10 |     case _ => throw new SparkException("No active Spark session")
11 |   }
12 | 
13 |   def setJobDescription(desc: String): Unit = spark.sparkContext.setJobDescription(desc)
14 | 
15 |   def setJobGroup(group: String): Unit = spark.sparkContext.setJobGroup(group, null)
16 | 
17 |   def setJobGroup(group: String, description: String): Unit = spark.sparkContext.setJobGroup(group, description)
18 | 
19 |   def clearJobGroup(): Unit = spark.sparkContext.clearJobGroup()
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/util/MermaidUtils.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.util
 2 | 
 3 | import java.util.Base64
 4 | 
 5 | import org.json4s.jackson.Serialization
 6 | 
 7 | private[setl] object MermaidUtils {
 8 | 
 9 |   /**
10 |    * Needed to convert a Map object into JSON String
11 |    */
12 |   implicit val formats: org.json4s.DefaultFormats.type = org.json4s.DefaultFormats
13 | 
14 |   /**
15 |    * Mermaid diagram code header for pretty print
16 |    */
17 |   val mermaidHeader = "--------- START OF MERMAID DIAGRAM ---------"
18 | 
19 |   /**
20 |    * Mermaid diagram code footer for pretty print
21 |    */
22 |   val mermaidFooter = "---------- END OF MERMAID DIAGRAM ----------"
23 | 
24 |   val summaryString = "You can copy the previous code to a markdown viewer that supports Mermaid."
25 | 
26 |   val liveEditorMessage = "Otherwise you can try the live editor: "
27 | 
28 |   val linkPrefix = "https://mermaid-js.github.io/mermaid-live-editor/#/edit/"
29 | 
30 |   /**
31 |    * Encode the Mermaid diagram to Base64
32 |    *
33 |    * @param mermaidDiagram Mermaid diagram code
34 |    * @return the Base64 of the diagram code
35 |    */
36 |   def encodeMermaid(mermaidDiagram: String): String = {
37 |     val mermaidMap = Map("code" -> mermaidDiagram, "mermaid" -> Map("theme" -> "default"))
38 |     val jsonString = Serialization.write(mermaidMap).replace("\\r", "")
39 |     val encoded = Base64.getUrlEncoder.encode(jsonString.getBytes())
40 |     new String(encoded)
41 |   }
42 | 
43 |   /**
44 |    * Message to be printed for live editor preview
45 |    *
46 |    * @param code diagram base64 code
47 |    * @return Full message for live editor preview
48 |    */
49 |   def mermaidDiagramLink(code: String): String = {
50 |     this.liveEditorMessage + this.linkPrefix + code
51 |   }
52 | 
53 |   /**
54 |    * Format output of Mermaid diagram
55 |    *
56 |    * @param mermaidDiagram Mermaid diagram code
57 |    * @return Pretty formatted output of Mermaid diagram with direct link
58 |    */
59 |   def printMermaid(mermaidDiagram: String): Unit = {
60 |     val encoded = this.encodeMermaid(mermaidDiagram)
61 |     val linkMessage = this.mermaidDiagramLink(encoded)
62 | 
63 | 
64 | 
65 |     println(
66 |       s"""$mermaidHeader
67 |          |$mermaidDiagram
68 |          |$mermaidFooter
69 |          |
70 |          |$summaryString
71 |          |
72 |          |$linkMessage
73 |          |""".stripMargin
74 |     )
75 |   }
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/util/ReflectUtils.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.util
 2 | 
 3 | import scala.reflect.runtime
 4 | 
 5 | object ReflectUtils {
 6 | 
 7 |   def getPrettyName(tpe: runtime.universe.Type): String = tpe.toString.split("\\[").map(getPrettyName).mkString("[")
 8 | 
 9 |   def getPrettyName(cls: Class[_]): String = getPrettyName(cls.getCanonicalName)
10 | 
11 |   def getPrettyName(canonicalName: String): String = canonicalName.split("\\.").last
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/util/SparkUtils.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.util
 2 | 
 3 | import org.apache.spark.SparkException
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 6 | import org.apache.spark.sql.execution.command.ExplainCommand
 7 | 
 8 | private[setl] object SparkUtils {
 9 | 
10 |   /**
11 |    * Check if the current spark version is superior than the required version
12 |    * @param requiredVersion minimum version of spark
13 |    * @return true if the current spark is newer than the required version
14 |    */
15 |   def checkSparkVersion(requiredVersion: String): Boolean = {
16 |     val currentVersion = SparkSession.getActiveSession match {
17 |       case Some(ss) => ss.version
18 |       case _ => throw new SparkException("No active Spark Session")
19 |     }
20 |     val targetVer = requiredVersion.replace(".", "") + "000"
21 |     val thisVer = currentVersion.replace(".", "") + "000"
22 |     thisVer.take(3).toInt >= targetVer.take(3).toInt
23 |   }
24 | 
25 |   def withSparkVersion[T](minVersion: String)(fun: Boolean => T): T = {
26 |     try {
27 |       fun(checkSparkVersion(minVersion))
28 |     } catch {
29 |       case e: NoSuchMethodException =>
30 |         throw new NoSuchMethodException("Cannot instantiate ExplainCommand. " +
31 |           s"Please check the implementation of its constructor in Spark $minVersion")
32 |     }
33 |   }
34 | 
35 |   def explainCommandWithExtendedMode(logicalPlan: LogicalPlan): ExplainCommand = {
36 |     withSparkVersion("3.0.0") { newer =>
37 |       if (newer) {
38 |         val extendedMode = Class.forName("org.apache.spark.sql.execution.ExtendedMode$")
39 |           .getField("MODULE$")
40 |           .get(Class.forName("org.apache.spark.sql.execution.ExtendedMode$"))
41 |           .asInstanceOf[Object]
42 | 
43 |         classOf[ExplainCommand]
44 |           .getConstructor(classOf[LogicalPlan], Class.forName("org.apache.spark.sql.execution.ExplainMode"))
45 |           .newInstance(logicalPlan, extendedMode)
46 |       } else {
47 |         classOf[ExplainCommand]
48 |           .getConstructor(classOf[LogicalPlan], classOf[Boolean], classOf[Boolean], classOf[Boolean])
49 |           .newInstance(logicalPlan, true.asInstanceOf[java.lang.Boolean], false.asInstanceOf[java.lang.Boolean], false.asInstanceOf[java.lang.Boolean])
50 |       }
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/util/TypesafeConfigUtils.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.util
 2 | 
 3 | import io.github.setl.enums.Storage
 4 | import com.typesafe.config.{Config, ConfigException}
 5 | 
 6 | object TypesafeConfigUtils {
 7 | 
 8 |   @throws[com.typesafe.config.ConfigException]
 9 |   def getAs[T](config: Config, path: String)(implicit getter: ConfigGetter[T]): Option[T] = getter.get(config, path)
10 | 
11 |   private[this] def _get[T](path: String): (String => T) => Option[T] = (fun: String => T) => {
12 |     try {
13 |       Option(fun(path))
14 |     } catch {
15 |       case _: ConfigException.Missing => None
16 |       case e: ConfigException.WrongType => throw e
17 |     }
18 |   }
19 | 
20 |   private[setl] implicit val stringGetter: ConfigGetter[String] = new ConfigGetter[String] {
21 |     override def get(config: Config, path: String): Option[String] = {
22 |       _get[String](path)(config.getString)
23 |     }
24 |   }
25 | 
26 |   private[setl] implicit val intGetter: ConfigGetter[Int] = new ConfigGetter[Int] {
27 |     override def get(config: Config, path: String): Option[Int] = {
28 |       _get[Int](path)(config.getInt)
29 |     }
30 |   }
31 | 
32 |   private[setl] implicit val longGetter: ConfigGetter[Long] = new ConfigGetter[Long] {
33 |     override def get(config: Config, path: String): Option[Long] = {
34 |       _get[Long](path)(config.getLong)
35 |     }
36 |   }
37 | 
38 |   private[setl] implicit val floatGetter: ConfigGetter[Float] = new ConfigGetter[Float] {
39 |     override def get(config: Config, path: String): Option[Float] = {
40 |       _get[Float](path)(x => config.getString(x).toFloat)
41 |     }
42 |   }
43 | 
44 |   private[setl] implicit val doubleGetter: ConfigGetter[Double] = new ConfigGetter[Double] {
45 |     override def get(config: Config, path: String): Option[Double] = {
46 |       _get[Double](path)(config.getDouble)
47 |     }
48 |   }
49 | 
50 |   private[setl] implicit val booleanGetter: ConfigGetter[Boolean] = new ConfigGetter[Boolean] {
51 |     override def get(config: Config, path: String): Option[Boolean] = {
52 |       _get[Boolean](path)(config.getBoolean)
53 |     }
54 |   }
55 | 
56 |   private[setl] implicit val listGetter: ConfigGetter[Array[AnyRef]] = new ConfigGetter[Array[AnyRef]] {
57 |     override def get(config: Config, path: String): Option[Array[AnyRef]] = {
58 |       _get[Array[AnyRef]](path)(x => config.getList(x).unwrapped().toArray())
59 |     }
60 |   }
61 | 
62 |   private[setl] implicit val StorageGetter: ConfigGetter[Storage] = new ConfigGetter[Storage] {
63 |     override def get(config: Config, path: String): Option[Storage] = {
64 |       _get[Storage](path)(x => Storage.valueOf(config.getString(x)))
65 |     }
66 |   }
67 | 
68 |   def getList(config: Config, path: String): Option[Array[AnyRef]] = {
69 |     listGetter.get(config, path)
70 |   }
71 | 
72 |   def getMap(config: Config): Map[String, String] = {
73 |     import scala.collection.JavaConverters._
74 |     config.entrySet().asScala.map(x => x.getKey -> x.getValue.unwrapped().toString).toMap
75 |   }
76 | 
77 |   def isDefined(config: Config, path: String): Boolean = {
78 |     try {
79 |       config.getAnyRef(path) != null
80 |     } catch {
81 |       case _: ConfigException => false
82 |     }
83 |   }
84 | 
85 |   private[setl] trait ConfigGetter[T] {
86 |     def get(config: Config, path: String): Option[T]
87 |   }
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/workflow/DAG.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.workflow
 2 | 
 3 | import io.github.setl.internal.HasDiagram
 4 | import io.github.setl.transformation.{Factory, FactoryDeliveryMetadata}
 5 | import io.github.setl.util.MermaidUtils
 6 | 
 7 | private[workflow] case class DAG(nodes: Set[Node], flows: Set[Flow]) extends HasDiagram {
 8 | 
 9 |   def describe(): Unit = {
10 |     println("-------------   Data Transformation Summary  -------------")
11 |     checkEmpty(nodes)
12 |     nodes.toList.sortBy(_.stage).foreach(_.describe())
13 | 
14 |     println("------------------   Data Flow Summary  ------------------")
15 |     checkEmpty(flows)
16 |     flows.toList.sortBy(_.stage).foreach(_.describe())
17 |   }
18 | 
19 |   private[this] def checkEmpty(input: Set[_]): Unit = {
20 |     if (input.isEmpty) println("Empty\n")
21 |   }
22 | 
23 |   /**
24 |    * Find all the setter methods of the given Factory
25 |    *
26 |    * @param factory an instantiated Factory
27 |    * @return a list of [[io.github.setl.transformation.FactoryDeliveryMetadata]]
28 |    */
29 |   def findDeliveryMetadata(factory: Factory[_]): List[FactoryDeliveryMetadata] = {
30 |     nodes.find(n => n.factoryUUID == factory.getUUID).get.setters
31 |   }
32 | 
33 |   /** Generate the diagram */
34 |   override def toDiagram: String = {
35 |     val nodeDiagrams = nodes.map(_.toDiagram).mkString("\n")
36 |     val flowDiagrams = flows.map(_.toDiagram).mkString("\n")
37 |     val externalNodeDiagrams = flows.filter(_.from.factoryClass == classOf[External])
38 |       .map(_.from.output.toDiagram).mkString("\n")
39 | 
40 |     s"""classDiagram
41 |        |$nodeDiagrams
42 |        |$externalNodeDiagrams
43 |        |$flowDiagrams
44 |        |""".stripMargin
45 |   }
46 | 
47 |   /** Display the diagram */
48 |   override def showDiagram(): Unit = MermaidUtils.printMermaid(this.toDiagram)
49 | 
50 |   /** Get the diagram ID */
51 |   override def diagramId: String = throw new NotImplementedError("DAG doesn't have diagram id")
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/workflow/External.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.workflow
 2 | 
 3 | import java.util.UUID
 4 | 
 5 | import io.github.setl.transformation.Factory
 6 | 
 7 | sealed abstract class External private extends Factory[External]
 8 | 
 9 | /**
10 |  * Singleton for external data source
11 |  */
12 | object External {
13 |   val NODE: Node = Node(
14 |     classOf[External],
15 |     UUID.fromString("00000000-0000-0000-0000-000000000000"),
16 |     -1,
17 |     List(),
18 |     null
19 |   )
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/workflow/Flow.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.workflow
 2 | 
 3 | import io.github.setl.internal.{HasDescription, HasDiagram}
 4 | import io.github.setl.transformation.Deliverable
 5 | import io.github.setl.util.ReflectUtils
 6 | 
 7 | import scala.reflect.runtime
 8 | 
 9 | /**
10 |  * Flow is a representation of the data transfer in a Pipeline.
11 |  *
12 |  * @param from origin node of the transfer
13 |  * @param to   destination node of the transfer
14 |  */
15 | private[workflow] case class Flow(from: Node, to: Node) extends HasDescription with HasDiagram {
16 | 
17 |   def payload: runtime.universe.Type = from.output.runtimeType
18 | 
19 |   def stage: Int = from.stage
20 | 
21 |   def deliveryId: String = from.output.deliveryId
22 | 
23 |   override def describe(): this.type = {
24 |     if (deliveryId != Deliverable.DEFAULT_ID) {
25 |       println(s"Delivery id : $deliveryId")
26 |     }
27 |     println(s"Stage       : $stage")
28 |     println(s"Direction   : ${from.getPrettyName} ==> ${to.getPrettyName}")
29 |     println(s"PayLoad     : ${ReflectUtils.getPrettyName(payload)}")
30 |     println("----------------------------------------------------------")
31 |     this
32 |   }
33 | 
34 |   override def toDiagram: String = {
35 |     s"${to.diagramId} <|-- ${from.output.diagramId} : Input".stripMargin
36 |   }
37 | 
38 |   override def diagramId: String = ""
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/workflow/PipelineOptimizer.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.workflow
 2 | 
 3 | trait PipelineOptimizer {
 4 | 
 5 |   def setExecutionPlan(dag: DAG): this.type
 6 | 
 7 |   def optimize(stages: Iterable[Stage]): Array[Stage]
 8 | 
 9 |   def getOptimizedExecutionPlan: DAG
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/workflow/SimplePipelineOptimizer.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.workflow
 2 | 
 3 | import io.github.setl.annotation.InterfaceStability
 4 | import io.github.setl.internal.Logging
 5 | 
 6 | import scala.annotation.tailrec
 7 | 
 8 | @InterfaceStability.Unstable
 9 | class SimplePipelineOptimizer(val parallelism: Int = 4) extends PipelineOptimizer with Logging {
10 | 
11 |   private[this] var _executionPlan: DAG = _
12 |   lazy val optExecutionPlan: DAG = optimize()
13 | 
14 |   override def getOptimizedExecutionPlan: DAG = optExecutionPlan
15 | 
16 |   override def setExecutionPlan(dag: DAG): SimplePipelineOptimizer.this.type = {
17 |     this._executionPlan = dag
18 |     this
19 |   }
20 | 
21 |   private[this] def optimize(): DAG = {
22 |     val nodes = _executionPlan.nodes.toList.sortBy(_.stage)
23 |     val oldDag = _executionPlan.copy()
24 |     nodes.foldLeft[DAG](oldDag) {
25 |       case (dag, node) => updateNode(node, dag)
26 |     }
27 |   }
28 | 
29 |   override def optimize(stages: Iterable[Stage]): Array[Stage] = {
30 |     val factories = stages.flatMap(_.factories)
31 | 
32 |     optExecutionPlan.nodes.groupBy(_.stage).map {
33 |       case (id, nodes) =>
34 |         val stage = new Stage().setStageId(id)
35 | 
36 |         val factoryUUIDs = nodes.map(_.factoryUUID)
37 | 
38 |         factories
39 |           .filter(f => factoryUUIDs.contains(f.getUUID))
40 |           .foreach(stage.addFactory)
41 | 
42 |         stage
43 |     }.toArray.sortBy(_.stageId)
44 |   }
45 | 
46 |   private[this] def flowsOf(node: Node, dag: DAG): Set[Flow] = {
47 |     dag.flows.filter(_.to.factoryUUID == node.factoryUUID)
48 |   }
49 | 
50 |   private[this] def updateDag(newNode: Node, dag: DAG): DAG = {
51 |     logDebug(s"Update DAG for node ${newNode.getPrettyName}")
52 |     val oldNode = dag.nodes.find(_.factoryUUID == newNode.factoryUUID).get
53 | 
54 |     val startingFlows = dag.flows
55 |       .filter(_.from == oldNode)
56 |       .map(_.copy(from = newNode))
57 | 
58 |     val endingFlows = dag.flows
59 |       .filter(_.to == oldNode)
60 |       .map(_.copy(to = newNode))
61 | 
62 |     val otherFlows = dag.flows.filter(_.from != oldNode).filter(_.to != oldNode)
63 | 
64 |     val otherNodes = dag.nodes.filter(_ != oldNode)
65 | 
66 |     DAG(otherNodes + newNode, startingFlows ++ endingFlows ++ otherFlows)
67 |   }
68 | 
69 |   @tailrec
70 |   private[this] def validateStage(newStageID: Int, dag: DAG): Int = {
71 |     val nodeCount = dag.nodes.count(_.stage == newStageID)
72 |     if (nodeCount < parallelism) {
73 |       logDebug(s"Valid stage ID: $newStageID")
74 |       newStageID
75 |     } else {
76 |       validateStage(newStageID + 1, dag)
77 |     }
78 |   }
79 | 
80 |   private[this] def updateNode(oldNode: Node, dag: DAG): DAG = {
81 |     logDebug(s"Optimize node: ${oldNode.getPrettyName} of stage ${oldNode.stage}")
82 |     val currentDag = dag.copy()
83 |     val flows = flowsOf(oldNode, dag)
84 | 
85 |     val maxInputStage = flows.size match {
86 |       case 0 => 0
87 |       case _ => flows.map(_.stage).max + 1
88 |     }
89 | 
90 |     logDebug(s"Max input stage of ${oldNode.getPrettyName}: $maxInputStage")
91 | 
92 |     val validStage = validateStage(maxInputStage, dag)
93 | 
94 |     val newNode = oldNode.copy(stage = validStage)
95 | 
96 |     updateDag(newNode, currentDag)
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/test/resources/dynamodb.conf:
--------------------------------------------------------------------------------
1 | dynamodb {
2 |   connector {
3 |     storage = "DYNAMODB"
4 |     region = "eu-west-1"
5 |     table = "test-table"
6 |     saveMode = "Overwrite"
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/src/test/resources/local.conf:
--------------------------------------------------------------------------------
 1 | include "application.conf"
 2 | 
 3 | test.string = "foo"
 4 | test.variable = ${?myJvmProperty}
 5 | 
 6 | setl.config {
 7 |   spark {
 8 |     spark.app.name = "my_app"
 9 |     spark.sql.shuffle.partitions = "1000"
10 |   }
11 | }
12 | 
13 | setl.config_2 {
14 |   spark.app.name = "my_app_2"
15 |   spark.sql.shuffle.partitions = "2000"
16 | }
17 | 
18 | usages.config {
19 |   spark {
20 |     spark.app.name = "usages_app"
21 |     spark.cassandra.connection.host = "cassandraHost"
22 |   }
23 |   usages = ["cassandra"]
24 | }
25 | 
26 | context.spark.spark.sql.shuffle.partitions = 600
27 | 
28 | csv_dc_context2 {
29 |   storage = "CSV"
30 |   path = "src/test/resources/test_config_csv_dc_context2"
31 |   inferSchema = "true"
32 |   delimiter = ";"
33 |   header = "true"
34 |   saveMode = "Append"
35 | }
36 | 
37 | csv_dc_context {
38 |   storage = "CSV"
39 |   path = "src/test/resources/test_config_csv_dc_context"
40 |   inferSchema = "true"
41 |   delimiter = ";"
42 |   header = "true"
43 |   saveMode = "Append"
44 | }
45 | 
46 | parquet_dc_context {
47 |   storage = "PARQUET"
48 |   path = "src/test/resources/test_parquet_dc_context"  // must be absolute path
49 |   table = "test_config2222"
50 |   saveMode = "Append"
51 | }
52 | 
53 | csv_dc_context_consumer {
54 |   storage = "CSV"
55 |   path = "src/test/resources/test_config_csv_dc_context_consumer"
56 |   inferSchema = "true"
57 |   delimiter = ";"
58 |   header = "true"
59 |   saveMode = "Overwrite"
60 | }
61 | 
62 | parquet_dc_context_consumer {
63 |   storage = "PARQUET"
64 |   path = "src/test/resources/test_parquet_dc_context_consumer"  // must be absolute path
65 |   saveMode = "Append"
66 | }
67 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Root logger option
 2 | log4j.rootLogger=warn, stdout
 3 | # Captures all logs inside jcdecaux airport package
 4 | log4j.logger.com.jcdecaux=DEBUG, stdout
 5 | log4j.additivity.com.jcdecaux=false
 6 | # Decrease the verbosity of external libraries logging
 7 | log4j.logger.org.apache=WARN, stdout
 8 | log4j.additivity.org.apache=false
 9 | log4j.logger.com.datastax=INFO, stdout
10 | log4j.additivity.com.datastax=false
11 | log4j.logger.io.netty=WARN, stdout
12 | log4j.additivity.io.netty=false
13 | log4j.logger.org.apache.spark.sql=WARN, stdout
14 | log4j.additivity.org.apache.spark.sql=false
15 | log4j.logger.org.apache.spark.core=WARN, stdout
16 | log4j.additivity.org.apache.spark.core=false
17 | # Direct log messages to stdout
18 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
19 | log4j.appender.stdout.Target=System.out
20 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
21 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{3}:%L - %m%n
22 | 


--------------------------------------------------------------------------------
/src/test/resources/myconf.conf:
--------------------------------------------------------------------------------
1 | my_test_variable = "haha"


--------------------------------------------------------------------------------
/src/test/resources/streaming_test_resources/input/text.txt:
--------------------------------------------------------------------------------
1 | Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the Dataset/DataFrame API in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write-Ahead Logs. In short, Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming.
2 | Internally, by default, Structured Streaming queries are processed using a micro-batch processing engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called Continuous Processing, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able to choose the mode based on your application requirements.
3 | In this guide, we are going to walk you through the programming model and the APIs. We are going to explain the concepts mostly using the default micro-batch processing model, and then later discuss Continuous Processing model. First, let’s start with a simple example of a Structured Streaming query - a streaming word count.
4 | 


--------------------------------------------------------------------------------
/src/test/resources/streaming_test_resources/input2/input2.csv:
--------------------------------------------------------------------------------
1 | text
2 | "hello"
3 | "world"
4 | 


--------------------------------------------------------------------------------
/src/test/resources/streaming_test_resources/streaming.conf:
--------------------------------------------------------------------------------
 1 | structured_streaming_connector_input {
 2 |   storage = "STRUCTURED_STREAMING"
 3 |   format = "text"
 4 |   path = "src/test/resources/streaming_test_resources/input"
 5 | }
 6 | 
 7 | structured_streaming_connector_output {
 8 |   storage = "STRUCTURED_STREAMING"
 9 |   format = "csv"
10 |   header = "false"
11 |   outputMode = "append"
12 |   checkpointLocation = "src/test/resources/streaming_test_resources/output/checkpoint_2"
13 |   path = "src/test/resources/streaming_test_resources/output/2"
14 | }
15 | 
16 | structured_streaming_connector_input_repository {
17 |   storage = "STRUCTURED_STREAMING"
18 |   format = "csv"
19 |   schema = "text STRING" // must be provided for streaming
20 |   header = "true"
21 |   path = "src/test/resources/streaming_test_resources/input2"
22 | }
23 | 
24 | structured_streaming_connector_output_repository {
25 |   storage = "STRUCTURED_STREAMING"
26 |   format = "csv"
27 |   header = "true"
28 |   outputMode = "append"
29 |   checkpointLocation = "src/test/resources/streaming_test_resources/output/checkpoint_3"
30 |   path = "src/test/resources/streaming_test_resources/output/3"
31 | }
32 | 


--------------------------------------------------------------------------------
/src/test/resources/test-archiver/test-input-file.txt:
--------------------------------------------------------------------------------
1 | Hello, world!


--------------------------------------------------------------------------------
/src/test/resources/test-archiver/test-input/col3=c/file1-1-1.csv:
--------------------------------------------------------------------------------
1 | col1,col2
2 | "A","a"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-archiver/test-input/col3=c/file1-2-1.csv:
--------------------------------------------------------------------------------
1 | col1,col2
2 | "B","b"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-archiver/test-input/col3=c/file1-2-2.csv:
--------------------------------------------------------------------------------
1 | col1,col2
2 | "C","c"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-archiver/test-input/col3=cc/file2-1.csv:
--------------------------------------------------------------------------------
1 | col1,col2
2 | "D","d"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-json.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "col1": "a",
 4 |     "col2": 1,
 5 |     "col3": {
 6 |       "col3-1": "haha",
 7 |       "col3-2": "hehe"
 8 |     },
 9 |     "col4": true,
10 |     "col5": 1.1
11 |   },
12 |   {
13 |     "col1": "b",
14 |     "col2": 2,
15 |     "col3": {
16 |       "col3-1": "hahahaha",
17 |       "col3-2": "hehehehe"
18 |     },
19 |     "col4": true,
20 |     "col5": 1.2
21 |   },
22 |   {
23 |     "col1": "c",
24 |     "col2": 3,
25 |     "col3": {
26 |       "col3-1": "hahahahahaha",
27 |       "col3-2": "hehehehehehe"
28 |     },
29 |     "col4": false,
30 |     "col5": 1.3
31 |   },
32 |   {
33 |     "col1": "d",
34 |     "col2": 4,
35 |     "col3": {
36 |       "col3-1": "hahahahahahahaha",
37 |       "col3-2": "hehehehehehehehe"
38 |     },
39 |     "col4": false,
40 |     "col5": 1.4
41 |   }
42 | ]


--------------------------------------------------------------------------------
/src/test/resources/test-list-files/file1.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "D", "d"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-list-files/subdir1/subsubdir1/file1-1-1.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "A", "a"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-list-files/subdir1/subsubdir1/wrongfile1-1-1.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "A", "a"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-list-files/subdir1/subsubdir2/file1-2-1.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "B", "b"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-list-files/subdir1/subsubdir2/file1-2-2.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "C", "c"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-list-files/subdir2/file2-1.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "D", "d"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-list-files2/col3=c/file1-1-1.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "A", "a"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-list-files2/col3=c/file1-2-1.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "B", "b"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-list-files2/col3=c/file1-2-2.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "C", "c"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test-list-files2/col3=cc/file2-1.csv:
--------------------------------------------------------------------------------
1 | col1, col2
2 | "D", "d"
3 | 


--------------------------------------------------------------------------------
/src/test/resources/test_base_path.csv:
--------------------------------------------------------------------------------
1 | col1,col2
2 | a,b
3 | c,d


--------------------------------------------------------------------------------
/src/test/resources/test_connector_builder.conf:
--------------------------------------------------------------------------------
1 | customConnector {
2 |   storage = "OTHER"
3 |   class = "io.github.setl.CustomConnector"
4 | }
5 | 


--------------------------------------------------------------------------------
/src/test/resources/test_priority.conf:
--------------------------------------------------------------------------------
 1 | my.value = "haha"
 2 | 
 3 | setl.config {
 4 |   spark {
 5 |     spark.master = "local"
 6 |     spark.app.name = "my_app_2"
 7 |     spark.sql.shuffle.partitions = "1000"
 8 |   }
 9 | }
10 | 
11 | setl.config_2 {
12 |   spark {
13 |     spark.master = "local"
14 |     spark.app.name = "my_app_context_2"
15 |     spark.sql.shuffle.partitions = "2000"
16 |   }
17 | }
18 | 
19 | test {
20 |   string = "abc"
21 |   int = 1
22 |   long = 2
23 |   float = 3.1
24 |   float2 = "3.1"
25 |   double = 4.4
26 |   boolean = false
27 |   boolean2 = "true"
28 |   list = [1,2,3]
29 |   listFloat = [1.2,2,3]
30 |   listString = ["1.2","2","3"]
31 | 
32 |   map {
33 |     v1 = "a"
34 |     v2 = "b"
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/test/resources/test_schema_converter.csv:
--------------------------------------------------------------------------------
1 | col1,col2,col3,col4
2 | 1,"1","A","a"
3 | 2,"2","B","b"


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/SparkTestUtils.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl
 2 | 
 3 | import io.github.setl.util.SparkUtils
 4 | import org.apache.spark.SparkContext
 5 | 
 6 | private[setl] object SparkTestUtils {
 7 | 
 8 |   def getActiveSparkContext: Option[SparkContext] = {
 9 |     val method = SparkContext.getClass.getDeclaredMethod("getActive")
10 |     method.setAccessible(true)
11 |     method.invoke(SparkContext).asInstanceOf[Option[SparkContext]]
12 |   }
13 | 
14 |   def checkSparkVersion(requiredVersion: String): Boolean = SparkUtils.checkSparkVersion(requiredVersion)
15 | 
16 |   def testConsolePrint(test: => Any, expected: String): Boolean = {
17 |     val stream = new java.io.ByteArrayOutputStream()
18 |     Console.withOut(stream)(test)
19 |     val result = stream.toString().trim()
20 |     result == expected
21 |   }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/TestObject.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl
 2 | 
 3 | import java.sql.{Date, Timestamp}
 4 | 
 5 | import io.github.setl.config.Conf
 6 | import io.github.setl.internal.CanDrop
 7 | import io.github.setl.storage.connector.ConnectorInterface
 8 | import com.typesafe.config.Config
 9 | import org.apache.spark.sql.DataFrame
10 | 
11 | 
12 | case class TestObject(partition1: Int, partition2: String, clustering1: String, value: Long)
13 | 
14 | case class TestObject3(partition1: Int, partition2: String, clustering1: String, value: Long, value2: String)
15 | 
16 | case class TestObject2(col1: String, col2: Int, col3: Double, col4: Timestamp, col5: Date, col6: Long)
17 | 
18 | class CustomConnector extends ConnectorInterface with CanDrop {
19 |   override def setConf(conf: Conf): Unit = null
20 | 
21 |   override def read(): DataFrame = {
22 |     import spark.implicits._
23 |     Seq(1, 2, 3).toDF("id")
24 |   }
25 | 
26 |   override def write(t: DataFrame, suffix: Option[String]): Unit = logDebug("Write with suffix")
27 | 
28 |   override def write(t: DataFrame): Unit = logDebug("Write")
29 | 
30 |   /**
31 |    * Drop the entire table.
32 |    */
33 |   override def drop(): Unit = logDebug("drop")
34 | }
35 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/ConfLoaderSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import com.typesafe.config.ConfigFactory
 4 | import org.scalatest.funsuite.AnyFunSuite
 5 | 
 6 | class ConfLoaderSuite extends AnyFunSuite {
 7 | 
 8 |   test("ConfigLoader builder should build ConfigLoader") {
 9 |     System.setProperty("app.environment", "test")
10 |     System.setProperty("myvalue", "test-my-value")
11 | 
12 |     val cl = ConfigLoader.builder()
13 |       .setAppEnv("local")
14 |       .setAppName("TestConfigLoaderBuilder")
15 |       .setProperty("myJvmProperty", "myJvmPropertyValue")
16 |       .getOrCreate()
17 | 
18 |     assert(cl.get("test.string") === "foo")
19 |     assert(cl.get("test.variable") === "myJvmPropertyValue")
20 |     assert(cl.appName === "TestConfigLoaderBuilder")
21 | 
22 |     System.clearProperty("app.environment")
23 |     System.clearProperty("myvalue")
24 |   }
25 | 
26 |   test("Getters of ConfigLoader") {
27 |     System.setProperty("app.environment", "test")
28 |     val cl = ConfigLoader.builder()
29 |       .setAppEnv("local")
30 |       .setConfigPath("test_priority.conf")
31 |       .getOrCreate()
32 | 
33 |     assert(cl.get("my.value") === "haha")
34 |     assert(cl.getOption("my.value") === Some("haha"))
35 |     assert(cl.getOption("notExisting") === None)
36 |     assert(cl.getArray("test.list") === Array("1","2","3"))
37 |     assert(cl.getObject("setl.config") === cl.config.getObject("setl.config"))
38 |   }
39 | 
40 |   test("ConfigLoader builder should prioritize setConfigPath than setAppEnv and jvm property and pom") {
41 |     System.setProperty("app.environment", "test")
42 |     val cl = ConfigLoader.builder()
43 |       .setAppEnv("local")
44 |       .setConfigPath("test_priority.conf")
45 |       .getOrCreate()
46 | 
47 |     assert(cl.get("my.value") === "haha")
48 |     System.clearProperty("app.environment")
49 |   }
50 | 
51 |   test("ConfigLoader builder should take into account the app.environment property in pom") {
52 |     System.clearProperty("app.environment")
53 |     val configLoader = ConfigLoader.builder().getOrCreate()
54 |     assert(configLoader.appEnv === ConfigFactory.load().getString("setl.environment"))
55 |     System.clearProperty("app.environment")
56 |   }
57 | 
58 |   test("ConfigLoader builder should prioritize setAppEnv than jvm property and pom") {
59 |     System.setProperty("app.environment", "test")
60 | 
61 |     val cl = ConfigLoader.builder()
62 |       .setAppEnv("test_priority")
63 |       .getOrCreate()
64 | 
65 |     assert(cl.get("my.value") === "haha")
66 |     System.clearProperty("app.environment")
67 |   }
68 | 
69 |   test("ConfigLoader builder should prioritize jvm property than pom") {
70 |     System.setProperty("app.environment", "test_priority")
71 | 
72 |     val cl = ConfigLoader.builder()
73 |       .getOrCreate()
74 | 
75 |     assert(cl.get("my.value") === "haha")
76 |     System.clearProperty("app.environment")
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/DeltaConnectorConfSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import org.apache.spark.sql.SaveMode
 4 | import org.scalatest.funsuite.AnyFunSuite
 5 | 
 6 | class DeltaConnectorConfSuite extends AnyFunSuite {
 7 | 
 8 |   val conf = new DeltaConnectorConf()
 9 | 
10 |   test("Set DeltaConnectorConf") {
11 |     assert(conf.get("path") === None)
12 |     assert(conf.get("saveMode") === None)
13 |     conf.setPath("./path")
14 |     conf.setSaveMode(SaveMode.Overwrite)
15 | 
16 |     assert(conf.get("path").get === "./path")
17 |     assert(conf.get("saveMode").get === "Overwrite")
18 |   }
19 | 
20 |   test("Getters of DynamoDBConnectorConf") {
21 |     assert(conf.getPath === "./path")
22 |     assert(conf.getSaveMode === SaveMode.Overwrite)
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/DynamoDBConnectorConfSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import org.scalatest.funsuite.AnyFunSuite
 4 | 
 5 | class DynamoDBConnectorConfSuite extends AnyFunSuite {
 6 | 
 7 |   val conf = new DynamoDBConnectorConf()
 8 | 
 9 |   test("Set DynamoDBConnectorConf") {
10 |     assert(conf.get("table") === None)
11 |     assert(conf.get("readPartitions") === None)
12 |     conf.setTable("realTable")
13 |     conf.setReadPartitions("realReadPartitions")
14 | 
15 |     assert(conf.get("table").get === "realTable")
16 |     assert(conf.get("readPartitions").get === "realReadPartitions")
17 |   }
18 | 
19 |   test("Getters of DynamoDBConnectorConf") {
20 |     assert(conf.getTable === Some("realTable"))
21 |     assert(conf.getReadPartitions === Some("realReadPartitions"))
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/FileConnectorConfSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import io.github.setl.enums.{PathFormat, Storage}
 4 | import io.github.setl.exception.ConfException
 5 | import org.apache.spark.sql.SaveMode
 6 | import org.scalatest.funsuite.AnyFunSuite
 7 | 
 8 | class FileConnectorConfSuite extends AnyFunSuite {
 9 | 
10 |   val conf = new FileConnectorConf()
11 | 
12 | 
13 |   test("Set FileConnectorConf") {
14 |     assert(conf.get("storage") === None)
15 |     conf.setStorage("CSV")
16 |     assert(conf.get("storage").get === "CSV")
17 |     conf.setStorage(Storage.EXCEL)
18 |     assert(conf.get("storage").get === "EXCEL")
19 | 
20 |     assert(conf.get("encoding") === None)
21 |     conf.setEncoding("latin-1")
22 |     assert(conf.get("encoding").get === "latin-1")
23 | 
24 |     assert(conf.get("saveMode") === None)
25 |     conf.setSaveMode("Append")
26 |     assert(conf.get("saveMode").get === "Append")
27 |     conf.setSaveMode(SaveMode.Overwrite)
28 |     assert(conf.get("saveMode").get === "Overwrite")
29 | 
30 |     assert(conf.get("path") === None)
31 |     conf.setPath("path")
32 |     assert(conf.get("path").get === "path")
33 | 
34 |     assert(conf.get("pathFormat") === None)
35 |     conf.setPathFormat(PathFormat.WILDCARD)
36 |     assert(conf.get("pathFormat").get === "WILDCARD")
37 | 
38 |     assert(conf.get("credentialsProvider") === None)
39 |     conf.setS3CredentialsProvider("credentialsProvider")
40 |     assert(conf.get("fs.s3a.aws.credentials.provider").get === "credentialsProvider")
41 | 
42 |     assert(conf.get("accessKey") === None)
43 |     conf.setS3AccessKey("accessKey")
44 |     assert(conf.get("fs.s3a.access.key").get === "accessKey")
45 | 
46 |     assert(conf.get("secretKey") === None)
47 |     conf.setS3SecretKey("secretKey")
48 |     assert(conf.get("fs.s3a.secret.key").get === "secretKey")
49 | 
50 |     assert(conf.get("sessionToken") === None)
51 |     conf.setS3SessionToken("sessionToken")
52 |     assert(conf.get("fs.s3a.session.token").get === "sessionToken")
53 | 
54 |     assert(conf.get("filenamePattern") === None)
55 |     conf.setFilenamePattern("(file)(.*)(\\.csv)")
56 |     assert(conf.get("filenamePattern").get === "(file)(.*)(\\.csv)")
57 |   }
58 | 
59 |   test("Getters FileConnectorConf") {
60 |     assert(conf.getEncoding === "latin-1")
61 |     assert(conf.getSaveMode === SaveMode.Overwrite)
62 |     assert(conf.getStorage === Storage.EXCEL)
63 |     assert(conf.getPath === "path")
64 |     assert(conf.getPathFormat === "WILDCARD")
65 |     assert(conf.getSchema === None)
66 |     assert(conf.getS3CredentialsProvider === Some("credentialsProvider"))
67 |     assert(conf.getS3AccessKey === Some("accessKey"))
68 |     assert(conf.getS3SecretKey === Some("secretKey"))
69 |     assert(conf.getS3SessionToken === Some("sessionToken"))
70 |     assert(conf.getFilenamePattern === Some("(file)(.*)(\\.csv)"))
71 | 
72 |     val newConf = new FileConnectorConf()
73 |     assertThrows[ConfException](newConf.getStorage)
74 |     assertThrows[ConfException](newConf.getPath)
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/HudiConnectorConfSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import io.github.setl.exception.ConfException
 4 | import org.scalatest.funsuite.AnyFunSuite
 5 | import org.apache.spark.sql.SaveMode
 6 | 
 7 | class HudiConnectorConfSuite  extends AnyFunSuite {
 8 |   val conf = new HudiConnectorConf
 9 | 
10 |   test("Get/Set HudiConnectorConf") {
11 |     assert(conf.get("saveMode") === None)
12 |     conf.setSaveMode("Append")
13 |     assert(conf.getSaveMode === SaveMode.Append)
14 |     conf.setSaveMode("Overwrite")
15 |     assert(conf.getSaveMode === SaveMode.Overwrite)
16 |     conf.setSaveMode(SaveMode.Overwrite)
17 |     assert(conf.getSaveMode === SaveMode.Overwrite)
18 | 
19 |     assert(conf.get("path") === None)
20 |     assertThrows[ConfException](conf.getPath)
21 | 
22 |     conf.setPath("path")
23 |     assert(conf.getPath === "path")
24 |   }
25 | 
26 |   test("Init HudiConnectorConf from options") {
27 |     val options : Map[String, String] = Map(
28 |       "path" -> "path",
29 |       "saveMode" -> "Append",
30 |       "hoodie.table.name" -> "test_object",
31 |       "hoodie.datasource.write.recordkey.field" -> "col1",
32 |       "hoodie.datasource.write.precombine.field" -> "col4",
33 |       "hoodie.datasource.write.table.type" -> "MERGE_ON_READ"
34 |     )
35 | 
36 |     val confFromOpts: HudiConnectorConf = HudiConnectorConf.fromMap(options)
37 |     assert(confFromOpts.getPath === "path")
38 |     assert(confFromOpts.getSaveMode === SaveMode.Append)
39 | 
40 |     val readerOpts = confFromOpts.getReaderConf
41 |     val writerOpts = confFromOpts.getWriterConf
42 | 
43 |     // Config should not contains path & save mode
44 |     assert(!readerOpts.contains("path"))
45 |     assert(!writerOpts.contains("path"))
46 |     assert(!writerOpts.contains("saveMode"))
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/JDBCConnectorConfSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import org.apache.spark.sql.SaveMode
 4 | import org.scalatest.funsuite.AnyFunSuite
 5 | 
 6 | class JDBCConnectorConfSuite extends AnyFunSuite {
 7 | 
 8 |   val conf = new JDBCConnectorConf()
 9 |   val url = "url"
10 |   val dbTable = "dbtable"
11 |   val user = "user"
12 |   val password = "password"
13 |   val numPartitions = "numPartitions"
14 |   val partitionColumn = "partitionColumn"
15 |   val lowerBound = "lowerBound"
16 |   val upperBound = "upperBound"
17 |   val fetchSize = "fetchsize"
18 |   val batchSize = "batchsize"
19 |   val truncate = "truncate"
20 |   val driver = "driver"
21 | 
22 |   test("Set JDBCConnectorConf") {
23 |     assert(conf.get(url) === None)
24 |     conf.setUrl(url)
25 |     assert(conf.get(url).get === url)
26 | 
27 |     assert(conf.get(dbTable) === None)
28 |     conf.setDbTable(dbTable)
29 |     assert(conf.get(dbTable).get === dbTable)
30 | 
31 |     assert(conf.get(user) === None)
32 |     conf.setUser(user)
33 |     assert(conf.get(user).get === user)
34 | 
35 |     assert(conf.get(password) === None)
36 |     conf.setPassword(password)
37 |     assert(conf.get(password).get === password)
38 | 
39 |     assert(conf.get("saveMode") === None)
40 |     conf.setSaveMode("Overwrite")
41 |     assert(conf.get("saveMode").get === "Overwrite")
42 | 
43 |     conf.setSaveMode(SaveMode.Append)
44 |     assert(conf.get("saveMode").get === "Append")
45 | 
46 |     assert(conf.get(numPartitions) === None)
47 |     conf.setNumPartitions(numPartitions)
48 |     assert(conf.get(numPartitions).get === numPartitions)
49 | 
50 |     assert(conf.get(partitionColumn) === None)
51 |     conf.setPartitionColumn(partitionColumn)
52 |     assert(conf.get(partitionColumn).get === partitionColumn)
53 | 
54 |     assert(conf.get(lowerBound) === None)
55 |     conf.setLowerBound(lowerBound)
56 |     assert(conf.get(lowerBound).get === lowerBound)
57 | 
58 |     assert(conf.get(upperBound) === None)
59 |     conf.setUpperBound(upperBound)
60 |     assert(conf.get(upperBound).get === upperBound)
61 | 
62 |     assert(conf.get(fetchSize) === None)
63 |     conf.setFetchSize(fetchSize)
64 |     assert(conf.get(fetchSize).get === fetchSize)
65 | 
66 |     assert(conf.get(batchSize) === None)
67 |     conf.setBatchSize(batchSize)
68 |     assert(conf.get(batchSize).get === batchSize)
69 | 
70 |     assert(conf.get(truncate) === None)
71 |     conf.setTruncate(truncate)
72 |     assert(conf.get(truncate).get === truncate)
73 | 
74 |     assert(conf.get(driver) === None)
75 |     conf.setDriver(driver)
76 |     assert(conf.get(driver).get === driver)
77 |   }
78 | 
79 |   test("Getters of JDBCConnectorConf") {
80 |     assert(conf.getUrl === Some(url))
81 |     assert(conf.getDbTable === Some(dbTable))
82 |     assert(conf.getUser === Some(user))
83 |     assert(conf.getPassword === Some(password))
84 |     assert(conf.getSaveMode === Some("Append"))
85 |     assert(conf.getNumPartitions === Some(numPartitions))
86 |     assert(conf.getPartitionColumn === Some(partitionColumn))
87 |     assert(conf.getLowerBound === Some(lowerBound))
88 |     assert(conf.getUpperBound === Some(upperBound))
89 |     assert(conf.getFetchSize === Some(fetchSize))
90 |     assert(conf.getBatchSize === Some(batchSize))
91 |     assert(conf.getTruncate === Some(truncate))
92 |     assert(conf.getDriver === Some(driver))
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/Properties.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import com.typesafe.config.Config
 4 | 
 5 | object Properties {
 6 | 
 7 |   //  override def beforeAll(): Unit = {
 8 |   //    System.setProperty("myvalue", "test-my-value")
 9 |   //  }
10 |   //
11 |   val cl: ConfigLoader = ConfigLoader
12 |     .builder()
13 |     .setProperty("myvalue", "test-my-value")
14 |     .setConfigPath("application.conf").getOrCreate()
15 | 
16 |   val excelConfig: Config = cl.getConfig("test.excel")
17 |   val excelConfigWithoutSchema: Config = cl.getConfig("test.excelWithoutSchema")
18 |   val cassandraConfig: Config = cl.getConfig("test.cassandra")
19 |   val cassandraConfigWithoutClustering: Config = cl.getConfig("test.cassandraWithoutClustering")
20 | 
21 |   val csvConfig: Config = cl.getConfig("test.csv")
22 |   val parquetConfig: Config = cl.getConfig("test.parquet")
23 | 
24 |   val jsonConfig: Config = cl.getConfig("test.json")
25 | 
26 |   val jdbcConfig: Config = cl.getConfig("psql.test")
27 | 
28 |   val hudiConfig : Config = cl.getConfig("hudi.test")
29 |   val sparkSQLConfig : Config = cl.getConfig("sparkSQL.test")
30 | 
31 |   val excelConfigConnector: Config = cl.getConfig("connector.excel")
32 |   val cassandraConfigConnector: Config = cl.getConfig("connector.cassandra")
33 |   val csvConfigConnector: Config = cl.getConfig("connector.csv")
34 |   val parquetConfigConnector: Config = cl.getConfig("connector.parquet")
35 |   val dynamoDbConfigConnector: Config = cl.getConfig("connector.dynamo")
36 | 
37 |   val excelConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.excel")
38 |   val cassandraConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.cassandra")
39 |   val csvConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.csv")
40 |   val jsonConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.json")
41 |   val deltaConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.delta")
42 | 
43 | 
44 |   val wrongCsvConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.wrong_csv")
45 |   val customConnectorWithoutRef: Config = cl.getConfig("connectorBuilder.wrong_csv2")
46 |   val parquetConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.parquet")
47 | 
48 | 
49 |   val excelConfigRepoBuilder: Config = cl.getConfig("repoBuilder.excel")
50 |   val cassandraConfigRepoBuilder: Config = cl.getConfig("repoBuilder.cassandra")
51 |   val csvConfigRepoBuilder: Config = cl.getConfig("repoBuilder.csv")
52 |   val parquetConfigRepoBuilder: Config = cl.getConfig("repoBuilder.parquet")
53 |   val deltaConfigRepoBuilder: Config = cl.getConfig("repoBuilder.delta")
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/PropertiesSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import io.github.setl.util.TypesafeConfigUtils
 4 | import org.scalatest.BeforeAndAfterAll
 5 | import org.scalatest.funsuite.AnyFunSuite
 6 | 
 7 | class PropertiesSuite extends AnyFunSuite with BeforeAndAfterAll {
 8 | 
 9 |   override protected def beforeAll(): Unit = {
10 |     System.setProperty("myvalue", "test-my-value")
11 |   }
12 | 
13 |   System.setProperty("myvalue", "test-my-value")
14 | 
15 |   override protected def afterAll(): Unit = {
16 |     System.clearProperty("myvalue")
17 |   }
18 | 
19 |   //  test("ConfigLoader beforeAll") {
20 |   //    assert(Properties.cl.get("myValue") === "test-my-value")
21 |   //    assert(Properties.cl.get("test.myValue2") === "test-my-value-loaded")
22 |   //  }
23 | 
24 |   test("Cassandra config") {
25 |     assert(TypesafeConfigUtils.getAs[String](Properties.cassandraConfig, "storage").get === "CASSANDRA")
26 |     assert(TypesafeConfigUtils.getAs[String](Properties.cassandraConfig, "keyspace").get === "test_space")
27 |     assert(TypesafeConfigUtils.getAs[String](Properties.cassandraConfig, "table").get === "test_spark_connector2")
28 |     assert(TypesafeConfigUtils.getList(Properties.cassandraConfig, "partitionKeyColumns").get === Array("partition1", "partition2"))
29 |     assert(TypesafeConfigUtils.getList(Properties.cassandraConfig, "clusteringKeyColumns").get === Array("clustering1"))
30 |     assert(TypesafeConfigUtils.getList(Properties.cassandraConfig, "doesntExist") === None)
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/StructuredStreamingConnectorConfSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.config
 2 | 
 3 | import org.scalatest.funsuite.AnyFunSuite
 4 | import org.scalatest.matchers.should.Matchers
 5 | 
 6 | class StructuredStreamingConnectorConfSuite extends AnyFunSuite with Matchers {
 7 | 
 8 | 
 9 |   test("StructureStreamingConnectorConf exceptions") {
10 |     val conf = new StructuredStreamingConnectorConf()
11 |     assertThrows[IllegalArgumentException](conf.getSchema)
12 |     assertThrows[IllegalArgumentException](conf.getFormat)
13 |     assertThrows[IllegalArgumentException](conf.getPath)
14 |     assertThrows[IllegalArgumentException](conf.getOutputMode)
15 | 
16 |     assert(conf.getReaderConf === Map())
17 |     assert(conf.getReaderConf === Map())
18 | 
19 |   }
20 | 
21 |   test("StructureStreamingConnectorConf getFormat should always output lowercase") {
22 |     val conf = new StructuredStreamingConnectorConf()
23 |     conf.setFormat("PARQUET")
24 |     assert(conf.getFormat === "parquet")
25 |   }
26 | 
27 |   test("Setters and getters") {
28 |     val conf = new StructuredStreamingConnectorConf()
29 |     conf.setFormat("parquet")
30 |     conf.setPath("test/path")
31 |     conf.setOutputMode("outputmode")
32 |     conf.setSchema("schema")
33 | 
34 |     conf.set("header", "header1")
35 | 
36 |     assert(conf.getFormat === "parquet")
37 |     assert(conf.getPath === "test/path")
38 |     assert(conf.getOutputMode === "outputmode")
39 |     assert(conf.getSchema === "schema")
40 |     assert(conf.get("header") === Option("header1"))
41 | 
42 |     conf.getReaderConf.keys should contain theSameElementsAs Array("header", "path")
43 |     conf.getWriterConf.keys should contain theSameElementsAs Array("header", "path")
44 |   }
45 | 
46 |   test("Construction from Map") {
47 |     val map = Map(
48 |       "format" -> "PARQUET",
49 |       "outputMode" -> "append",
50 |       "checkpointLocation" -> "test_checkpoint",
51 |       "path" -> "test_path"
52 |     )
53 | 
54 |     val conf = StructuredStreamingConnectorConf.fromMap(map)
55 | 
56 |     assert(conf.getFormat === "parquet")
57 |     assert(conf.getPath === "test_path")
58 |     assert(conf.getOutputMode === "append")
59 |     assert(conf.get("checkpointLocation") === Some("test_checkpoint"))
60 |     assert(conf.get("none") === None)
61 |   }
62 | 
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/factory/FactoryDeliveryMetadataSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.factory
 2 | 
 3 | import io.github.setl.internal.TestClasses.TestFactory
 4 | import io.github.setl.transformation.FactoryDeliveryMetadata
 5 | import io.github.setl.workflow.External
 6 | import org.scalatest.funsuite.AnyFunSuite
 7 | 
 8 | class FactoryDeliveryMetadataSuite extends AnyFunSuite {
 9 | 
10 |   val fac = new TestFactory
11 | 
12 |   test("Test FactoryDeliveryMetadata Builder") {
13 | 
14 |     val setters = FactoryDeliveryMetadata.builder().setFactory(fac).getOrCreate()
15 | 
16 |     setters.foreach(println)
17 | 
18 |     assert(setters.size === 4)
19 |     assert(setters.map(_.factoryUUID).toSet.size === 1)
20 |     assert(setters.find(_.name == "inputInt").get.producer === classOf[External])
21 |     assert(setters.find(_.name == "setInputs").get.argTypes.size === 2)
22 |     assert(setters.find(_.isDataset.contains(true)).size === 0)
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/internal/BenchmarkInvocationHandlerSuite.scala:
--------------------------------------------------------------------------------
  1 | package io.github.setl.internal
  2 | 
  3 | import io.github.setl.annotation.Benchmark
  4 | import io.github.setl.transformation.{AbstractFactory, Factory}
  5 | import io.github.setl.workflow.Pipeline
  6 | import org.scalatest.funsuite.AnyFunSuite
  7 | 
  8 | class BenchmarkInvocationHandlerSuite extends AnyFunSuite {
  9 | 
 10 |   import BenchmarkInvocationHandlerSuite._
 11 | 
 12 |   test("BenchmarkInvocationHandler should log execution time") {
 13 |     val factory = new BenchmarkFactory
 14 |     val benchmarkHandler = new BenchmarkInvocationHandler(factory)
 15 | 
 16 |     val proxyFactory = java.lang.reflect.Proxy.newProxyInstance(
 17 |       getClass.getClassLoader,
 18 |       Array(classOf[AbstractFactory[_]]),
 19 |       benchmarkHandler
 20 |     ).asInstanceOf[AbstractFactory[_]]
 21 | 
 22 |     proxyFactory.read()
 23 |     proxyFactory.process()
 24 |     proxyFactory.write()
 25 | 
 26 |     assert(classOf[BenchmarkFactory].isAnnotationPresent(classOf[Benchmark]))
 27 |     assert(factory.get() === proxyFactory.get())
 28 | 
 29 |     import scala.collection.JavaConverters._
 30 |     benchmarkHandler.getBenchmarkResult.asScala.foreach {
 31 |       x => assert(x._2 >=0)
 32 |     }
 33 | 
 34 |     assert(benchmarkHandler.getBenchmarkResult.size() === 2)
 35 | 
 36 |   }
 37 | 
 38 |   test("Benchmark should be handled in pipeline") {
 39 | 
 40 |     val pipeline = new Pipeline()
 41 | 
 42 |     val result = pipeline
 43 |       .addStage[BenchmarkFactory]()
 44 |       .benchmark(true)
 45 |       .run()
 46 |       .getBenchmarkResult
 47 | 
 48 |     assert(result.length === 1)
 49 | 
 50 |     val result2 = new Pipeline()
 51 |       .addStage[BenchmarkFactory]()
 52 |       .run()
 53 |       .getBenchmarkResult
 54 | 
 55 |     assert(result2.isEmpty)
 56 | 
 57 |     val result3 = new Pipeline()
 58 |       .addStage[BenchmarkFactory]()
 59 |       .benchmark(false)
 60 |       .run()
 61 |       .getBenchmarkResult
 62 | 
 63 |     assert(result3.isEmpty)
 64 |   }
 65 | 
 66 | }
 67 | 
 68 | object BenchmarkInvocationHandlerSuite {
 69 | 
 70 |   @Benchmark
 71 |   class BenchmarkFactory extends Factory[String] {
 72 | 
 73 |     private[this] var data = ""
 74 | 
 75 |     override def read(): BenchmarkFactory.this.type = {
 76 |       data = s"testing ${this.getClass.getSimpleName}... "
 77 |       this
 78 |     }
 79 | 
 80 |     @Benchmark
 81 |     override def process(): BenchmarkFactory.this.type = {
 82 |       data = data + data
 83 |       this
 84 |     }
 85 | 
 86 |     @Benchmark
 87 |     override def write(): BenchmarkFactory.this.type = {
 88 |       println(data)
 89 |       sleep()
 90 |       this
 91 |     }
 92 | 
 93 |     override def get(): String = data
 94 | 
 95 |     def sleep(): Unit = Thread.sleep(1000L)
 96 | 
 97 |   }
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/internal/StructAnalyserSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.annotation.{ColumnName, CompoundKey, Compress}
 4 | import io.github.setl.internal.TestClasses.TestStructAnalyser
 5 | import io.github.setl.storage.{Compressor, XZCompressor}
 6 | import org.apache.spark.sql.types.StructType
 7 | import org.scalatest.funsuite.AnyFunSuite
 8 | 
 9 | class StructAnalyserSuite extends AnyFunSuite {
10 | 
11 |   val schema: StructType = StructAnalyser.analyseSchema[TestStructAnalyser]
12 | 
13 |   test("StructAnalyser should be able to handle @ColumnName") {
14 |     val fields = schema.filter(_.metadata.contains(classOf[ColumnName].getCanonicalName))
15 | 
16 |     assert(fields.length === 1)
17 |     assert(fields.head.name === "col1")
18 |     assert(fields.head.metadata.getStringArray(classOf[ColumnName].getCanonicalName) === Array("alias1"))
19 | 
20 |   }
21 | 
22 |   test("StructAnalyser should be able to handle @CompoundKey") {
23 |     val fields = schema.filter(_.metadata.contains(classOf[CompoundKey].getCanonicalName))
24 | 
25 |     assert(fields.length === 2)
26 |     assert(fields.map(_.name) === Array("col2", "col22"))
27 |     assert(fields.map(_.metadata.getStringArray(classOf[CompoundKey].getCanonicalName)).map(_ (0)) === List("test!@1", "test!@2"))
28 |   }
29 | 
30 |   test("StructAnalyser should be able to handle @Compress") {
31 |     val fields = schema.filter(_.metadata.contains(classOf[Compress].getCanonicalName))
32 | 
33 |     assert(fields.length === 2)
34 |     assert(fields.map(_.name) === Array("col3", "col4"))
35 | 
36 |     assert(
37 |       fields
38 |         .find(_.name == "col3")
39 |         .get.metadata
40 |         .getStringArray(classOf[Compress].getCanonicalName)(0) === classOf[XZCompressor].getCanonicalName
41 |     )
42 | 
43 |     assert(
44 |       fields
45 |         .find(_.name == "col4")
46 |         .get.metadata
47 |         .getStringArray(classOf[Compress].getCanonicalName)(0) === classOf[Compressor].getCanonicalName
48 |     )
49 |   }
50 | 
51 |   test("[SETL-34] StructAnalyser should handle multiple @CompoundKey annotations") {
52 |     val structType = StructAnalyser.analyseSchema[TestClasses.MultipleCompoundKeyTest]
53 |     structType.foreach { x =>
54 |       println(s"name: ${x.name}, type: ${x.dataType}, meta: ${x.metadata}")
55 |     }
56 | 
57 |     assert(structType.find(_.name == "col1").get.metadata.getStringArray(classOf[CompoundKey].getCanonicalName) === Array("sort!@1","part!@1"))
58 |     assert(structType.find(_.name == "col2").get.metadata.getStringArray(classOf[CompoundKey].getCanonicalName) === Array("sort!@2"))
59 |     assert(structType.find(_.name == "col3").get.metadata.getStringArray(classOf[CompoundKey].getCanonicalName) === Array("part!@2"))
60 |   }
61 | 
62 | 
63 |   test("StructAnalyser should be able to find columns with @CompoundKey") {
64 |     val primaryColumns1 = StructAnalyser.findCompoundColumns[TestClasses.MultipleCompoundKeyTest]
65 |     val primaryColumns2 = StructAnalyser.findCompoundColumns[TestClasses.MyObject]
66 | 
67 |     assert(primaryColumns1.length == 3)
68 |     assert(primaryColumns1 === Array("col1", "col2", "COLUMN_3"))
69 |     assert(primaryColumns2.isEmpty)
70 |     assert(primaryColumns2 === Array())
71 |   }
72 | 
73 |   test("[SETL-34] StructAnalyser should throw exception when there are more than one ColumnName annotation") {
74 |     assertThrows[IllegalArgumentException](StructAnalyser.analyseSchema[TestClasses.WrongClass])
75 |   }
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/internal/TestClasses.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.internal
 2 | 
 3 | import io.github.setl.annotation.{ColumnName, CompoundKey, Compress, Delivery}
 4 | import io.github.setl.storage.Compressor
 5 | import io.github.setl.transformation.Factory
 6 | 
 7 | object TestClasses {
 8 | 
 9 |   case class WrongClass(@ColumnName("1") @ColumnName("2") col1: String)
10 | 
11 |   case class MultipleCompoundKeyTest(@CompoundKey("sort", "1") @CompoundKey("part", "1") col1: String,
12 |                                      @CompoundKey("sort", "2") col2: String,
13 |                                      @CompoundKey("part", "2") @ColumnName("COLUMN_3") col3: String)
14 | 
15 |   case class InnerClass(innerCol1: String, innerCol2: String)
16 | 
17 |   case class TestCompression(@ColumnName("dqsf") col1: String,
18 |                              @CompoundKey("test", "1") col2: String,
19 |                              @Compress col3: Seq[InnerClass],
20 |                              @Compress col4: Seq[String]) {
21 |   }
22 | 
23 |   case class TestStructAnalyser(@ColumnName("alias1") col1: String,
24 |                                 @CompoundKey("test", "1") col2: String,
25 |                                 @CompoundKey("test", "2") col22: String,
26 |                                 @Compress col3: Seq[InnerClass],
27 |                                 @Compress(compressor = classOf[Compressor]) col4: Seq[String]) {
28 |   }
29 | 
30 |   class Producer1
31 | 
32 |   class Producer2
33 | 
34 |   class TestFactory extends Factory[String] {
35 | 
36 |     var input3: Double = _
37 |     var input4: Boolean = _
38 | 
39 |     @Delivery(producer = classOf[Producer1])
40 |     var inputString1: String = _
41 | 
42 |     @Delivery(producer = classOf[Producer2])
43 |     var inputString2: String = _
44 | 
45 |     @Delivery(optional = true)
46 |     var inputInt: Int = _
47 | 
48 |     @Delivery
49 |     def setInputs(d: Double, boo: Boolean): this.type = {
50 |       input3 = d
51 |       input4 = boo
52 |       this
53 |     }
54 | 
55 |     /**
56 |      * Read data
57 |      */
58 |     override def read(): TestFactory.this.type = this
59 | 
60 |     /**
61 |      * Process data
62 |      */
63 |     override def process(): TestFactory.this.type = this
64 | 
65 |     /**
66 |      * Write data
67 |      */
68 |     override def write(): TestFactory.this.type = this
69 | 
70 |     /**
71 |      * Get the processed data
72 |      */
73 |     override def get(): String = "Product of TestFactory " + inputString1 + inputString2
74 |   }
75 | 
76 | 
77 |   case class MyObject(@ColumnName("col1") column1: String, column2: String)
78 | 
79 |   case class TestCompoundKey(@CompoundKey("primary", "1") a: String, @CompoundKey("primary", "2") b: Int, @CompoundKey("sort", "1") c: String)
80 | 
81 |   case class TestNullableColumn(@CompoundKey("primary", "1") col1: String, col2: String, col3: Option[Int], col4: Double)
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/ConditionSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage
 2 | 
 3 | import java.time.{LocalDate, LocalDateTime}
 4 | 
 5 | import io.github.setl.enums.ValueType
 6 | import org.scalatest.funsuite.AnyFunSuite
 7 | 
 8 | class ConditionSuite extends AnyFunSuite {
 9 | 
10 |   test("Condition could be converted to sql request") {
11 | 
12 |     val strCond = Condition("col1", "=", "haha")
13 |     assert(strCond.toSqlRequest === "(`col1` = 'haha')")
14 | 
15 |     val intCond = Condition("col1", "=", 1)
16 |     assert(intCond.toSqlRequest === "(`col1` = 1)")
17 | 
18 |     val floatCond = Condition("col1", "=", 1F)
19 |     assert(floatCond.toSqlRequest === "(`col1` = 1.0)")
20 | 
21 |     val date = LocalDate.parse("1990-01-01")
22 |     val dateCond = Condition("date", "=", date)
23 |     assert(dateCond.toSqlRequest === "(`date` = cast('1990-01-01' as date))")
24 | 
25 |     val datetime = LocalDateTime.parse("1990-01-01T00:00:00")
26 |     val datetimeCond = Condition("datetime", "=", datetime)
27 |     assert(datetimeCond.toSqlRequest === "(`datetime` = cast('1990-01-01 00:00:00' as timestamp))")
28 | 
29 |     val strSetCond = Condition("str_set", "in", Set("a", "b"))
30 |     assert(strSetCond.toSqlRequest === "(`str_set` IN ('a', 'b'))")
31 | 
32 |     val floatSetCond = Condition("float_set", "in", Set(1.343F, 2.445F))
33 |     assert(floatSetCond.toSqlRequest === "(`float_set` IN (1.343, 2.445))")
34 | 
35 |     val strCondWithType = Condition("col1", "=", "hehe", ValueType.STRING)
36 |     assert(strCondWithType.toSqlRequest === "(`col1` = 'hehe')")
37 |   }
38 | 
39 |   test("Condition should return null if value is not defined") {
40 |     val cond = Condition("a", "=", None, ValueType.STRING)
41 |     assert(cond.toSqlRequest === null)
42 |   }
43 | 
44 |   test("Null sql request should be ignored in a condition set") {
45 | 
46 |     val conds = Set(
47 |       Condition("a", "=", None, ValueType.STRING),
48 |       Condition("b", "=", 1.5),
49 |       Condition("c", "in", Set("x", "y"))
50 |     )
51 | 
52 |     import io.github.setl.util.FilterImplicits._
53 |     assert(conds.toSqlRequest === "(`b` = 1.5) AND (`c` IN ('x', 'y'))")
54 | 
55 |   }
56 | 
57 |   test("Condition should handle Column") {
58 |     import org.apache.spark.sql.functions._
59 |     val condition = Condition(
60 |       col("test col").isin(1, 2, 3)
61 |     )
62 | 
63 |     assert(condition.toSqlRequest === Condition("test col", "IN", Set(1, 2, 3)).toSqlRequest)
64 | 
65 |     val condition2 = Condition(
66 |       col("test col").isin(1, 2, 3) && col("test col 2") === "A"
67 |     )
68 |     assert(condition2.toSqlRequest === "((`test col` IN (1, 2, 3)) AND (`test col 2` = 'A'))")
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/GZIPCompressorSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage
 2 | 
 3 | import org.scalatest.funsuite.AnyFunSuite
 4 | 
 5 | class GZIPCompressorSuite extends AnyFunSuite {
 6 | 
 7 |   val compressor = new GZIPCompressor
 8 | 
 9 |   test("GZIPCompressor should be able to compress a string to a Byte[]") {
10 |     println(s"String1: ${str.getBytes().length} -> ${compressor.compress(str).length}")
11 |     println(s"String2: ${str2.getBytes().length} -> ${compressor.compress(str2).length}")
12 |     println(s"String3: ${str3.getBytes().length} -> ${compressor.compress(str3).length}")
13 |     println(s"String4: ${str4.getBytes().length} -> ${compressor.compress(str4).length}")
14 |     assert(str.getBytes().length >= compressor.compress(str).length)
15 |     assert(str2.getBytes().length >= compressor.compress(str2).length)
16 |     assert(str3.getBytes().length >= compressor.compress(str3).length)
17 |     assert(str4.getBytes().length >= compressor.compress(str4).length)
18 | 
19 |   }
20 | 
21 |   test("GZIPCompressor should be able to decompress a Byte array to string") {
22 |     assert(compressor.decompress(compressor.compress(str)) === str)
23 |     assert(compressor.decompress(compressor.compress(str2)) === str2)
24 |     assert(compressor.decompress(compressor.compress(str3)) === str3)
25 |     assert(compressor.decompress(compressor.compress(str4)) === str4)
26 |     assert(compressor.decompress("testtesttest".getBytes()) === "testtesttest")
27 | 
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/SnappyCompressorSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage
 2 | 
 3 | import org.scalatest.funsuite.AnyFunSuite
 4 | 
 5 | class SnappyCompressorSuite extends AnyFunSuite {
 6 | 
 7 |   val compressor = new SnappyCompressor
 8 | 
 9 |   test("XZCompressor should be able to compress a string to a Byte[]") {
10 |     println(s"String1: ${str.getBytes().length} -> ${compressor.compress(str).length}")
11 |     println(s"String2: ${str2.getBytes().length} -> ${compressor.compress(str2).length}")
12 |     println(s"String3: ${str3.getBytes().length} -> ${compressor.compress(str3).length}")
13 |     println(s"String4: ${str4.getBytes().length} -> ${compressor.compress(str4).length}")
14 | 
15 |     assert(str.getBytes().length >= compressor.compress(str).length)
16 |     assert(str2.getBytes().length >= compressor.compress(str2).length)
17 |     assert(str3.getBytes().length >= compressor.compress(str3).length)
18 |     assert(str4.getBytes().length >= compressor.compress(str4).length)
19 | 
20 |   }
21 | 
22 |   test("XZCompressor should be able to decompress a Byte array to string") {
23 |     assert(compressor.decompress(compressor.compress(str)) === str)
24 |     assert(compressor.decompress(compressor.compress(str2)) === str2)
25 |     assert(compressor.decompress(compressor.compress(str3)) === str3)
26 |     assert(compressor.decompress(compressor.compress(str4)) === str4)
27 |   }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/XZCompressorSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage
 2 | 
 3 | import org.scalatest.funsuite.AnyFunSuite
 4 | 
 5 | class XZCompressorSuite extends AnyFunSuite {
 6 | 
 7 |   val compressor = new XZCompressor
 8 | 
 9 |   test("XZCompressor should be able to compress a string to a Byte[]") {
10 |     println(s"String1: ${str.getBytes().length} -> ${compressor.compress(str).length}")
11 |     println(s"String2: ${str2.getBytes().length} -> ${compressor.compress(str2).length}")
12 |     println(s"String3: ${str3.getBytes().length} -> ${compressor.compress(str3).length}")
13 |     println(s"String4: ${str4.getBytes().length} -> ${compressor.compress(str4).length}")
14 | 
15 |     assert(str.getBytes().length >= compressor.compress(str).length)
16 |     assert(str2.getBytes().length >= compressor.compress(str2).length)
17 |     assert(str3.getBytes().length >= compressor.compress(str3).length)
18 |     assert(str4.getBytes().length >= compressor.compress(str4).length)
19 | 
20 |   }
21 | 
22 |   test("XZCompressor should be able to decompress a Byte array to string") {
23 |     assert(compressor.decompress(compressor.compress(str)) === str)
24 |     assert(compressor.decompress(compressor.compress(str2)) === str2)
25 |     assert(compressor.decompress(compressor.compress(str3)) === str3)
26 |     assert(compressor.decompress(compressor.compress(str4)) === str4)
27 |   }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/connector/ConnectorSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.scalatest.BeforeAndAfterAll
 6 | import org.scalatest.funsuite.AnyFunSuite
 7 | 
 8 | class ConnectorSuite extends AnyFunSuite with BeforeAndAfterAll {
 9 | 
10 |   test("Connector object") {
11 |     val spark: SparkSession = SparkSession.builder().config(new SparkConf()).master("local[*]").getOrCreate()
12 | 
13 |     val df = spark.emptyDataFrame
14 | 
15 |     assert(Connector.empty.spark === null)
16 |     assert(Connector.empty.storage === null)
17 |     assert(Connector.empty.read() === null)
18 |     Connector.empty.write(df)
19 |     Connector.empty.write(df, Some("suffix"))
20 |   }
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/connector/HudiConnectorSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import io.github.setl.config.{Conf, HudiConnectorConf, Properties}
 4 | import io.github.setl.{SparkSessionBuilder, SparkTestUtils, TestObject2}
 5 | import org.apache.spark.sql.{SaveMode, SparkSession}
 6 | import org.scalatest.funsuite.AnyFunSuite
 7 | 
 8 | import java.nio.file.Paths
 9 | import java.sql.{Date, Timestamp}
10 | 
11 | class HudiConnectorSuite extends AnyFunSuite {
12 | 
13 |   val path: String = Paths.get("src", "test", "resources", "test_hudi").toFile.getAbsolutePath
14 |   val saveMode = SaveMode.Overwrite
15 | 
16 |   val options: Map[String, String] = Map[String, String](
17 |     "path" -> path,
18 |     "saveMode" -> saveMode.toString,
19 |     "hoodie.table.name" -> "test_object",
20 |     "hoodie.datasource.write.recordkey.field" -> "col1",
21 |     "hoodie.datasource.write.precombine.field" -> "col4",
22 |     "hoodie.datasource.write.table.type" -> "MERGE_ON_READ"
23 |   )
24 | 
25 |   val testTable: Seq[TestObject2] = Seq(
26 |     TestObject2("string", 5, 0.000000001685400132103450D, new Timestamp(1557153268000L), new Date(1557100800000L), 999999999999999999L),
27 |     TestObject2("string2", 5, 0.000000001685400132103450D, new Timestamp(1557153268000L), new Date(1557100800000L), 999999999999999999L),
28 |     TestObject2("string3", 5, 0.000000001685400132103450D, new Timestamp(1557153268000L), new Date(1557100800000L), 999999999999999999L)
29 |   )
30 | 
31 |   test("Instantiation of constructors") {
32 | 
33 |     // New spark session here since Hudi only supports KryoSerializer
34 |     val spark: SparkSession = new SparkSessionBuilder().setEnv("local")
35 |       .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
36 |       .build()
37 |       .get()
38 |     assume(SparkTestUtils.checkSparkVersion("2.4"))
39 | 
40 |     import spark.implicits._
41 | 
42 |     val connector = new HudiConnector(HudiConnectorConf.fromMap(options))
43 |     connector.write(testTable.toDF)
44 |     assert(connector.read().collect().length == testTable.length)
45 | 
46 |     val path2: String = Paths.get("src", "test", "resources", "test_hudi_2").toFile.getAbsolutePath
47 |     val options2 = options + ("path" -> path2)
48 |     val connector2 = new HudiConnector(options2)
49 |     connector2.write(testTable.toDF)
50 |     assert(connector2.read().collect().length == testTable.length)
51 | 
52 |     val path3: String = Paths.get("src", "test", "resources", "test_hudi_3").toFile.getAbsolutePath
53 |     val options3 = options + ("path" -> path3)
54 |     val connector3 = new HudiConnector(Conf.fromMap(options3))
55 |     connector3.write(testTable.toDF, Some("any_"))
56 |     assert(connector3.read().collect().length == testTable.length)
57 | 
58 |     val connector7 = new HudiConnector(Properties.hudiConfig)
59 |     connector7.write(testTable.toDF)
60 |     assert(connector7.read().collect().length == testTable.length)
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/connector/SparkSQLConnectorSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import io.github.setl.config.{Conf, Properties}
 4 | import io.github.setl.{SparkSessionBuilder, TestObject}
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.scalatest.funsuite.AnyFunSuite
 8 | 
 9 | class SparkSQLConnectorSuite extends AnyFunSuite{
10 | 
11 |   val query : String =
12 |     """
13 |       | SELECT (ones.n1 + tens.n2 * 10) as user_id
14 |       | FROM (
15 |       |  SELECT 0 AS n1
16 |       |  UNION SELECT 1 AS n1
17 |       |  UNION SELECT 2 AS n1
18 |       |  UNION SELECT 3 AS n1
19 |       |  UNION SELECT 4 AS n1
20 |       |  UNION SELECT 5 AS n1
21 |       |  UNION SELECT 6 AS n1
22 |       |  UNION SELECT 7 AS n1
23 |       |  UNION SELECT 8 AS n1
24 |       |  UNION SELECT 9 AS n1
25 |       | ) ones
26 |       | CROSS JOIN
27 |       | (
28 |       |  SELECT 0 AS n2
29 |       |  UNION SELECT 1 AS n2
30 |       |  UNION SELECT 2 AS n2
31 |       |  UNION SELECT 3 AS n2
32 |       |  UNION SELECT 4 AS n2
33 |       |  UNION SELECT 5 AS n2
34 |       |  UNION SELECT 6 AS n2
35 |       |  UNION SELECT 7 AS n2
36 |       |  UNION SELECT 8 AS n2
37 |       |  UNION SELECT 9 AS n2
38 |       | ) tens
39 |       |""".stripMargin
40 | 
41 |   val testTable: Seq[TestObject] = Seq(
42 |     TestObject(1, "p1", "c1", 1L),
43 |     TestObject(2, "p2", "c2", 2L),
44 |     TestObject(3, "p3", "c3", 3L)
45 |   )
46 | 
47 |   val options : Map[String, String] = Map(
48 |     "query" -> query
49 |   )
50 | 
51 | 
52 |   test("Instantiation of constructors") {
53 |     val connector = new SparkSQLConnector(query)
54 |     assert(connector.query === query)
55 | 
56 |     val testConfig = Properties.sparkSQLConfig
57 |     val connector2 = new SparkSQLConnector(testConfig)
58 |     assert(connector2.query === "SELECT * FROM schema.table")
59 | 
60 |     val connector3 = new SparkSQLConnector(Conf.fromMap(options))
61 |     assert(connector3.query === query)
62 | 
63 |     assertThrows[IllegalArgumentException](new SparkSQLConnector(""))
64 |     assertThrows[IllegalArgumentException](new SparkSQLConnector(Conf.fromMap(Map.empty)))
65 |     assertThrows[IllegalArgumentException](new SparkSQLConnector(testConfig.withoutPath("query")))
66 |   }
67 | 
68 |   test("Read/Write of SparkSQLConnector") {
69 |     val spark: SparkSession = SparkSession.builder().config(new SparkConf()).master("local[*]").getOrCreate()
70 |     import spark.implicits._
71 | 
72 |     val connector = new SparkSQLConnector(query)
73 |     assert(connector.read().collect().length == 100)
74 | 
75 |     // Should log warning & do nothing
76 |     val testDF = testTable.toDF()
77 |     connector.write(testDF)
78 |     connector.write(testDF, Some("any_"))
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/connector/StructuredStreamingConnectorSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.connector
 2 | 
 3 | import io.github.setl.SparkSessionBuilder
 4 | import io.github.setl.config.Conf
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.scalatest.funsuite.AnyFunSuite
 7 | 
 8 | class StructuredStreamingConnectorSuite extends AnyFunSuite {
 9 | 
10 |   val inputConf: Map[String, String] = Map(
11 |     "format" -> "text",
12 |     "path" -> "src/test/resources/streaming_test_resources/input"
13 |   )
14 | 
15 |   val consoleOutputConf: Map[String, String] = Map(
16 |     "format" -> "console",
17 |     "outputMode" -> "append"
18 |   )
19 | 
20 |   val parquetOutputConf: Map[String, String] = Map(
21 |     "format" -> "PARQUET",
22 |     "outputMode" -> "append",
23 |     "checkpointLocation" -> "src/test/resources/streaming_test_resources/output/checkpoint_1",
24 |     "path" -> "src/test/resources/streaming_test_resources/output/1"
25 |   )
26 | 
27 |   test("StructuredStreamingConnector instantiation") {
28 |     val spark: SparkSession = new SparkSessionBuilder().setEnv("local").build().get()
29 |     import spark.implicits._
30 | 
31 |     val _conf = Conf.fromMap(parquetOutputConf)
32 | 
33 |     val connector = new StructuredStreamingConnector(inputConf)
34 |     val outputConnector = new StructuredStreamingConnector(_conf)
35 |     val parquetConnector = new ParquetConnector(parquetOutputConf)
36 | 
37 |     val input = connector.read()
38 | 
39 |     outputConnector.write(input, Option("suffix_should_be_ignored"))
40 |     outputConnector.awaitTerminationOrTimeout(10000)
41 | 
42 |     parquetConnector.read().show()
43 |     assert(parquetConnector.read().as[String].collect().mkString(" ") === StructuredStreamingConnectorSuite.text)
44 |   }
45 | 
46 | }
47 | 
48 | object StructuredStreamingConnectorSuite {
49 |   val text = "Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the Dataset/DataFrame API in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write-Ahead Logs. In short, Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming. Internally, by default, Structured Streaming queries are processed using a micro-batch processing engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called Continuous Processing, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able to choose the mode based on your application requirements. In this guide, we are going to walk you through the programming model and the APIs. We are going to explain the concepts mostly using the default micro-batch processing model, and then later discuss Continuous Processing model. First, let’s start with a simple example of a Structured Streaming query - a streaming word count."
50 | }
51 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/repository/RepositoryAdapterSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.repository
 2 | 
 3 | import io.github.setl.SparkSessionBuilder
 4 | import io.github.setl.storage.Condition
 5 | import io.github.setl.storage.connector.CSVConnector
 6 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 7 | import org.apache.spark.sql.{Dataset, SparkSession}
 8 | import org.scalatest.funsuite.AnyFunSuite
 9 | 
10 | class RepositoryAdapterSuite extends AnyFunSuite {
11 | 
12 |   val path: String = "src/test/resources/test_repository_adapter"
13 | 
14 |   val data: Seq[RepoAdapterTesterA] = Seq(
15 |     RepoAdapterTesterA("a", "A"),
16 |     RepoAdapterTesterA("b", "B")
17 |   )
18 | 
19 |   test("RepositoryAdapter should implicitly convert two dataset") {
20 |     val spark: SparkSession = new SparkSessionBuilder().setEnv("local").build().get()
21 |     val ds: Dataset[RepoAdapterTesterA] = spark.createDataset(data)(ExpressionEncoder[RepoAdapterTesterA])
22 | 
23 |     import io.github.setl.storage.repository.ImplicitConverter.a2b
24 |     import io.github.setl.storage.repository.ImplicitRepositoryAdapter._
25 | 
26 |     val options: Map[String, String] = Map[String, String](
27 |       "path" -> path,
28 |       "inferSchema" -> "true",
29 |       "delimiter" -> ",",
30 |       "header" -> "true",
31 |       "saveMode" -> "Overwrite"
32 |     )
33 | 
34 |     val csvConnector = new CSVConnector(options)
35 | 
36 |     val repo: SparkRepository[RepoAdapterTesterA] =
37 |       new SparkRepository[RepoAdapterTesterA]().setConnector(csvConnector)
38 | 
39 |     repo.convertAndSave(ds)
40 |     val ds2 = repo.findAllAndConvert()
41 |     val df = csvConnector.read()
42 | 
43 |     assert(ds2.columns === ds.columns)
44 |     assert(df.columns === Array("column1", "col2", "col3"))
45 |     csvConnector.delete()
46 |   }
47 | 
48 |   test("RepositoryAdapter should be able to handle filter") {
49 |     val spark: SparkSession = new SparkSessionBuilder().setEnv("local").build().get()
50 |     val ds: Dataset[RepoAdapterTesterA] = spark.createDataset(data)(ExpressionEncoder[RepoAdapterTesterA])
51 | 
52 |     import io.github.setl.storage.repository.ImplicitConverter.a2b
53 |     import io.github.setl.storage.repository.ImplicitRepositoryAdapter._
54 | 
55 |     val options: Map[String, String] = Map[String, String](
56 |       "path" -> (path + "_filter"),
57 |       "inferSchema" -> "true",
58 |       "delimiter" -> ",",
59 |       "header" -> "true",
60 |       "saveMode" -> "Overwrite"
61 |     )
62 | 
63 |     val csvConnector = new CSVConnector(options)
64 | 
65 |     val repo: SparkRepository[RepoAdapterTesterA] =
66 |       new SparkRepository[RepoAdapterTesterA]().setConnector(csvConnector)
67 | 
68 |     repo.convertAndSave(ds)
69 | 
70 |     val conditions = Condition("column1", "=", "a")
71 | 
72 |     assert(repo.findByAndConvert(conditions).count() === 1)
73 |     csvConnector.delete()
74 |   }
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/repository/package.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage
 2 | 
 3 | import io.github.setl.annotation.{ColumnName, CompoundKey, Compress}
 4 | import io.github.setl.internal.TestClasses.InnerClass
 5 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 6 | import org.apache.spark.sql.{Dataset, Encoder}
 7 | 
 8 | package object repository {
 9 | 
10 |   case class RepoAdapterTesterA(col1: String, col2: String)
11 | 
12 |   case class RepoAdapterTesterB(@ColumnName("column1") col1: String, col2: String, col3: String)
13 | 
14 |   case class MyObject(@CompoundKey("sort", "2") @ColumnName("col1") column1: String, @CompoundKey("sort", "1") column2: String)
15 | 
16 |   case class TestDeltaUpdate(@ColumnName("col1") @CompoundKey("partition", "1") column1: Int, @CompoundKey("sort", "1") column2: String, value: Double)
17 | 
18 |   case class TestCompressionRepository(col1: String,
19 |                                        col2: String,
20 |                                        @Compress col3: Seq[InnerClass],
21 |                                        @Compress col4: Seq[String]) {
22 |   }
23 | 
24 |   case class TestCompressionRepositoryGZIP(col1: String,
25 |                                            col2: String,
26 |                                            @Compress(compressor = classOf[GZIPCompressor]) col3: Seq[InnerClass],
27 |                                            @Compress(compressor = classOf[GZIPCompressor]) col4: Seq[String]) {
28 |   }
29 | 
30 |   object ImplicitConverter {
31 | 
32 |     implicit val a2b: DatasetConverter[RepoAdapterTesterA, RepoAdapterTesterB] = new DatasetConverter[RepoAdapterTesterA, RepoAdapterTesterB] {
33 | 
34 |       implicit val encoderA: Encoder[RepoAdapterTesterA] = ExpressionEncoder[RepoAdapterTesterA]
35 |       implicit val encoderB: Encoder[RepoAdapterTesterB] = ExpressionEncoder[RepoAdapterTesterB]
36 | 
37 |       override def convertFrom(ds: Dataset[RepoAdapterTesterB]): Dataset[RepoAdapterTesterA] =
38 |         ds.drop("col3").as[RepoAdapterTesterA]
39 | 
40 |       override def convertTo(ds: Dataset[RepoAdapterTesterA]): Dataset[RepoAdapterTesterB] = {
41 |         import org.apache.spark.sql.functions._
42 |         ds.withColumn("col3", concat(col("col1"), col("col2"))).as[RepoAdapterTesterB]
43 |       }
44 |     }
45 |   }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/repository/streaming/StreamingRepositorySuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.storage.repository.streaming
 2 | 
 3 | import io.github.setl.SparkSessionBuilder
 4 | import io.github.setl.exception.InvalidConnectorException
 5 | import io.github.setl.storage.connector.CSVConnector
 6 | import io.github.setl.storage.repository.SparkRepository
 7 | import org.apache.spark.sql.SparkSession
 8 | import org.scalatest.funsuite.AnyFunSuite
 9 | 
10 | class StreamingRepositorySuite extends AnyFunSuite {
11 | 
12 |   test("StreamingRepository should throw exception") {
13 |     import io.github.setl.storage.repository.streaming.StreamingRepositorySuite.TestClass
14 | 
15 |     val spark: SparkSession = new SparkSessionBuilder().setEnv("local").getOrCreate()
16 | 
17 |     val csvOutputConf: Map[String, String] = Map(
18 |       "path" -> "src/test/resources/streaming_test_resources/output/3",
19 |       "header" -> "true"
20 |     )
21 |     val csvConnector = new CSVConnector(csvOutputConf)
22 |     val repo = new SparkRepository[TestClass]().setConnector(csvConnector)
23 | 
24 |     assertThrows[InvalidConnectorException](repo.awaitTermination())
25 |     assertThrows[InvalidConnectorException](repo.awaitTerminationOrTimeout(1))
26 |   }
27 | 
28 | }
29 | 
30 | object StreamingRepositorySuite {
31 | 
32 |   case class TestClass(x: String)
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/util/IOUtils.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.util
 2 | 
 3 | import java.io.{File, IOException}
 4 | import java.util.UUID
 5 | 
 6 | import io.github.setl.internal.Logging
 7 | 
 8 | object IOUtils extends Logging {
 9 | 
10 |   def createDirectory(root: String, namePrefix: String = "setl"): File = {
11 |     var attempts = 0
12 |     val maxAttempts = 10
13 |     var dir: File = null
14 |     while (dir == null) {
15 |       attempts += 1
16 |       if (attempts > maxAttempts) {
17 |         throw new IOException("Failed to create a temp directory (under " + root + ") after " +
18 |           maxAttempts + " attempts!")
19 |       }
20 |       try {
21 |         dir = new File(root, namePrefix + "-" + UUID.randomUUID.toString)
22 |         if (dir.exists() || !dir.mkdirs()) {
23 |           dir = null
24 |         }
25 |       } catch {
26 |         case e: SecurityException => dir = null;
27 |       }
28 |     }
29 | 
30 |     dir.getCanonicalFile
31 |   }
32 | 
33 |   /**
34 |    * Create a temporary directory inside the given parent directory. The directory will be
35 |    * automatically deleted when the VM shuts down.
36 |    */
37 |   def createTempDir(root: String = System.getProperty("java.io.tmpdir"), namePrefix: String = "setl"): File = {
38 |     val dir = createDirectory(root, namePrefix)
39 |     dir
40 |   }
41 | 
42 |   /**
43 |    * Creates a temporary directory, which is then passed to `f` and will be deleted after `f`
44 |    * returns.
45 |    *
46 |    * @todo Probably this method should be moved to a more general place
47 |    */
48 |   def withTempDir(f: File => Unit): Unit = {
49 |     val dir = createTempDir().getCanonicalFile
50 |     try f(dir) finally {
51 |       deleteRecursively(dir)
52 |     }
53 |   }
54 | 
55 |   def deleteRecursively(file: File): Unit = {
56 |     logDebug(s"Remove ${file.getCanonicalPath}")
57 |     if (file.isDirectory)
58 |       file.listFiles.foreach(deleteRecursively)
59 |     if (file.exists && !file.delete)
60 |       throw new Exception(s"Unable to delete ${file.getAbsolutePath}")
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/util/TypesafeConfigUtilsSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.util
 2 | 
 3 | import com.typesafe.config.ConfigFactory
 4 | import org.scalatest.funsuite.AnyFunSuite
 5 | import org.scalatest.matchers.should.Matchers
 6 | 
 7 | class TypesafeConfigUtilsSuite extends AnyFunSuite with Matchers {
 8 | 
 9 |   val config = ConfigFactory.load("test_priority.conf")
10 |   import TypesafeConfigUtils._
11 | 
12 |   test("TypesafeConfigUtils should handle implicit type conversion") {
13 |     assert(getAs[String](config, "test.string") === Option("abc"))
14 |     assert(getAs[Int](config, "test.int") === Option(1))
15 |     assert(getAs[Long](config, "test.long") === Option(2L))
16 |     assert(getAs[Float](config, "test.float") === Option(3.1F))
17 |     assert(getAs[Float](config, "test.float2") === Option(3.1F))
18 |     assert(getAs[Double](config, "test.double") === Option(4.4D))
19 |     assert(getAs[Boolean](config, "test.boolean") === Option(false))
20 |     assert(getAs[Boolean](config, "test.boolean2") === Option(true))
21 |     assert(getAs[Int](config, "test.non_existing") === None)
22 |     assert(isDefined(config, "test.non_existing") === false)
23 |     assert(isDefined(config, "test.string"))
24 |   }
25 | 
26 |   test("TypesafeConfigUtils should handle list") {
27 |     getList(config, "test.list").get should equal (Array(1, 2, 3))
28 |     val expected = Array(1.2, 2, 3)
29 |     getList(config, "test.listFloat").get should equal (expected)
30 |     getList(config, "test.listString").get should equal (Array("1.2", "2", "3"))
31 |   }
32 | 
33 |   test("TypesafeConfigUtils should handle map") {
34 |     getMap(config.getConfig("test.map")) should equal (Map("v1" -> "a", "v2" -> "b"))
35 | 
36 |   }
37 | 
38 |   test("TypesafeConfigUtils exceptions") {
39 |     assertThrows[com.typesafe.config.ConfigException.WrongType](getAs[Int](config, "test.string"))
40 |   }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/workflow/FlowSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl.workflow
 2 | 
 3 | import io.github.setl.annotation.Delivery
 4 | import io.github.setl.transformation.{Factory, FactoryOutput}
 5 | import org.scalatest.funsuite.AnyFunSuite
 6 | 
 7 | import scala.reflect.runtime.universe
 8 | 
 9 | class FlowSuite extends AnyFunSuite {
10 | 
11 |   import FlowSuite._
12 | 
13 |   val nodeProductFactory = new Node(new ProductFactory, 0, false)
14 |   val nodeProduct2Factory = new Node(new Product2Factory, 1, false)
15 | 
16 |   test("Flow constructor") {
17 |     val flow = Flow(nodeProductFactory, nodeProduct2Factory)
18 | 
19 |     assert(flow.deliveryId === "")
20 |     assert(flow.stage === 0)
21 |     assert(flow.payload === universe.typeOf[Product1])
22 |   }
23 | 
24 |   test("Flow should generate diagram") {
25 |     val flow = Flow(nodeProductFactory, nodeProduct2Factory)
26 | 
27 |     val externalNode = External.NODE.copy(output = FactoryOutput(universe.typeOf[String], Seq.empty, "", external = true))
28 |     val flowExternal = Flow(externalNode, nodeProductFactory)
29 | 
30 |     val expectedDiagram =
31 |       """Product2Factory <|-- Product1 : Input""".stripMargin.replace(" ", "")
32 | 
33 |     val expectedExternalFlowDiagram = "ProductFactory <|-- StringExternal : Input".replace(" ", "")
34 | 
35 |     assert(flow.diagramId === "")
36 |     assert(flow.toDiagram.replace(" ", "") === expectedDiagram)
37 |     assert(flowExternal.toDiagram.replace(" ", "") === expectedExternalFlowDiagram)
38 |   }
39 | 
40 | }
41 | 
42 | object FlowSuite {
43 | 
44 | 
45 |   class ProductFactory extends Factory[Product1] {
46 |     @Delivery
47 |     private[this] val id: String = null
48 |     private[this] var output: Product1 = _
49 | 
50 |     override def read(): ProductFactory.this.type = this
51 | 
52 |     override def process(): ProductFactory.this.type = {
53 |       output = Product1(id)
54 |       this
55 |     }
56 | 
57 |     override def write(): ProductFactory.this.type = this
58 | 
59 |     override def get(): Product1 = output
60 |   }
61 | 
62 |   class Product2Factory extends Factory[Product2] {
63 | 
64 |     @Delivery
65 |     val input: Product1 = null
66 |     var output: Product2 = _
67 | 
68 |     override def read(): this.type = this
69 | 
70 |     override def process(): this.type = {
71 |       this
72 |     }
73 | 
74 |     override def write(): this.type = this
75 | 
76 |     override def get(): Product2 = output
77 |   }
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/workflow/package.scala:
--------------------------------------------------------------------------------
 1 | package io.github.setl
 2 | 
 3 | package object workflow {
 4 | 
 5 |   case class Product1(x: String)
 6 | 
 7 |   case class Product2(x: String, y: String)
 8 | 
 9 |   case class Product(x: String)
10 | 
11 |   case class Product23(x: String)
12 | 
13 |   case class Container[T](content: T)
14 | 
15 |   case class Container2[T](content: T)
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------