├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── dependabot.yml ├── stale.yml └── workflows │ ├── release.yml │ ├── snapshot.yml │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── dev ├── change-scala-version.sh ├── deploy-release.sh ├── deploy-snapshot.sh ├── docker-compose.yml └── test.sh ├── docs ├── Annotations.md ├── Architecture.md ├── Condition.md ├── Conf.md ├── ConfigLoader.md ├── Deliverable.md ├── Factory.md ├── Logging.md ├── Pipeline.md ├── Quick-Start.md ├── SchemaConverter.md ├── Setl.md ├── SparkRepository-caching.md ├── SparkSessionBuilder.md ├── Stage.md ├── StructAnalyser.md ├── Transformer.md ├── _config.yml ├── data_access_layer │ ├── Connector.md │ ├── ConnectorBuilder.md │ ├── CustomConnector.md │ ├── Repository.md │ ├── SparkRepositoryAdapter.md │ ├── SparkRepositoryBuilder.md │ ├── Structured-Streaming-Connector.md │ └── configuration_example.md ├── img │ ├── logo_setl.png │ ├── logo_setl_1280_640.png │ └── old_logo │ │ ├── logo_setl.png │ │ └── logo_setl_1280_640.png ├── index.md ├── utils │ └── Compressor_Archiver.md └── vocabulary.md ├── pom.xml └── src ├── main ├── java │ └── io │ │ └── github │ │ └── setl │ │ ├── annotation │ │ ├── Benchmark.java │ │ ├── Compress.java │ │ ├── Delivery.java │ │ ├── Experimental.java │ │ └── InterfaceStability.java │ │ ├── enums │ │ ├── PathFormat.java │ │ ├── Storage.java │ │ └── ValueType.java │ │ ├── exception │ │ ├── AlreadyExistsException.java │ │ ├── BaseException.java │ │ ├── ConfException.java │ │ ├── ConnectorException.java │ │ ├── InvalidConnectorException.java │ │ ├── InvalidDeliveryException.java │ │ ├── InvalidSchemaException.java │ │ ├── RepositoryException.java │ │ └── UnknownException.java │ │ ├── internal │ │ └── BenchmarkInvocationHandler.java │ │ └── storage │ │ ├── GZIPCompressor.java │ │ ├── SnappyCompressor.java │ │ └── XZCompressor.java └── scala │ └── io │ └── github │ └── setl │ ├── BenchmarkResult.scala │ ├── Builder.scala │ ├── Converter.scala │ ├── Setl.scala │ ├── SparkSessionBuilder.scala │ ├── annotation │ ├── ColumnName.scala │ └── CompoundKey.scala │ ├── config │ ├── Conf.scala │ ├── ConfigLoader.scala │ ├── ConnectorConf.scala │ ├── DeltaConnectorConf.scala │ ├── DynamoDBConnectorConf.scala │ ├── FileConnectorConf.scala │ ├── HudiConnectorConf.scala │ ├── JDBCConnectorConf.scala │ └── StructuredStreamingConnectorConf.scala │ ├── internal │ ├── CanCreate.scala │ ├── CanDelete.scala │ ├── CanDrop.scala │ ├── CanPartition.scala │ ├── CanUpdate.scala │ ├── CanVacuum.scala │ ├── CanWait.scala │ ├── Configurable.scala │ ├── HasBenchmark.scala │ ├── HasDescription.scala │ ├── HasDiagram.scala │ ├── HasReader.scala │ ├── HasReaderWriter.scala │ ├── HasRegistry.scala │ ├── HasType.scala │ ├── HasWriter.scala │ ├── Identifiable.scala │ ├── Logging.scala │ ├── SchemaConverter.scala │ ├── StructAnalyser.scala │ └── Writable.scala │ ├── storage │ ├── Archiver.scala │ ├── Compressor.scala │ ├── Condition.scala │ ├── ConnectorBuilder.scala │ ├── DatasetConverter.scala │ ├── SparkRepositoryBuilder.scala │ ├── ZipArchiver.scala │ ├── connector │ │ ├── ACIDConnector.scala │ │ ├── CSVConnector.scala │ │ ├── CassandraConnector.scala │ │ ├── Connector.scala │ │ ├── ConnectorInterface.scala │ │ ├── DBConnector.scala │ │ ├── DeltaConnector.scala │ │ ├── DynamoDBConnector.scala │ │ ├── ExcelConnector.scala │ │ ├── FileConnector.scala │ │ ├── HudiConnector.scala │ │ ├── JDBCConnector.scala │ │ ├── JSONConnector.scala │ │ ├── ParquetConnector.scala │ │ ├── SparkSQLConnector.scala │ │ ├── StreamingConnector.scala │ │ └── StructuredStreamingConnector.scala │ └── repository │ │ ├── ImplicitRepositoryAdapter.scala │ │ ├── Repository.scala │ │ ├── RepositoryAdapter.scala │ │ └── SparkRepository.scala │ ├── transformation │ ├── AbstractFactory.scala │ ├── Deliverable.scala │ ├── Factory.scala │ ├── FactoryDeliveryMetadata.scala │ ├── FactoryInput.scala │ ├── FactoryOutput.scala │ ├── MLTransformer.scala │ └── Transformer.scala │ ├── util │ ├── DateUtils.scala │ ├── ExpectedDeliverable.scala │ ├── FilterImplicits.scala │ ├── HasSparkSession.scala │ ├── MermaidUtils.scala │ ├── ReflectUtils.scala │ ├── SparkUtils.scala │ └── TypesafeConfigUtils.scala │ └── workflow │ ├── DAG.scala │ ├── DeliverableDispatcher.scala │ ├── External.scala │ ├── Flow.scala │ ├── Node.scala │ ├── Pipeline.scala │ ├── PipelineInspector.scala │ ├── PipelineOptimizer.scala │ ├── SimplePipelineOptimizer.scala │ └── Stage.scala └── test ├── resources ├── application.conf ├── dynamodb.conf ├── local.conf ├── log4j.properties ├── myconf.conf ├── streaming_test_resources │ ├── input │ │ └── text.txt │ ├── input2 │ │ └── input2.csv │ └── streaming.conf ├── test-archiver │ ├── test-input-file.txt │ └── test-input │ │ ├── col3=c │ │ ├── file1-1-1.csv │ │ ├── file1-2-1.csv │ │ └── file1-2-2.csv │ │ └── col3=cc │ │ └── file2-1.csv ├── test-json.json ├── test-list-files │ ├── file1.csv │ ├── subdir1 │ │ ├── subsubdir1 │ │ │ ├── file1-1-1.csv │ │ │ └── wrongfile1-1-1.csv │ │ └── subsubdir2 │ │ │ ├── file1-2-1.csv │ │ │ └── file1-2-2.csv │ └── subdir2 │ │ └── file2-1.csv ├── test-list-files2 │ ├── col3=c │ │ ├── file1-1-1.csv │ │ ├── file1-2-1.csv │ │ └── file1-2-2.csv │ └── col3=cc │ │ └── file2-1.csv ├── test_base_path.csv ├── test_connector_builder.conf ├── test_priority.conf └── test_schema_converter.csv └── scala └── io └── github └── setl ├── MockCassandra.scala ├── SetlSuite.scala ├── SparkSessionBuilderSuite.scala ├── SparkTestUtils.scala ├── TestObject.scala ├── config ├── ConfLoaderSuite.scala ├── ConfSuite.scala ├── DeltaConnectorConfSuite.scala ├── DynamoDBConnectorConfSuite.scala ├── FileConnectorConfSuite.scala ├── HudiConnectorConfSuite.scala ├── JDBCConnectorConfSuite.scala ├── Properties.scala ├── PropertiesSuite.scala └── StructuredStreamingConnectorConfSuite.scala ├── factory └── FactoryDeliveryMetadataSuite.scala ├── internal ├── BenchmarkInvocationHandlerSuite.scala ├── HasRegistrySuite.scala ├── SchemaConverterSuite.scala ├── StructAnalyserSuite.scala └── TestClasses.scala ├── storage ├── ConditionSuite.scala ├── ConnectorBuilderSuite.scala ├── GZIPCompressorSuite.scala ├── SnappyCompressorSuite.scala ├── SparkRepositoryBuilderSuite.scala ├── XZCompressorSuite.scala ├── ZipArchiverSuite.scala ├── connector │ ├── CSVConnectorSuite.scala │ ├── CassandraConnectorSuite.scala │ ├── ConnectorSuite.scala │ ├── DeltaConnectorSuite.scala │ ├── DynamoDBConnectorSuite.scala │ ├── ExcelConnectorSuite.scala │ ├── FileConnectorSuite.scala │ ├── HudiConnectorSuite.scala │ ├── JDBCConnectorSuite.scala │ ├── JSONConnectorSuite.scala │ ├── ParquetConnectorSuite.scala │ ├── SparkSQLConnectorSuite.scala │ └── StructuredStreamingConnectorSuite.scala ├── package.scala └── repository │ ├── RepositoryAdapterSuite.scala │ ├── SparkRepositorySuite.scala │ ├── package.scala │ └── streaming │ └── StreamingRepositorySuite.scala ├── transformation └── DeliverableSuite.scala ├── util ├── DateUtilsSuite.scala ├── FilterImplicitsSuite.scala ├── IOUtils.scala ├── MermaidUtilsSuite.scala └── TypesafeConfigUtilsSuite.scala └── workflow ├── DeliverableDispatcherSuite.scala ├── FlowSuite.scala ├── NodeSuite.scala ├── PipelineInspectorSuite.scala ├── PipelineSuite.scala ├── SimplePipelineOptimizerSuite.scala ├── StageSuite.scala └── package.scala /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "Issue title" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Environment (please complete the following information):** 23 | - OS: [e.g. iOS] 24 | - Version [e.g. 22] 25 | - Dependencies: 26 | 27 | **Additional context** 28 | Add any other context about the problem here. 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "Feature request title" 5 | labels: feature 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "maven" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | ignore: 13 | - dependency-name: "scala*" 14 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: stale 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | release_deployment: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | include: 13 | - SCALA_VER: "2.12" 14 | SPARK_VER: "3.2" 15 | - SCALA_VER: "2.11" 16 | SPARK_VER: "2.4" 17 | 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v2 21 | 22 | - name: Set up JDK 1.8 23 | uses: actions/setup-java@v1 24 | with: 25 | java-version: 1.8 26 | 27 | - name: Before all 28 | run: | 29 | chmod +x ./dev/change-scala-version.sh 30 | ./dev/change-scala-version.sh ${{ matrix.SCALA_VER }} 31 | docker-compose -f ./dev/docker-compose.yml up -d 32 | 33 | - name: Prepare maven 34 | env: 35 | MVN_SETTINGS: ${{ secrets.MVN_SETTINGS }} 36 | MVN_SECURITY: ${{ secrets.MVN_SECURITY_SETTINGS }} 37 | GPG_KEY: ${{ secrets.GPG_KEY }} 38 | run: | 39 | echo "$MVN_SETTINGS" | base64 -d > "$HOME"/.m2/settings.xml 40 | echo "$MVN_SECURITY" | base64 -d > "$HOME"/.m2/settings-security.xml 41 | echo "$GPG_KEY" | base64 -d | gpg --import --batch > /dev/null 2>&1 42 | 43 | - name: Run tests 44 | run: | 45 | set -e 46 | export AWS_ACCESS_KEY_ID="fakeAccess" 47 | export AWS_SECRET_ACCESS_KEY="fakeSecret" 48 | export AWS_REGION="eu-west-1" 49 | mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${{ matrix.SPARK_VER }} 50 | 51 | # If the tag follows the format SETL-X.Y.Z-RC*, then release the RC version 52 | - name: RC version deployment 53 | if: ${{ startsWith( github.ref, 'refs/tags/SETL-' ) && contains( github.ref, '-RC' ) }} 54 | run: | 55 | RC_VER=-$(echo ${{ github.ref }} | cut -d'-' -f 3) 56 | mvn clean deploy scala:doc -ntp -B -DskipTests -P release,spark_${{ matrix.SPARK_VER }} -Dchangelist=$RC_VER 57 | 58 | # If the tag follows the format SETL-X.Y.Z, then release the stable version 59 | - name: Deployment 60 | if: ${{ startsWith( github.ref, 'refs/tags/SETL-' ) && !contains( github.ref, '-RC' ) }} 61 | run: mvn clean deploy scala:doc -ntp -B -DskipTests -P release,spark_${{ matrix.SPARK_VER }} 62 | -------------------------------------------------------------------------------- /.github/workflows/snapshot.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | paths-ignore: 7 | - 'README.md' 8 | - 'docs/**' 9 | - '.github/ISSUE_TEMPLATE/**' 10 | 11 | jobs: 12 | snapshot_deployment: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | include: 17 | - SCALA_VER: "2.12" 18 | SPARK_VER: "3.2" 19 | - SCALA_VER: "2.11" 20 | SPARK_VER: "2.4" 21 | 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v2 25 | 26 | - name: Set up JDK 1.8 27 | uses: actions/setup-java@v1 28 | with: 29 | java-version: 1.8 30 | 31 | - name: Before all 32 | run: | 33 | chmod +x ./dev/change-scala-version.sh 34 | ./dev/change-scala-version.sh ${{ matrix.SCALA_VER }} 35 | docker-compose -f ./dev/docker-compose.yml up -d 36 | 37 | - name: Prepare maven 38 | env: 39 | MVN_SETTINGS: ${{ secrets.MVN_SETTINGS }} 40 | MVN_SECURITY: ${{ secrets.MVN_SECURITY_SETTINGS }} 41 | run: | 42 | echo "$MVN_SETTINGS" | base64 -d > "$HOME"/.m2/settings.xml 43 | echo "$MVN_SECURITY" | base64 -d > "$HOME"/.m2/settings-security.xml 44 | 45 | - name: Run tests 46 | run: | 47 | set -e 48 | export AWS_ACCESS_KEY_ID="fakeAccess" 49 | export AWS_SECRET_ACCESS_KEY="fakeSecret" 50 | export AWS_REGION="eu-west-1" 51 | mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${{ matrix.SPARK_VER }} 52 | 53 | - name: Upload coverage report 54 | uses: codecov/codecov-action@v1 55 | with: 56 | flags: master_${{ matrix.SCALA_VER }}_${{ matrix.SPARK_VER }} 57 | name: codecov-master-branch 58 | 59 | - name: Deployment 60 | run: mvn clean deploy scala:doc -ntp -B -DskipTests -P snapshot,spark_${{ matrix.SPARK_VER }} 61 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | pull_request: 5 | branches: [ master ] 6 | paths-ignore: 7 | - 'README.md' 8 | - 'docs/**' 9 | - '.github/ISSUE_TEMPLATE/**' 10 | 11 | jobs: 12 | test_setl: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | SCALA_VER: ["2.12", "2.11"] 18 | SPARK_VER: ["3.2", "3.0", "2.4", "2.3"] 19 | exclude: 20 | - SCALA_VER: 2.12 21 | SPARK_VER: 2.3 22 | - SCALA_VER: 2.11 23 | SPARK_VER: 3.0 24 | - SCALA_VER: 2.11 25 | SPARK_VER: 3.2 26 | steps: 27 | - name: Checkout 28 | uses: actions/checkout@v2 29 | 30 | - name: Set up JDK 1.8 31 | uses: actions/setup-java@v1 32 | with: 33 | java-version: 1.8 34 | 35 | - name: Before all 36 | run: | 37 | chmod +x ./dev/change-scala-version.sh 38 | ./dev/change-scala-version.sh ${{ matrix.SCALA_VER }} 39 | docker-compose -f ./dev/docker-compose.yml up -d 40 | 41 | - name: Run tests 42 | run: | 43 | set -e 44 | export AWS_ACCESS_KEY_ID="fakeAccess" 45 | export AWS_SECRET_ACCESS_KEY="fakeSecret" 46 | export AWS_REGION="eu-west-1" 47 | mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${{ matrix.SPARK_VER }} 48 | 49 | - name: Upload coverage to Codecov 50 | uses: codecov/codecov-action@v1 51 | with: 52 | flags: pr_${{ matrix.SCALA_VER }}_${{ matrix.SPARK_VER }} 53 | name: codecov-pull-request 54 | 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/intellij 2 | 3 | ### Intellij ### 4 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 5 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 6 | 7 | # User-specific stuff: 8 | .idea/workspace.xml 9 | .idea/tasks.xml 10 | .idea/dictionaries 11 | .idea/vcs.xml 12 | .idea/jsLibraryMappings.xml 13 | 14 | # Sensitive or high-churn files: 15 | .idea/dataSources.ids 16 | .idea/dataSources.xml 17 | .idea/dataSources.local.xml 18 | .idea/sqlDataSources.xml 19 | .idea/dynamic.xml 20 | .idea/uiDesigner.xml 21 | 22 | # Gradle: 23 | .idea/gradle.xml 24 | .idea/libraries 25 | 26 | # Mongo Explorer plugin: 27 | .idea/mongoSettings.xml 28 | 29 | ## File-based project format: 30 | *.iws 31 | 32 | ## Plugin-specific files: 33 | 34 | # IntelliJ 35 | /out/ 36 | 37 | # mpeltonen/sbt-idea plugin 38 | .idea_modules/ 39 | 40 | # JIRA plugin 41 | atlassian-ide-plugin.xml 42 | 43 | # Crashlytics plugin (for Android Studio and IntelliJ) 44 | com_crashlytics_export_strings.xml 45 | crashlytics.properties 46 | crashlytics-build.properties 47 | fabric.properties 48 | 49 | ### Intellij Patch ### 50 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 51 | 52 | # *.iml 53 | # modules.xml 54 | 55 | /target/ 56 | 57 | ### STS ### 58 | .apt_generated 59 | .classpath 60 | .factorypath 61 | .project 62 | .settings 63 | .springBeans 64 | .sts4-cache 65 | 66 | ### IntelliJ IDEA ### 67 | .idea 68 | *.iml 69 | *.ipr 70 | 71 | ### NetBeans ### 72 | /nbproject/private/ 73 | /build/ 74 | /nbbuild/ 75 | /dist/ 76 | /nbdist/ 77 | /.nb-gradle/ 78 | 79 | */target/ 80 | null/ 81 | */null/ 82 | /data/ 83 | .toDelete 84 | *.log 85 | 86 | # OS generated files # 87 | ###################### 88 | .DS_Store 89 | .DS_Store? 90 | ._* 91 | .Spotlight-V100 92 | .Trashes 93 | ehthumbs.db 94 | Thumbs.db -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at setl@qinxuzhou.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to SETL 2 | 3 | Thanks sooooo much for taking time to contribute :+1: 4 | 5 | ## Bug report 6 | 7 | When you are creating a bug report, please include as many details as possible. 8 | Fill out the required template, the information it asks for helps us resolve issues faster. 9 | 10 | ## Feature request 11 | 12 | When you are creating an enhancement suggestion, please include as many details as possible. 13 | Fill in the template, including the steps that you imagine you would take if the feature you're requesting existed. 14 | 15 | ## Development 16 | 17 | ### Quick guide 18 | - Fork the project & clone locally. 19 | - Create an upstream remote and sync your local copy before you branch. 20 | - Branch for each separate piece of work. 21 | - Push to the origin repository (the fork). 22 | - Create a new Pull Request in GitHub. 23 | 24 | ### Build 25 | 26 | Use pre-created profiles to change version. 27 | 28 | ```shell 29 | # Build SNAPSHOT with Scala 2.11 30 | mvn clean package -Psnapshot -Pscala_2.11 -Pspark_2.4 31 | 32 | # Build RELEASE with Scala 2.11 33 | mvn clean package -Prelease -Pscala_2.11 -Pspark_2.4 34 | 35 | # Build SNAPSHOT with Scala 2.12 36 | ./dev/change-scala-version.sh 2.12 37 | mvn clean package -Psnapshot -Pscala_2.12 -Pspark_2.4 38 | 39 | # Build RELEASE with Scala 2.12 40 | ./dev/change-scala-version.sh 2.12 41 | mvn clean package -Prelease -Pscala_2.12 -Pspark_2.4 42 | ``` 43 | 44 | ### Unit tests 45 | 46 | We use docker to provide services for the unit test. Run the following command before the unit test: 47 | ```shell 48 | docker-compose -f ./dev/docker-compose.yml up 49 | ``` 50 | 51 | To start the test with cli: 52 | ```shell 53 | export SCALA_VER=2.11 54 | export SPARK_VER=2.4 55 | ./dev/test.sh 56 | ``` 57 | 58 | Note: in some case you get the following error 59 | ``` 60 | java.net.BindException: Can't assign requested address: Service 'sparkDriver' 61 | ``` 62 | then you have to bind the spark to local ip like this 63 | ```shell 64 | export SPARK_LOCAL_IP=127.0.0.1 65 | ``` 66 | 67 | ## Styleguide 68 | 69 | ### Commit styleguide 70 | 71 | Please refer to [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0-beta.2/) 72 | 73 | ### Scala styleguide 74 | 75 | Please refer to [Databricks Scala Guide](https://github.com/databricks/scala-style-guide) 76 | -------------------------------------------------------------------------------- /dev/change-scala-version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | set -e 21 | 22 | VALID_VERSIONS=( 2.11 2.12 ) 23 | 24 | usage() { 25 | echo "Usage: $(basename $0) [-h|--help] 26 | where : 27 | -h| --help Display this help text 28 | valid version values : ${VALID_VERSIONS[*]} 29 | " 1>&2 30 | exit 1 31 | } 32 | 33 | if [[ ($# -ne 1) || ( $1 == "--help") || $1 == "-h" ]]; then 34 | usage 35 | fi 36 | 37 | TO_COMP_VERSION=$1 38 | 39 | check_scala_version() { 40 | for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done 41 | echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2 42 | exit 1 43 | } 44 | 45 | check_scala_version "$TO_COMP_VERSION" 46 | 47 | if [ $TO_COMP_VERSION = "2.11" ]; then 48 | FROM_COMP_VERSION="2.12" 49 | FROM_VERSION="2.12.10" 50 | TO_VERSION="2.11.12" 51 | else 52 | FROM_COMP_VERSION="2.11" 53 | FROM_VERSION="2.11.12" 54 | TO_VERSION="2.12.10" 55 | fi 56 | 57 | sed_i() { 58 | sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2" 59 | } 60 | 61 | export -f sed_i 62 | 63 | BASEDIR=$(dirname $0)/.. 64 | 65 | find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \ 66 | -exec bash -c "sed_i 's/\(artifactId>setl\)_'$FROM_COMP_VERSION'/\1_'$TO_COMP_VERSION'/g' {}" \; 67 | 68 | find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \ 69 | -exec bash -c "sed_i 's/\(scala.compat.version>\)'$FROM_COMP_VERSION'/\1'$TO_COMP_VERSION'/g' {}" \; 70 | 71 | find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \ 72 | -exec bash -c "sed_i 's/\(scala.version>\)'$FROM_VERSION'/\1'$TO_VERSION'/g' {}" \; 73 | 74 | -------------------------------------------------------------------------------- /dev/deploy-release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | echo ${MVN_SETTINGS} | base64 -d > ${HOME}/.m2/settings.xml 6 | echo ${MVN_SECURITY} | base64 -d > ${HOME}/.m2/settings-security.xml 7 | echo ${GPG_KEY} | base64 -d | gpg --import --batch > /dev/null 2>&1 8 | 9 | mvn clean deploy scala:doc -ntp -B -DskipTests -P release,spark_${SPARK_VER} 10 | -------------------------------------------------------------------------------- /dev/deploy-snapshot.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | echo ${MVN_SETTINGS} | base64 -d > ${HOME}/.m2/settings.xml 6 | echo ${MVN_SECURITY} | base64 -d > ${HOME}/.m2/settings-security.xml 7 | 8 | mvn clean deploy scala:doc -ntp -B -DskipTests -P snapshot,spark_${SPARK_VER} 9 | -------------------------------------------------------------------------------- /dev/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.2' 2 | services: 3 | psql: 4 | image: "postgres" 5 | container_name: "postgres-unit-test" 6 | environment: 7 | - POSTGRES_USER=postgres 8 | - POSTGRES_PASSWORD=postgres 9 | - POSTGRES_DB=framework_dev 10 | ports: 11 | - "5432:5432" 12 | 13 | cassandra: 14 | image: "cassandra" 15 | container_name: "cassandra-unit-test" 16 | ports: 17 | - "9042:9042" 18 | 19 | dynamodb: 20 | image: "amazon/dynamodb-local" 21 | container_name: "dynamodb-unit-test" 22 | ports: 23 | - "8000:8000" 24 | -------------------------------------------------------------------------------- /dev/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | export AWS_ACCESS_KEY_ID="fakeAccess" 6 | export AWS_SECRET_ACCESS_KEY="fakeSecret" 7 | export AWS_REGION="eu-west-1" 8 | 9 | mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${SPARK_VER} 10 | -------------------------------------------------------------------------------- /docs/Architecture.md: -------------------------------------------------------------------------------- 1 | ![image](uploads/8c6071c49e88dcb3ead283edcebd4927/image.png) 2 | 3 | ![image](uploads/1488d95144a09bd5bb0543bde5a1f193/image.png) 4 | 5 | ![image](uploads/7e800d5ff258cfd2bccb38a4deca6f5e/image.png) -------------------------------------------------------------------------------- /docs/Condition.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | 3 | **Condition** is used by the `findBy` method of a **Repository** 4 | 5 | ```scala 6 | val cond = Set( 7 | Condition("column1", ">", 100), 8 | Condition("column2", "=", "value2") 9 | ) 10 | 11 | myRepository.findBy(cond) 12 | ``` 13 | 14 | ## Operation 15 | - `>` 16 | - `<` 17 | - `>=` 18 | - `<=` 19 | - `=` -------------------------------------------------------------------------------- /docs/Conf.md: -------------------------------------------------------------------------------- 1 | ## Definition -------------------------------------------------------------------------------- /docs/ConfigLoader.md: -------------------------------------------------------------------------------- 1 | ## Definition -------------------------------------------------------------------------------- /docs/Deliverable.md: -------------------------------------------------------------------------------- 1 | ## Definition -------------------------------------------------------------------------------- /docs/Factory.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | A **Factory[A]** is a complete data transformation job to produce an object of type A. 3 | 4 | ## Difference with *Transformer* 5 | A **Factory** is more complex than a **Transformer**. In addition to data transformation, a **Factory** contains also logics for reading and writing data. 6 | 7 | ## Demo 8 | You could implement your own factory by extending the class **Factory[A]**. 9 | 10 | ```scala 11 | case class MyProduct 12 | 13 | // MyFactory will produce MyProduct 14 | class MyFactory extend Factory[MyProduct] { 15 | override def read(): this.type = ... 16 | override def process(): this.type = ... 17 | override def write(): this.type = ... 18 | override def get(): MyProduct = ... 19 | } 20 | ``` 21 | 22 | To run **MyFactory**: 23 | ```scala 24 | new MyFactory().read().process().write().get() 25 | ``` 26 | 27 | ## Dependency Handling 28 | Dependency of a **Factory** could be handled by a **Pipeline** if the field has the **Delivery** annotation. 29 | For the previous **MyFactory** class: 30 | 31 | ```scala 32 | case class MyProduct 33 | 34 | // MyFactory will produce MyProduct 35 | class MyFactory extend Factory[MyProduct] { 36 | 37 | @Delivery 38 | var input: String = _ 39 | 40 | override def read(): this.type = ... 41 | override def process(): this.type = ... 42 | override def write(): this.type = ... 43 | override def get(): MyProduct = ... 44 | } 45 | ``` 46 | 47 | By adding `@Delivery` to the variable **input**, the value of **input** will be automatically injected by **Pipeline**. 48 | 49 | For more information about dependency handling, read the [doc of **Pipeline**](Pipeline). 50 | -------------------------------------------------------------------------------- /docs/Logging.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | Logging module -------------------------------------------------------------------------------- /docs/SchemaConverter.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | 3 | **SchemaConverter** can: 4 | - Convert a Dataset[A] to a DataFrame with the metadata of class **A** (extracted by **StructAnalyser**) 5 | - Convert a DataFrame to a Dataset[A] 6 | 7 | For each of the three annotations: ColumnName, CompoundKey and Compress, SchemaConverter will 8 | - rename the column 9 | - create/drop the compound key column(s) 10 | - compress/decompress the column(s) having Compress annotation. 11 | 12 | ## Demo 13 | 14 | ### Dataset to DataFrame 15 | ```scala 16 | val ds: Dataset[MyClass] = ... 17 | SchemaConverter.toDF(ds) 18 | ``` 19 | 20 | ### DataFrame to Dataset 21 | ```scala 22 | val df: DataFrame = ... 23 | SchemaConverter.fromDF[MyClass](df) 24 | ``` -------------------------------------------------------------------------------- /docs/SparkRepository-caching.md: -------------------------------------------------------------------------------- 1 | [![](https://mermaid.ink/img/eyJjb2RlIjoiZ3JhcGggVERcbiAgICByZWFkW2ludm9rZSBgZmluZEFsbGAgb3IgYGZpbmRCeWBdICAtLT4gY2hlY2tfcGVyc2lzdGVuY2VcblxuICAgIGNoZWNrX3BlcnNpc3RlbmNle3BlcnNpc3RSZWFkRGF0YT99XG5cbiAgICBjaGVja19wZXJzaXN0ZW5jZSAtLiBmYWxzZSAuLT4gbG9hZF9kaXNrW2xvYWQgZnJvbSBkYXRhIHN0b3JhZ2VdXG4gICAgY2hlY2tfcGVyc2lzdGVuY2UgLS4gdHJ1ZSAuLT4gY2hlY2tfZmx1c2h7Zmx1c2hSZWFkQ2FjaGU_fVxuXG4gICAgY2hlY2tfZmx1c2ggLS4gdHJ1ZSAuLT4gdW5wZXJzaXN0W3VucGVyc2lzdCBsYXN0IHJlYWQgY2FjaGVdXG4gICAgdW5wZXJzaXN0IC0tPiB1cGRhdGVfY29uZGl0aW9uX2hhc2hbc2F2ZSByZXF1ZXN0IGhpc3RvcnldXG4gICAgdXBkYXRlX2NvbmRpdGlvbl9oYXNoIC0tPiB1cGRhdGVfcmVhZF9jYWNoZVtvdmVyd3JpdGUgbGFzdCByZWFkIGNhY2hlXVxuICAgIHVwZGF0ZV9yZWFkX2NhY2hlIC0tPiBwZXJzaXN0W3BlcnNpc3QgbmV3IHJlYWQgY2FjaGVdXG4gICAgcGVyc2lzdCAtLT4gcmVhZF9jYWNoZVxuXG4gICAgY2hlY2tfZmx1c2ggLS4gZmFsc2UgLi0-IGNoZWNrX2NvbmRpdGlvbl9oYXNoe3NhbWUgcmVhZCByZXF1ZXN0IGFzIGxhc3QgdGltZT99XG5cbiAgICBjaGVja19jb25kaXRpb25faGFzaCAtLiB0cnVlIC4tPiByZWFkX2NhY2hlW3JldHVybiByZWFkIGNhY2hlXVxuICAgIGNoZWNrX2NvbmRpdGlvbl9oYXNoIC0uIGZhbHNlIC4tPiB1bnBlcnNpc3RcbiIsIm1lcm1haWQiOnsidGhlbWUiOiJkZWZhdWx0In0sInVwZGF0ZUVkaXRvciI6ZmFsc2V9)](https://mermaid-js.github.io/mermaid-live-editor/#/edit/eyJjb2RlIjoiZ3JhcGggVERcbiAgICByZWFkW2ludm9rZSBgZmluZEFsbGAgb3IgYGZpbmRCeWBdICAtLT4gY2hlY2tfcGVyc2lzdGVuY2VcblxuICAgIGNoZWNrX3BlcnNpc3RlbmNle3BlcnNpc3RSZWFkRGF0YT99XG5cbiAgICBjaGVja19wZXJzaXN0ZW5jZSAtLiBmYWxzZSAuLT4gbG9hZF9kaXNrW2xvYWQgZnJvbSBkYXRhIHN0b3JhZ2VdXG4gICAgY2hlY2tfcGVyc2lzdGVuY2UgLS4gdHJ1ZSAuLT4gY2hlY2tfZmx1c2h7Zmx1c2hSZWFkQ2FjaGU_fVxuXG4gICAgY2hlY2tfZmx1c2ggLS4gdHJ1ZSAuLT4gdW5wZXJzaXN0W3VucGVyc2lzdCBsYXN0IHJlYWQgY2FjaGVdXG4gICAgdW5wZXJzaXN0IC0tPiB1cGRhdGVfY29uZGl0aW9uX2hhc2hbc2F2ZSByZXF1ZXN0IGhpc3RvcnldXG4gICAgdXBkYXRlX2NvbmRpdGlvbl9oYXNoIC0tPiB1cGRhdGVfcmVhZF9jYWNoZVtvdmVyd3JpdGUgbGFzdCByZWFkIGNhY2hlXVxuICAgIHVwZGF0ZV9yZWFkX2NhY2hlIC0tPiBwZXJzaXN0W3BlcnNpc3QgbmV3IHJlYWQgY2FjaGVdXG4gICAgcGVyc2lzdCAtLT4gcmVhZF9jYWNoZVxuXG4gICAgY2hlY2tfZmx1c2ggLS4gZmFsc2UgLi0-IGNoZWNrX2NvbmRpdGlvbl9oYXNoe3NhbWUgcmVhZCByZXF1ZXN0IGFzIGxhc3QgdGltZT99XG5cbiAgICBjaGVja19jb25kaXRpb25faGFzaCAtLiB0cnVlIC4tPiByZWFkX2NhY2hlW3JldHVybiByZWFkIGNhY2hlXVxuICAgIGNoZWNrX2NvbmRpdGlvbl9oYXNoIC0uIGZhbHNlIC4tPiB1bnBlcnNpc3RcbiIsIm1lcm1haWQiOnsidGhlbWUiOiJkZWZhdWx0In0sInVwZGF0ZUVkaXRvciI6ZmFsc2V9) -------------------------------------------------------------------------------- /docs/SparkSessionBuilder.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | The class `SparkSessionBuilder` is used to configure and build new spark session for the given usage(s). 4 | 5 | ## Code Example 6 | 7 | ```scala 8 | import com.jcdecaux.datacorp.spark.SparkSessionBuilder 9 | 10 | // Auto-configure 11 | val spark1: SparkSession = new SparkSessionBuilder("cassandra") 12 | .setAppName("myApp") 13 | .setEnv("dev") // or AppEnv.DEV 14 | .setCassandraHost("localhost") 15 | .build() 16 | .get() 17 | 18 | // Build with your own SparkConf 19 | val spark2: SparkSession = new SparkSessionBuilder() 20 | .configure(yourSparkConf) 21 | .build() 22 | .get() 23 | 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /docs/Stage.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | A **Stage** is a collection of independent **Factories**. All the stages of a pipeline will be executed sequentially at runtime. Within a stage, all factories could be executed parallelly or sequentially. 3 | 4 | ## Demo 5 | 6 | You could instantiate a stage like the follows: 7 | ```scala 8 | val stage = new Stage() 9 | ``` 10 | 11 | Run in sequential mode: 12 | ```scala 13 | stage.parallel(false) 14 | ``` 15 | 16 | Add a factory into this stage: 17 | ```scala 18 | // Add an already existed instance of factory 19 | val myFactory = new MyFactory() 20 | stage.addFactory(myFactory) 21 | 22 | // Or let the framework handle the instantiation 23 | stage.addFactory(classOf[MyFactory], constructorArguments...) 24 | ``` 25 | 26 | Describe the current stage: 27 | ```scala 28 | stage.describe() 29 | ``` 30 | 31 | Run the current stage: 32 | ```scala 33 | stage.run() 34 | ``` -------------------------------------------------------------------------------- /docs/StructAnalyser.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | 3 | **StructAnalyser** provides functionalities to retrieve annotation information from a class. 4 | 5 | It scans the class' metadata and returns a **StructType** so that the **SchemaConverter** could use to transform the schema of a DataFrame/Dataset. 6 | 7 | You can access the metadata of your class by getting the metadata of **StructField** 8 | 9 | ### Demo 10 | 11 | ```scala 12 | case class MyClass(col1: String, @ColumnName("column_2") col2: String) 13 | 14 | // analyseSchema will return a StructType of MyClass 15 | val structType = StructAnalyser.analyseSchema[MyClass] 16 | ``` 17 | -------------------------------------------------------------------------------- /docs/Transformer.md: -------------------------------------------------------------------------------- 1 | The notion of the transformer is preliminary. 2 | 3 | # Definition 4 | **Transformer** is the atomic class for data transformation. A `transformer[T]` will transform some input data into an object of type **T**. 5 | 6 | 7 | ## When should I use a transformer 8 | The original idea of the transformer is to decouple a complex data processing procedure of a **Factory**. Generally, a transformer should be placed inside a **Factory**. A factory can have multiple transformers. 9 | 10 | A transformer should be simple (in terms of task, for example, transform an object of type A to type B) and stateless (which means it should minimize its dependence on the application context). 11 | 12 | Another use case would be to implement several different data transformation logic for one factory (for example, there may be several different ML models for one single prediction job). In this case, there should be a way to select the most appropriate transformer according to their performance in a specific environment. 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /docs/data_access_layer/ConnectorBuilder.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | [**ConnectorBuilder**](https://github.com/SETL-Developers/setl/tree/master/src/main/scala/com/jcdecaux/setl/storage/ConnectorBuilder.scala) provides a simplified way to create **Connector**. 3 | 4 | ## Usage 5 | You have two ways to instantiate a **ConnectorBuilder**: 6 | - with a *Typesafe* [**Config**](https://github.com/lightbend/config) object from a configuration file 7 | - with a [**Conf**](https://github.com/SETL-Developers/setl/tree/master/src/main/scala/com/jcdecaux/setl/config/Conf.scala) object from a `Map[String, String]`. 8 | 9 | ### With Typesafe Config 10 | Firstly, you should create a configuration file in your project's resources directory. 11 | 12 | In this case, let's call it `application.conf`. 13 | 14 | ```text 15 | csvConfiguration { 16 | storage = "CSV" 17 | path = "your/path/to/file.csv" 18 | inferSchema = "true" 19 | delimiter = ";" 20 | header = "true" 21 | saveMode = "Append" 22 | } 23 | ``` 24 | 25 | Then you can use **ConfigLoader** to load your configuration file. By default, it loads `application.conf`. 26 | ```scala 27 | object Properties extends ConfigLoader 28 | 29 | val connector = new ConnectorBuilder(spark, Properties.getConfig("csvConfiguration")).getOrCreate() 30 | 31 | connector.read() 32 | connector.write(df) 33 | ``` 34 | 35 | ### With Conf 36 | You can create a **Conf** object from a **Map**. 37 | ```scala 38 | val conf = Conf.fromMap( 39 | Map( 40 | "storage" -> "PARQUET", 41 | "path" -> "path/to/your/file", 42 | ... 43 | ) 44 | ) 45 | 46 | val connector = new ConnectorBuilder(spark, conf).getOrCreate() 47 | 48 | connector.read() 49 | connector.write(df) 50 | 51 | ``` 52 | 53 | ## Parameters 54 | Please refer to [Connector documentation](Connector) -------------------------------------------------------------------------------- /docs/data_access_layer/CustomConnector.md: -------------------------------------------------------------------------------- 1 | ## Custom Connector 2 | 3 | You can implement you own data source connector by implementing the `ConnectorInterface` 4 | 5 | ```scala 6 | import io.github.setl.storage.connector.ConnectorInterface 7 | import io.github.setl.internal.CanDrop 8 | import io.github.setl.config.Conf 9 | import org.apache.spark.sql.DataFrame 10 | 11 | class CustomConnector extends ConnectorInterface with CanDrop { 12 | override def setConf(conf: Conf): Unit = { 13 | // configuration 14 | } 15 | 16 | override def read(): DataFrame = { 17 | import spark.implicits._ 18 | Seq(1, 2, 3).toDF("id") 19 | } 20 | 21 | override def write(t: DataFrame, suffix: Option[String]): Unit = logDebug("Write with suffix") 22 | 23 | override def write(t: DataFrame): Unit = logDebug("Write") 24 | 25 | override def drop(): Unit = logDebug("drop") 26 | } 27 | ``` 28 | 29 | ### Functionalities 30 | 31 | Like the previous example, by extending your connector class with functionality traits (like `CanDrop`) 32 | and implementing accordingly their abstract methods, SparkRepository will be able to use these specific 33 | functionalities. 34 | 35 | ### Use the custom connector 36 | 37 | To use this connector, set the storage to **OTHER** and provide the class reference of your connector: 38 | 39 | ```txt 40 | myConnector { 41 | storage = "OTHER" 42 | class = "com.example.CustomConnector" // class reference of your connector 43 | yourParam = "some parameter" // put your parameters here 44 | } 45 | ``` 46 | -------------------------------------------------------------------------------- /docs/data_access_layer/SparkRepositoryAdapter.md: -------------------------------------------------------------------------------- 1 | # RepositoryAdapter 2 | 3 | In some situation, the data format defined in the data source doesn't match the case class defined in our project, and we want to hide 4 | the conversion detail (which may be irrelevant to the business logic). We can achieve this by using the 5 | [SparkRepositoryAdapter](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/repository/ImplicitRepositoryAdapter.scala). 6 | and [DatasetConverter](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/DatasetConverter.scala) 7 | 8 | ## Example 9 | 10 | Imagine our datasource has a format that match the following case class: 11 | 12 | ```scala 13 | case class DataSourceFormat(col1: String, col2: Int, col3: String) 14 | 15 | // col1, col2, col3 16 | // r1, 1, r1-1 17 | // r2, 2, r1-2 18 | ``` 19 | 20 | The column `col3` is not necessary (as it's only a concatenation of `col1` and `col2`, we can ignore it and use this 21 | case class in out project: 22 | 23 | ```scala 24 | case class ProjectFormat(col1: String, col2: Int) 25 | ``` 26 | 27 | So the data conversions that we want to hide are: 28 | - during the reading, we want to implicitly drop the `col3` 29 | - during the writing, we want to implicitly create `col3` by concatenating `col1` and `col2` 30 | 31 | Let's implement our dataset converter: 32 | ```scala 33 | import io.github.setl.storage.DatasetConverter 34 | 35 | implicit val myConverter = new DatasetConverter[ProjectFormat, DataSourceFormat] { 36 | override def convertFrom(t2: Dataset[DataSourceFormat]): Dataset[ProjectFormat] = { 37 | t2.drop("col3") 38 | .as[ProjectFormat](ExpressionEncoder[ProjectFormat]) 39 | } 40 | 41 | override def convertTo(t1: Dataset[ProjectFormat]): Dataset[DataSourceFormat] = { 42 | import org.apache.spark.sql.functions._ 43 | 44 | t1.withColumn("col3", concat(col("col1"), lit("-"), col("col2"))) 45 | .as[DataSourceFormat](ExpressionEncoder[DataSourceFormat]) 46 | } 47 | } 48 | ``` 49 | 50 | To use this converter: 51 | ```scala 52 | import io.github.setl.storage.repository.ImplicitRepositoryAdapter._ 53 | 54 | // Supposed that we have a repository of type ProjectFormat. 55 | // After the import, several new methods will be added to the SparkRepository 56 | // For example: convertAndSave and findAllAndConvert 57 | val projectFormatRepo = SparkRepository[ProjectFormat] 58 | 59 | // This will convert a Dataset[ProjectFormat] to a Dataset[DataSourceFormat] and save it 60 | projectFormatRepo.convertAndSave(projectFormatDataset) 61 | 62 | // This will load a Dataset[DataSourceFormat] and automatically convert it to a Dataset[ProjectFormat] 63 | val loaded = projectFormatRepo.findAllAndConvert() 64 | ``` 65 | -------------------------------------------------------------------------------- /docs/data_access_layer/SparkRepositoryBuilder.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | Based on the same idea of [**ConnectorBuilder**](ConnectorBuilder), [**SparkRepositoryBuilder**](https://github.com/SETL-Developers/setl/tree/master/src/main/scala/com/jcdecaux/setl/storage/SparkRepositoryBuilder.scala) helps you create your **SparkRepository** :ok_hand: 4 | 5 | ## Usage 6 | Firstly, you should create a configuration file in your project's resources directory. 7 | 8 | In this case, let's call it `application.conf`. 9 | 10 | ```text 11 | csvConfiguration { 12 | storage = "CSV" 13 | path = "your/path/to/file.csv" 14 | inferSchema = "true" 15 | delimiter = ";" 16 | header = "true" 17 | saveMode = "Append" 18 | } 19 | ``` 20 | 21 | Then you can use **ConfigLoader** to load your configuration file. By default it loads `application.conf`. 22 | ```scala 23 | val repo = new SparkRepositoryBuilder[MyClass](setl.configLoader.getConfig("csvConfiguration")).getOrCreate() 24 | 25 | repo.findAll() 26 | repo.save(dataset) 27 | ``` 28 | 29 | ## Parameters 30 | Please refer to [Connector documentation](Connector) -------------------------------------------------------------------------------- /docs/data_access_layer/Structured-Streaming-Connector.md: -------------------------------------------------------------------------------- 1 | **StructuredStreamingConnector** is a new connector added since the version 0.4.3. It brings the Spark Structured Streaming API together with the Connector API. It allows users to manipulate streaming data like any other static connectors. 2 | 3 | Here is an implementation of the [word count program](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#quick-example) from the Spark structured streaming documentation: 4 | 5 | ```scala 6 | // Configuration 7 | val input = Map( 8 | "storage" -> "STRUCTURED_STREAMING", 9 | "format" -> "socket", 10 | "host" -> "localhost", 11 | "port" -> "9999" 12 | ) 13 | 14 | val output = Map( 15 | "storage" -> "STRUCTURED_STREAMING", 16 | "outputMode" -> "complete", 17 | "format" -> "console" 18 | ) 19 | 20 | val spark = SparkSession 21 | .builder 22 | .appName("StructuredNetworkWordCount") 23 | .master("local") 24 | .getOrCreate() 25 | 26 | import spark.implicits._ 27 | 28 | val inputConnector = new ConnectorBuilder(Conf.fromMap(input)).getOrCreate() 29 | val outputConnector = new ConnectorBuilder(Conf.fromMap(output)).getOrCreate().asInstanceOf[StructuredStreamingConnector] 30 | 31 | // read lines 32 | val lines = inputConnector.read() 33 | // Split the lines into words 34 | val words = lines.as[String].flatMap(_.split(" ")) 35 | // Generate running word count 36 | val wordCounts = words.groupBy("value").count() 37 | // Show the output 38 | outputConnector.write(wordCounts) 39 | outputConnector.awaitTermination() 40 | ``` -------------------------------------------------------------------------------- /docs/img/logo_setl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SETL-Framework/setl/68f4e0213f5b6793acb96b2d9e08c102439565c4/docs/img/logo_setl.png -------------------------------------------------------------------------------- /docs/img/logo_setl_1280_640.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SETL-Framework/setl/68f4e0213f5b6793acb96b2d9e08c102439565c4/docs/img/logo_setl_1280_640.png -------------------------------------------------------------------------------- /docs/img/old_logo/logo_setl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SETL-Framework/setl/68f4e0213f5b6793acb96b2d9e08c102439565c4/docs/img/old_logo/logo_setl.png -------------------------------------------------------------------------------- /docs/img/old_logo/logo_setl_1280_640.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SETL-Framework/setl/68f4e0213f5b6793acb96b2d9e08c102439565c4/docs/img/old_logo/logo_setl_1280_640.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ![logo](img/logo_setl.png) 2 | ----------- 3 | If you’re a **data scientist** or **data engineer**, this might sound familiar while working on **ETL** projects: 4 | 5 | - Switching between multiple projects is a hassle 6 | - Debugging others’ code is a nightmare 7 | - Spending a lot of time solving non-business-related issues 8 | 9 | **SETL** (Spark ETL, pronounced "settle") is a Scala framework that helps you structure your Spark ETL projects, modularize your data transformation logic and speed up your development. 10 | 11 | ## Table of contents 12 | 13 | - [Quick start](Quick-Start) 14 | - [Setl](Setl) 15 | - Data Access Layer 16 | - [Access data with Connector](data_access_layer/Connector) 17 | - [FileConnector](data_access_layer/Connector#fileconnector) 18 | - [DBConnector](data_access_layer/Connector#dbconnector) 19 | - [StructuredStreamingConnector](data_access_layer/Structured-Streaming-Connector) 20 | - [Use your own connector](data_access_layer/CustomConnector) 21 | - [Access data with Repository](data_access_layer/Repository) 22 | - [Create a connector](data_access_layer/ConnectorBuilder) 23 | - [Create a repository](data_access_layer/SparkRepositoryBuilder) 24 | - [Hide implicit data conversion with SparkRepositoryAdapter](data_access_layer/SparkRepositoryAdapter) 25 | - [Configuration Example](data_access_layer/configuration_example) 26 | - Data Transformation API 27 | - [Transformer](Transformer) 28 | - [Factory](Factory) 29 | - Workflow Management 30 | - [Stage](Stage) 31 | - [Pipeline](Pipeline) 32 | - [Pipeline execution optimization (preliminary feature)](PipelineOptimizer) 33 | - Utilities 34 | - [Annotations](Annotations) 35 | - [SparkSession builder](SparkSessionBuilder) 36 | - [ConfigLoader](ConfigLoader) 37 | - [DateUtils](DateUtils) 38 | - [Condition](Condition) 39 | - Developer 40 | - [StructAnalyser](StructAnalyser) 41 | - [SchemaConverter](SchemaConverter) 42 | - [PipelineInspector](PipelineInspector) 43 | - [PipelineOptimizer](PipelineOptimizer) 44 | - [DeliverableDispatcher](DeliverableDispatcher) 45 | - [Read cache strategy](SparkRepository-caching) 46 | - [Logging](Logging) 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /docs/utils/Compressor_Archiver.md: -------------------------------------------------------------------------------- 1 | # Compressor 2 | 3 | A [compressor](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/Compressor.scala) 4 | can: 5 | - compress a string to a byte array 6 | - decompress a byte array to a string 7 | 8 | ## Example: 9 | 10 | ```scala 11 | import io.github.setl.storage.GZIPCompressor 12 | 13 | val compressor = new GZIPCompressor() 14 | 15 | val compressed = compressor.compress("data to be compressed") 16 | val data = compressor.decompress(compressed) 17 | ``` 18 | 19 | # Archiver 20 | 21 | An [Archiver](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/Archiver.scala) can 22 | package files and directories into a single data archive file. 23 | 24 | -------------------------------------------------------------------------------- /docs/vocabulary.md: -------------------------------------------------------------------------------- 1 | #### Data access layer 2 | Data access layer is a layer of a computer program which provides simplified access (saving and retrieving) data to data stored in persistent storage. 3 | 4 | #### Business logic layer 5 | Business logic layer contains code which works with the data, processing it according to the rules of the business logic. 6 | 7 | #### Persistence storage 8 | A storage of data, *e.g* a database, a distributed filesystem, etc. 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/annotation/Benchmark.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | 9 | /** 10 | *

The Benchmark annotation should be put on any class of Factory[T] to enable the benchmark process. 11 | * The total elapsed time of the factory will then be recorded.

12 | * 13 | *

In addition, user can also put it onto any the "read", "process" or "write" methods that are defined 14 | * in AbstractFactory[T], and the elapsed time of each method will be recorded as well.

15 | */ 16 | @InterfaceStability.Evolving 17 | @Retention(RetentionPolicy.RUNTIME) 18 | @Target({ElementType.METHOD, ElementType.TYPE}) 19 | public @interface Benchmark { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/annotation/Compress.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation; 2 | 3 | import io.github.setl.internal.SchemaConverter; 4 | import io.github.setl.internal.StructAnalyser; 5 | import io.github.setl.storage.Compressor; 6 | import io.github.setl.storage.XZCompressor; 7 | 8 | import java.lang.annotation.ElementType; 9 | import java.lang.annotation.Retention; 10 | import java.lang.annotation.RetentionPolicy; 11 | import java.lang.annotation.Target; 12 | 13 | /** 14 | *

15 | * The annotation Compress indicates {@link StructAnalyser} to save the metadata of corresponding fields 16 | * into the output StructType object. All annotated columns will be compressed by {@link SchemaConverter} 17 | * during the saving process in SparkRepository 18 | *

19 | * 20 | *

21 | * By default, the compression algorithm is XZ with the default compression level (=6). You can define other compressor 22 | * by implementing com.jcdecaux.datacorp.storage.Compressor interface. 23 | *

24 | */ 25 | @InterfaceStability.Stable 26 | @Retention(RetentionPolicy.RUNTIME) 27 | @Target({ElementType.PARAMETER}) 28 | public @interface Compress { 29 | 30 | Class compressor() default XZCompressor.class; 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/annotation/Delivery.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation; 2 | 3 | import io.github.setl.workflow.External; 4 | 5 | import java.lang.annotation.ElementType; 6 | import java.lang.annotation.Retention; 7 | import java.lang.annotation.RetentionPolicy; 8 | import java.lang.annotation.Target; 9 | 10 | /** 11 | * The annotation @Delivery indicates {@link io.github.setl.workflow.DeliverableDispatcher} that the current field 12 | * or method is marked as an input and it will be injected during the runtime by the DispatchManager. 13 | *

14 | * If multiple {@link io.github.setl.transformation.Deliverable} of the same type were found in the delivery pool of DispatchManager, then 15 | * it will try to compare the producer of the Deliverable 16 | */ 17 | @InterfaceStability.Evolving 18 | @Retention(RetentionPolicy.RUNTIME) 19 | @Target({ElementType.FIELD, ElementType.METHOD}) 20 | public @interface Delivery { 21 | 22 | /** 23 | * Producer of the current delivery that will be use by DispatchManager in order to find the corresponding delivery 24 | */ 25 | Class producer() default External.class; 26 | 27 | /** 28 | * Indicates whether the current Delivery is optional or not 29 | */ 30 | boolean optional() default false; 31 | 32 | boolean autoLoad() default false; 33 | 34 | String condition() default ""; 35 | 36 | String id() default ""; 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/annotation/Experimental.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | /** 9 | * The Experimental annotation indicate that the annotated class/method/field is supposed to be an experimental feature, 10 | * thus the stability can't be guaranteed. 11 | */ 12 | @Retention(RetentionPolicy.CLASS) 13 | @Target({ElementType.FIELD, ElementType.METHOD, ElementType.TYPE}) 14 | public @interface Experimental { 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/annotation/InterfaceStability.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation; 2 | 3 | import java.lang.annotation.Documented; 4 | 5 | /** 6 | * Annotation to inform users of how much to rely on a particular package, 7 | * class or method not changing over time. 8 | */ 9 | public class InterfaceStability { 10 | 11 | /** 12 | * Stable APIs that retain source and binary compatibility within a major release. 13 | * These interfaces can change from one major release to another major release 14 | * (e.g. from 1.0 to 2.0). 15 | */ 16 | @Documented 17 | public @interface Stable { 18 | } 19 | 20 | /** 21 | * APIs that are meant to evolve towards becoming stable APIs, but are not stable APIs yet. 22 | * Evolving interfaces can change from one feature release to another release (i.e. 2.1 to 2.2). 23 | */ 24 | @Documented 25 | public @interface Evolving { 26 | } 27 | 28 | /** 29 | * Unstable APIs, with no guarantee on stability. 30 | * Classes that are unannotated are considered Unstable. 31 | */ 32 | @Documented 33 | public @interface Unstable { 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/enums/PathFormat.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.enums; 2 | 3 | public enum PathFormat { 4 | WILDCARD, 5 | REGEX; 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/enums/Storage.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.enums; 2 | 3 | /** 4 | * StorageType 5 | */ 6 | public enum Storage { 7 | CSV("io.github.setl.storage.connector.CSVConnector"), 8 | EXCEL("io.github.setl.storage.connector.ExcelConnector"), 9 | PARQUET("io.github.setl.storage.connector.ParquetConnector"), 10 | DELTA("io.github.setl.storage.connector.DeltaConnector"), 11 | CASSANDRA("io.github.setl.storage.connector.CassandraConnector"), 12 | DYNAMODB("io.github.setl.storage.connector.DynamoDBConnector"), 13 | JSON("io.github.setl.storage.connector.JSONConnector"), 14 | JDBC("io.github.setl.storage.connector.JDBCConnector"), 15 | STRUCTURED_STREAMING("io.github.setl.storage.connector.StructuredStreamingConnector"), 16 | HUDI("io.github.setl.storage.connector.HudiConnector"), 17 | SPARK_SQL("io.github.setl.storage.connector.SparkSQLConnector"), 18 | OTHER(null); 19 | 20 | private String connectorName; 21 | 22 | Storage(String cls) { 23 | this.connectorName = cls; 24 | } 25 | 26 | public String connectorName() { 27 | return connectorName; 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/enums/ValueType.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.enums; 2 | 3 | public enum ValueType { 4 | STRING("string"), 5 | DATETIME("timestamp"), 6 | DATE("date"), 7 | NUMBER("number"), 8 | SET("set"), 9 | COLUMN("column"); 10 | 11 | private final String value; 12 | 13 | ValueType(String value) { 14 | this.value = value; 15 | } 16 | 17 | public String value() { 18 | return value; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/AlreadyExistsException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class AlreadyExistsException extends BaseException { 4 | public AlreadyExistsException() { 5 | } 6 | 7 | public AlreadyExistsException(String message) { 8 | super(message); 9 | } 10 | 11 | public AlreadyExistsException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public AlreadyExistsException(Throwable cause) { 16 | super(cause); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/BaseException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class BaseException extends RuntimeException { 4 | 5 | public BaseException() { 6 | } 7 | 8 | public BaseException(String message) { 9 | super(message); 10 | } 11 | 12 | public BaseException(String message, Throwable cause) { 13 | super(message, cause); 14 | } 15 | 16 | public BaseException(Throwable cause) { 17 | super(cause); 18 | } 19 | 20 | public BaseException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 21 | super(message, cause, enableSuppression, writableStackTrace); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/ConfException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class ConfException extends BaseException { 4 | 5 | public ConfException(String errorMessage) { 6 | super(errorMessage); 7 | } 8 | 9 | public static class Format extends ConfException { 10 | /** 11 | * @param errorMessage error message 12 | */ 13 | public Format(String errorMessage) { 14 | super(errorMessage); 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/ConnectorException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class ConnectorException extends BaseException { 4 | public ConnectorException() { 5 | } 6 | 7 | public ConnectorException(String message) { 8 | super(message); 9 | } 10 | 11 | public ConnectorException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public ConnectorException(Throwable cause) { 16 | super(cause); 17 | } 18 | 19 | public ConnectorException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 20 | super(message, cause, enableSuppression, writableStackTrace); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/InvalidConnectorException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class InvalidConnectorException extends BaseException { 4 | public InvalidConnectorException() { 5 | } 6 | 7 | public InvalidConnectorException(String message) { 8 | super(message); 9 | } 10 | 11 | public InvalidConnectorException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public InvalidConnectorException(Throwable cause) { 16 | super(cause); 17 | } 18 | 19 | public InvalidConnectorException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 20 | super(message, cause, enableSuppression, writableStackTrace); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/InvalidDeliveryException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class InvalidDeliveryException extends BaseException { 4 | 5 | public InvalidDeliveryException() { 6 | } 7 | 8 | public InvalidDeliveryException(String message) { 9 | super(message); 10 | } 11 | 12 | public InvalidDeliveryException(String message, Throwable cause) { 13 | super(message, cause); 14 | } 15 | 16 | public InvalidDeliveryException(Throwable cause) { 17 | super(cause); 18 | } 19 | 20 | public InvalidDeliveryException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 21 | super(message, cause, enableSuppression, writableStackTrace); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/InvalidSchemaException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class InvalidSchemaException extends BaseException { 4 | public InvalidSchemaException() { 5 | } 6 | 7 | public InvalidSchemaException(String message) { 8 | super(message); 9 | } 10 | 11 | public InvalidSchemaException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public InvalidSchemaException(Throwable cause) { 16 | super(cause); 17 | } 18 | 19 | public InvalidSchemaException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 20 | super(message, cause, enableSuppression, writableStackTrace); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/RepositoryException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class RepositoryException extends BaseException { 4 | public RepositoryException() { 5 | } 6 | 7 | public RepositoryException(String message) { 8 | super(message); 9 | } 10 | 11 | public RepositoryException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public RepositoryException(Throwable cause) { 16 | super(cause); 17 | } 18 | 19 | public RepositoryException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 20 | super(message, cause, enableSuppression, writableStackTrace); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/UnknownException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | /** 4 | * UnknownException 5 | */ 6 | public class UnknownException extends BaseException { 7 | 8 | public UnknownException(String errorMessage) { 9 | super(errorMessage); 10 | } 11 | 12 | public static class Storage extends UnknownException { 13 | public Storage(String errorMessage) { 14 | super(errorMessage); 15 | } 16 | } 17 | 18 | public static class Format extends UnknownException { 19 | public Format(String errorMessage) { 20 | super(errorMessage); 21 | } 22 | } 23 | 24 | public static class Environment extends UnknownException { 25 | public Environment(String errorMessage) { 26 | super(errorMessage); 27 | } 28 | } 29 | 30 | public static class ValueType extends UnknownException { 31 | public ValueType(String errorMessage) { 32 | super(errorMessage); 33 | } 34 | } 35 | } 36 | 37 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/internal/BenchmarkInvocationHandler.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal; 2 | 3 | import io.github.setl.annotation.Benchmark; 4 | import org.apache.log4j.LogManager; 5 | import org.apache.log4j.Logger; 6 | 7 | import java.lang.reflect.InvocationHandler; 8 | import java.lang.reflect.Method; 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | 12 | /** 13 | * BenchmarkInvocationHandler is used to handle the `@Benchmark` annotation. It measure the elapsed time of the method 14 | * having the annotation. 15 | */ 16 | public class BenchmarkInvocationHandler implements InvocationHandler { 17 | 18 | private Object target; 19 | 20 | private final Map methods = new HashMap<>(); 21 | 22 | private Map benchmarkResult = new HashMap<>(); 23 | 24 | private static Logger logger = LogManager.getLogger(BenchmarkInvocationHandler.class); 25 | 26 | public BenchmarkInvocationHandler(Object target) { 27 | this.target = target; 28 | for (Method method : target.getClass().getDeclaredMethods()) { 29 | // Exclude all the bridge methods 30 | if (!method.isBridge()) { 31 | this.methods.put(method.getName(), method); 32 | } 33 | } 34 | } 35 | 36 | public Map getBenchmarkResult() { 37 | return benchmarkResult; 38 | } 39 | 40 | @Override 41 | public Object invoke(Object proxy, Method method, Object[] args) throws Throwable { 42 | 43 | Method targetMethod = methods.get(method.getName()); 44 | Object result; 45 | 46 | if (targetMethod.isAnnotationPresent(Benchmark.class)) { 47 | // Measure the elapsed time if the method has @Benchmark annotation 48 | long start = System.nanoTime(); 49 | result = targetMethod.invoke(target, args); 50 | long elapsed = System.nanoTime() - start; 51 | double seconds = (double)elapsed / 1_000_000_000.0; 52 | 53 | this.benchmarkResult.put(targetMethod.getName(), seconds); 54 | 55 | logger.info("Executing " + target.getClass().getSimpleName() + "." + 56 | method.getName() + " finished in " + seconds + " s"); 57 | } else { 58 | // if the method doesn't have the Benchmark annotation, run it without measuring the elapsed time 59 | result = targetMethod.invoke(target, args); 60 | } 61 | 62 | return result; 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/storage/GZIPCompressor.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage; 2 | 3 | import java.io.*; 4 | import java.nio.charset.StandardCharsets; 5 | import java.util.stream.Collectors; 6 | import java.util.zip.GZIPInputStream; 7 | import java.util.zip.GZIPOutputStream; 8 | 9 | /** 10 | * XZCompressor implement {@link Compressor}'s interface with the GZIP compression algorithm 11 | */ 12 | public class GZIPCompressor implements Compressor { 13 | 14 | @Override 15 | public byte[] compress(String input) throws IOException { 16 | if ((input == null) || (input.length() == 0)) { 17 | return null; 18 | } 19 | ByteArrayOutputStream compressedBytes = new ByteArrayOutputStream(); 20 | GZIPOutputStream gzipOutput = new GZIPOutputStream(compressedBytes); 21 | gzipOutput.write(input.getBytes(StandardCharsets.UTF_8)); 22 | gzipOutput.flush(); 23 | gzipOutput.close(); 24 | return compressedBytes.toByteArray(); 25 | } 26 | 27 | @Override 28 | public String decompress(byte[] bytes) throws IOException { 29 | if ((bytes == null) || (bytes.length == 0)) { 30 | return ""; 31 | } 32 | 33 | InputStreamReader inputStreamReader; 34 | if (isCompressed(bytes)) { 35 | GZIPInputStream gzipInputStream = new GZIPInputStream(new ByteArrayInputStream(bytes)); 36 | inputStreamReader = new InputStreamReader(gzipInputStream, StandardCharsets.UTF_8); 37 | } else { 38 | inputStreamReader = new InputStreamReader(new ByteArrayInputStream(bytes), StandardCharsets.UTF_8); 39 | } 40 | 41 | return new BufferedReader(inputStreamReader).lines().collect(Collectors.joining("\n")); 42 | } 43 | 44 | private static boolean isCompressed(final byte[] compressed) { 45 | return (compressed[0] == (byte) (GZIPInputStream.GZIP_MAGIC)) && 46 | (compressed[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8)); 47 | } 48 | } 49 | 50 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/storage/SnappyCompressor.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage; 2 | 3 | import org.xerial.snappy.Snappy; 4 | 5 | import java.io.IOException; 6 | import java.nio.charset.StandardCharsets; 7 | 8 | public class SnappyCompressor implements Compressor { 9 | 10 | @Override 11 | public byte[] compress(String input) throws IOException { 12 | return Snappy.compress(input, StandardCharsets.UTF_8); 13 | } 14 | 15 | @Override 16 | public String decompress(byte[] bytes) throws IOException { 17 | return Snappy.uncompressString(bytes, StandardCharsets.UTF_8); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/storage/XZCompressor.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage; 2 | 3 | import org.tukaani.xz.LZMA2Options; 4 | import org.tukaani.xz.XZInputStream; 5 | import org.tukaani.xz.XZOutputStream; 6 | 7 | import java.io.ByteArrayInputStream; 8 | import java.io.ByteArrayOutputStream; 9 | import java.io.IOException; 10 | import java.nio.charset.StandardCharsets; 11 | 12 | /** 13 | * XZCompressor implement {@link Compressor}'s interface with the XZ compression algorithm 14 | */ 15 | public class XZCompressor implements Compressor { 16 | 17 | @Override 18 | public byte[] compress(String input) throws IOException { 19 | if ((input == null) || (input.length() == 0)) { 20 | return null; 21 | } 22 | ByteArrayOutputStream xzOutput = new ByteArrayOutputStream(); 23 | XZOutputStream xzStream = new XZOutputStream(xzOutput, new LZMA2Options(LZMA2Options.PRESET_DEFAULT)); 24 | xzStream.write(input.getBytes(StandardCharsets.UTF_8)); 25 | xzStream.close(); 26 | return xzOutput.toByteArray(); 27 | } 28 | 29 | @Override 30 | public String decompress(byte[] bytes) throws IOException { 31 | if ((bytes == null) || (bytes.length == 0)) { 32 | return ""; 33 | } 34 | XZInputStream xzInputStream = new XZInputStream(new ByteArrayInputStream(bytes)); 35 | byte firstByte = (byte) xzInputStream.read(); 36 | byte[] buffer = new byte[xzInputStream.available() + 1]; 37 | buffer[0] = firstByte; 38 | xzInputStream.read(buffer, 1, buffer.length - 1); 39 | xzInputStream.close(); 40 | return new String(buffer); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/BenchmarkResult.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | case class BenchmarkResult(cls: String, read: Double, process: Double, write: Double, get: Double, total: Double) { 4 | 5 | override def toString: String = { 6 | 7 | val formatter = java.text.NumberFormat.getNumberInstance 8 | 9 | s"Benchmark class: $cls\n" + 10 | s"Total elapsed time: ${formatter.format(total)} s\n" + 11 | s"read: ${formatter.format(read)} s\n" + 12 | s"process: ${formatter.format(process)} s\n" + 13 | s"write: ${formatter.format(write)} s\n" + 14 | "=================" 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/Builder.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.Logging 5 | 6 | /** 7 | * Builder could be used to build or initialize objects 8 | * 9 | * @tparam A the type of object that the builder is supposed to produce 10 | */ 11 | @InterfaceStability.Evolving 12 | trait Builder[+A] extends Logging { 13 | 14 | /** 15 | * Build an object 16 | * 17 | * @return 18 | */ 19 | def build(): this.type 20 | 21 | def get(): A 22 | 23 | def getOrCreate(): A = this.build().get() 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/Converter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | 5 | /** 6 | * A converter should be able to convert between two types T1 and T2. 7 | */ 8 | @InterfaceStability.Evolving 9 | trait Converter { 10 | type T1 11 | type T2 12 | 13 | /** 14 | * Convert from an object of type T2 to an object of type T1 15 | * 16 | * @param t2 object of type T2 17 | * @return an object of type T1 18 | */ 19 | def convertFrom(t2: T2): T1 20 | 21 | /** 22 | * Convert an object of type T1 to an object of type T2 23 | * 24 | * @param t1 object of type T1 to be convert to T2 25 | * @return an object of type T2 26 | */ 27 | def convertTo(t1: T1): T2 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/annotation/ColumnName.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation 2 | 3 | import scala.annotation.StaticAnnotation 4 | 5 | /** 6 | * Define an alias for the current field in the table 7 | * 8 | * @param name alias of the current field name 9 | */ 10 | @InterfaceStability.Stable 11 | final case class ColumnName(name: String) extends StaticAnnotation 12 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/annotation/CompoundKey.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation 2 | 3 | import scala.annotation.StaticAnnotation 4 | 5 | /** 6 | * Mark current field as a part of a compound key. All the compound keys will be concatenated with a separator 7 | * The position of the current field could be set with the position argument 8 | * 9 | * @param id String, "primary", "sort", etc. 10 | * @param position String, "1", "2", etc. 11 | */ 12 | @InterfaceStability.Stable 13 | final case class CompoundKey(id: String, position: String) extends StaticAnnotation 14 | 15 | private[setl] object CompoundKey { 16 | 17 | private[this] val separator: String = "!@" 18 | 19 | import scala.reflect.runtime.{universe => ru} 20 | 21 | /** To be used to handle the scala reflect annotation api of compound key */ 22 | def serialize(compoundKey: ru.AnnotationApi): String = { 23 | val attributes = compoundKey.tree.children.tail.collect { 24 | case ru.Literal(ru.Constant(attribute)) => attribute.toString 25 | } 26 | attributes.mkString(separator) 27 | } 28 | 29 | /** Deserialize the string of compound key into an object of CompoundKey */ 30 | def deserialize(str: String): CompoundKey = { 31 | val data = str.split(separator) 32 | CompoundKey(data(0), data(1)) 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/config/ConnectorConf.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | abstract class ConnectorConf extends Conf { 4 | 5 | def getReaderConf: Map[String, String] 6 | 7 | def getWriterConf: Map[String, String] 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/config/DeltaConnectorConf.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import io.github.setl.exception.ConfException 4 | import org.apache.spark.sql.SaveMode 5 | 6 | class DeltaConnectorConf extends ConnectorConf { 7 | 8 | import DeltaConnectorConf._ 9 | 10 | def setPath(path: String): this.type = set("path", path) 11 | 12 | def setSaveMode(saveMode: String): this.type = set("saveMode", saveMode) 13 | 14 | def setSaveMode(saveMode: SaveMode): this.type = set("saveMode", saveMode.toString) 15 | 16 | def getPath: String = get("path") match { 17 | case Some(path) => path 18 | case _ => throw new ConfException("The value of path is not set") 19 | } 20 | 21 | def getSaveMode: SaveMode = SaveMode.valueOf(get("saveMode", SaveMode.Append.toString)) 22 | 23 | override def getReaderConf: Map[String, String] = { 24 | import scala.collection.JavaConverters._ 25 | settings.asScala.toMap - PATH 26 | } 27 | 28 | override def getWriterConf: Map[String, String] = { 29 | import scala.collection.JavaConverters._ 30 | settings.asScala.toMap - SAVEMODE - PATH 31 | } 32 | } 33 | 34 | object DeltaConnectorConf { 35 | 36 | def fromMap(options: Map[String, String]): DeltaConnectorConf = new DeltaConnectorConf().set(options) 37 | 38 | val SAVEMODE: String = "saveMode" 39 | val PATH: String = "path" 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/config/DynamoDBConnectorConf.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import org.apache.spark.sql.SaveMode 4 | 5 | class DynamoDBConnectorConf extends ConnectorConf { 6 | 7 | import DynamoDBConnectorConf._ 8 | 9 | def setTable(table: String): this.type = set(TABLE, table) 10 | 11 | def getTable: Option[String] = get(TABLE) 12 | 13 | def setReadPartitions(readPartitions: String): this.type = set(Reader.READ_PARTITIONS, readPartitions) 14 | 15 | def getReadPartitions: Option[String] = get(Reader.READ_PARTITIONS) 16 | 17 | def getSaveMode: SaveMode = SaveMode.valueOf(get("saveMode", "ErrorIfExists")) 18 | 19 | override def getReaderConf: Map[String, String] = { 20 | import scala.collection.JavaConverters._ 21 | settings.asScala.toMap - 22 | Writer.WRITE_BATCH_SIZE - 23 | Writer.UPDATE - 24 | TABLE - 25 | Writer.SAVE_MODE 26 | } 27 | 28 | override def getWriterConf: Map[String, String] = { 29 | import scala.collection.JavaConverters._ 30 | settings.asScala.toMap - 31 | Reader.READ_PARTITIONS - 32 | Reader.MAX_PARTITION_BYTES - 33 | Reader.DEFAULT_PARALLELISM - 34 | Reader.STRONGLY_CONSISTENT_READS - 35 | Reader.BYTES_PER_RCU - 36 | Reader.FILTER_PUSHDOWN - 37 | TABLE - 38 | Writer.SAVE_MODE 39 | } 40 | 41 | def getRegion: Option[String] = get(REGION) 42 | 43 | } 44 | 45 | object DynamoDBConnectorConf { 46 | 47 | object Reader { 48 | val READ_PARTITIONS: String = "readPartitions" 49 | val MAX_PARTITION_BYTES: String = "maxPartitionBytes" 50 | val DEFAULT_PARALLELISM: String = "defaultParallelism" 51 | val STRONGLY_CONSISTENT_READS: String = "stronglyConsistentReads" 52 | val BYTES_PER_RCU: String = "bytesPerRCU" 53 | val FILTER_PUSHDOWN: String = "filterPushdown" 54 | } 55 | 56 | object Writer { 57 | val WRITE_BATCH_SIZE = "writeBatchSize" 58 | val UPDATE = "update" 59 | val SAVE_MODE = "saveMode" 60 | } 61 | 62 | val REGION: String = "region" 63 | val TABLE: String = "table" 64 | val THROUGHPUT: String = "throughput" 65 | val TARGET_CAPACITY: String = "targetCapacity" 66 | } 67 | 68 | 69 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/config/HudiConnectorConf.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import io.github.setl.exception.ConfException 4 | import org.apache.spark.sql.SaveMode 5 | 6 | class HudiConnectorConf extends ConnectorConf { 7 | 8 | import HudiConnectorConf._ 9 | 10 | override def getReaderConf: Map[String, String] = { 11 | import scala.collection.JavaConverters._ 12 | settings.asScala.toMap - PATH 13 | } 14 | 15 | override def getWriterConf: Map[String, String] = { 16 | import scala.collection.JavaConverters._ 17 | settings.asScala.toMap - SAVEMODE - PATH 18 | } 19 | 20 | def setPath(path: String): this.type = set("path", path) 21 | 22 | def setSaveMode(saveMode: String): this.type = set("saveMode", saveMode) 23 | 24 | def setSaveMode(saveMode: SaveMode): this.type = set("saveMode", saveMode.toString) 25 | 26 | def getPath: String = get("path") match { 27 | case Some(path) => path 28 | case _ => throw new ConfException("The value of path is not set") 29 | } 30 | 31 | def getSaveMode: SaveMode = SaveMode.valueOf(get("saveMode", SaveMode.Append.toString)) 32 | 33 | } 34 | 35 | object HudiConnectorConf { 36 | def fromMap(options: Map[String, String]): HudiConnectorConf = new HudiConnectorConf().set(options) 37 | 38 | val SAVEMODE: String = "saveMode" 39 | val PATH: String = "path" 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/config/StructuredStreamingConnectorConf.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | /** 4 | * Configuration parameters: 5 | * Spark documentation 6 | */ 7 | class StructuredStreamingConnectorConf extends ConnectorConf { 8 | 9 | import StructuredStreamingConnectorConf._ 10 | 11 | def setFormat(format: String): this.type = set(FORMAT.toLowerCase(), format) 12 | 13 | def getFormat: String = getWithException(FORMAT).toLowerCase() 14 | 15 | def setSchema(schema: String): this.type = set(SCHEMA, schema) 16 | 17 | def getSchema: String = getWithException(SCHEMA) 18 | 19 | def setOutputMode(mode: String): this.type = set(OUTPUT_MODE, mode) 20 | 21 | def getOutputMode: String = getWithException(OUTPUT_MODE) 22 | 23 | def setPath(path: String): this.type = set(PATH, path) 24 | 25 | def getPath: String = getWithException(PATH) 26 | 27 | override def getReaderConf: Map[String, String] = removePrivateConf() 28 | 29 | override def getWriterConf: Map[String, String] = removePrivateConf() 30 | 31 | private[this] def getWithException(key: String): String = { 32 | get(key).getOrElse(throw new IllegalArgumentException(s"Can't find $key")) 33 | } 34 | 35 | private[this] def removePrivateConf(): Map[String, String] = { 36 | import scala.collection.JavaConverters._ 37 | settings.asScala.toMap - FORMAT - SCHEMA - OUTPUT_MODE 38 | } 39 | } 40 | 41 | object StructuredStreamingConnectorConf { 42 | def fromMap(options: Map[String, String]): StructuredStreamingConnectorConf = 43 | new StructuredStreamingConnectorConf().set(options) 44 | 45 | val FORMAT: String = "format" 46 | val SCHEMA: String = "schema" 47 | val OUTPUT_MODE: String = "outputMode" 48 | val PATH: String = "path" 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanCreate.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | import org.apache.spark.sql.DataFrame 5 | 6 | /** 7 | * Connectors that inherit CanCreate should be able to create a table in a database or a file/folder in a file system 8 | */ 9 | trait CanCreate { 10 | self: Connector => 11 | 12 | /** 13 | * Create a data storage (e.g. table in a database or file/folder in a file system) with a suffix 14 | * 15 | * @param t data frame to be written 16 | * @param suffix suffix to be appended at the end of the data storage name 17 | */ 18 | def create(t: DataFrame, suffix: Option[String]): Unit 19 | 20 | /** 21 | * Create a data storage (e.g. table in a database or file/folder in a file system) 22 | * 23 | * @param t data frame to be written 24 | */ 25 | def create(t: DataFrame): Unit 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanDelete.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | 5 | /** 6 | * Connectors that inherit CanDelete should be able to delete records for a given query string 7 | */ 8 | trait CanDelete { 9 | self: Connector => 10 | 11 | /** 12 | * Delete rows according to the query 13 | * 14 | * @param query a query string 15 | */ 16 | def delete(query: String): Unit 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanDrop.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | 5 | /** 6 | * Connectors that inherit CanDrop should be able to drop the entire data table 7 | */ 8 | trait CanDrop { 9 | self: Connector => 10 | 11 | /** 12 | * Drop the entire table. 13 | */ 14 | def drop(): Unit 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanPartition.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | 5 | /** 6 | * Connectors that inherit CanPartition should be able to partition the output by the given columns on the file system 7 | */ 8 | trait CanPartition { 9 | self: Connector => 10 | 11 | /** 12 | * Partitions the output by the given columns on the file system. If specified, the output is 13 | * laid out on the file system similar to Hive's partitioning scheme. As an example, when we 14 | * partition a dataset by year and then month, the directory layout would look like: 15 | *

19 | * 20 | * Partitioning is one of the most widely used techniques to optimize physical data layout. 21 | * It provides a coarse-grained index for skipping unnecessary data reads when queries have 22 | * predicates on the partitioned columns. In order for partitioning to work well, the number 23 | * of distinct values in each column should typically be less than tens of thousands. 24 | * 25 | * This is applicable for all file-based data sources (e.g. Parquet, JSON) 26 | */ 27 | def partitionBy(columns: String*): this.type 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanUpdate.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | import org.apache.spark.sql.DataFrame 5 | 6 | /** 7 | * Connectors that inherit CanUpdate should be able to update the data store with a new data frame and a given matching 8 | * columns. 9 | */ 10 | trait CanUpdate { 11 | self: Connector => 12 | 13 | /** 14 | * Update the data store with a new data frame and the given matching columns. 15 | * 16 | * All the matched data will be updated, the non-matched data will be inserted 17 | * 18 | * @param df new data 19 | * @param columns other columns to be matched 20 | */ 21 | def update(df: DataFrame, columns: String*): Unit 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanVacuum.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | 5 | /** 6 | * Connectors that inherit CanVacuum should be able to recursively delete files and directories in the table that are 7 | * not needed by the table for maintaining older versions up to the given retention threshold 8 | */ 9 | trait CanVacuum { 10 | self: Connector => 11 | 12 | /** 13 | * Recursively delete files and directories in the table that are not needed by the table for 14 | * maintaining older versions up to the given retention threshold. This method will return an 15 | * empty DataFrame on successful completion. 16 | * 17 | * @param retentionHours The retention threshold in hours. Files required by the table for 18 | * reading versions earlier than this will be preserved and the 19 | * rest of them will be deleted. 20 | */ 21 | def vacuum(retentionHours: Double): Unit 22 | 23 | /** 24 | * Recursively delete files and directories in the table that are not needed by the table for 25 | * maintaining older versions up to the given retention threshold. This method will return an 26 | * empty DataFrame on successful completion. 27 | * 28 | * note: This will use the default retention period of 7 days. 29 | */ 30 | def vacuum(): Unit 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanWait.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | import io.github.setl.storage.connector.Connector 5 | 6 | /** 7 | * Connectors that inherit CanWait should be able to wait for the execution to stop 8 | */ 9 | trait CanWait { 10 | self: Connector => 11 | 12 | /** 13 | * Wait for the execution to stop. Any exceptions that occurs during the execution 14 | * will be thrown in this thread. 15 | */ 16 | def awaitTermination(): Unit 17 | 18 | /** 19 | * Wait for the execution to stop. Any exceptions that occurs during the execution 20 | * will be thrown in this thread. 21 | * 22 | * @param timeout time to wait in milliseconds 23 | * @return `true` if it's stopped; or throw the reported error during the execution; or `false` 24 | * if the waiting time elapsed before returning from the method. 25 | */ 26 | def awaitTerminationOrTimeout(timeout: Long): Boolean 27 | 28 | /** 29 | * Stops the execution of this query if it is running. 30 | */ 31 | def stop(): Unit 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/Configurable.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | 5 | @InterfaceStability.Evolving 6 | trait Configurable { 7 | 8 | def set(key: String, value: String): this.type 9 | 10 | def get(key: String): Option[String] 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasBenchmark.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.BenchmarkResult 4 | 5 | /** 6 | * HasBenchmark should be used for object having an aggregated benchmark. Typically a Pipeline or a Stage 7 | */ 8 | trait HasBenchmark { 9 | 10 | protected var _benchmark: Option[Boolean] = None 11 | 12 | /** 13 | * True if the benchmark will be measured, otherwise false 14 | * 15 | * @return boolean 16 | */ 17 | def benchmark: Option[Boolean] = _benchmark 18 | 19 | /** 20 | * Set to true to enable the benchmarking 21 | * 22 | * @param boo true to enable benchmarking 23 | * @return this object 24 | */ 25 | def benchmark(boo: Boolean): this.type = { 26 | _benchmark = Option(boo) 27 | this 28 | } 29 | 30 | /** 31 | * Get the aggregated benchmark result. 32 | * 33 | * @return an array of BenchmarkResult 34 | */ 35 | def getBenchmarkResult: Array[BenchmarkResult] 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasDescription.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.util.ReflectUtils 5 | 6 | 7 | @InterfaceStability.Evolving 8 | trait HasDescription { 9 | 10 | def getPrettyName: String = ReflectUtils.getPrettyName(this.getClass) 11 | 12 | /** Describe the current class */ 13 | def describe(): this.type 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasDiagram.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import scala.reflect.runtime 4 | 5 | trait HasDiagram { 6 | 7 | /** Generate the diagram */ 8 | def toDiagram: String 9 | 10 | /** Get the diagram ID */ 11 | def diagramId: String 12 | 13 | protected def getTypeArgList(tpe: runtime.universe.Type): List[runtime.universe.Symbol] = { 14 | tpe 15 | .baseClasses.head 16 | .asClass 17 | .primaryConstructor 18 | .typeSignature 19 | .paramLists 20 | .head 21 | } 22 | 23 | protected def formatDiagramId(prettyName: String, 24 | deliveryId: String, 25 | suffix: String): String = { 26 | prettyName.replaceAll("[\\[\\]]", "") + deliveryId.capitalize + suffix 27 | } 28 | 29 | /** Display the diagram */ 30 | def showDiagram(): Unit = println(toDiagram) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasReader.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import org.apache.spark.sql.DataFrameReader 4 | 5 | trait HasReader { Connector => 6 | 7 | protected val reader: DataFrameReader 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasReaderWriter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | trait HasReaderWriter extends HasReader with HasWriter { Connector => 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasRegistry.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import java.util.UUID 4 | 5 | import io.github.setl.annotation.InterfaceStability 6 | import io.github.setl.exception.AlreadyExistsException 7 | 8 | import scala.collection.immutable.ListMap 9 | 10 | /** 11 | * HasUUIDRegistry provide a UUID registry and methods to check if an 12 | * [[io.github.setl.internal.Identifiable]] object already 13 | * exists in its registry 14 | */ 15 | @InterfaceStability.Evolving 16 | trait HasRegistry[T <: Identifiable] { 17 | 18 | /** 19 | * Registry is a HashSet that keeps the UUID of identifiable objects 20 | */ 21 | private[this] var registry: ListMap[UUID, T] = ListMap.empty 22 | 23 | /** 24 | * Register a new [[io.github.setl.internal.Identifiable]] in registry 25 | * 26 | * @param item an object that inherit [[io.github.setl.internal.Identifiable]] 27 | * @return true if the given item is registered, false otherwise 28 | */ 29 | @throws[AlreadyExistsException] 30 | protected def registerNewItem(item: T): Unit = { 31 | if (hasRegisteredItem(item)) { 32 | throw new AlreadyExistsException(s"The current item ${item.getUUID} of type ${item.getCanonicalName} already exists") 33 | } else { 34 | registry += (item.getUUID -> item) 35 | } 36 | } 37 | 38 | /** Clear the registry */ 39 | protected def clearRegistry(): Unit = { 40 | registry = ListMap.empty 41 | } 42 | 43 | /** 44 | * Register multiple items 45 | * 46 | * @param items an [[io.github.setl.internal.Identifiable]] object 47 | */ 48 | protected def registerNewItems(items: Iterable[T]): Unit = items.foreach(this.registerNewItem) 49 | 50 | /** 51 | * Check if the Identifiable exists in the registry 52 | * 53 | * @param item an object that inherit [[io.github.setl.internal.Identifiable]] 54 | * @return true if it already exists in the registry, false otherwise 55 | */ 56 | def hasRegisteredItem(item: Identifiable): Boolean = this.hasRegisteredItem(item.getUUID) 57 | 58 | /** 59 | * Check if the UUID exists in the registry 60 | * 61 | * @param uuid an UUID 62 | * @return true if it already exists in the registry, false otherwise 63 | */ 64 | def hasRegisteredItem(uuid: UUID): Boolean = registry.contains(uuid) 65 | 66 | /** Return the registry */ 67 | def getRegistry: ListMap[UUID, T] = this.registry 68 | 69 | /** 70 | * For a given UUID, return the corresponding registered item 71 | * 72 | * @param uuid uuid 73 | * @return 74 | */ 75 | def getRegisteredItem(uuid: UUID): Option[T] = registry.get(uuid) 76 | 77 | /** Return the number of items in the current registry */ 78 | def getRegistryLength: Long = registry.size 79 | 80 | /** Return true if the registry is empty, false otherwise */ 81 | def isRegistryEmpty: Boolean = registry.isEmpty 82 | 83 | /** 84 | * Return the last registered item 85 | * 86 | * @return if the registry is empty, None will be returned 87 | */ 88 | def lastRegisteredItem: Option[T] = if (isRegistryEmpty) { 89 | None 90 | } else { 91 | Option(registry.last._2) 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasType.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | 5 | import scala.reflect.runtime 6 | 7 | /** 8 | * HasType should be used on classed having a payload 9 | */ 10 | @InterfaceStability.Evolving 11 | trait HasType { 12 | 13 | val runtimeType: runtime.universe.Type 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasWriter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row} 4 | 5 | trait HasWriter { Connector => 6 | 7 | protected val writer: DataFrame => DataFrameWriter[Row] 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/Identifiable.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import java.util.UUID 4 | 5 | import io.github.setl.annotation.InterfaceStability 6 | 7 | /** 8 | * Identifiable generates an UUID for any object that implement the trait 9 | */ 10 | @InterfaceStability.Evolving 11 | trait Identifiable { 12 | 13 | private[this] val _uuid: UUID = UUID.randomUUID 14 | 15 | private[this] val _name: String = getClass.getCanonicalName 16 | 17 | def getUUID: UUID = _uuid 18 | 19 | def getCanonicalName: String = _name 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/Logging.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import org.apache.log4j.{LogManager, Logger} 5 | 6 | /** 7 | * Logging provide logging features for the class that extends this trait 8 | */ 9 | @InterfaceStability.Evolving 10 | private[setl] trait Logging { 11 | 12 | // Make the log field transient so that objects with Logging can 13 | // be serialized and used on another machine 14 | @transient private var logger: Logger = _ 15 | 16 | // Method to get or create the logger for this object 17 | protected def log: Logger = { 18 | if (logger == null) { 19 | logger = LogManager.getLogger(logName) 20 | } 21 | logger 22 | } 23 | 24 | // Method to get the logger name for this object 25 | protected def logName: String = { 26 | // Ignore trailing $'s in the class names for Scala objects 27 | this.getClass.getName.stripSuffix("$") 28 | } 29 | 30 | protected def logInfo(msg: => String): Unit = { 31 | if (log.isInfoEnabled) log.info(msg) 32 | } 33 | 34 | protected def logDebug(msg: => String): Unit = { 35 | if (log.isDebugEnabled) log.debug(msg) 36 | } 37 | 38 | protected def logTrace(msg: => String): Unit = { 39 | if (log.isTraceEnabled) log.trace(msg) 40 | } 41 | 42 | protected def logWarning(msg: => String): Unit = log.warn(msg) 43 | 44 | protected def logError(msg: => String): Unit = log.error(msg) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/Writable.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | /** 4 | * Indicate that users can activate or deactivate the write of the class 5 | */ 6 | trait Writable { 7 | 8 | protected var _write: Boolean = true 9 | 10 | /** 11 | * Whether invoke the write method or not 12 | * 13 | * @param write if set to true, then the write method of the factory will be invoked 14 | * @return 15 | */ 16 | def writable(write: Boolean): this.type = { 17 | this._write = write 18 | this 19 | } 20 | 21 | /** Return true if the write method will be invoked by the pipeline */ 22 | def writable: Boolean = this._write 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/Archiver.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import io.github.setl.exception.InvalidConnectorException 4 | import io.github.setl.storage.connector.FileConnector 5 | import io.github.setl.storage.repository.SparkRepository 6 | import org.apache.hadoop.fs.Path 7 | 8 | trait Archiver { 9 | 10 | def addFile(file: Path, name: Option[String] = None): this.type 11 | 12 | /** 13 | * Add the connector's data to the consolidator. For a directory with the following structure: 14 | * 15 | * {{{ 16 | * base_path 17 | * |-- dir_1 18 | * | |-- file1 19 | * |-- dir_2 20 | * |-- file2 21 | * }}} 22 | * 23 | * After calling addConnector(connector, Some("new_name")), the structure in the compressed zip file will be: 24 | * 25 | * {{{ 26 | * outputPath.zip // outputPath.zip is given during the instantiation of FileConsolidator 27 | * |--new_name 28 | * |-- dir_1 29 | * | |-- file1 30 | * |-- dir_2 31 | * |-- file2 32 | * }}} 33 | * 34 | * @param repository Repository that will be used to load data 35 | * @param name name of the directory in the zip output. default is the name of the base directory of the connector 36 | * @return 37 | */ 38 | @throws[InvalidConnectorException] 39 | def addRepository(repository: SparkRepository[_], name: Option[String] = None): this.type 40 | 41 | /** 42 | * Add the connector's data to the consolidator. For a directory with the following structure: 43 | * 44 | * {{{ 45 | * base_path 46 | * |-- dir_1 47 | * | |-- file1 48 | * |-- dir_2 49 | * |-- file2 50 | * }}} 51 | * 52 | * After calling addConnector(connector, Some("new_name")), the structure in the compressed zip file will be: 53 | * 54 | * {{{ 55 | * outputPath.zip // outputPath.zip is given during the instantiation of FileConsolidator 56 | * |--new_name 57 | * |-- dir_1 58 | * | |-- file1 59 | * |-- dir_2 60 | * |-- file2 61 | * }}} 62 | * 63 | * @param connector FileConnector that will be used to load data 64 | * @param name name of the directory in the zip output. default is the name of the base directory of the connector 65 | * @return 66 | */ 67 | def addConnector(connector: FileConnector, name: Option[String] = None): this.type 68 | 69 | def archive(outputPath: Path): this.type 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/Compressor.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import java.io.IOException 4 | 5 | import io.github.setl.annotation.InterfaceStability 6 | 7 | /** 8 | * A Compressor is able to compress an input string into a byte array and vice versa. 9 | */ 10 | @InterfaceStability.Evolving 11 | trait Compressor extends Serializable { 12 | 13 | /** 14 | * Compress an input string into a byte array 15 | */ 16 | @throws[IOException] 17 | def compress(input: String): Array[Byte] 18 | 19 | /** 20 | * Decompress a byte array into an input string 21 | */ 22 | @throws[IOException] 23 | def decompress(bytes: Array[Byte]): String 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/DatasetConverter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import io.github.setl.Converter 4 | import io.github.setl.annotation.InterfaceStability 5 | import org.apache.spark.sql.Dataset 6 | 7 | /** 8 | * DatasetConverter inherits from a Converter. It can convert between two Dataset: Dataset[A] and Dataset[B] 9 | * 10 | * @tparam A Type of Dataset[A] 11 | * @tparam B Type of Dataset[B] 12 | */ 13 | @InterfaceStability.Evolving 14 | abstract class DatasetConverter[A, B] extends Converter { 15 | 16 | override type T1 = Dataset[A] 17 | override type T2 = Dataset[B] 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/ACIDConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.{CanDelete, CanDrop, CanUpdate, CanVacuum} 5 | 6 | @InterfaceStability.Evolving 7 | abstract class ACIDConnector extends Connector 8 | with CanUpdate 9 | with CanDrop 10 | with CanDelete 11 | with CanVacuum { 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/Connector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.enums.Storage 5 | import io.github.setl.internal.Logging 6 | import io.github.setl.util.HasSparkSession 7 | import org.apache.spark.sql._ 8 | 9 | /** 10 | * Connector is a non-typed data access layer (DAL) abstraction that provides read/write functionalities. 11 | * 12 | *
13 | * A basic data storage connector has two main functionalities: 14 | * 18 | * 19 | */ 20 | @InterfaceStability.Evolving 21 | trait Connector extends HasSparkSession with Logging { 22 | 23 | val storage: Storage 24 | 25 | /** 26 | * Read data from the data source 27 | * @return a [[DataFrame]] 28 | */ 29 | def read(): DataFrame 30 | 31 | /** 32 | * Write a [[DataFrame]] into the data storage 33 | * @param t a [[DataFrame]] to be saved 34 | * @param suffix for data connectors that support suffix (e.g. [[FileConnector]], 35 | * add the given suffix to the save path 36 | */ 37 | def write(t: DataFrame, suffix: Option[String]): Unit 38 | 39 | /** 40 | * Write a [[DataFrame]] into the data storage 41 | * @param t a [[DataFrame]] to be saved 42 | */ 43 | def write(t: DataFrame): Unit 44 | 45 | } 46 | 47 | object Connector { 48 | 49 | /** 50 | * Create an empty Connector 51 | * @return an empty Connector 52 | */ 53 | def empty: Connector = new Connector { 54 | override val spark: SparkSession = null 55 | override val storage: Storage = null 56 | override def read(): DataFrame = null 57 | override def write(t: DataFrame, suffix: Option[String]): Unit = {} 58 | override def write(t: DataFrame): Unit = {} 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/ConnectorInterface.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.config.Conf 4 | import io.github.setl.enums.Storage 5 | import com.typesafe.config.Config 6 | 7 | /** 8 | * ConnectorInterface provides the abstraction of a pluggable connector that could be used by [[io.github.setl.storage.ConnectorBuilder]]. 9 | * Users can implement their customized data source connector by extending this trait. 10 | */ 11 | trait ConnectorInterface extends Connector { 12 | 13 | /** 14 | * By default, the custom connector's storage type should be OTHER. 15 | */ 16 | override val storage: Storage = Storage.OTHER 17 | 18 | /** 19 | * Configure the connector with the given [[Conf]] 20 | * @param conf an object of [[Conf]] 21 | */ 22 | def setConf(conf: Conf): Unit 23 | 24 | /** 25 | * Configure the connector with the given [[Config]] 26 | * @param config an object of [[Config]] 27 | */ 28 | def setConfig(config: Config): Unit = this.setConf(Conf.fromConfig(config)) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/DBConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.{CanCreate, CanDelete, CanDrop} 5 | 6 | @InterfaceStability.Evolving 7 | abstract class DBConnector extends Connector 8 | with CanCreate 9 | with CanDrop 10 | with CanDelete { 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/HudiConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import com.typesafe.config.Config 4 | import io.github.setl.config.{Conf, HudiConnectorConf} 5 | import io.github.setl.enums.Storage 6 | import io.github.setl.internal.HasReaderWriter 7 | import io.github.setl.util.TypesafeConfigUtils 8 | import org.apache.spark.sql._ 9 | 10 | class HudiConnector(val options: HudiConnectorConf) extends Connector with HasReaderWriter { 11 | override val storage: Storage = Storage.HUDI 12 | 13 | def this(options: Map[String, String]) = this(HudiConnectorConf.fromMap(options)) 14 | 15 | def this(config: Config) = this(TypesafeConfigUtils.getMap(config)) 16 | 17 | def this(conf: Conf) = this(conf.toMap) 18 | 19 | override val reader: DataFrameReader = { 20 | spark.read 21 | .format("hudi") 22 | .options(options.getReaderConf) 23 | } 24 | 25 | override val writer: DataFrame => DataFrameWriter[Row] = (df: DataFrame) => { 26 | df.write 27 | .format("hudi") 28 | .mode(options.getSaveMode) 29 | .options(options.getWriterConf) 30 | } 31 | 32 | /** 33 | * Read data from the data source 34 | * 35 | * @return a [[DataFrame]] 36 | */ 37 | @throws[java.io.FileNotFoundException](s"${options.getPath} doesn't exist") 38 | @throws[org.apache.spark.sql.AnalysisException](s"${options.getPath} doesn't exist") 39 | override def read(): DataFrame = { 40 | logDebug(s"Reading ${storage.toString} file in: '${options.getPath}'") 41 | this.setJobDescription(s"Read file(s) from '${options.getPath}'") 42 | reader.load(options.getPath) 43 | } 44 | 45 | /** 46 | * Write a [[DataFrame]] into the data storage 47 | * 48 | * @param t a [[DataFrame]] to be saved 49 | * @param suffix for data connectors that support suffix (e.g. [[FileConnector]], 50 | * add the given suffix to the save path 51 | */ 52 | override def write(t: DataFrame, suffix: Option[String]): Unit = { 53 | if (suffix.isDefined) logWarning("Suffix is not supported in HudiConnector") 54 | write(t) 55 | } 56 | 57 | /** 58 | * Write a [[DataFrame]] into the data storage 59 | * 60 | * @param t a [[DataFrame]] to be saved 61 | */ 62 | override def write(t: DataFrame): Unit = { 63 | this.setJobDescription(s"Write file to ${options.getPath}") 64 | writer(t).save(options.getPath) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/ParquetConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.config.{Conf, FileConnectorConf} 5 | import io.github.setl.enums.Storage 6 | import io.github.setl.util.TypesafeConfigUtils 7 | import com.typesafe.config.Config 8 | import org.apache.spark.sql._ 9 | 10 | /** 11 | * ParquetConnector contains functionality for transforming [[DataFrame]] into parquet files 12 | */ 13 | @InterfaceStability.Evolving 14 | class ParquetConnector(override val options: FileConnectorConf) extends FileConnector(options) { 15 | 16 | def this(options: Map[String, String]) = this(FileConnectorConf.fromMap(options)) 17 | 18 | def this(path: String, saveMode: SaveMode) = this(Map("path" -> path, "saveMode" -> saveMode.toString)) 19 | 20 | def this(config: Config) = this(TypesafeConfigUtils.getMap(config)) 21 | 22 | def this(conf: Conf) = this(conf.toMap) 23 | 24 | override val storage: Storage = Storage.PARQUET 25 | 26 | this.options.setStorage(storage) 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/SparkSQLConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import com.typesafe.config.Config 4 | import io.github.setl.config.Conf 5 | import io.github.setl.enums.Storage 6 | import io.github.setl.util.TypesafeConfigUtils 7 | import org.apache.spark.sql.DataFrame 8 | 9 | class SparkSQLConnector(val query: String) extends Connector { 10 | override val storage: Storage = Storage.SPARK_SQL 11 | 12 | def this(conf: Conf) = this(conf.get("query", "")) 13 | def this(config: Config) = this( 14 | query = TypesafeConfigUtils.getAs[String](config, "query").getOrElse("") 15 | ) 16 | 17 | require(query.nonEmpty, "query is not defined") 18 | 19 | /** 20 | * Read data from the data source 21 | * 22 | * @return a [[DataFrame]] 23 | */ 24 | @throws[org.apache.spark.sql.AnalysisException](s"$query is invalid") 25 | override def read(): DataFrame = spark.sql(query) 26 | 27 | /** 28 | * Write a [[DataFrame]] into the data storage 29 | * 30 | * @param t a [[DataFrame]] to be saved 31 | * @param suffix for data connectors that support suffix (e.g. [[FileConnector]], 32 | * add the given suffix to the save path 33 | */ 34 | override def write(t: DataFrame, suffix: Option[String]): Unit = { 35 | if (suffix.isDefined) logWarning("suffix is not supported in SparkSQLConnector") 36 | write(t) 37 | } 38 | 39 | /** 40 | * Write a [[DataFrame]] into the data storage 41 | * 42 | * @param t a [[DataFrame]] to be saved 43 | */ 44 | override def write(t: DataFrame): Unit = { 45 | logWarning("write is not supported in SparkSQLConnector") 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/StreamingConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.internal.CanWait 4 | 5 | abstract class StreamingConnector extends Connector 6 | with CanWait { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/StructuredStreamingConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.annotation.{Experimental, InterfaceStability} 4 | import io.github.setl.config.{Conf, StructuredStreamingConnectorConf} 5 | import io.github.setl.enums.Storage 6 | import io.github.setl.util.TypesafeConfigUtils 7 | import com.typesafe.config.Config 8 | import org.apache.spark.sql.streaming.{DataStreamReader, DataStreamWriter, StreamingQuery} 9 | import org.apache.spark.sql.{DataFrame, Row} 10 | 11 | /** 12 | * :: Experimental :: 13 | * 14 | * Spark Structured Streaming connector 15 | * 16 | * @param conf configuration, see 17 | * Spark structured streaming documentation for details 18 | */ 19 | @Experimental 20 | @InterfaceStability.Unstable 21 | class StructuredStreamingConnector(val conf: StructuredStreamingConnectorConf) extends StreamingConnector { 22 | 23 | private[this] var streamingQuery: StreamingQuery = _ 24 | 25 | def this(options: Map[String, String]) = this(StructuredStreamingConnectorConf.fromMap(options)) 26 | 27 | def this(config: Config) = this(TypesafeConfigUtils.getMap(config)) 28 | 29 | def this(config: Conf) = this(config.toMap) 30 | 31 | override val storage: Storage = Storage.STRUCTURED_STREAMING 32 | 33 | @inline protected val streamReader: DataStreamReader = spark.readStream 34 | .format(conf.getFormat) 35 | .options(conf.getReaderConf) 36 | 37 | protected val streamWriter: DataFrame => DataStreamWriter[Row] = (df: DataFrame) => { 38 | df.writeStream 39 | .outputMode(conf.getOutputMode) 40 | .format(conf.getFormat) 41 | .options(conf.getWriterConf) 42 | } 43 | 44 | override def read(): DataFrame = { 45 | if (conf.has(StructuredStreamingConnectorConf.SCHEMA)) { 46 | logInfo("Apply user-defined schema") 47 | streamReader 48 | .schema(conf.getSchema) 49 | .load() 50 | } else { 51 | streamReader.load() 52 | } 53 | } 54 | 55 | override def write(t: DataFrame, suffix: Option[String]): Unit = { 56 | logWarning("Suffix will be ignored by StructuredStreamingConnector") 57 | write(t) 58 | } 59 | 60 | override def write(t: DataFrame): Unit = { 61 | streamingQuery = streamWriter(t).start() 62 | } 63 | 64 | /** 65 | * Wait for the execution to stop. Any exceptions that occurs during the execution 66 | * will be thrown in this thread. 67 | */ 68 | override def awaitTermination(): Unit = streamingQuery.awaitTermination() 69 | 70 | /** 71 | * Wait for the execution to stop. Any exceptions that occurs during the execution 72 | * will be thrown in this thread. 73 | * 74 | * @param timeout time to wait in milliseconds 75 | * @return `true` if it's stopped; or throw the reported error during the execution; or `false` 76 | * if the waiting time elapsed before returning from the method. 77 | */ 78 | override def awaitTerminationOrTimeout(timeout: Long): Boolean = streamingQuery.awaitTermination(timeout) 79 | 80 | /** 81 | * Stops the execution of this query if it is running. 82 | */ 83 | override def stop(): Unit = streamingQuery.stop() 84 | } 85 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/repository/ImplicitRepositoryAdapter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.repository 2 | 3 | import io.github.setl.internal.{SchemaConverter, StructAnalyser} 4 | import io.github.setl.storage.{Condition, DatasetConverter} 5 | import org.apache.spark.sql.Dataset 6 | import org.apache.spark.sql.types.StructType 7 | 8 | import scala.reflect.runtime.universe.TypeTag 9 | 10 | object ImplicitRepositoryAdapter { 11 | 12 | /** 13 | * SparkRepositoryAdapter is an implemented implicit RepositoryAdapter that provides 4 additional methods to an 14 | * existing `SparkRepository[A]`. 15 | * 16 | * {{{ 17 | * // Example: 18 | * 19 | * implicit val converter = new DatasetConverter[A, B] { 20 | * // implementation 21 | * } 22 | * 23 | * val defaultRepository: SparkRepository[A] // a default repository that can save a Dataset[A] 24 | * 25 | * import io.github.setl.storage.repository.ImplicitRepositoryAdapter._ 26 | * 27 | * // This will convert dsOfTypeA (a Dataset[A]) to a Dataset[B] by using the previous implicit converter, then 28 | * // save the converted dataset into the data store 29 | * defaultRepository.convertAndSave(dsOfTypeA) 30 | * 31 | * defaultRepository.findAllAndConvert() 32 | * }}} 33 | * 34 | * @param repository an existing repository 35 | * @param converter a DatasetConverter (should be implemented by user) 36 | * @tparam A source type 37 | * @tparam B target type 38 | */ 39 | implicit class SparkRepositoryAdapter[A: TypeTag, B: TypeTag] 40 | (override val repository: SparkRepository[A]) 41 | (override implicit val converter: DatasetConverter[A, B]) extends RepositoryAdapter[Dataset[A], Dataset[B]] { 42 | 43 | private[this] val DBTypeSchema: StructType = StructAnalyser.analyseSchema[B] 44 | 45 | def findAllAndConvert(): Dataset[A] = { 46 | val data = repository.readDataFrame() 47 | converter.convertFrom(SchemaConverter.fromDF[B](data)) 48 | } 49 | 50 | def findByAndConvert(conditions: Set[Condition]): Dataset[A] = { 51 | val data = repository.readDataFrame(SparkRepository.handleConditions(conditions, DBTypeSchema)) 52 | converter.convertFrom(SchemaConverter.fromDF[B](data)) 53 | } 54 | 55 | def findByAndConvert(condition: Condition): Dataset[A] = { 56 | findByAndConvert(Set(condition)) 57 | } 58 | 59 | def convertAndSave(data: Dataset[A], suffix: Option[String] = None): SparkRepositoryAdapter.this.type = { 60 | val dsToSave = converter.convertTo(data) 61 | repository.configureConnector(dsToSave.toDF(), suffix) 62 | repository.writeDataFrame(SchemaConverter.toDF[B](dsToSave)) 63 | this 64 | } 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/repository/Repository.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.repository 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.storage.Condition 5 | import org.apache.spark.sql.{Column, DataFrame, Dataset} 6 | 7 | /** 8 | * The goal of Repository is to significantly reduce the amount of boilerplate code required to 9 | * implement data access layers for various persistence stores. 10 | * 11 | * @tparam DT data type 12 | */ 13 | @InterfaceStability.Evolving 14 | trait Repository[DT] { 15 | 16 | /** 17 | * Find data by giving a set of conditions 18 | * 19 | * @param conditions Set of [[Condition]] 20 | * @return 21 | */ 22 | def findBy(conditions: Set[Condition]): DT 23 | 24 | /** 25 | * Find data by giving a single condition 26 | * 27 | * @param condition a [[Condition]] 28 | * @return 29 | */ 30 | def findBy(condition: Condition): DT = this.findBy(Set(condition)) 31 | 32 | /** 33 | * Find data by giving a Spark sql column 34 | * 35 | * @param column a column object (could be chained) 36 | * @return 37 | */ 38 | def findBy(column: Column): DT = this.findBy(Condition(column)) 39 | 40 | /** 41 | * Retrieve all data 42 | * 43 | * @return 44 | */ 45 | def findAll(): DT 46 | 47 | /** 48 | * Save a [[Dataset]] into a data persistence store 49 | * 50 | * @param data data to be saved 51 | * @param suffix an optional string to separate data 52 | * @return this repository instance 53 | */ 54 | def save(data: DT, suffix: Option[String]): this.type 55 | 56 | 57 | /** 58 | * Update/Insert a [[Dataset]] into a data persistence store 59 | * 60 | * @param data data to be saved 61 | * @return this repository instance 62 | */ 63 | def update(data: DT): this.type 64 | 65 | /** 66 | * Drop the entire table/file/directory 67 | * @return this repository instance 68 | */ 69 | def drop(): this.type 70 | 71 | def delete(query: String): this.type 72 | 73 | /** 74 | * Create a data storage (e.g. table in a database or file/folder in a file system) with a suffix 75 | * 76 | * @param t data frame to be written 77 | * @param suffix suffix to be appended at the end of the data storage name 78 | */ 79 | def create(t: DataFrame, suffix: Option[String]): this.type 80 | 81 | /** 82 | * Create a data storage (e.g. table in a database or file/folder in a file system) 83 | * 84 | * @param t data frame to be written 85 | */ 86 | def create(t: DataFrame): this.type 87 | 88 | def vacuum(retentionHours: Double): this.type 89 | 90 | def vacuum(): this.type 91 | 92 | /** 93 | * Wait for the execution to stop. Any exceptions that occurs during the execution 94 | * will be thrown in this thread. 95 | */ 96 | def awaitTermination(): Unit 97 | 98 | /** 99 | * Wait for the execution to stop. Any exceptions that occurs during the execution 100 | * will be thrown in this thread. 101 | * 102 | * @param timeout time to wait in milliseconds 103 | * @return `true` if it's stopped; or throw the reported error during the execution; or `false` 104 | * if the waiting time elapsed before returning from the method. 105 | */ 106 | def awaitTerminationOrTimeout(timeout: Long): Boolean 107 | 108 | /** 109 | * Stops the execution of this query if it is running. 110 | */ 111 | def stopStreaming(): this.type 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/repository/RepositoryAdapter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.repository 2 | 3 | import io.github.setl.Converter 4 | import io.github.setl.annotation.InterfaceStability 5 | import io.github.setl.storage.Condition 6 | 7 | /** 8 | * RepositoryAdapter could be used when one wants to save a `Dataset[A]` to a data store of type `B`. 9 | * 10 | * A `Repository[A]` and a `DatasetConverter[A, B]` must be provided (either explicitly or implicitly) 11 | * 12 | * @tparam A Type of the Repository 13 | * @tparam B Target data store type 14 | */ 15 | @InterfaceStability.Evolving 16 | trait RepositoryAdapter[A, B] { 17 | 18 | val repository: Repository[A] 19 | 20 | val converter: Converter 21 | 22 | def findAllAndConvert(): A 23 | 24 | def findByAndConvert(conditions: Set[Condition]): A 25 | 26 | def findByAndConvert(condition: Condition): A 27 | 28 | def convertAndSave(data: A, suffix: Option[String]): this.type 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/AbstractFactory.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | trait AbstractFactory[A] { 4 | 5 | def read(): this.type 6 | 7 | def process(): this.type 8 | 9 | def write(): this.type 10 | 11 | def get(): A 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/Factory.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.{HasDescription, Identifiable, Logging, Writable} 5 | import io.github.setl.util.ReflectUtils 6 | 7 | import scala.reflect.runtime.{universe => ru} 8 | 9 | /** 10 | * Factory could be used to manipulate data. 11 | * 12 | * A Factory is able to read data from a data source, process/transform it 13 | * and write it back to the storage.
14 | * 15 | * @tparam A the type of object that the factory is supposed to produce 16 | */ 17 | @InterfaceStability.Evolving 18 | abstract class Factory[A: ru.TypeTag] extends AbstractFactory[A] 19 | with Logging 20 | with Identifiable 21 | with HasDescription 22 | with Writable { 23 | 24 | private[this] val _consumers: Seq[Class[_ <: Factory[_]]] = Seq.empty 25 | private[this] val _deliveryId: String = Deliverable.DEFAULT_ID 26 | 27 | /** Return the list of consumer class */ 28 | def consumers: Seq[Class[_ <: Factory[_]]] = this._consumers 29 | 30 | /** Return the delivery id of this factory */ 31 | def deliveryId: String = this._deliveryId 32 | 33 | /** Read data */ 34 | override def read(): this.type 35 | 36 | /** Process data */ 37 | override def process(): this.type 38 | 39 | /** Write data */ 40 | override def write(): this.type 41 | 42 | /** Get the processed data */ 43 | override def get(): A 44 | 45 | /** Create a new Deliverable object */ 46 | def getDelivery: Deliverable[A] = { 47 | new Deliverable[A](this.get()) 48 | .setProducer(this.getClass) 49 | .setConsumers(consumers) 50 | .setDeliveryId(deliveryId) 51 | } 52 | 53 | /** Get the type of deliverable payload */ 54 | def deliveryType(): ru.Type = ru.typeTag[A].tpe 55 | 56 | /** Describe the */ 57 | override def describe(): this.type = { 58 | logInfo(s"$getPrettyName will produce a ${ReflectUtils.getPrettyName(deliveryType())}") 59 | this 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/FactoryInput.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | import io.github.setl.internal.HasType 4 | 5 | import scala.language.existentials 6 | import scala.reflect.runtime 7 | 8 | /** 9 | * Metadata of an input of a Factory. 10 | * 11 | * If a `FactoryDeliveryMetadata` represents a method, then it may be converted to multiple FactoryInputs as each of its 12 | * arguments will be abstracted as a `FactoryInput`. 13 | * 14 | * @param runtimeType runtime type of the input 15 | * @param producer producer of the input 16 | * @param deliveryId delivery id of the input 17 | */ 18 | private[setl] case class FactoryInput(override val runtimeType: runtime.universe.Type, 19 | producer: Class[_], 20 | deliveryId: String = Deliverable.DEFAULT_ID, 21 | autoLoad: Boolean, 22 | optional: Boolean, 23 | consumer: Class[_ <: Factory[_]]) extends HasType 24 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/FactoryOutput.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | import io.github.setl.internal.{HasDiagram, HasType} 4 | import io.github.setl.util.ReflectUtils 5 | import org.apache.spark.sql.Dataset 6 | 7 | import scala.reflect.runtime 8 | 9 | private[setl] case class FactoryOutput(override val runtimeType: runtime.universe.Type, 10 | consumer: Seq[Class[_ <: Factory[_]]], 11 | deliveryId: String = Deliverable.DEFAULT_ID, 12 | finalOutput: Boolean = false, 13 | external: Boolean = false) extends HasType with HasDiagram { 14 | 15 | override def diagramId: String = { 16 | val finalSuffix = if (finalOutput) "Final" else "" 17 | 18 | val externalSuffix = if (external) "External" else "" 19 | 20 | super.formatDiagramId(ReflectUtils.getPrettyName(runtimeType), deliveryId, finalSuffix + externalSuffix) 21 | } 22 | 23 | private[this] val typeToExclude = List( 24 | "String", "Double", "Int", "Float", "Long" 25 | ) 26 | 27 | private[this] def payloadField: List[String] = { 28 | val isDataset = this.runtimeType.baseClasses.head.asClass == runtime.universe.symbolOf[Dataset[_]].asClass 29 | 30 | val isCaseClass = { 31 | runtimeType.baseClasses.head.asClass.isCaseClass 32 | } 33 | 34 | if (isDataset) { 35 | if (this.runtimeType.typeArgs.isEmpty) { 36 | // DataFrame 37 | List.empty 38 | } else { 39 | // Dataset 40 | val datasetTypeArgFields = super.getTypeArgList(this.runtimeType.typeArgs.head) 41 | datasetTypeArgFields.map { 42 | i => s">${i.name}: ${ReflectUtils.getPrettyName(i.typeSignature)}" 43 | } 44 | } 45 | 46 | } else if (isCaseClass) { 47 | val typeArgFields = super.getTypeArgList(this.runtimeType) 48 | typeArgFields.map { 49 | i => s"-${i.name}: ${ReflectUtils.getPrettyName(i.typeSignature)}" 50 | } 51 | 52 | } else { 53 | List.empty 54 | } 55 | } 56 | 57 | override def toDiagram: String = { 58 | 59 | val fields = this.payloadField.mkString("\n ") 60 | 61 | s"""class ${this.diagramId} { 62 | | <<${ReflectUtils.getPrettyName(this.runtimeType)}>> 63 | | $fields 64 | |} 65 | |""".stripMargin 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/MLTransformer.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import org.apache.hadoop.fs.Path 5 | import org.apache.spark.ml.Model 6 | 7 | /** 8 | * A MLTransformer is a basic transformer with a ML model and ML-related functionality. 9 | * 10 | * @tparam T Data type of the transformer 11 | */ 12 | @InterfaceStability.Evolving 13 | trait MLTransformer[T, M <: Model[_]] extends Transformer[T] { 14 | 15 | var model: M = _ 16 | val modelPath: Path 17 | var overwriteModel: Boolean = false 18 | 19 | /** Fit a model with the current data */ 20 | def fit(): MLTransformer.this.type 21 | 22 | /** Load a model from a given path */ 23 | def loadModel(): MLTransformer.this.type 24 | 25 | /** Save the current model */ 26 | def saveModel(): MLTransformer.this.type 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/Transformer.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.{Identifiable, Logging} 5 | 6 | /** 7 | * A transformer can transform data into a type A 8 | * 9 | * @tparam T : Type of output data 10 | */ 11 | @InterfaceStability.Evolving 12 | trait Transformer[T] extends Logging with Identifiable { 13 | 14 | /** 15 | * Get the transformed data 16 | * 17 | * @return 18 | */ 19 | def transformed: T 20 | 21 | /** 22 | * Transform the current data 23 | */ 24 | def transform(): this.type 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/ExpectedDeliverable.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import io.github.setl.transformation.Factory 4 | 5 | case class ExpectedDeliverable(deliverableType: String, 6 | deliveryId: String, 7 | producer: Class[_], 8 | consumer: Class[_ <: Factory[_]]) { 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/FilterImplicits.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import io.github.setl.internal.Logging 4 | import io.github.setl.storage.Condition 5 | import org.apache.spark.sql.Dataset 6 | 7 | object FilterImplicits extends Logging { 8 | 9 | implicit class DatasetFilterByCondition[T](dataset: Dataset[T]) { 10 | 11 | def filter(conditions: Set[Condition]): Dataset[T] = { 12 | dataset.filter(conditions.toSqlRequest) 13 | } 14 | 15 | def filter(condition: Condition): Dataset[T] = { 16 | dataset.filter(condition.toSqlRequest) 17 | } 18 | } 19 | 20 | implicit class ConditionsToRequest(conditions: Set[Condition]) { 21 | 22 | /** 23 | * Convert a [[Set]] of [[io.github.setl.storage.Condition]] objects to a spark SQL query string 24 | * 25 | * @throws IllegalArgumentException if a datetime/date filter doesn't have a value with correct format, 26 | * an illegal argument exception will be thrown 27 | * @return String 28 | */ 29 | @throws[IllegalArgumentException] 30 | def toSqlRequest: String = { 31 | val query = conditions 32 | .filter(row => row.value.isDefined) 33 | .map(_.toSqlRequest) 34 | .filter(_ != null) 35 | .mkString(" AND ") 36 | query 37 | } 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/HasSparkSession.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import org.apache.spark.SparkException 4 | import org.apache.spark.sql.SparkSession 5 | 6 | trait HasSparkSession { 7 | 8 | val spark: SparkSession = SparkSession.getActiveSession match { 9 | case Some(ss) => ss 10 | case _ => throw new SparkException("No active Spark session") 11 | } 12 | 13 | def setJobDescription(desc: String): Unit = spark.sparkContext.setJobDescription(desc) 14 | 15 | def setJobGroup(group: String): Unit = spark.sparkContext.setJobGroup(group, null) 16 | 17 | def setJobGroup(group: String, description: String): Unit = spark.sparkContext.setJobGroup(group, description) 18 | 19 | def clearJobGroup(): Unit = spark.sparkContext.clearJobGroup() 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/MermaidUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import java.util.Base64 4 | 5 | import org.json4s.jackson.Serialization 6 | 7 | private[setl] object MermaidUtils { 8 | 9 | /** 10 | * Needed to convert a Map object into JSON String 11 | */ 12 | implicit val formats: org.json4s.DefaultFormats.type = org.json4s.DefaultFormats 13 | 14 | /** 15 | * Mermaid diagram code header for pretty print 16 | */ 17 | val mermaidHeader = "--------- START OF MERMAID DIAGRAM ---------" 18 | 19 | /** 20 | * Mermaid diagram code footer for pretty print 21 | */ 22 | val mermaidFooter = "---------- END OF MERMAID DIAGRAM ----------" 23 | 24 | val summaryString = "You can copy the previous code to a markdown viewer that supports Mermaid." 25 | 26 | val liveEditorMessage = "Otherwise you can try the live editor: " 27 | 28 | val linkPrefix = "https://mermaid-js.github.io/mermaid-live-editor/#/edit/" 29 | 30 | /** 31 | * Encode the Mermaid diagram to Base64 32 | * 33 | * @param mermaidDiagram Mermaid diagram code 34 | * @return the Base64 of the diagram code 35 | */ 36 | def encodeMermaid(mermaidDiagram: String): String = { 37 | val mermaidMap = Map("code" -> mermaidDiagram, "mermaid" -> Map("theme" -> "default")) 38 | val jsonString = Serialization.write(mermaidMap).replace("\\r", "") 39 | val encoded = Base64.getUrlEncoder.encode(jsonString.getBytes()) 40 | new String(encoded) 41 | } 42 | 43 | /** 44 | * Message to be printed for live editor preview 45 | * 46 | * @param code diagram base64 code 47 | * @return Full message for live editor preview 48 | */ 49 | def mermaidDiagramLink(code: String): String = { 50 | this.liveEditorMessage + this.linkPrefix + code 51 | } 52 | 53 | /** 54 | * Format output of Mermaid diagram 55 | * 56 | * @param mermaidDiagram Mermaid diagram code 57 | * @return Pretty formatted output of Mermaid diagram with direct link 58 | */ 59 | def printMermaid(mermaidDiagram: String): Unit = { 60 | val encoded = this.encodeMermaid(mermaidDiagram) 61 | val linkMessage = this.mermaidDiagramLink(encoded) 62 | 63 | 64 | 65 | println( 66 | s"""$mermaidHeader 67 | |$mermaidDiagram 68 | |$mermaidFooter 69 | | 70 | |$summaryString 71 | | 72 | |$linkMessage 73 | |""".stripMargin 74 | ) 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/ReflectUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import scala.reflect.runtime 4 | 5 | object ReflectUtils { 6 | 7 | def getPrettyName(tpe: runtime.universe.Type): String = tpe.toString.split("\\[").map(getPrettyName).mkString("[") 8 | 9 | def getPrettyName(cls: Class[_]): String = getPrettyName(cls.getCanonicalName) 10 | 11 | def getPrettyName(canonicalName: String): String = canonicalName.split("\\.").last 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/SparkUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import org.apache.spark.SparkException 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 6 | import org.apache.spark.sql.execution.command.ExplainCommand 7 | 8 | private[setl] object SparkUtils { 9 | 10 | /** 11 | * Check if the current spark version is superior than the required version 12 | * @param requiredVersion minimum version of spark 13 | * @return true if the current spark is newer than the required version 14 | */ 15 | def checkSparkVersion(requiredVersion: String): Boolean = { 16 | val currentVersion = SparkSession.getActiveSession match { 17 | case Some(ss) => ss.version 18 | case _ => throw new SparkException("No active Spark Session") 19 | } 20 | val targetVer = requiredVersion.replace(".", "") + "000" 21 | val thisVer = currentVersion.replace(".", "") + "000" 22 | thisVer.take(3).toInt >= targetVer.take(3).toInt 23 | } 24 | 25 | def withSparkVersion[T](minVersion: String)(fun: Boolean => T): T = { 26 | try { 27 | fun(checkSparkVersion(minVersion)) 28 | } catch { 29 | case e: NoSuchMethodException => 30 | throw new NoSuchMethodException("Cannot instantiate ExplainCommand. " + 31 | s"Please check the implementation of its constructor in Spark $minVersion") 32 | } 33 | } 34 | 35 | def explainCommandWithExtendedMode(logicalPlan: LogicalPlan): ExplainCommand = { 36 | withSparkVersion("3.0.0") { newer => 37 | if (newer) { 38 | val extendedMode = Class.forName("org.apache.spark.sql.execution.ExtendedMode$") 39 | .getField("MODULE$") 40 | .get(Class.forName("org.apache.spark.sql.execution.ExtendedMode$")) 41 | .asInstanceOf[Object] 42 | 43 | classOf[ExplainCommand] 44 | .getConstructor(classOf[LogicalPlan], Class.forName("org.apache.spark.sql.execution.ExplainMode")) 45 | .newInstance(logicalPlan, extendedMode) 46 | } else { 47 | classOf[ExplainCommand] 48 | .getConstructor(classOf[LogicalPlan], classOf[Boolean], classOf[Boolean], classOf[Boolean]) 49 | .newInstance(logicalPlan, true.asInstanceOf[java.lang.Boolean], false.asInstanceOf[java.lang.Boolean], false.asInstanceOf[java.lang.Boolean]) 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/TypesafeConfigUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import io.github.setl.enums.Storage 4 | import com.typesafe.config.{Config, ConfigException} 5 | 6 | object TypesafeConfigUtils { 7 | 8 | @throws[com.typesafe.config.ConfigException] 9 | def getAs[T](config: Config, path: String)(implicit getter: ConfigGetter[T]): Option[T] = getter.get(config, path) 10 | 11 | private[this] def _get[T](path: String): (String => T) => Option[T] = (fun: String => T) => { 12 | try { 13 | Option(fun(path)) 14 | } catch { 15 | case _: ConfigException.Missing => None 16 | case e: ConfigException.WrongType => throw e 17 | } 18 | } 19 | 20 | private[setl] implicit val stringGetter: ConfigGetter[String] = new ConfigGetter[String] { 21 | override def get(config: Config, path: String): Option[String] = { 22 | _get[String](path)(config.getString) 23 | } 24 | } 25 | 26 | private[setl] implicit val intGetter: ConfigGetter[Int] = new ConfigGetter[Int] { 27 | override def get(config: Config, path: String): Option[Int] = { 28 | _get[Int](path)(config.getInt) 29 | } 30 | } 31 | 32 | private[setl] implicit val longGetter: ConfigGetter[Long] = new ConfigGetter[Long] { 33 | override def get(config: Config, path: String): Option[Long] = { 34 | _get[Long](path)(config.getLong) 35 | } 36 | } 37 | 38 | private[setl] implicit val floatGetter: ConfigGetter[Float] = new ConfigGetter[Float] { 39 | override def get(config: Config, path: String): Option[Float] = { 40 | _get[Float](path)(x => config.getString(x).toFloat) 41 | } 42 | } 43 | 44 | private[setl] implicit val doubleGetter: ConfigGetter[Double] = new ConfigGetter[Double] { 45 | override def get(config: Config, path: String): Option[Double] = { 46 | _get[Double](path)(config.getDouble) 47 | } 48 | } 49 | 50 | private[setl] implicit val booleanGetter: ConfigGetter[Boolean] = new ConfigGetter[Boolean] { 51 | override def get(config: Config, path: String): Option[Boolean] = { 52 | _get[Boolean](path)(config.getBoolean) 53 | } 54 | } 55 | 56 | private[setl] implicit val listGetter: ConfigGetter[Array[AnyRef]] = new ConfigGetter[Array[AnyRef]] { 57 | override def get(config: Config, path: String): Option[Array[AnyRef]] = { 58 | _get[Array[AnyRef]](path)(x => config.getList(x).unwrapped().toArray()) 59 | } 60 | } 61 | 62 | private[setl] implicit val StorageGetter: ConfigGetter[Storage] = new ConfigGetter[Storage] { 63 | override def get(config: Config, path: String): Option[Storage] = { 64 | _get[Storage](path)(x => Storage.valueOf(config.getString(x))) 65 | } 66 | } 67 | 68 | def getList(config: Config, path: String): Option[Array[AnyRef]] = { 69 | listGetter.get(config, path) 70 | } 71 | 72 | def getMap(config: Config): Map[String, String] = { 73 | import scala.collection.JavaConverters._ 74 | config.entrySet().asScala.map(x => x.getKey -> x.getValue.unwrapped().toString).toMap 75 | } 76 | 77 | def isDefined(config: Config, path: String): Boolean = { 78 | try { 79 | config.getAnyRef(path) != null 80 | } catch { 81 | case _: ConfigException => false 82 | } 83 | } 84 | 85 | private[setl] trait ConfigGetter[T] { 86 | def get(config: Config, path: String): Option[T] 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/workflow/DAG.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.workflow 2 | 3 | import io.github.setl.internal.HasDiagram 4 | import io.github.setl.transformation.{Factory, FactoryDeliveryMetadata} 5 | import io.github.setl.util.MermaidUtils 6 | 7 | private[workflow] case class DAG(nodes: Set[Node], flows: Set[Flow]) extends HasDiagram { 8 | 9 | def describe(): Unit = { 10 | println("------------- Data Transformation Summary -------------") 11 | checkEmpty(nodes) 12 | nodes.toList.sortBy(_.stage).foreach(_.describe()) 13 | 14 | println("------------------ Data Flow Summary ------------------") 15 | checkEmpty(flows) 16 | flows.toList.sortBy(_.stage).foreach(_.describe()) 17 | } 18 | 19 | private[this] def checkEmpty(input: Set[_]): Unit = { 20 | if (input.isEmpty) println("Empty\n") 21 | } 22 | 23 | /** 24 | * Find all the setter methods of the given Factory 25 | * 26 | * @param factory an instantiated Factory 27 | * @return a list of [[io.github.setl.transformation.FactoryDeliveryMetadata]] 28 | */ 29 | def findDeliveryMetadata(factory: Factory[_]): List[FactoryDeliveryMetadata] = { 30 | nodes.find(n => n.factoryUUID == factory.getUUID).get.setters 31 | } 32 | 33 | /** Generate the diagram */ 34 | override def toDiagram: String = { 35 | val nodeDiagrams = nodes.map(_.toDiagram).mkString("\n") 36 | val flowDiagrams = flows.map(_.toDiagram).mkString("\n") 37 | val externalNodeDiagrams = flows.filter(_.from.factoryClass == classOf[External]) 38 | .map(_.from.output.toDiagram).mkString("\n") 39 | 40 | s"""classDiagram 41 | |$nodeDiagrams 42 | |$externalNodeDiagrams 43 | |$flowDiagrams 44 | |""".stripMargin 45 | } 46 | 47 | /** Display the diagram */ 48 | override def showDiagram(): Unit = MermaidUtils.printMermaid(this.toDiagram) 49 | 50 | /** Get the diagram ID */ 51 | override def diagramId: String = throw new NotImplementedError("DAG doesn't have diagram id") 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/workflow/External.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.workflow 2 | 3 | import java.util.UUID 4 | 5 | import io.github.setl.transformation.Factory 6 | 7 | sealed abstract class External private extends Factory[External] 8 | 9 | /** 10 | * Singleton for external data source 11 | */ 12 | object External { 13 | val NODE: Node = Node( 14 | classOf[External], 15 | UUID.fromString("00000000-0000-0000-0000-000000000000"), 16 | -1, 17 | List(), 18 | null 19 | ) 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/workflow/Flow.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.workflow 2 | 3 | import io.github.setl.internal.{HasDescription, HasDiagram} 4 | import io.github.setl.transformation.Deliverable 5 | import io.github.setl.util.ReflectUtils 6 | 7 | import scala.reflect.runtime 8 | 9 | /** 10 | * Flow is a representation of the data transfer in a Pipeline. 11 | * 12 | * @param from origin node of the transfer 13 | * @param to destination node of the transfer 14 | */ 15 | private[workflow] case class Flow(from: Node, to: Node) extends HasDescription with HasDiagram { 16 | 17 | def payload: runtime.universe.Type = from.output.runtimeType 18 | 19 | def stage: Int = from.stage 20 | 21 | def deliveryId: String = from.output.deliveryId 22 | 23 | override def describe(): this.type = { 24 | if (deliveryId != Deliverable.DEFAULT_ID) { 25 | println(s"Delivery id : $deliveryId") 26 | } 27 | println(s"Stage : $stage") 28 | println(s"Direction : ${from.getPrettyName} ==> ${to.getPrettyName}") 29 | println(s"PayLoad : ${ReflectUtils.getPrettyName(payload)}") 30 | println("----------------------------------------------------------") 31 | this 32 | } 33 | 34 | override def toDiagram: String = { 35 | s"${to.diagramId} <|-- ${from.output.diagramId} : Input".stripMargin 36 | } 37 | 38 | override def diagramId: String = "" 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/workflow/PipelineOptimizer.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.workflow 2 | 3 | trait PipelineOptimizer { 4 | 5 | def setExecutionPlan(dag: DAG): this.type 6 | 7 | def optimize(stages: Iterable[Stage]): Array[Stage] 8 | 9 | def getOptimizedExecutionPlan: DAG 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/workflow/SimplePipelineOptimizer.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.workflow 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.Logging 5 | 6 | import scala.annotation.tailrec 7 | 8 | @InterfaceStability.Unstable 9 | class SimplePipelineOptimizer(val parallelism: Int = 4) extends PipelineOptimizer with Logging { 10 | 11 | private[this] var _executionPlan: DAG = _ 12 | lazy val optExecutionPlan: DAG = optimize() 13 | 14 | override def getOptimizedExecutionPlan: DAG = optExecutionPlan 15 | 16 | override def setExecutionPlan(dag: DAG): SimplePipelineOptimizer.this.type = { 17 | this._executionPlan = dag 18 | this 19 | } 20 | 21 | private[this] def optimize(): DAG = { 22 | val nodes = _executionPlan.nodes.toList.sortBy(_.stage) 23 | val oldDag = _executionPlan.copy() 24 | nodes.foldLeft[DAG](oldDag) { 25 | case (dag, node) => updateNode(node, dag) 26 | } 27 | } 28 | 29 | override def optimize(stages: Iterable[Stage]): Array[Stage] = { 30 | val factories = stages.flatMap(_.factories) 31 | 32 | optExecutionPlan.nodes.groupBy(_.stage).map { 33 | case (id, nodes) => 34 | val stage = new Stage().setStageId(id) 35 | 36 | val factoryUUIDs = nodes.map(_.factoryUUID) 37 | 38 | factories 39 | .filter(f => factoryUUIDs.contains(f.getUUID)) 40 | .foreach(stage.addFactory) 41 | 42 | stage 43 | }.toArray.sortBy(_.stageId) 44 | } 45 | 46 | private[this] def flowsOf(node: Node, dag: DAG): Set[Flow] = { 47 | dag.flows.filter(_.to.factoryUUID == node.factoryUUID) 48 | } 49 | 50 | private[this] def updateDag(newNode: Node, dag: DAG): DAG = { 51 | logDebug(s"Update DAG for node ${newNode.getPrettyName}") 52 | val oldNode = dag.nodes.find(_.factoryUUID == newNode.factoryUUID).get 53 | 54 | val startingFlows = dag.flows 55 | .filter(_.from == oldNode) 56 | .map(_.copy(from = newNode)) 57 | 58 | val endingFlows = dag.flows 59 | .filter(_.to == oldNode) 60 | .map(_.copy(to = newNode)) 61 | 62 | val otherFlows = dag.flows.filter(_.from != oldNode).filter(_.to != oldNode) 63 | 64 | val otherNodes = dag.nodes.filter(_ != oldNode) 65 | 66 | DAG(otherNodes + newNode, startingFlows ++ endingFlows ++ otherFlows) 67 | } 68 | 69 | @tailrec 70 | private[this] def validateStage(newStageID: Int, dag: DAG): Int = { 71 | val nodeCount = dag.nodes.count(_.stage == newStageID) 72 | if (nodeCount < parallelism) { 73 | logDebug(s"Valid stage ID: $newStageID") 74 | newStageID 75 | } else { 76 | validateStage(newStageID + 1, dag) 77 | } 78 | } 79 | 80 | private[this] def updateNode(oldNode: Node, dag: DAG): DAG = { 81 | logDebug(s"Optimize node: ${oldNode.getPrettyName} of stage ${oldNode.stage}") 82 | val currentDag = dag.copy() 83 | val flows = flowsOf(oldNode, dag) 84 | 85 | val maxInputStage = flows.size match { 86 | case 0 => 0 87 | case _ => flows.map(_.stage).max + 1 88 | } 89 | 90 | logDebug(s"Max input stage of ${oldNode.getPrettyName}: $maxInputStage") 91 | 92 | val validStage = validateStage(maxInputStage, dag) 93 | 94 | val newNode = oldNode.copy(stage = validStage) 95 | 96 | updateDag(newNode, currentDag) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/test/resources/dynamodb.conf: -------------------------------------------------------------------------------- 1 | dynamodb { 2 | connector { 3 | storage = "DYNAMODB" 4 | region = "eu-west-1" 5 | table = "test-table" 6 | saveMode = "Overwrite" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /src/test/resources/local.conf: -------------------------------------------------------------------------------- 1 | include "application.conf" 2 | 3 | test.string = "foo" 4 | test.variable = ${?myJvmProperty} 5 | 6 | setl.config { 7 | spark { 8 | spark.app.name = "my_app" 9 | spark.sql.shuffle.partitions = "1000" 10 | } 11 | } 12 | 13 | setl.config_2 { 14 | spark.app.name = "my_app_2" 15 | spark.sql.shuffle.partitions = "2000" 16 | } 17 | 18 | usages.config { 19 | spark { 20 | spark.app.name = "usages_app" 21 | spark.cassandra.connection.host = "cassandraHost" 22 | } 23 | usages = ["cassandra"] 24 | } 25 | 26 | context.spark.spark.sql.shuffle.partitions = 600 27 | 28 | csv_dc_context2 { 29 | storage = "CSV" 30 | path = "src/test/resources/test_config_csv_dc_context2" 31 | inferSchema = "true" 32 | delimiter = ";" 33 | header = "true" 34 | saveMode = "Append" 35 | } 36 | 37 | csv_dc_context { 38 | storage = "CSV" 39 | path = "src/test/resources/test_config_csv_dc_context" 40 | inferSchema = "true" 41 | delimiter = ";" 42 | header = "true" 43 | saveMode = "Append" 44 | } 45 | 46 | parquet_dc_context { 47 | storage = "PARQUET" 48 | path = "src/test/resources/test_parquet_dc_context" // must be absolute path 49 | table = "test_config2222" 50 | saveMode = "Append" 51 | } 52 | 53 | csv_dc_context_consumer { 54 | storage = "CSV" 55 | path = "src/test/resources/test_config_csv_dc_context_consumer" 56 | inferSchema = "true" 57 | delimiter = ";" 58 | header = "true" 59 | saveMode = "Overwrite" 60 | } 61 | 62 | parquet_dc_context_consumer { 63 | storage = "PARQUET" 64 | path = "src/test/resources/test_parquet_dc_context_consumer" // must be absolute path 65 | saveMode = "Append" 66 | } 67 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=warn, stdout 3 | # Captures all logs inside jcdecaux airport package 4 | log4j.logger.com.jcdecaux=DEBUG, stdout 5 | log4j.additivity.com.jcdecaux=false 6 | # Decrease the verbosity of external libraries logging 7 | log4j.logger.org.apache=WARN, stdout 8 | log4j.additivity.org.apache=false 9 | log4j.logger.com.datastax=INFO, stdout 10 | log4j.additivity.com.datastax=false 11 | log4j.logger.io.netty=WARN, stdout 12 | log4j.additivity.io.netty=false 13 | log4j.logger.org.apache.spark.sql=WARN, stdout 14 | log4j.additivity.org.apache.spark.sql=false 15 | log4j.logger.org.apache.spark.core=WARN, stdout 16 | log4j.additivity.org.apache.spark.core=false 17 | # Direct log messages to stdout 18 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 19 | log4j.appender.stdout.Target=System.out 20 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 21 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{3}:%L - %m%n 22 | -------------------------------------------------------------------------------- /src/test/resources/myconf.conf: -------------------------------------------------------------------------------- 1 | my_test_variable = "haha" -------------------------------------------------------------------------------- /src/test/resources/streaming_test_resources/input/text.txt: -------------------------------------------------------------------------------- 1 | Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the Dataset/DataFrame API in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write-Ahead Logs. In short, Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming. 2 | Internally, by default, Structured Streaming queries are processed using a micro-batch processing engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called Continuous Processing, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able to choose the mode based on your application requirements. 3 | In this guide, we are going to walk you through the programming model and the APIs. We are going to explain the concepts mostly using the default micro-batch processing model, and then later discuss Continuous Processing model. First, let’s start with a simple example of a Structured Streaming query - a streaming word count. 4 | -------------------------------------------------------------------------------- /src/test/resources/streaming_test_resources/input2/input2.csv: -------------------------------------------------------------------------------- 1 | text 2 | "hello" 3 | "world" 4 | -------------------------------------------------------------------------------- /src/test/resources/streaming_test_resources/streaming.conf: -------------------------------------------------------------------------------- 1 | structured_streaming_connector_input { 2 | storage = "STRUCTURED_STREAMING" 3 | format = "text" 4 | path = "src/test/resources/streaming_test_resources/input" 5 | } 6 | 7 | structured_streaming_connector_output { 8 | storage = "STRUCTURED_STREAMING" 9 | format = "csv" 10 | header = "false" 11 | outputMode = "append" 12 | checkpointLocation = "src/test/resources/streaming_test_resources/output/checkpoint_2" 13 | path = "src/test/resources/streaming_test_resources/output/2" 14 | } 15 | 16 | structured_streaming_connector_input_repository { 17 | storage = "STRUCTURED_STREAMING" 18 | format = "csv" 19 | schema = "text STRING" // must be provided for streaming 20 | header = "true" 21 | path = "src/test/resources/streaming_test_resources/input2" 22 | } 23 | 24 | structured_streaming_connector_output_repository { 25 | storage = "STRUCTURED_STREAMING" 26 | format = "csv" 27 | header = "true" 28 | outputMode = "append" 29 | checkpointLocation = "src/test/resources/streaming_test_resources/output/checkpoint_3" 30 | path = "src/test/resources/streaming_test_resources/output/3" 31 | } 32 | -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input-file.txt: -------------------------------------------------------------------------------- 1 | Hello, world! -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input/col3=c/file1-1-1.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | "A","a" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input/col3=c/file1-2-1.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | "B","b" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input/col3=c/file1-2-2.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | "C","c" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input/col3=cc/file2-1.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | "D","d" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-json.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "col1": "a", 4 | "col2": 1, 5 | "col3": { 6 | "col3-1": "haha", 7 | "col3-2": "hehe" 8 | }, 9 | "col4": true, 10 | "col5": 1.1 11 | }, 12 | { 13 | "col1": "b", 14 | "col2": 2, 15 | "col3": { 16 | "col3-1": "hahahaha", 17 | "col3-2": "hehehehe" 18 | }, 19 | "col4": true, 20 | "col5": 1.2 21 | }, 22 | { 23 | "col1": "c", 24 | "col2": 3, 25 | "col3": { 26 | "col3-1": "hahahahahaha", 27 | "col3-2": "hehehehehehe" 28 | }, 29 | "col4": false, 30 | "col5": 1.3 31 | }, 32 | { 33 | "col1": "d", 34 | "col2": 4, 35 | "col3": { 36 | "col3-1": "hahahahahahahaha", 37 | "col3-2": "hehehehehehehehe" 38 | }, 39 | "col4": false, 40 | "col5": 1.4 41 | } 42 | ] -------------------------------------------------------------------------------- /src/test/resources/test-list-files/file1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "D", "d" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir1/subsubdir1/file1-1-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "A", "a" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir1/subsubdir1/wrongfile1-1-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "A", "a" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir1/subsubdir2/file1-2-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "B", "b" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir1/subsubdir2/file1-2-2.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "C", "c" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir2/file2-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "D", "d" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files2/col3=c/file1-1-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "A", "a" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files2/col3=c/file1-2-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "B", "b" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files2/col3=c/file1-2-2.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "C", "c" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files2/col3=cc/file2-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "D", "d" 3 | -------------------------------------------------------------------------------- /src/test/resources/test_base_path.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | a,b 3 | c,d -------------------------------------------------------------------------------- /src/test/resources/test_connector_builder.conf: -------------------------------------------------------------------------------- 1 | customConnector { 2 | storage = "OTHER" 3 | class = "io.github.setl.CustomConnector" 4 | } 5 | -------------------------------------------------------------------------------- /src/test/resources/test_priority.conf: -------------------------------------------------------------------------------- 1 | my.value = "haha" 2 | 3 | setl.config { 4 | spark { 5 | spark.master = "local" 6 | spark.app.name = "my_app_2" 7 | spark.sql.shuffle.partitions = "1000" 8 | } 9 | } 10 | 11 | setl.config_2 { 12 | spark { 13 | spark.master = "local" 14 | spark.app.name = "my_app_context_2" 15 | spark.sql.shuffle.partitions = "2000" 16 | } 17 | } 18 | 19 | test { 20 | string = "abc" 21 | int = 1 22 | long = 2 23 | float = 3.1 24 | float2 = "3.1" 25 | double = 4.4 26 | boolean = false 27 | boolean2 = "true" 28 | list = [1,2,3] 29 | listFloat = [1.2,2,3] 30 | listString = ["1.2","2","3"] 31 | 32 | map { 33 | v1 = "a" 34 | v2 = "b" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/resources/test_schema_converter.csv: -------------------------------------------------------------------------------- 1 | col1,col2,col3,col4 2 | 1,"1","A","a" 3 | 2,"2","B","b" -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/SparkTestUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | import io.github.setl.util.SparkUtils 4 | import org.apache.spark.SparkContext 5 | 6 | private[setl] object SparkTestUtils { 7 | 8 | def getActiveSparkContext: Option[SparkContext] = { 9 | val method = SparkContext.getClass.getDeclaredMethod("getActive") 10 | method.setAccessible(true) 11 | method.invoke(SparkContext).asInstanceOf[Option[SparkContext]] 12 | } 13 | 14 | def checkSparkVersion(requiredVersion: String): Boolean = SparkUtils.checkSparkVersion(requiredVersion) 15 | 16 | def testConsolePrint(test: => Any, expected: String): Boolean = { 17 | val stream = new java.io.ByteArrayOutputStream() 18 | Console.withOut(stream)(test) 19 | val result = stream.toString().trim() 20 | result == expected 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/TestObject.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | import java.sql.{Date, Timestamp} 4 | 5 | import io.github.setl.config.Conf 6 | import io.github.setl.internal.CanDrop 7 | import io.github.setl.storage.connector.ConnectorInterface 8 | import com.typesafe.config.Config 9 | import org.apache.spark.sql.DataFrame 10 | 11 | 12 | case class TestObject(partition1: Int, partition2: String, clustering1: String, value: Long) 13 | 14 | case class TestObject3(partition1: Int, partition2: String, clustering1: String, value: Long, value2: String) 15 | 16 | case class TestObject2(col1: String, col2: Int, col3: Double, col4: Timestamp, col5: Date, col6: Long) 17 | 18 | class CustomConnector extends ConnectorInterface with CanDrop { 19 | override def setConf(conf: Conf): Unit = null 20 | 21 | override def read(): DataFrame = { 22 | import spark.implicits._ 23 | Seq(1, 2, 3).toDF("id") 24 | } 25 | 26 | override def write(t: DataFrame, suffix: Option[String]): Unit = logDebug("Write with suffix") 27 | 28 | override def write(t: DataFrame): Unit = logDebug("Write") 29 | 30 | /** 31 | * Drop the entire table. 32 | */ 33 | override def drop(): Unit = logDebug("drop") 34 | } 35 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/ConfLoaderSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import com.typesafe.config.ConfigFactory 4 | import org.scalatest.funsuite.AnyFunSuite 5 | 6 | class ConfLoaderSuite extends AnyFunSuite { 7 | 8 | test("ConfigLoader builder should build ConfigLoader") { 9 | System.setProperty("app.environment", "test") 10 | System.setProperty("myvalue", "test-my-value") 11 | 12 | val cl = ConfigLoader.builder() 13 | .setAppEnv("local") 14 | .setAppName("TestConfigLoaderBuilder") 15 | .setProperty("myJvmProperty", "myJvmPropertyValue") 16 | .getOrCreate() 17 | 18 | assert(cl.get("test.string") === "foo") 19 | assert(cl.get("test.variable") === "myJvmPropertyValue") 20 | assert(cl.appName === "TestConfigLoaderBuilder") 21 | 22 | System.clearProperty("app.environment") 23 | System.clearProperty("myvalue") 24 | } 25 | 26 | test("Getters of ConfigLoader") { 27 | System.setProperty("app.environment", "test") 28 | val cl = ConfigLoader.builder() 29 | .setAppEnv("local") 30 | .setConfigPath("test_priority.conf") 31 | .getOrCreate() 32 | 33 | assert(cl.get("my.value") === "haha") 34 | assert(cl.getOption("my.value") === Some("haha")) 35 | assert(cl.getOption("notExisting") === None) 36 | assert(cl.getArray("test.list") === Array("1","2","3")) 37 | assert(cl.getObject("setl.config") === cl.config.getObject("setl.config")) 38 | } 39 | 40 | test("ConfigLoader builder should prioritize setConfigPath than setAppEnv and jvm property and pom") { 41 | System.setProperty("app.environment", "test") 42 | val cl = ConfigLoader.builder() 43 | .setAppEnv("local") 44 | .setConfigPath("test_priority.conf") 45 | .getOrCreate() 46 | 47 | assert(cl.get("my.value") === "haha") 48 | System.clearProperty("app.environment") 49 | } 50 | 51 | test("ConfigLoader builder should take into account the app.environment property in pom") { 52 | System.clearProperty("app.environment") 53 | val configLoader = ConfigLoader.builder().getOrCreate() 54 | assert(configLoader.appEnv === ConfigFactory.load().getString("setl.environment")) 55 | System.clearProperty("app.environment") 56 | } 57 | 58 | test("ConfigLoader builder should prioritize setAppEnv than jvm property and pom") { 59 | System.setProperty("app.environment", "test") 60 | 61 | val cl = ConfigLoader.builder() 62 | .setAppEnv("test_priority") 63 | .getOrCreate() 64 | 65 | assert(cl.get("my.value") === "haha") 66 | System.clearProperty("app.environment") 67 | } 68 | 69 | test("ConfigLoader builder should prioritize jvm property than pom") { 70 | System.setProperty("app.environment", "test_priority") 71 | 72 | val cl = ConfigLoader.builder() 73 | .getOrCreate() 74 | 75 | assert(cl.get("my.value") === "haha") 76 | System.clearProperty("app.environment") 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/DeltaConnectorConfSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import org.apache.spark.sql.SaveMode 4 | import org.scalatest.funsuite.AnyFunSuite 5 | 6 | class DeltaConnectorConfSuite extends AnyFunSuite { 7 | 8 | val conf = new DeltaConnectorConf() 9 | 10 | test("Set DeltaConnectorConf") { 11 | assert(conf.get("path") === None) 12 | assert(conf.get("saveMode") === None) 13 | conf.setPath("./path") 14 | conf.setSaveMode(SaveMode.Overwrite) 15 | 16 | assert(conf.get("path").get === "./path") 17 | assert(conf.get("saveMode").get === "Overwrite") 18 | } 19 | 20 | test("Getters of DynamoDBConnectorConf") { 21 | assert(conf.getPath === "./path") 22 | assert(conf.getSaveMode === SaveMode.Overwrite) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/DynamoDBConnectorConfSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import org.scalatest.funsuite.AnyFunSuite 4 | 5 | class DynamoDBConnectorConfSuite extends AnyFunSuite { 6 | 7 | val conf = new DynamoDBConnectorConf() 8 | 9 | test("Set DynamoDBConnectorConf") { 10 | assert(conf.get("table") === None) 11 | assert(conf.get("readPartitions") === None) 12 | conf.setTable("realTable") 13 | conf.setReadPartitions("realReadPartitions") 14 | 15 | assert(conf.get("table").get === "realTable") 16 | assert(conf.get("readPartitions").get === "realReadPartitions") 17 | } 18 | 19 | test("Getters of DynamoDBConnectorConf") { 20 | assert(conf.getTable === Some("realTable")) 21 | assert(conf.getReadPartitions === Some("realReadPartitions")) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/FileConnectorConfSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import io.github.setl.enums.{PathFormat, Storage} 4 | import io.github.setl.exception.ConfException 5 | import org.apache.spark.sql.SaveMode 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | class FileConnectorConfSuite extends AnyFunSuite { 9 | 10 | val conf = new FileConnectorConf() 11 | 12 | 13 | test("Set FileConnectorConf") { 14 | assert(conf.get("storage") === None) 15 | conf.setStorage("CSV") 16 | assert(conf.get("storage").get === "CSV") 17 | conf.setStorage(Storage.EXCEL) 18 | assert(conf.get("storage").get === "EXCEL") 19 | 20 | assert(conf.get("encoding") === None) 21 | conf.setEncoding("latin-1") 22 | assert(conf.get("encoding").get === "latin-1") 23 | 24 | assert(conf.get("saveMode") === None) 25 | conf.setSaveMode("Append") 26 | assert(conf.get("saveMode").get === "Append") 27 | conf.setSaveMode(SaveMode.Overwrite) 28 | assert(conf.get("saveMode").get === "Overwrite") 29 | 30 | assert(conf.get("path") === None) 31 | conf.setPath("path") 32 | assert(conf.get("path").get === "path") 33 | 34 | assert(conf.get("pathFormat") === None) 35 | conf.setPathFormat(PathFormat.WILDCARD) 36 | assert(conf.get("pathFormat").get === "WILDCARD") 37 | 38 | assert(conf.get("credentialsProvider") === None) 39 | conf.setS3CredentialsProvider("credentialsProvider") 40 | assert(conf.get("fs.s3a.aws.credentials.provider").get === "credentialsProvider") 41 | 42 | assert(conf.get("accessKey") === None) 43 | conf.setS3AccessKey("accessKey") 44 | assert(conf.get("fs.s3a.access.key").get === "accessKey") 45 | 46 | assert(conf.get("secretKey") === None) 47 | conf.setS3SecretKey("secretKey") 48 | assert(conf.get("fs.s3a.secret.key").get === "secretKey") 49 | 50 | assert(conf.get("sessionToken") === None) 51 | conf.setS3SessionToken("sessionToken") 52 | assert(conf.get("fs.s3a.session.token").get === "sessionToken") 53 | 54 | assert(conf.get("filenamePattern") === None) 55 | conf.setFilenamePattern("(file)(.*)(\\.csv)") 56 | assert(conf.get("filenamePattern").get === "(file)(.*)(\\.csv)") 57 | } 58 | 59 | test("Getters FileConnectorConf") { 60 | assert(conf.getEncoding === "latin-1") 61 | assert(conf.getSaveMode === SaveMode.Overwrite) 62 | assert(conf.getStorage === Storage.EXCEL) 63 | assert(conf.getPath === "path") 64 | assert(conf.getPathFormat === "WILDCARD") 65 | assert(conf.getSchema === None) 66 | assert(conf.getS3CredentialsProvider === Some("credentialsProvider")) 67 | assert(conf.getS3AccessKey === Some("accessKey")) 68 | assert(conf.getS3SecretKey === Some("secretKey")) 69 | assert(conf.getS3SessionToken === Some("sessionToken")) 70 | assert(conf.getFilenamePattern === Some("(file)(.*)(\\.csv)")) 71 | 72 | val newConf = new FileConnectorConf() 73 | assertThrows[ConfException](newConf.getStorage) 74 | assertThrows[ConfException](newConf.getPath) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/HudiConnectorConfSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import io.github.setl.exception.ConfException 4 | import org.scalatest.funsuite.AnyFunSuite 5 | import org.apache.spark.sql.SaveMode 6 | 7 | class HudiConnectorConfSuite extends AnyFunSuite { 8 | val conf = new HudiConnectorConf 9 | 10 | test("Get/Set HudiConnectorConf") { 11 | assert(conf.get("saveMode") === None) 12 | conf.setSaveMode("Append") 13 | assert(conf.getSaveMode === SaveMode.Append) 14 | conf.setSaveMode("Overwrite") 15 | assert(conf.getSaveMode === SaveMode.Overwrite) 16 | conf.setSaveMode(SaveMode.Overwrite) 17 | assert(conf.getSaveMode === SaveMode.Overwrite) 18 | 19 | assert(conf.get("path") === None) 20 | assertThrows[ConfException](conf.getPath) 21 | 22 | conf.setPath("path") 23 | assert(conf.getPath === "path") 24 | } 25 | 26 | test("Init HudiConnectorConf from options") { 27 | val options : Map[String, String] = Map( 28 | "path" -> "path", 29 | "saveMode" -> "Append", 30 | "hoodie.table.name" -> "test_object", 31 | "hoodie.datasource.write.recordkey.field" -> "col1", 32 | "hoodie.datasource.write.precombine.field" -> "col4", 33 | "hoodie.datasource.write.table.type" -> "MERGE_ON_READ" 34 | ) 35 | 36 | val confFromOpts: HudiConnectorConf = HudiConnectorConf.fromMap(options) 37 | assert(confFromOpts.getPath === "path") 38 | assert(confFromOpts.getSaveMode === SaveMode.Append) 39 | 40 | val readerOpts = confFromOpts.getReaderConf 41 | val writerOpts = confFromOpts.getWriterConf 42 | 43 | // Config should not contains path & save mode 44 | assert(!readerOpts.contains("path")) 45 | assert(!writerOpts.contains("path")) 46 | assert(!writerOpts.contains("saveMode")) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/JDBCConnectorConfSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import org.apache.spark.sql.SaveMode 4 | import org.scalatest.funsuite.AnyFunSuite 5 | 6 | class JDBCConnectorConfSuite extends AnyFunSuite { 7 | 8 | val conf = new JDBCConnectorConf() 9 | val url = "url" 10 | val dbTable = "dbtable" 11 | val user = "user" 12 | val password = "password" 13 | val numPartitions = "numPartitions" 14 | val partitionColumn = "partitionColumn" 15 | val lowerBound = "lowerBound" 16 | val upperBound = "upperBound" 17 | val fetchSize = "fetchsize" 18 | val batchSize = "batchsize" 19 | val truncate = "truncate" 20 | val driver = "driver" 21 | 22 | test("Set JDBCConnectorConf") { 23 | assert(conf.get(url) === None) 24 | conf.setUrl(url) 25 | assert(conf.get(url).get === url) 26 | 27 | assert(conf.get(dbTable) === None) 28 | conf.setDbTable(dbTable) 29 | assert(conf.get(dbTable).get === dbTable) 30 | 31 | assert(conf.get(user) === None) 32 | conf.setUser(user) 33 | assert(conf.get(user).get === user) 34 | 35 | assert(conf.get(password) === None) 36 | conf.setPassword(password) 37 | assert(conf.get(password).get === password) 38 | 39 | assert(conf.get("saveMode") === None) 40 | conf.setSaveMode("Overwrite") 41 | assert(conf.get("saveMode").get === "Overwrite") 42 | 43 | conf.setSaveMode(SaveMode.Append) 44 | assert(conf.get("saveMode").get === "Append") 45 | 46 | assert(conf.get(numPartitions) === None) 47 | conf.setNumPartitions(numPartitions) 48 | assert(conf.get(numPartitions).get === numPartitions) 49 | 50 | assert(conf.get(partitionColumn) === None) 51 | conf.setPartitionColumn(partitionColumn) 52 | assert(conf.get(partitionColumn).get === partitionColumn) 53 | 54 | assert(conf.get(lowerBound) === None) 55 | conf.setLowerBound(lowerBound) 56 | assert(conf.get(lowerBound).get === lowerBound) 57 | 58 | assert(conf.get(upperBound) === None) 59 | conf.setUpperBound(upperBound) 60 | assert(conf.get(upperBound).get === upperBound) 61 | 62 | assert(conf.get(fetchSize) === None) 63 | conf.setFetchSize(fetchSize) 64 | assert(conf.get(fetchSize).get === fetchSize) 65 | 66 | assert(conf.get(batchSize) === None) 67 | conf.setBatchSize(batchSize) 68 | assert(conf.get(batchSize).get === batchSize) 69 | 70 | assert(conf.get(truncate) === None) 71 | conf.setTruncate(truncate) 72 | assert(conf.get(truncate).get === truncate) 73 | 74 | assert(conf.get(driver) === None) 75 | conf.setDriver(driver) 76 | assert(conf.get(driver).get === driver) 77 | } 78 | 79 | test("Getters of JDBCConnectorConf") { 80 | assert(conf.getUrl === Some(url)) 81 | assert(conf.getDbTable === Some(dbTable)) 82 | assert(conf.getUser === Some(user)) 83 | assert(conf.getPassword === Some(password)) 84 | assert(conf.getSaveMode === Some("Append")) 85 | assert(conf.getNumPartitions === Some(numPartitions)) 86 | assert(conf.getPartitionColumn === Some(partitionColumn)) 87 | assert(conf.getLowerBound === Some(lowerBound)) 88 | assert(conf.getUpperBound === Some(upperBound)) 89 | assert(conf.getFetchSize === Some(fetchSize)) 90 | assert(conf.getBatchSize === Some(batchSize)) 91 | assert(conf.getTruncate === Some(truncate)) 92 | assert(conf.getDriver === Some(driver)) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/Properties.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import com.typesafe.config.Config 4 | 5 | object Properties { 6 | 7 | // override def beforeAll(): Unit = { 8 | // System.setProperty("myvalue", "test-my-value") 9 | // } 10 | // 11 | val cl: ConfigLoader = ConfigLoader 12 | .builder() 13 | .setProperty("myvalue", "test-my-value") 14 | .setConfigPath("application.conf").getOrCreate() 15 | 16 | val excelConfig: Config = cl.getConfig("test.excel") 17 | val excelConfigWithoutSchema: Config = cl.getConfig("test.excelWithoutSchema") 18 | val cassandraConfig: Config = cl.getConfig("test.cassandra") 19 | val cassandraConfigWithoutClustering: Config = cl.getConfig("test.cassandraWithoutClustering") 20 | 21 | val csvConfig: Config = cl.getConfig("test.csv") 22 | val parquetConfig: Config = cl.getConfig("test.parquet") 23 | 24 | val jsonConfig: Config = cl.getConfig("test.json") 25 | 26 | val jdbcConfig: Config = cl.getConfig("psql.test") 27 | 28 | val hudiConfig : Config = cl.getConfig("hudi.test") 29 | val sparkSQLConfig : Config = cl.getConfig("sparkSQL.test") 30 | 31 | val excelConfigConnector: Config = cl.getConfig("connector.excel") 32 | val cassandraConfigConnector: Config = cl.getConfig("connector.cassandra") 33 | val csvConfigConnector: Config = cl.getConfig("connector.csv") 34 | val parquetConfigConnector: Config = cl.getConfig("connector.parquet") 35 | val dynamoDbConfigConnector: Config = cl.getConfig("connector.dynamo") 36 | 37 | val excelConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.excel") 38 | val cassandraConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.cassandra") 39 | val csvConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.csv") 40 | val jsonConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.json") 41 | val deltaConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.delta") 42 | 43 | 44 | val wrongCsvConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.wrong_csv") 45 | val customConnectorWithoutRef: Config = cl.getConfig("connectorBuilder.wrong_csv2") 46 | val parquetConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.parquet") 47 | 48 | 49 | val excelConfigRepoBuilder: Config = cl.getConfig("repoBuilder.excel") 50 | val cassandraConfigRepoBuilder: Config = cl.getConfig("repoBuilder.cassandra") 51 | val csvConfigRepoBuilder: Config = cl.getConfig("repoBuilder.csv") 52 | val parquetConfigRepoBuilder: Config = cl.getConfig("repoBuilder.parquet") 53 | val deltaConfigRepoBuilder: Config = cl.getConfig("repoBuilder.delta") 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/PropertiesSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import io.github.setl.util.TypesafeConfigUtils 4 | import org.scalatest.BeforeAndAfterAll 5 | import org.scalatest.funsuite.AnyFunSuite 6 | 7 | class PropertiesSuite extends AnyFunSuite with BeforeAndAfterAll { 8 | 9 | override protected def beforeAll(): Unit = { 10 | System.setProperty("myvalue", "test-my-value") 11 | } 12 | 13 | System.setProperty("myvalue", "test-my-value") 14 | 15 | override protected def afterAll(): Unit = { 16 | System.clearProperty("myvalue") 17 | } 18 | 19 | // test("ConfigLoader beforeAll") { 20 | // assert(Properties.cl.get("myValue") === "test-my-value") 21 | // assert(Properties.cl.get("test.myValue2") === "test-my-value-loaded") 22 | // } 23 | 24 | test("Cassandra config") { 25 | assert(TypesafeConfigUtils.getAs[String](Properties.cassandraConfig, "storage").get === "CASSANDRA") 26 | assert(TypesafeConfigUtils.getAs[String](Properties.cassandraConfig, "keyspace").get === "test_space") 27 | assert(TypesafeConfigUtils.getAs[String](Properties.cassandraConfig, "table").get === "test_spark_connector2") 28 | assert(TypesafeConfigUtils.getList(Properties.cassandraConfig, "partitionKeyColumns").get === Array("partition1", "partition2")) 29 | assert(TypesafeConfigUtils.getList(Properties.cassandraConfig, "clusteringKeyColumns").get === Array("clustering1")) 30 | assert(TypesafeConfigUtils.getList(Properties.cassandraConfig, "doesntExist") === None) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/StructuredStreamingConnectorConfSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import org.scalatest.funsuite.AnyFunSuite 4 | import org.scalatest.matchers.should.Matchers 5 | 6 | class StructuredStreamingConnectorConfSuite extends AnyFunSuite with Matchers { 7 | 8 | 9 | test("StructureStreamingConnectorConf exceptions") { 10 | val conf = new StructuredStreamingConnectorConf() 11 | assertThrows[IllegalArgumentException](conf.getSchema) 12 | assertThrows[IllegalArgumentException](conf.getFormat) 13 | assertThrows[IllegalArgumentException](conf.getPath) 14 | assertThrows[IllegalArgumentException](conf.getOutputMode) 15 | 16 | assert(conf.getReaderConf === Map()) 17 | assert(conf.getReaderConf === Map()) 18 | 19 | } 20 | 21 | test("StructureStreamingConnectorConf getFormat should always output lowercase") { 22 | val conf = new StructuredStreamingConnectorConf() 23 | conf.setFormat("PARQUET") 24 | assert(conf.getFormat === "parquet") 25 | } 26 | 27 | test("Setters and getters") { 28 | val conf = new StructuredStreamingConnectorConf() 29 | conf.setFormat("parquet") 30 | conf.setPath("test/path") 31 | conf.setOutputMode("outputmode") 32 | conf.setSchema("schema") 33 | 34 | conf.set("header", "header1") 35 | 36 | assert(conf.getFormat === "parquet") 37 | assert(conf.getPath === "test/path") 38 | assert(conf.getOutputMode === "outputmode") 39 | assert(conf.getSchema === "schema") 40 | assert(conf.get("header") === Option("header1")) 41 | 42 | conf.getReaderConf.keys should contain theSameElementsAs Array("header", "path") 43 | conf.getWriterConf.keys should contain theSameElementsAs Array("header", "path") 44 | } 45 | 46 | test("Construction from Map") { 47 | val map = Map( 48 | "format" -> "PARQUET", 49 | "outputMode" -> "append", 50 | "checkpointLocation" -> "test_checkpoint", 51 | "path" -> "test_path" 52 | ) 53 | 54 | val conf = StructuredStreamingConnectorConf.fromMap(map) 55 | 56 | assert(conf.getFormat === "parquet") 57 | assert(conf.getPath === "test_path") 58 | assert(conf.getOutputMode === "append") 59 | assert(conf.get("checkpointLocation") === Some("test_checkpoint")) 60 | assert(conf.get("none") === None) 61 | } 62 | 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/factory/FactoryDeliveryMetadataSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.factory 2 | 3 | import io.github.setl.internal.TestClasses.TestFactory 4 | import io.github.setl.transformation.FactoryDeliveryMetadata 5 | import io.github.setl.workflow.External 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | class FactoryDeliveryMetadataSuite extends AnyFunSuite { 9 | 10 | val fac = new TestFactory 11 | 12 | test("Test FactoryDeliveryMetadata Builder") { 13 | 14 | val setters = FactoryDeliveryMetadata.builder().setFactory(fac).getOrCreate() 15 | 16 | setters.foreach(println) 17 | 18 | assert(setters.size === 4) 19 | assert(setters.map(_.factoryUUID).toSet.size === 1) 20 | assert(setters.find(_.name == "inputInt").get.producer === classOf[External]) 21 | assert(setters.find(_.name == "setInputs").get.argTypes.size === 2) 22 | assert(setters.find(_.isDataset.contains(true)).size === 0) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/internal/BenchmarkInvocationHandlerSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.Benchmark 4 | import io.github.setl.transformation.{AbstractFactory, Factory} 5 | import io.github.setl.workflow.Pipeline 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | class BenchmarkInvocationHandlerSuite extends AnyFunSuite { 9 | 10 | import BenchmarkInvocationHandlerSuite._ 11 | 12 | test("BenchmarkInvocationHandler should log execution time") { 13 | val factory = new BenchmarkFactory 14 | val benchmarkHandler = new BenchmarkInvocationHandler(factory) 15 | 16 | val proxyFactory = java.lang.reflect.Proxy.newProxyInstance( 17 | getClass.getClassLoader, 18 | Array(classOf[AbstractFactory[_]]), 19 | benchmarkHandler 20 | ).asInstanceOf[AbstractFactory[_]] 21 | 22 | proxyFactory.read() 23 | proxyFactory.process() 24 | proxyFactory.write() 25 | 26 | assert(classOf[BenchmarkFactory].isAnnotationPresent(classOf[Benchmark])) 27 | assert(factory.get() === proxyFactory.get()) 28 | 29 | import scala.collection.JavaConverters._ 30 | benchmarkHandler.getBenchmarkResult.asScala.foreach { 31 | x => assert(x._2 >=0) 32 | } 33 | 34 | assert(benchmarkHandler.getBenchmarkResult.size() === 2) 35 | 36 | } 37 | 38 | test("Benchmark should be handled in pipeline") { 39 | 40 | val pipeline = new Pipeline() 41 | 42 | val result = pipeline 43 | .addStage[BenchmarkFactory]() 44 | .benchmark(true) 45 | .run() 46 | .getBenchmarkResult 47 | 48 | assert(result.length === 1) 49 | 50 | val result2 = new Pipeline() 51 | .addStage[BenchmarkFactory]() 52 | .run() 53 | .getBenchmarkResult 54 | 55 | assert(result2.isEmpty) 56 | 57 | val result3 = new Pipeline() 58 | .addStage[BenchmarkFactory]() 59 | .benchmark(false) 60 | .run() 61 | .getBenchmarkResult 62 | 63 | assert(result3.isEmpty) 64 | } 65 | 66 | } 67 | 68 | object BenchmarkInvocationHandlerSuite { 69 | 70 | @Benchmark 71 | class BenchmarkFactory extends Factory[String] { 72 | 73 | private[this] var data = "" 74 | 75 | override def read(): BenchmarkFactory.this.type = { 76 | data = s"testing ${this.getClass.getSimpleName}... " 77 | this 78 | } 79 | 80 | @Benchmark 81 | override def process(): BenchmarkFactory.this.type = { 82 | data = data + data 83 | this 84 | } 85 | 86 | @Benchmark 87 | override def write(): BenchmarkFactory.this.type = { 88 | println(data) 89 | sleep() 90 | this 91 | } 92 | 93 | override def get(): String = data 94 | 95 | def sleep(): Unit = Thread.sleep(1000L) 96 | 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/internal/StructAnalyserSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.{ColumnName, CompoundKey, Compress} 4 | import io.github.setl.internal.TestClasses.TestStructAnalyser 5 | import io.github.setl.storage.{Compressor, XZCompressor} 6 | import org.apache.spark.sql.types.StructType 7 | import org.scalatest.funsuite.AnyFunSuite 8 | 9 | class StructAnalyserSuite extends AnyFunSuite { 10 | 11 | val schema: StructType = StructAnalyser.analyseSchema[TestStructAnalyser] 12 | 13 | test("StructAnalyser should be able to handle @ColumnName") { 14 | val fields = schema.filter(_.metadata.contains(classOf[ColumnName].getCanonicalName)) 15 | 16 | assert(fields.length === 1) 17 | assert(fields.head.name === "col1") 18 | assert(fields.head.metadata.getStringArray(classOf[ColumnName].getCanonicalName) === Array("alias1")) 19 | 20 | } 21 | 22 | test("StructAnalyser should be able to handle @CompoundKey") { 23 | val fields = schema.filter(_.metadata.contains(classOf[CompoundKey].getCanonicalName)) 24 | 25 | assert(fields.length === 2) 26 | assert(fields.map(_.name) === Array("col2", "col22")) 27 | assert(fields.map(_.metadata.getStringArray(classOf[CompoundKey].getCanonicalName)).map(_ (0)) === List("test!@1", "test!@2")) 28 | } 29 | 30 | test("StructAnalyser should be able to handle @Compress") { 31 | val fields = schema.filter(_.metadata.contains(classOf[Compress].getCanonicalName)) 32 | 33 | assert(fields.length === 2) 34 | assert(fields.map(_.name) === Array("col3", "col4")) 35 | 36 | assert( 37 | fields 38 | .find(_.name == "col3") 39 | .get.metadata 40 | .getStringArray(classOf[Compress].getCanonicalName)(0) === classOf[XZCompressor].getCanonicalName 41 | ) 42 | 43 | assert( 44 | fields 45 | .find(_.name == "col4") 46 | .get.metadata 47 | .getStringArray(classOf[Compress].getCanonicalName)(0) === classOf[Compressor].getCanonicalName 48 | ) 49 | } 50 | 51 | test("[SETL-34] StructAnalyser should handle multiple @CompoundKey annotations") { 52 | val structType = StructAnalyser.analyseSchema[TestClasses.MultipleCompoundKeyTest] 53 | structType.foreach { x => 54 | println(s"name: ${x.name}, type: ${x.dataType}, meta: ${x.metadata}") 55 | } 56 | 57 | assert(structType.find(_.name == "col1").get.metadata.getStringArray(classOf[CompoundKey].getCanonicalName) === Array("sort!@1","part!@1")) 58 | assert(structType.find(_.name == "col2").get.metadata.getStringArray(classOf[CompoundKey].getCanonicalName) === Array("sort!@2")) 59 | assert(structType.find(_.name == "col3").get.metadata.getStringArray(classOf[CompoundKey].getCanonicalName) === Array("part!@2")) 60 | } 61 | 62 | 63 | test("StructAnalyser should be able to find columns with @CompoundKey") { 64 | val primaryColumns1 = StructAnalyser.findCompoundColumns[TestClasses.MultipleCompoundKeyTest] 65 | val primaryColumns2 = StructAnalyser.findCompoundColumns[TestClasses.MyObject] 66 | 67 | assert(primaryColumns1.length == 3) 68 | assert(primaryColumns1 === Array("col1", "col2", "COLUMN_3")) 69 | assert(primaryColumns2.isEmpty) 70 | assert(primaryColumns2 === Array()) 71 | } 72 | 73 | test("[SETL-34] StructAnalyser should throw exception when there are more than one ColumnName annotation") { 74 | assertThrows[IllegalArgumentException](StructAnalyser.analyseSchema[TestClasses.WrongClass]) 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/internal/TestClasses.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.{ColumnName, CompoundKey, Compress, Delivery} 4 | import io.github.setl.storage.Compressor 5 | import io.github.setl.transformation.Factory 6 | 7 | object TestClasses { 8 | 9 | case class WrongClass(@ColumnName("1") @ColumnName("2") col1: String) 10 | 11 | case class MultipleCompoundKeyTest(@CompoundKey("sort", "1") @CompoundKey("part", "1") col1: String, 12 | @CompoundKey("sort", "2") col2: String, 13 | @CompoundKey("part", "2") @ColumnName("COLUMN_3") col3: String) 14 | 15 | case class InnerClass(innerCol1: String, innerCol2: String) 16 | 17 | case class TestCompression(@ColumnName("dqsf") col1: String, 18 | @CompoundKey("test", "1") col2: String, 19 | @Compress col3: Seq[InnerClass], 20 | @Compress col4: Seq[String]) { 21 | } 22 | 23 | case class TestStructAnalyser(@ColumnName("alias1") col1: String, 24 | @CompoundKey("test", "1") col2: String, 25 | @CompoundKey("test", "2") col22: String, 26 | @Compress col3: Seq[InnerClass], 27 | @Compress(compressor = classOf[Compressor]) col4: Seq[String]) { 28 | } 29 | 30 | class Producer1 31 | 32 | class Producer2 33 | 34 | class TestFactory extends Factory[String] { 35 | 36 | var input3: Double = _ 37 | var input4: Boolean = _ 38 | 39 | @Delivery(producer = classOf[Producer1]) 40 | var inputString1: String = _ 41 | 42 | @Delivery(producer = classOf[Producer2]) 43 | var inputString2: String = _ 44 | 45 | @Delivery(optional = true) 46 | var inputInt: Int = _ 47 | 48 | @Delivery 49 | def setInputs(d: Double, boo: Boolean): this.type = { 50 | input3 = d 51 | input4 = boo 52 | this 53 | } 54 | 55 | /** 56 | * Read data 57 | */ 58 | override def read(): TestFactory.this.type = this 59 | 60 | /** 61 | * Process data 62 | */ 63 | override def process(): TestFactory.this.type = this 64 | 65 | /** 66 | * Write data 67 | */ 68 | override def write(): TestFactory.this.type = this 69 | 70 | /** 71 | * Get the processed data 72 | */ 73 | override def get(): String = "Product of TestFactory " + inputString1 + inputString2 74 | } 75 | 76 | 77 | case class MyObject(@ColumnName("col1") column1: String, column2: String) 78 | 79 | case class TestCompoundKey(@CompoundKey("primary", "1") a: String, @CompoundKey("primary", "2") b: Int, @CompoundKey("sort", "1") c: String) 80 | 81 | case class TestNullableColumn(@CompoundKey("primary", "1") col1: String, col2: String, col3: Option[Int], col4: Double) 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/ConditionSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import java.time.{LocalDate, LocalDateTime} 4 | 5 | import io.github.setl.enums.ValueType 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | class ConditionSuite extends AnyFunSuite { 9 | 10 | test("Condition could be converted to sql request") { 11 | 12 | val strCond = Condition("col1", "=", "haha") 13 | assert(strCond.toSqlRequest === "(`col1` = 'haha')") 14 | 15 | val intCond = Condition("col1", "=", 1) 16 | assert(intCond.toSqlRequest === "(`col1` = 1)") 17 | 18 | val floatCond = Condition("col1", "=", 1F) 19 | assert(floatCond.toSqlRequest === "(`col1` = 1.0)") 20 | 21 | val date = LocalDate.parse("1990-01-01") 22 | val dateCond = Condition("date", "=", date) 23 | assert(dateCond.toSqlRequest === "(`date` = cast('1990-01-01' as date))") 24 | 25 | val datetime = LocalDateTime.parse("1990-01-01T00:00:00") 26 | val datetimeCond = Condition("datetime", "=", datetime) 27 | assert(datetimeCond.toSqlRequest === "(`datetime` = cast('1990-01-01 00:00:00' as timestamp))") 28 | 29 | val strSetCond = Condition("str_set", "in", Set("a", "b")) 30 | assert(strSetCond.toSqlRequest === "(`str_set` IN ('a', 'b'))") 31 | 32 | val floatSetCond = Condition("float_set", "in", Set(1.343F, 2.445F)) 33 | assert(floatSetCond.toSqlRequest === "(`float_set` IN (1.343, 2.445))") 34 | 35 | val strCondWithType = Condition("col1", "=", "hehe", ValueType.STRING) 36 | assert(strCondWithType.toSqlRequest === "(`col1` = 'hehe')") 37 | } 38 | 39 | test("Condition should return null if value is not defined") { 40 | val cond = Condition("a", "=", None, ValueType.STRING) 41 | assert(cond.toSqlRequest === null) 42 | } 43 | 44 | test("Null sql request should be ignored in a condition set") { 45 | 46 | val conds = Set( 47 | Condition("a", "=", None, ValueType.STRING), 48 | Condition("b", "=", 1.5), 49 | Condition("c", "in", Set("x", "y")) 50 | ) 51 | 52 | import io.github.setl.util.FilterImplicits._ 53 | assert(conds.toSqlRequest === "(`b` = 1.5) AND (`c` IN ('x', 'y'))") 54 | 55 | } 56 | 57 | test("Condition should handle Column") { 58 | import org.apache.spark.sql.functions._ 59 | val condition = Condition( 60 | col("test col").isin(1, 2, 3) 61 | ) 62 | 63 | assert(condition.toSqlRequest === Condition("test col", "IN", Set(1, 2, 3)).toSqlRequest) 64 | 65 | val condition2 = Condition( 66 | col("test col").isin(1, 2, 3) && col("test col 2") === "A" 67 | ) 68 | assert(condition2.toSqlRequest === "((`test col` IN (1, 2, 3)) AND (`test col 2` = 'A'))") 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/GZIPCompressorSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import org.scalatest.funsuite.AnyFunSuite 4 | 5 | class GZIPCompressorSuite extends AnyFunSuite { 6 | 7 | val compressor = new GZIPCompressor 8 | 9 | test("GZIPCompressor should be able to compress a string to a Byte[]") { 10 | println(s"String1: ${str.getBytes().length} -> ${compressor.compress(str).length}") 11 | println(s"String2: ${str2.getBytes().length} -> ${compressor.compress(str2).length}") 12 | println(s"String3: ${str3.getBytes().length} -> ${compressor.compress(str3).length}") 13 | println(s"String4: ${str4.getBytes().length} -> ${compressor.compress(str4).length}") 14 | assert(str.getBytes().length >= compressor.compress(str).length) 15 | assert(str2.getBytes().length >= compressor.compress(str2).length) 16 | assert(str3.getBytes().length >= compressor.compress(str3).length) 17 | assert(str4.getBytes().length >= compressor.compress(str4).length) 18 | 19 | } 20 | 21 | test("GZIPCompressor should be able to decompress a Byte array to string") { 22 | assert(compressor.decompress(compressor.compress(str)) === str) 23 | assert(compressor.decompress(compressor.compress(str2)) === str2) 24 | assert(compressor.decompress(compressor.compress(str3)) === str3) 25 | assert(compressor.decompress(compressor.compress(str4)) === str4) 26 | assert(compressor.decompress("testtesttest".getBytes()) === "testtesttest") 27 | 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/SnappyCompressorSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import org.scalatest.funsuite.AnyFunSuite 4 | 5 | class SnappyCompressorSuite extends AnyFunSuite { 6 | 7 | val compressor = new SnappyCompressor 8 | 9 | test("XZCompressor should be able to compress a string to a Byte[]") { 10 | println(s"String1: ${str.getBytes().length} -> ${compressor.compress(str).length}") 11 | println(s"String2: ${str2.getBytes().length} -> ${compressor.compress(str2).length}") 12 | println(s"String3: ${str3.getBytes().length} -> ${compressor.compress(str3).length}") 13 | println(s"String4: ${str4.getBytes().length} -> ${compressor.compress(str4).length}") 14 | 15 | assert(str.getBytes().length >= compressor.compress(str).length) 16 | assert(str2.getBytes().length >= compressor.compress(str2).length) 17 | assert(str3.getBytes().length >= compressor.compress(str3).length) 18 | assert(str4.getBytes().length >= compressor.compress(str4).length) 19 | 20 | } 21 | 22 | test("XZCompressor should be able to decompress a Byte array to string") { 23 | assert(compressor.decompress(compressor.compress(str)) === str) 24 | assert(compressor.decompress(compressor.compress(str2)) === str2) 25 | assert(compressor.decompress(compressor.compress(str3)) === str3) 26 | assert(compressor.decompress(compressor.compress(str4)) === str4) 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/XZCompressorSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import org.scalatest.funsuite.AnyFunSuite 4 | 5 | class XZCompressorSuite extends AnyFunSuite { 6 | 7 | val compressor = new XZCompressor 8 | 9 | test("XZCompressor should be able to compress a string to a Byte[]") { 10 | println(s"String1: ${str.getBytes().length} -> ${compressor.compress(str).length}") 11 | println(s"String2: ${str2.getBytes().length} -> ${compressor.compress(str2).length}") 12 | println(s"String3: ${str3.getBytes().length} -> ${compressor.compress(str3).length}") 13 | println(s"String4: ${str4.getBytes().length} -> ${compressor.compress(str4).length}") 14 | 15 | assert(str.getBytes().length >= compressor.compress(str).length) 16 | assert(str2.getBytes().length >= compressor.compress(str2).length) 17 | assert(str3.getBytes().length >= compressor.compress(str3).length) 18 | assert(str4.getBytes().length >= compressor.compress(str4).length) 19 | 20 | } 21 | 22 | test("XZCompressor should be able to decompress a Byte array to string") { 23 | assert(compressor.decompress(compressor.compress(str)) === str) 24 | assert(compressor.decompress(compressor.compress(str2)) === str2) 25 | assert(compressor.decompress(compressor.compress(str3)) === str3) 26 | assert(compressor.decompress(compressor.compress(str4)) === str4) 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/connector/ConnectorSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | import org.scalatest.BeforeAndAfterAll 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | class ConnectorSuite extends AnyFunSuite with BeforeAndAfterAll { 9 | 10 | test("Connector object") { 11 | val spark: SparkSession = SparkSession.builder().config(new SparkConf()).master("local[*]").getOrCreate() 12 | 13 | val df = spark.emptyDataFrame 14 | 15 | assert(Connector.empty.spark === null) 16 | assert(Connector.empty.storage === null) 17 | assert(Connector.empty.read() === null) 18 | Connector.empty.write(df) 19 | Connector.empty.write(df, Some("suffix")) 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/connector/HudiConnectorSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.config.{Conf, HudiConnectorConf, Properties} 4 | import io.github.setl.{SparkSessionBuilder, SparkTestUtils, TestObject2} 5 | import org.apache.spark.sql.{SaveMode, SparkSession} 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | import java.nio.file.Paths 9 | import java.sql.{Date, Timestamp} 10 | 11 | class HudiConnectorSuite extends AnyFunSuite { 12 | 13 | val path: String = Paths.get("src", "test", "resources", "test_hudi").toFile.getAbsolutePath 14 | val saveMode = SaveMode.Overwrite 15 | 16 | val options: Map[String, String] = Map[String, String]( 17 | "path" -> path, 18 | "saveMode" -> saveMode.toString, 19 | "hoodie.table.name" -> "test_object", 20 | "hoodie.datasource.write.recordkey.field" -> "col1", 21 | "hoodie.datasource.write.precombine.field" -> "col4", 22 | "hoodie.datasource.write.table.type" -> "MERGE_ON_READ" 23 | ) 24 | 25 | val testTable: Seq[TestObject2] = Seq( 26 | TestObject2("string", 5, 0.000000001685400132103450D, new Timestamp(1557153268000L), new Date(1557100800000L), 999999999999999999L), 27 | TestObject2("string2", 5, 0.000000001685400132103450D, new Timestamp(1557153268000L), new Date(1557100800000L), 999999999999999999L), 28 | TestObject2("string3", 5, 0.000000001685400132103450D, new Timestamp(1557153268000L), new Date(1557100800000L), 999999999999999999L) 29 | ) 30 | 31 | test("Instantiation of constructors") { 32 | 33 | // New spark session here since Hudi only supports KryoSerializer 34 | val spark: SparkSession = new SparkSessionBuilder().setEnv("local") 35 | .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 36 | .build() 37 | .get() 38 | assume(SparkTestUtils.checkSparkVersion("2.4")) 39 | 40 | import spark.implicits._ 41 | 42 | val connector = new HudiConnector(HudiConnectorConf.fromMap(options)) 43 | connector.write(testTable.toDF) 44 | assert(connector.read().collect().length == testTable.length) 45 | 46 | val path2: String = Paths.get("src", "test", "resources", "test_hudi_2").toFile.getAbsolutePath 47 | val options2 = options + ("path" -> path2) 48 | val connector2 = new HudiConnector(options2) 49 | connector2.write(testTable.toDF) 50 | assert(connector2.read().collect().length == testTable.length) 51 | 52 | val path3: String = Paths.get("src", "test", "resources", "test_hudi_3").toFile.getAbsolutePath 53 | val options3 = options + ("path" -> path3) 54 | val connector3 = new HudiConnector(Conf.fromMap(options3)) 55 | connector3.write(testTable.toDF, Some("any_")) 56 | assert(connector3.read().collect().length == testTable.length) 57 | 58 | val connector7 = new HudiConnector(Properties.hudiConfig) 59 | connector7.write(testTable.toDF) 60 | assert(connector7.read().collect().length == testTable.length) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/connector/SparkSQLConnectorSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.config.{Conf, Properties} 4 | import io.github.setl.{SparkSessionBuilder, TestObject} 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.sql.SparkSession 7 | import org.scalatest.funsuite.AnyFunSuite 8 | 9 | class SparkSQLConnectorSuite extends AnyFunSuite{ 10 | 11 | val query : String = 12 | """ 13 | | SELECT (ones.n1 + tens.n2 * 10) as user_id 14 | | FROM ( 15 | | SELECT 0 AS n1 16 | | UNION SELECT 1 AS n1 17 | | UNION SELECT 2 AS n1 18 | | UNION SELECT 3 AS n1 19 | | UNION SELECT 4 AS n1 20 | | UNION SELECT 5 AS n1 21 | | UNION SELECT 6 AS n1 22 | | UNION SELECT 7 AS n1 23 | | UNION SELECT 8 AS n1 24 | | UNION SELECT 9 AS n1 25 | | ) ones 26 | | CROSS JOIN 27 | | ( 28 | | SELECT 0 AS n2 29 | | UNION SELECT 1 AS n2 30 | | UNION SELECT 2 AS n2 31 | | UNION SELECT 3 AS n2 32 | | UNION SELECT 4 AS n2 33 | | UNION SELECT 5 AS n2 34 | | UNION SELECT 6 AS n2 35 | | UNION SELECT 7 AS n2 36 | | UNION SELECT 8 AS n2 37 | | UNION SELECT 9 AS n2 38 | | ) tens 39 | |""".stripMargin 40 | 41 | val testTable: Seq[TestObject] = Seq( 42 | TestObject(1, "p1", "c1", 1L), 43 | TestObject(2, "p2", "c2", 2L), 44 | TestObject(3, "p3", "c3", 3L) 45 | ) 46 | 47 | val options : Map[String, String] = Map( 48 | "query" -> query 49 | ) 50 | 51 | 52 | test("Instantiation of constructors") { 53 | val connector = new SparkSQLConnector(query) 54 | assert(connector.query === query) 55 | 56 | val testConfig = Properties.sparkSQLConfig 57 | val connector2 = new SparkSQLConnector(testConfig) 58 | assert(connector2.query === "SELECT * FROM schema.table") 59 | 60 | val connector3 = new SparkSQLConnector(Conf.fromMap(options)) 61 | assert(connector3.query === query) 62 | 63 | assertThrows[IllegalArgumentException](new SparkSQLConnector("")) 64 | assertThrows[IllegalArgumentException](new SparkSQLConnector(Conf.fromMap(Map.empty))) 65 | assertThrows[IllegalArgumentException](new SparkSQLConnector(testConfig.withoutPath("query"))) 66 | } 67 | 68 | test("Read/Write of SparkSQLConnector") { 69 | val spark: SparkSession = SparkSession.builder().config(new SparkConf()).master("local[*]").getOrCreate() 70 | import spark.implicits._ 71 | 72 | val connector = new SparkSQLConnector(query) 73 | assert(connector.read().collect().length == 100) 74 | 75 | // Should log warning & do nothing 76 | val testDF = testTable.toDF() 77 | connector.write(testDF) 78 | connector.write(testDF, Some("any_")) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/connector/StructuredStreamingConnectorSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.SparkSessionBuilder 4 | import io.github.setl.config.Conf 5 | import org.apache.spark.sql.SparkSession 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | class StructuredStreamingConnectorSuite extends AnyFunSuite { 9 | 10 | val inputConf: Map[String, String] = Map( 11 | "format" -> "text", 12 | "path" -> "src/test/resources/streaming_test_resources/input" 13 | ) 14 | 15 | val consoleOutputConf: Map[String, String] = Map( 16 | "format" -> "console", 17 | "outputMode" -> "append" 18 | ) 19 | 20 | val parquetOutputConf: Map[String, String] = Map( 21 | "format" -> "PARQUET", 22 | "outputMode" -> "append", 23 | "checkpointLocation" -> "src/test/resources/streaming_test_resources/output/checkpoint_1", 24 | "path" -> "src/test/resources/streaming_test_resources/output/1" 25 | ) 26 | 27 | test("StructuredStreamingConnector instantiation") { 28 | val spark: SparkSession = new SparkSessionBuilder().setEnv("local").build().get() 29 | import spark.implicits._ 30 | 31 | val _conf = Conf.fromMap(parquetOutputConf) 32 | 33 | val connector = new StructuredStreamingConnector(inputConf) 34 | val outputConnector = new StructuredStreamingConnector(_conf) 35 | val parquetConnector = new ParquetConnector(parquetOutputConf) 36 | 37 | val input = connector.read() 38 | 39 | outputConnector.write(input, Option("suffix_should_be_ignored")) 40 | outputConnector.awaitTerminationOrTimeout(10000) 41 | 42 | parquetConnector.read().show() 43 | assert(parquetConnector.read().as[String].collect().mkString(" ") === StructuredStreamingConnectorSuite.text) 44 | } 45 | 46 | } 47 | 48 | object StructuredStreamingConnectorSuite { 49 | val text = "Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the Dataset/DataFrame API in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write-Ahead Logs. In short, Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming. Internally, by default, Structured Streaming queries are processed using a micro-batch processing engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called Continuous Processing, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able to choose the mode based on your application requirements. In this guide, we are going to walk you through the programming model and the APIs. We are going to explain the concepts mostly using the default micro-batch processing model, and then later discuss Continuous Processing model. First, let’s start with a simple example of a Structured Streaming query - a streaming word count." 50 | } 51 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/repository/RepositoryAdapterSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.repository 2 | 3 | import io.github.setl.SparkSessionBuilder 4 | import io.github.setl.storage.Condition 5 | import io.github.setl.storage.connector.CSVConnector 6 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 7 | import org.apache.spark.sql.{Dataset, SparkSession} 8 | import org.scalatest.funsuite.AnyFunSuite 9 | 10 | class RepositoryAdapterSuite extends AnyFunSuite { 11 | 12 | val path: String = "src/test/resources/test_repository_adapter" 13 | 14 | val data: Seq[RepoAdapterTesterA] = Seq( 15 | RepoAdapterTesterA("a", "A"), 16 | RepoAdapterTesterA("b", "B") 17 | ) 18 | 19 | test("RepositoryAdapter should implicitly convert two dataset") { 20 | val spark: SparkSession = new SparkSessionBuilder().setEnv("local").build().get() 21 | val ds: Dataset[RepoAdapterTesterA] = spark.createDataset(data)(ExpressionEncoder[RepoAdapterTesterA]) 22 | 23 | import io.github.setl.storage.repository.ImplicitConverter.a2b 24 | import io.github.setl.storage.repository.ImplicitRepositoryAdapter._ 25 | 26 | val options: Map[String, String] = Map[String, String]( 27 | "path" -> path, 28 | "inferSchema" -> "true", 29 | "delimiter" -> ",", 30 | "header" -> "true", 31 | "saveMode" -> "Overwrite" 32 | ) 33 | 34 | val csvConnector = new CSVConnector(options) 35 | 36 | val repo: SparkRepository[RepoAdapterTesterA] = 37 | new SparkRepository[RepoAdapterTesterA]().setConnector(csvConnector) 38 | 39 | repo.convertAndSave(ds) 40 | val ds2 = repo.findAllAndConvert() 41 | val df = csvConnector.read() 42 | 43 | assert(ds2.columns === ds.columns) 44 | assert(df.columns === Array("column1", "col2", "col3")) 45 | csvConnector.delete() 46 | } 47 | 48 | test("RepositoryAdapter should be able to handle filter") { 49 | val spark: SparkSession = new SparkSessionBuilder().setEnv("local").build().get() 50 | val ds: Dataset[RepoAdapterTesterA] = spark.createDataset(data)(ExpressionEncoder[RepoAdapterTesterA]) 51 | 52 | import io.github.setl.storage.repository.ImplicitConverter.a2b 53 | import io.github.setl.storage.repository.ImplicitRepositoryAdapter._ 54 | 55 | val options: Map[String, String] = Map[String, String]( 56 | "path" -> (path + "_filter"), 57 | "inferSchema" -> "true", 58 | "delimiter" -> ",", 59 | "header" -> "true", 60 | "saveMode" -> "Overwrite" 61 | ) 62 | 63 | val csvConnector = new CSVConnector(options) 64 | 65 | val repo: SparkRepository[RepoAdapterTesterA] = 66 | new SparkRepository[RepoAdapterTesterA]().setConnector(csvConnector) 67 | 68 | repo.convertAndSave(ds) 69 | 70 | val conditions = Condition("column1", "=", "a") 71 | 72 | assert(repo.findByAndConvert(conditions).count() === 1) 73 | csvConnector.delete() 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/repository/package.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import io.github.setl.annotation.{ColumnName, CompoundKey, Compress} 4 | import io.github.setl.internal.TestClasses.InnerClass 5 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 6 | import org.apache.spark.sql.{Dataset, Encoder} 7 | 8 | package object repository { 9 | 10 | case class RepoAdapterTesterA(col1: String, col2: String) 11 | 12 | case class RepoAdapterTesterB(@ColumnName("column1") col1: String, col2: String, col3: String) 13 | 14 | case class MyObject(@CompoundKey("sort", "2") @ColumnName("col1") column1: String, @CompoundKey("sort", "1") column2: String) 15 | 16 | case class TestDeltaUpdate(@ColumnName("col1") @CompoundKey("partition", "1") column1: Int, @CompoundKey("sort", "1") column2: String, value: Double) 17 | 18 | case class TestCompressionRepository(col1: String, 19 | col2: String, 20 | @Compress col3: Seq[InnerClass], 21 | @Compress col4: Seq[String]) { 22 | } 23 | 24 | case class TestCompressionRepositoryGZIP(col1: String, 25 | col2: String, 26 | @Compress(compressor = classOf[GZIPCompressor]) col3: Seq[InnerClass], 27 | @Compress(compressor = classOf[GZIPCompressor]) col4: Seq[String]) { 28 | } 29 | 30 | object ImplicitConverter { 31 | 32 | implicit val a2b: DatasetConverter[RepoAdapterTesterA, RepoAdapterTesterB] = new DatasetConverter[RepoAdapterTesterA, RepoAdapterTesterB] { 33 | 34 | implicit val encoderA: Encoder[RepoAdapterTesterA] = ExpressionEncoder[RepoAdapterTesterA] 35 | implicit val encoderB: Encoder[RepoAdapterTesterB] = ExpressionEncoder[RepoAdapterTesterB] 36 | 37 | override def convertFrom(ds: Dataset[RepoAdapterTesterB]): Dataset[RepoAdapterTesterA] = 38 | ds.drop("col3").as[RepoAdapterTesterA] 39 | 40 | override def convertTo(ds: Dataset[RepoAdapterTesterA]): Dataset[RepoAdapterTesterB] = { 41 | import org.apache.spark.sql.functions._ 42 | ds.withColumn("col3", concat(col("col1"), col("col2"))).as[RepoAdapterTesterB] 43 | } 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/repository/streaming/StreamingRepositorySuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.repository.streaming 2 | 3 | import io.github.setl.SparkSessionBuilder 4 | import io.github.setl.exception.InvalidConnectorException 5 | import io.github.setl.storage.connector.CSVConnector 6 | import io.github.setl.storage.repository.SparkRepository 7 | import org.apache.spark.sql.SparkSession 8 | import org.scalatest.funsuite.AnyFunSuite 9 | 10 | class StreamingRepositorySuite extends AnyFunSuite { 11 | 12 | test("StreamingRepository should throw exception") { 13 | import io.github.setl.storage.repository.streaming.StreamingRepositorySuite.TestClass 14 | 15 | val spark: SparkSession = new SparkSessionBuilder().setEnv("local").getOrCreate() 16 | 17 | val csvOutputConf: Map[String, String] = Map( 18 | "path" -> "src/test/resources/streaming_test_resources/output/3", 19 | "header" -> "true" 20 | ) 21 | val csvConnector = new CSVConnector(csvOutputConf) 22 | val repo = new SparkRepository[TestClass]().setConnector(csvConnector) 23 | 24 | assertThrows[InvalidConnectorException](repo.awaitTermination()) 25 | assertThrows[InvalidConnectorException](repo.awaitTerminationOrTimeout(1)) 26 | } 27 | 28 | } 29 | 30 | object StreamingRepositorySuite { 31 | 32 | case class TestClass(x: String) 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/util/IOUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import java.io.{File, IOException} 4 | import java.util.UUID 5 | 6 | import io.github.setl.internal.Logging 7 | 8 | object IOUtils extends Logging { 9 | 10 | def createDirectory(root: String, namePrefix: String = "setl"): File = { 11 | var attempts = 0 12 | val maxAttempts = 10 13 | var dir: File = null 14 | while (dir == null) { 15 | attempts += 1 16 | if (attempts > maxAttempts) { 17 | throw new IOException("Failed to create a temp directory (under " + root + ") after " + 18 | maxAttempts + " attempts!") 19 | } 20 | try { 21 | dir = new File(root, namePrefix + "-" + UUID.randomUUID.toString) 22 | if (dir.exists() || !dir.mkdirs()) { 23 | dir = null 24 | } 25 | } catch { 26 | case e: SecurityException => dir = null; 27 | } 28 | } 29 | 30 | dir.getCanonicalFile 31 | } 32 | 33 | /** 34 | * Create a temporary directory inside the given parent directory. The directory will be 35 | * automatically deleted when the VM shuts down. 36 | */ 37 | def createTempDir(root: String = System.getProperty("java.io.tmpdir"), namePrefix: String = "setl"): File = { 38 | val dir = createDirectory(root, namePrefix) 39 | dir 40 | } 41 | 42 | /** 43 | * Creates a temporary directory, which is then passed to `f` and will be deleted after `f` 44 | * returns. 45 | * 46 | * @todo Probably this method should be moved to a more general place 47 | */ 48 | def withTempDir(f: File => Unit): Unit = { 49 | val dir = createTempDir().getCanonicalFile 50 | try f(dir) finally { 51 | deleteRecursively(dir) 52 | } 53 | } 54 | 55 | def deleteRecursively(file: File): Unit = { 56 | logDebug(s"Remove ${file.getCanonicalPath}") 57 | if (file.isDirectory) 58 | file.listFiles.foreach(deleteRecursively) 59 | if (file.exists && !file.delete) 60 | throw new Exception(s"Unable to delete ${file.getAbsolutePath}") 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/util/TypesafeConfigUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import com.typesafe.config.ConfigFactory 4 | import org.scalatest.funsuite.AnyFunSuite 5 | import org.scalatest.matchers.should.Matchers 6 | 7 | class TypesafeConfigUtilsSuite extends AnyFunSuite with Matchers { 8 | 9 | val config = ConfigFactory.load("test_priority.conf") 10 | import TypesafeConfigUtils._ 11 | 12 | test("TypesafeConfigUtils should handle implicit type conversion") { 13 | assert(getAs[String](config, "test.string") === Option("abc")) 14 | assert(getAs[Int](config, "test.int") === Option(1)) 15 | assert(getAs[Long](config, "test.long") === Option(2L)) 16 | assert(getAs[Float](config, "test.float") === Option(3.1F)) 17 | assert(getAs[Float](config, "test.float2") === Option(3.1F)) 18 | assert(getAs[Double](config, "test.double") === Option(4.4D)) 19 | assert(getAs[Boolean](config, "test.boolean") === Option(false)) 20 | assert(getAs[Boolean](config, "test.boolean2") === Option(true)) 21 | assert(getAs[Int](config, "test.non_existing") === None) 22 | assert(isDefined(config, "test.non_existing") === false) 23 | assert(isDefined(config, "test.string")) 24 | } 25 | 26 | test("TypesafeConfigUtils should handle list") { 27 | getList(config, "test.list").get should equal (Array(1, 2, 3)) 28 | val expected = Array(1.2, 2, 3) 29 | getList(config, "test.listFloat").get should equal (expected) 30 | getList(config, "test.listString").get should equal (Array("1.2", "2", "3")) 31 | } 32 | 33 | test("TypesafeConfigUtils should handle map") { 34 | getMap(config.getConfig("test.map")) should equal (Map("v1" -> "a", "v2" -> "b")) 35 | 36 | } 37 | 38 | test("TypesafeConfigUtils exceptions") { 39 | assertThrows[com.typesafe.config.ConfigException.WrongType](getAs[Int](config, "test.string")) 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/workflow/FlowSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.workflow 2 | 3 | import io.github.setl.annotation.Delivery 4 | import io.github.setl.transformation.{Factory, FactoryOutput} 5 | import org.scalatest.funsuite.AnyFunSuite 6 | 7 | import scala.reflect.runtime.universe 8 | 9 | class FlowSuite extends AnyFunSuite { 10 | 11 | import FlowSuite._ 12 | 13 | val nodeProductFactory = new Node(new ProductFactory, 0, false) 14 | val nodeProduct2Factory = new Node(new Product2Factory, 1, false) 15 | 16 | test("Flow constructor") { 17 | val flow = Flow(nodeProductFactory, nodeProduct2Factory) 18 | 19 | assert(flow.deliveryId === "") 20 | assert(flow.stage === 0) 21 | assert(flow.payload === universe.typeOf[Product1]) 22 | } 23 | 24 | test("Flow should generate diagram") { 25 | val flow = Flow(nodeProductFactory, nodeProduct2Factory) 26 | 27 | val externalNode = External.NODE.copy(output = FactoryOutput(universe.typeOf[String], Seq.empty, "", external = true)) 28 | val flowExternal = Flow(externalNode, nodeProductFactory) 29 | 30 | val expectedDiagram = 31 | """Product2Factory <|-- Product1 : Input""".stripMargin.replace(" ", "") 32 | 33 | val expectedExternalFlowDiagram = "ProductFactory <|-- StringExternal : Input".replace(" ", "") 34 | 35 | assert(flow.diagramId === "") 36 | assert(flow.toDiagram.replace(" ", "") === expectedDiagram) 37 | assert(flowExternal.toDiagram.replace(" ", "") === expectedExternalFlowDiagram) 38 | } 39 | 40 | } 41 | 42 | object FlowSuite { 43 | 44 | 45 | class ProductFactory extends Factory[Product1] { 46 | @Delivery 47 | private[this] val id: String = null 48 | private[this] var output: Product1 = _ 49 | 50 | override def read(): ProductFactory.this.type = this 51 | 52 | override def process(): ProductFactory.this.type = { 53 | output = Product1(id) 54 | this 55 | } 56 | 57 | override def write(): ProductFactory.this.type = this 58 | 59 | override def get(): Product1 = output 60 | } 61 | 62 | class Product2Factory extends Factory[Product2] { 63 | 64 | @Delivery 65 | val input: Product1 = null 66 | var output: Product2 = _ 67 | 68 | override def read(): this.type = this 69 | 70 | override def process(): this.type = { 71 | this 72 | } 73 | 74 | override def write(): this.type = this 75 | 76 | override def get(): Product2 = output 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/workflow/package.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | package object workflow { 4 | 5 | case class Product1(x: String) 6 | 7 | case class Product2(x: String, y: String) 8 | 9 | case class Product(x: String) 10 | 11 | case class Product23(x: String) 12 | 13 | case class Container[T](content: T) 14 | 15 | case class Container2[T](content: T) 16 | 17 | } 18 | --------------------------------------------------------------------------------