├── atum └── src │ ├── test │ ├── resources │ │ ├── mockito-extensions │ │ │ └── org.mockito.plugins.MockMaker │ │ └── example_input.info │ └── scala │ │ └── za │ │ └── co │ │ └── absa │ │ └── atum │ │ ├── utils │ │ ├── FileUtilsSpec.scala │ │ ├── BuildPropertiesSpec.scala │ │ ├── OperatingSystemSuite.scala │ │ ├── ExecutionPlanUtilsSuite.scala │ │ └── HdfsFileUtilsSpec.scala │ │ ├── persistence │ │ ├── TestResources.scala │ │ └── hdfs │ │ │ ├── ControlMeasuresHdfsLoaderJsonSpec.scala │ │ │ └── ControlMeasuresHdfsStorerJsonSpec.scala │ │ ├── CachingStorageLevelSpec.scala │ │ ├── ControlMeasureBaseTestSuite.scala │ │ ├── core │ │ └── AccumulatorSpec.scala │ │ └── ControlInfoToJsonSerializationSpec.scala │ └── main │ ├── resources │ └── atum_build.properties │ └── scala │ └── za │ └── co │ └── absa │ └── atum │ ├── plugins │ ├── PluginManager.scala │ └── EventListener.scala │ ├── persistence │ ├── ControlMeasuresLoader.scala │ ├── ControlMeasuresStorer.scala │ ├── hdfs │ │ ├── ControlMeasuresHdfsStorerJsonFile.scala │ │ └── ControlMeasuresHdfsLoaderJsonFile.scala │ └── ControlMeasuresParser.scala │ ├── model │ └── CheckpointImplicits.scala │ ├── utils │ ├── FileUtils.scala │ ├── SparkLocalMaster.scala │ ├── BuildProperties.scala │ ├── OperatingSystem.scala │ ├── ARMImplicits.scala │ ├── SparkTestBase.scala │ ├── InfoFile.scala │ ├── HdfsFileUtils.scala │ ├── ConfigurationImplicits.scala │ └── ExecutionPlanUtils.scala │ └── core │ ├── Constants.scala │ ├── SparkEventListener.scala │ ├── ControlType.scala │ ├── SparkQueryExecutionListener.scala │ ├── Accumulator.scala │ └── MeasurementProcessor.scala ├── atum-s3-sdk-extension └── src │ ├── test │ ├── resources │ │ ├── mockito-extensions │ │ │ └── org.mockito.plugins.MockMaker │ │ └── example_input.info │ └── scala │ │ └── za │ │ └── co │ │ └── absa │ │ └── atum │ │ └── persistence │ │ ├── TestResources.scala │ │ └── s3 │ │ ├── ControlMeasuresSdkS3LoaderJsonSpec.scala │ │ └── ControlMeasuresSdkS3StorerJsonSpec.scala │ └── main │ └── scala │ └── za │ └── co │ └── absa │ └── atum │ ├── persistence │ ├── S3ControlMeasuresStorer.scala │ └── s3 │ │ ├── Regional.scala │ │ ├── ControlMeasuresSdkS3LoaderJsonFile.scala │ │ └── ControlMeasuresSdkS3StorerJsonFile.scala │ ├── core │ ├── ControlFrameworkStateSdkS3.scala │ ├── AtumSdkS3.scala │ └── SparkQueryExecutionListenerSdkS3.scala │ ├── utils │ └── SdkS3ClientUtils.scala │ └── AtumImplicitsSdkS3.scala ├── project ├── build.properties ├── BuildInfoTemplateSettings.scala ├── plugins.sbt └── Dependencies.scala ├── version.sbt ├── model └── src │ ├── main │ └── scala │ │ └── za │ │ └── co │ │ └── absa │ │ └── atum │ │ ├── model │ │ ├── RunError.scala │ │ ├── RunState.scala │ │ ├── Measurement.scala │ │ ├── Checkpoint.scala │ │ ├── ControlMeasureMetadata.scala │ │ ├── RunStatus.scala │ │ └── ControlMeasure.scala │ │ └── utils │ │ └── SerializationUtils.scala │ └── test │ └── scala │ └── za │ └── co │ └── absa │ └── atum │ ├── util │ ├── JacksonJsonSerializer.scala │ ├── SerializatonUtilsBigDecimalToJsonSpec.scala │ └── SerializationUtilsJsonSpec.scala │ └── model │ └── ControlMeasureSpec.scala ├── .github └── workflows │ ├── license_check.yml │ ├── build-sbt.yml │ └── jacoco_check.yml ├── .editorconfig ├── examples-s3-sdk-extension └── src │ ├── test │ └── scala │ │ └── za │ │ └── co │ │ └── absa │ │ └── atum │ │ └── examples │ │ └── SampleMeasurementsS3RunnerExampleSpec.scala │ └── main │ └── scala │ └── za │ └── co │ └── absa │ └── atum │ └── examples │ ├── SampleSdkS3Measurements1.scala │ └── SampleSdkS3Measurements2.scala ├── .gitattributes ├── examples └── src │ ├── test │ ├── scala │ │ └── za │ │ │ └── co │ │ │ └── absa │ │ │ └── atum │ │ │ ├── examples │ │ │ └── SampleMeasurementsHdfsRunnerSpec.scala │ │ │ ├── LocalFsTestUtils.scala │ │ │ └── HdfsInfoIntegrationSuite.scala │ └── resources │ │ └── input │ │ └── wikidata.csv.info │ └── main │ └── scala │ └── za │ └── co │ └── absa │ └── atum │ ├── utils │ └── SparkJobRunnerMethods.scala │ └── examples │ ├── SampleMeasurements1.scala │ ├── SampleMeasurements2.scala │ ├── CreateInfoFileTool.scala │ └── CreateInfoFileToolCSV.scala ├── .gitignore ├── publish.sbt └── scalastyle-config.xml /atum/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker: -------------------------------------------------------------------------------- 1 | mock-maker-inline 2 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker: -------------------------------------------------------------------------------- 1 | mock-maker-inline 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | sbt.version = 1.5.5 17 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | ThisBuild / version := "3.10.1-SNAPSHOT" 17 | -------------------------------------------------------------------------------- /atum/src/main/resources/atum_build.properties: -------------------------------------------------------------------------------- 1 | # Copyright 2018 ABSA Group Limited 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | build.software=${project.artifactId} 15 | build.version=${project.version} 16 | -------------------------------------------------------------------------------- /atum/src/test/resources/example_input.info: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "sourceApplication": "AtumTest", 4 | "country": "CZ", 5 | "historyType": "Snapshot", 6 | "dataFilename": "example_input.csv", 7 | "sourceType": "public", 8 | "version": 1, 9 | "informationDate": "01-01-2020", 10 | "additionalInfo": { } 11 | }, 12 | "checkpoints": [ 13 | { 14 | "name": "checkpointA", 15 | "processStartTime": "01-01-2020 08:00:00", 16 | "processEndTime": "01-01-2020 08:00:10", 17 | "workflowName": "wf1", 18 | "order": 1, 19 | "controls": [ 20 | { 21 | "controlName": "control1", 22 | "controlType": "someControlType", 23 | "controlCol": "column1", 24 | "controlValue": "1234" 25 | } 26 | ] 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/test/resources/example_input.info: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "sourceApplication": "AtumTest", 4 | "country": "CZ", 5 | "historyType": "Snapshot", 6 | "dataFilename": "example_input.csv", 7 | "sourceType": "public", 8 | "version": 1, 9 | "informationDate": "01-01-2020", 10 | "additionalInfo": { } 11 | }, 12 | "checkpoints": [ 13 | { 14 | "name": "checkpointA", 15 | "processStartTime": "01-01-2020 08:00:00", 16 | "processEndTime": "01-01-2020 08:00:10", 17 | "workflowName": "wf1", 18 | "order": 1, 19 | "controls": [ 20 | { 21 | "controlName": "control1", 22 | "controlType": "someControlType", 23 | "controlCol": "column1", 24 | "controlValue": "1234" 25 | } 26 | ] 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /model/src/main/scala/za/co/absa/atum/model/RunError.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.model 18 | 19 | case class RunError 20 | ( 21 | job: String, 22 | step: String, 23 | description: String, 24 | technicalDetails: String 25 | ) 26 | -------------------------------------------------------------------------------- /model/src/main/scala/za/co/absa/atum/model/RunState.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.model 18 | 19 | object RunState extends Enumeration { 20 | type RunState = Value 21 | val allSucceeded, stageSucceeded, running, failed = Value 22 | } 23 | -------------------------------------------------------------------------------- /model/src/main/scala/za/co/absa/atum/model/Measurement.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.model 18 | 19 | case class Measurement 20 | ( 21 | controlName: String, 22 | controlType: String, 23 | controlCol: String, 24 | controlValue: Any 25 | ) 26 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/plugins/PluginManager.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.plugins 18 | 19 | import za.co.absa.atum.core.Atum 20 | 21 | object PluginManager { 22 | def loadPlugin(eventListener: EventListener): Unit = { 23 | Atum.addEventListener(eventListener) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresLoader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.persistence 18 | 19 | import za.co.absa.atum.model.ControlMeasure 20 | 21 | /** Trait for control measurements loading from a persistent storage */ 22 | trait ControlMeasuresLoader { 23 | def load() : ControlMeasure 24 | def getInfo: String 25 | } 26 | -------------------------------------------------------------------------------- /model/src/main/scala/za/co/absa/atum/model/Checkpoint.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.model 18 | 19 | case class Checkpoint 20 | ( 21 | name: String, 22 | software: Option[String] = None, 23 | version: Option[String] = None, 24 | processStartTime: String, 25 | processEndTime: String, 26 | workflowName: String, 27 | order: Int, 28 | controls: List[Measurement] 29 | ) 30 | -------------------------------------------------------------------------------- /model/src/main/scala/za/co/absa/atum/model/ControlMeasureMetadata.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.model 18 | 19 | case class ControlMeasureMetadata 20 | ( 21 | sourceApplication: String, 22 | country: String, 23 | historyType: String, 24 | dataFilename: String, 25 | sourceType: String, 26 | version: Int, 27 | informationDate: String, 28 | additionalInfo: Map[String, String] 29 | ) 30 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/model/CheckpointImplicits.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.model 18 | 19 | import za.co.absa.atum.utils.BuildProperties 20 | 21 | object CheckpointImplicits { 22 | 23 | implicit class CheckpointExt(checkpoint: Checkpoint) { 24 | def withBuildProperties: Checkpoint = { 25 | checkpoint.copy(software = Some(BuildProperties.projectName), version = Some(BuildProperties.buildVersion)) 26 | } 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /.github/workflows/license_check.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | name: License Check 17 | 18 | on: 19 | push: 20 | branches: [ master, develop ] 21 | pull_request: 22 | branches: [ master, develop ] 23 | 24 | jobs: 25 | license-test: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - name: Checkout code 29 | uses: actions/checkout@v2 30 | - name: Setup Scala 31 | uses: olafurpg/setup-scala@v10 32 | with: 33 | java-version: "adopt@1.8" 34 | - run: sbt headerCheck 35 | -------------------------------------------------------------------------------- /model/src/main/scala/za/co/absa/atum/model/RunStatus.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.model 18 | 19 | import com.fasterxml.jackson.core.`type`.TypeReference 20 | import com.fasterxml.jackson.module.scala.JsonScalaEnumeration 21 | import za.co.absa.atum.model.RunState.RunState 22 | 23 | class RunStateType extends TypeReference[RunState.type] 24 | 25 | case class RunStatus 26 | ( 27 | @JsonScalaEnumeration(classOf[RunStateType]) 28 | status: RunState, 29 | error: Option[RunError] 30 | ) 31 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | # top-most EditorConfig file 17 | root = true 18 | 19 | [*] 20 | charset = utf-8 21 | end_of_line = lf 22 | trim_trailing_whitespace = true 23 | 24 | [*.xml] 25 | indent_size = 4 26 | indent_style = space 27 | insert_final_newline = true 28 | 29 | [*.{java,scala,js,json,css}] 30 | indent_size = 2 31 | indent_style = space 32 | insert_final_newline = true 33 | max_line_length = 120 34 | 35 | [*.md] 36 | trim_trailing_whitespace = false 37 | 38 | [*.{cmd,bat}] 39 | end_of_line = crlf 40 | insert_final_newline = true 41 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/main/scala/za/co/absa/atum/persistence/S3ControlMeasuresStorer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.persistence 18 | 19 | import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider 20 | import za.co.absa.atum.persistence.s3.{S3KmsSettings, SimpleS3LocationWithRegion} 21 | 22 | trait S3ControlMeasuresStorer extends ControlMeasuresStorer { 23 | def kmsSettings: S3KmsSettings 24 | def outputLocation: SimpleS3LocationWithRegion 25 | def credentialsProvider: AwsCredentialsProvider 26 | } 27 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.persistence 18 | 19 | import org.apache.hadoop.fs.FileSystem 20 | import za.co.absa.atum.model.ControlMeasure 21 | 22 | /** Trait for control measurements saving to a persistent storage */ 23 | trait ControlMeasuresStorer { 24 | def store(controlInfo: ControlMeasure): Unit 25 | def getInfo: String 26 | } 27 | 28 | trait HadoopFsControlMeasuresStorer extends ControlMeasuresStorer { 29 | def outputFs: FileSystem 30 | } 31 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/FileUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | object FileUtils { 20 | def readFileToString(path: String): String = { 21 | val testTxtSource = scala.io.Source.fromFile(path) 22 | val str = testTxtSource.mkString 23 | testTxtSource.close() 24 | 25 | str 26 | } 27 | 28 | implicit class PathJoin(path: String) { 29 | def /(pathSegment: String): String = { 30 | s"${path.stripSuffix("/")}/${pathSegment.stripPrefix("/")}" 31 | } 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /examples-s3-sdk-extension/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerExampleSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.examples 17 | 18 | import org.scalatest.funsuite.AnyFunSuite 19 | import za.co.absa.atum.utils._ 20 | 21 | // integ tests are skipped via pom-settings 22 | class SampleMeasurementsS3RunnerExampleSpec extends AnyFunSuite 23 | with SparkJobRunnerMethods 24 | with SparkLocalMaster { 25 | 26 | runSparkJobAsTest[SampleSdkS3Measurements1.type] 27 | runSparkJobAsTest[SampleSdkS3Measurements2.type] 28 | } 29 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/SparkLocalMaster.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | trait SparkLocalMaster { 20 | System.getProperties.setProperty("spark.master", "local[*]") 21 | 22 | // in order to runSampleMeasuremts as tests, otherwise 23 | // java.lang.IllegalArgumentException: System memory 259522560 must be at least 471859200... is thrown 24 | System.getProperties.setProperty("spark.testing.memory", (1024*1024*1024).toString) // 1g 25 | System.getProperties.setProperty("spark.app.name", "unit-test") 26 | } 27 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ############################### 16 | # Git Line Endings # 17 | ############################### 18 | 19 | # Set default behaviour to automatically normalize line endings. 20 | * text=auto 21 | 22 | # Force the following filetypes to have unix eols, Windows can usually handle it well 23 | *.* text eol=lf 24 | 25 | # Force batch scripts to always use CRLF line endings as they in some cases might not work correctly. 26 | # Also if a repo is accessed in Windows via a file share from Linux, the scripts will work too 27 | *.cmd text eol=crlf 28 | *.bat text eol=crlf 29 | -------------------------------------------------------------------------------- /examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsHdfsRunnerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.examples 17 | 18 | import org.scalatest.funsuite.AnyFunSuite 19 | import za.co.absa.atum.utils._ 20 | 21 | class SampleMeasurementsHdfsRunnerSpec extends AnyFunSuite 22 | with SparkJobRunnerMethods 23 | with SparkLocalMaster { 24 | 25 | // SampleMeasurement2 depends on SampleMeasurements1's output, so they must be run in this order 26 | runSparkJobAsTest[SampleMeasurements1.type] 27 | runSparkJobAsTest[SampleMeasurements2.type] 28 | } 29 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/utils/FileUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.utils 17 | 18 | import org.scalatest.flatspec.AnyFlatSpec 19 | import org.scalatest.matchers.should.Matchers 20 | 21 | class FileUtilsSpec extends AnyFlatSpec with Matchers { 22 | 23 | "PathJoin" should "join paths correctly" in { 24 | 25 | import za.co.absa.atum.utils.FileUtils.PathJoin 26 | "/path/to" / "file" shouldBe "/path/to/file" 27 | "/path/to/" / "file" shouldBe "/path/to/file" 28 | "/path/to" / "/file" shouldBe "/path/to/file" 29 | "/path/to/" / "/file" shouldBe "/path/to/file" 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | # use glob syntax. 17 | syntax: glob 18 | *.ser 19 | *.class 20 | *~ 21 | *.bak 22 | *.old 23 | 24 | # eclipse conf file 25 | .settings 26 | .classpath 27 | .project 28 | .manager 29 | .scala_dependencies 30 | .scalastyle 31 | 32 | # idea 33 | .idea 34 | *.iml 35 | 36 | # building 37 | target 38 | build 39 | null 40 | tmp* 41 | temp* 42 | dist 43 | test-output 44 | build.log 45 | 46 | # other scm 47 | .svn 48 | .CVS 49 | .hg* 50 | 51 | .cache* 52 | 53 | _testOutput 54 | myTestCheckpoints 55 | examples/data/output 56 | 57 | # switch to regexp syntax. 58 | # syntax: regexp 59 | # ^\.pc/ 60 | /examples-s3-sdk-extension/data/output_s3/ 61 | 62 | /data/ 63 | .bsp 64 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/core/Constants.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.core 18 | 19 | /** 20 | * The object contains constants and default used through Control Framework 21 | */ 22 | object Constants { 23 | // Keys to store in Spaek Session 24 | val InitFlagKey = "control_framework.initialized_flag" 25 | val InfoFileVersionKey = "control_framework.info_version" 26 | val InfoFileDateKey = "control_framework.info_date" 27 | val RunUniqueIdKey = "control_framework.run_unique_id" 28 | 29 | val TimestampFormat = "dd-MM-yyyy HH:mm:ss [Z]" 30 | val DateFormat = "dd-MM-yyyy" 31 | val DefaultInfoFileName = "_INFO" 32 | 33 | val maxErrorMessageSize = 2048 34 | } 35 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/core/SparkEventListener.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.core 18 | 19 | import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd} 20 | 21 | /** 22 | * The class is responsible for listening to Spark events and saving pending Control Framework checkpoint changes. 23 | */ 24 | class SparkEventListener(cf: ControlFrameworkState) extends SparkListener { 25 | override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { 26 | if (cf.havePendingCheckpoints) { 27 | Atum.log.info(s"Saving control framework checkpoints") 28 | cf.updateRunCheckpoints(saveInfoFile = true) 29 | } 30 | cf.onApplicationEnd() 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/BuildProperties.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | import java.util.Properties 20 | 21 | object BuildProperties { 22 | private val properties = new Properties() 23 | private val buildVersionKey = "build.version" 24 | private val buildSoftwareKey = "build.software" 25 | 26 | /** Returns the version of the build. */ 27 | lazy val buildVersion: String = properties.getProperty(buildVersionKey) 28 | /** Returns the name of the build. */ 29 | lazy val projectName: String = properties.getProperty(buildSoftwareKey) 30 | 31 | loadConfig() 32 | 33 | private def loadConfig(): Unit = { 34 | val is = getClass.getResourceAsStream("/atum_build.properties") 35 | try properties.load(is) 36 | finally if (is != null) is.close() 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/utils/BuildPropertiesSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.utils 17 | 18 | import org.scalatest.flatspec.AnyFlatSpec 19 | import za.co.absa.commons.version.Version 20 | 21 | import scala.util.{Failure, Success, Try} 22 | 23 | class BuildPropertiesSpec extends AnyFlatSpec { 24 | private val version = BuildProperties.buildVersion 25 | private val name = BuildProperties.projectName 26 | 27 | "Project version" should "be parsable by the semVer" in { 28 | Try { 29 | Version.asSemVer(version) 30 | } match { 31 | case Success(_) => succeed 32 | case Failure(exception) => fail(exception.getMessage, exception.getCause) 33 | } 34 | } 35 | 36 | "Project Name" should "start with atum and scala version" in { 37 | assert(name.matches("""^atum_(2\.11|2\.12)$""")) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/utils/OperatingSystemSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.utils 17 | 18 | import org.scalatest.flatspec.AnyFlatSpec 19 | import org.scalatest.matchers.should.Matchers 20 | import za.co.absa.atum.utils.OperatingSystem.OperatingSystems 21 | 22 | class OperatingSystemSuite extends AnyFlatSpec with Matchers { 23 | 24 | "OperatingSystem util" should "correctly find out OS" in { 25 | OperatingSystem.getOsByOsName("Windows 10") shouldBe OperatingSystems.WINDOWS 26 | OperatingSystem.getOsByOsName("Linux") shouldBe OperatingSystems.LINUX 27 | OperatingSystem.getOsByOsName("Mac OS X") shouldBe OperatingSystems.MAC 28 | OperatingSystem.getOsByOsName("SunOs") shouldBe OperatingSystems.SOLARIS 29 | 30 | OperatingSystem.getOsByOsName("my own special os") shouldBe OperatingSystems.OTHER 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/OperatingSystem.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | 20 | object OperatingSystem { 21 | 22 | // adapted from https://stackoverflow.com/a/31547504/1773349 23 | 24 | object OperatingSystems extends Enumeration { 25 | val WINDOWS, LINUX, MAC, SOLARIS, OTHER = Value 26 | } 27 | 28 | def getOsByOsName(osName: String): OperatingSystems.Value = { 29 | import za.co.absa.atum.utils.OperatingSystem.OperatingSystems._ 30 | osName.toLowerCase match { 31 | case os if os.contains("win") => WINDOWS 32 | case os if os.contains("nix") || os.contains("nux") || os.contains("aix") => LINUX 33 | case os if os.contains("mac") => MAC 34 | case os if os.contains("sunos") => SOLARIS 35 | case _ => OTHER 36 | } 37 | } 38 | 39 | def getCurrentOs: OperatingSystems.Value = { 40 | getOsByOsName(System.getProperty("os.name")) 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/ARMImplicits.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | import scala.language.reflectiveCalls 20 | 21 | object ARMImplicits { 22 | 23 | type Closeable = {def close(): Unit} 24 | 25 | implicit class ArmResourceWrapper[ResourceType <: Closeable](private val resource: ResourceType) { 26 | def usingResourceDo[ResultType](body: ResourceType => ResultType): ResultType = 27 | try body(resource) 28 | finally resource.close() 29 | 30 | // implementing a for-comprehension contract 31 | 32 | def foreach(f: (ResourceType) => Unit): Unit = usingResourceDo(f) 33 | 34 | def map[ResultType](body: ResourceType => ResultType): ResultType = usingResourceDo(body) 35 | 36 | def flatMap[ResultType](body: (ResourceType) => ResultType): ResultType = usingResourceDo(body) 37 | 38 | def withFilter(f: (ResourceType) => Boolean): ArmResourceWrapper[ResourceType] = this 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.persistence 17 | 18 | import za.co.absa.atum.model.{Checkpoint, ControlMeasure, ControlMeasureMetadata, Measurement} 19 | 20 | object TestResources { 21 | 22 | object InputInfo { 23 | val localPath: String = getClass.getResource("/example_input.info").getPath 24 | 25 | // conforms to the content of the Resource file `example_input.info` 26 | val controlMeasure = ControlMeasure( 27 | ControlMeasureMetadata("AtumTest", "CZ", "Snapshot", "example_input.csv", "public", 1, "01-01-2020", Map.empty), 28 | runUniqueId = None, 29 | List(Checkpoint("checkpointA", None, None, "01-01-2020 08:00:00", "01-01-2020 08:00:10", "wf1", 1, List( 30 | Measurement("control1", "someControlType", "column1", "1234") 31 | ))) 32 | ) 33 | } 34 | 35 | def filterWhitespaces(content: String): String = { 36 | content.filterNot(_.isWhitespace) 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/main/scala/za/co/absa/atum/core/ControlFrameworkStateSdkS3.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.core 18 | 19 | import org.apache.spark.sql.SparkSession 20 | import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider 21 | import za.co.absa.atum.persistence.s3.{ControlMeasuresSdkS3StorerJsonFile, S3KmsSettings, SimpleS3LocationWithRegion} 22 | 23 | /** 24 | * This class holds control measurement's state 25 | */ 26 | class ControlFrameworkStateSdkS3(sparkSession: SparkSession) extends ControlFrameworkState(sparkSession) { 27 | 28 | def storeCurrentInfoFileOnSdkS3(s3Location: SimpleS3LocationWithRegion, s3KmsSettings: S3KmsSettings)(implicit credentialsProvider: AwsCredentialsProvider): Unit = { 29 | val storer = ControlMeasuresSdkS3StorerJsonFile(s3Location, s3KmsSettings) 30 | 31 | storer.store(accumulator.getControlMeasure) 32 | AtumSdkS3.log.info(s"Control measurements saved to ${s3Location.asSimpleS3LocationString}") 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/SparkTestBase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | import org.apache.log4j.{Level, Logger} 20 | import org.apache.spark.sql.SparkSession 21 | 22 | trait SparkTestBase { 23 | System.setProperty("user.timezone", "UTC") 24 | 25 | // Do not display INFO entries for tests 26 | Logger.getLogger("org").setLevel(Level.WARN) 27 | Logger.getLogger("akka").setLevel(Level.WARN) 28 | 29 | implicit val spark: SparkSession = SparkSession.builder() 30 | .master("local[1]") // Spark3 has async writes. When we move to AsyncFlatSpec, we will change this back to * 31 | .appName("test") 32 | .config("spark.sql.codegen.wholeStage", value = false) 33 | .config("spark.driver.bindAddress", "127.0.0.1") 34 | .config("spark.driver.host", "127.0.0.1") 35 | .config("spark.ui.enabled", "false") 36 | .config("spark.testing.memory", 1024*1024*1024) // otherwise may fail based on local machine settings 37 | .getOrCreate() 38 | 39 | } 40 | -------------------------------------------------------------------------------- /examples/src/main/scala/za/co/absa/atum/utils/SparkJobRunnerMethods.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.utils 17 | 18 | import org.scalatest.funsuite.AnyFunSuiteLike 19 | 20 | import scala.reflect.ClassTag 21 | import scala.reflect.runtime.universe 22 | 23 | trait SparkJobRunnerMethods { 24 | this: AnyFunSuiteLike => 25 | 26 | private def runSparkJob[T](implicit ct: ClassTag[T]): Unit = { 27 | type MainClass = {def main(args: Array[String]): Unit} 28 | 29 | val jobClass = ct.runtimeClass 30 | val jobClassSymbol = universe runtimeMirror jobClass.getClassLoader classSymbol jobClass 31 | val jobInstance = 32 | if (jobClassSymbol.isModuleClass) jobClass getField "MODULE$" get jobClass 33 | else jobClass.newInstance 34 | 35 | jobInstance.asInstanceOf[MainClass].main(Array.empty) 36 | } 37 | 38 | def runSparkJobAsTest[T](implicit ct: ClassTag[T]): Unit = { 39 | val sampleName = ct.runtimeClass.getSimpleName 40 | test(sampleName)(runSparkJob[T](ct)) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/test/scala/za/co/absa/atum/persistence/TestResources.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.persistence 17 | 18 | import za.co.absa.atum.model.{Checkpoint, ControlMeasure, ControlMeasureMetadata, Measurement} 19 | 20 | object TestResources { 21 | 22 | object InputInfo { 23 | val localPath: String = getClass.getResource("/example_input.info").getPath 24 | 25 | // conforms to the content of the Resource file `example_input.info` 26 | val controlMeasure: ControlMeasure = ControlMeasure( 27 | ControlMeasureMetadata("AtumTest", "CZ", "Snapshot", "example_input.csv", "public", 1, "01-01-2020", Map.empty), 28 | runUniqueId = None, 29 | List(Checkpoint("checkpointA", None, None, "01-01-2020 08:00:00", "01-01-2020 08:00:10", "wf1", 1, List( 30 | Measurement("control1", "someControlType", "column1", "1234") 31 | ))) 32 | ) 33 | } 34 | 35 | def filterWhitespaces(content: String): String = { 36 | content.filterNot(_.isWhitespace) 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonFile.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.persistence.hdfs 18 | 19 | import org.apache.hadoop.fs.{FileSystem, Path} 20 | import za.co.absa.atum.model.ControlMeasure 21 | import za.co.absa.atum.persistence.{ControlMeasuresParser, HadoopFsControlMeasuresStorer} 22 | import za.co.absa.atum.utils.HdfsFileUtils 23 | 24 | /** A storer of control measurements to hadoop filesystem as a JSON file . */ 25 | case class ControlMeasuresHdfsStorerJsonFile(path: Path)(implicit val outputFs: FileSystem) extends HadoopFsControlMeasuresStorer { 26 | override def store(controlInfo: ControlMeasure): Unit = { 27 | val serialized = ControlMeasuresParser asJson controlInfo 28 | HdfsFileUtils.saveStringDataToFile(path, serialized, 29 | HdfsFileUtils.getInfoFilePermissionsFromConfig().getOrElse(HdfsFileUtils.DefaultFilePermissions)) 30 | } 31 | 32 | override def getInfo: String = { 33 | s"JSON serializer to ${path.toUri}" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.persistence.hdfs 17 | 18 | import org.apache.hadoop.conf.Configuration 19 | import org.apache.hadoop.fs.{FileSystem, Path} 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import org.scalatest.matchers.should.Matchers 22 | import za.co.absa.atum.persistence.TestResources 23 | import za.co.absa.atum.utils.SparkLocalMaster 24 | 25 | class ControlMeasuresHdfsLoaderJsonSpec extends AnyFlatSpec with Matchers with SparkLocalMaster { 26 | 27 | val inputPath: String = TestResources.InputInfo.localPath 28 | val expectedInputControlMeasure = TestResources.InputInfo.controlMeasure 29 | 30 | implicit val fs = FileSystem.get(new Configuration()) 31 | 32 | 33 | "ControlMeasuresHdfsLoaderJsonFile" should "load json file from HDFS" in { 34 | val loadedControlMeasure = ControlMeasuresHdfsLoaderJsonFile(new Path(inputPath)).load() 35 | 36 | loadedControlMeasure shouldBe expectedInputControlMeasure 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonFile.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.persistence.hdfs 18 | 19 | import org.apache.hadoop.fs.{FileSystem, Path} 20 | import za.co.absa.atum.model.ControlMeasure 21 | import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresParser} 22 | import za.co.absa.atum.utils.HdfsFileUtils 23 | import za.co.absa.atum.utils.controlmeasure.ControlMeasureUtils 24 | 25 | /** A loader of control measurements from a JSON file stored in hadoop filesystem. */ 26 | case class ControlMeasuresHdfsLoaderJsonFile(path: Path) 27 | (implicit inputFs: FileSystem) extends ControlMeasuresLoader { 28 | override def load(): ControlMeasure = { 29 | val controlInfoJson = HdfsFileUtils.readHdfsFileToString(path) 30 | 31 | ControlMeasureUtils.preprocessControlMeasure(ControlMeasuresParser fromJson controlInfoJson) 32 | } 33 | 34 | override def getInfo: String = { 35 | s"JSON deserializer from ${path.toUri}" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/main/scala/za/co/absa/atum/utils/SdkS3ClientUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, ProfileCredentialsProvider} 20 | import software.amazon.awssdk.regions.Region 21 | import software.amazon.awssdk.services.s3.S3Client 22 | import za.co.absa.atum.core.AtumSdkS3.log 23 | 24 | object SdkS3ClientUtils { 25 | 26 | def getLocalProfileCredentialsProvider(credentialsProfileName: String): ProfileCredentialsProvider = { 27 | val localProfileCredentials = ProfileCredentialsProvider.create(credentialsProfileName) 28 | log.debug(s"Credentials of local $credentialsProfileName profile =" + 29 | s" ${localProfileCredentials.resolveCredentials().accessKeyId()}, ${localProfileCredentials.resolveCredentials().secretAccessKey().take(5)}...") 30 | 31 | localProfileCredentials 32 | } 33 | 34 | def getS3Client(region: Region, credentialsProvider: AwsCredentialsProvider): S3Client = { 35 | S3Client.builder() 36 | .region(region) 37 | .credentialsProvider(credentialsProvider) 38 | .build() 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /project/BuildInfoTemplateSettings.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | import sbt.Keys._ 17 | import sbt._ 18 | 19 | // heavily inspired by Cobrix 20 | object BuildInfoTemplateSettings { 21 | 22 | lazy val populateBuildInfoTemplate: Seq[Def.Setting[_]] = Seq( 23 | Compile / unmanagedResources / excludeFilter := excludeTemplateResource.value, 24 | Compile / resourceGenerators += populateResourceTemplate.taskValue 25 | ) 26 | 27 | private val excludeTemplateResource = Def.setting { 28 | val propsTemplate = ((Compile / resourceDirectory).value / "atum_build.properties").getCanonicalPath 29 | new SimpleFileFilter(_.getCanonicalPath == propsTemplate) 30 | } 31 | 32 | private val populateResourceTemplate = Def.task { 33 | val template = IO.read((Compile / resourceDirectory).value / "atum_build.properties") 34 | 35 | val filledTemplate = template 36 | .replace("${project.version}", version.value) 37 | .replace("${project.artifactId}", s"${artifact.value.name}_${scalaBinaryVersion.value}") 38 | 39 | val out = (Compile / resourceManaged).value / "atum_build.properties" 40 | IO.write(out, filledTemplate) 41 | Seq(out) 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/main/scala/za/co/absa/atum/persistence/s3/Regional.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.persistence.s3 18 | 19 | import software.amazon.awssdk.regions.Region 20 | import software.amazon.awssdk.services.s3.model.ServerSideEncryption 21 | import za.co.absa.commons.s3.{S3Location, SimpleS3Location} 22 | 23 | trait Regional { 24 | def region: Region 25 | } 26 | 27 | case class SimpleS3LocationWithRegion(protocol: String, bucketName: String, path: String, region: Region) extends S3Location with Regional { 28 | def withRegion(region: Region): SimpleS3LocationWithRegion = this.copy(region = region) 29 | 30 | override def asSimpleS3LocationString: String = SimpleS3Location(protocol, bucketName, path).asSimpleS3LocationString 31 | } 32 | 33 | case class S3KmsSettings(kmsKeyId: String, serverSideEncryption: ServerSideEncryption = ServerSideEncryption.AWS_KMS) 34 | 35 | object S3LocationRegionImplicits { 36 | 37 | implicit class SimpleS3LocationRegionExt(s3Loc: S3Location) { 38 | def withRegion(region: Region): SimpleS3LocationWithRegion = 39 | SimpleS3LocationWithRegion(s3Loc.protocol, s3Loc.bucketName, s3Loc.path, region) 40 | } 41 | 42 | } 43 | 44 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresParser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.persistence 18 | 19 | import za.co.absa.atum.model.ControlMeasure 20 | import za.co.absa.atum.utils.SerializationUtils 21 | 22 | /** 23 | * This object is used for [[za.co.absa.atum.model.ControlMeasure]] object serialization 24 | */ 25 | object ControlMeasuresParser { 26 | /** 27 | * The method returns JSON representation of a [[za.co.absa.atum.model.ControlMeasure]] object 28 | */ 29 | def asJson(controlMeasure: ControlMeasure): String = { 30 | SerializationUtils.asJson[ControlMeasure](controlMeasure) 31 | } 32 | 33 | /** 34 | * The method returns a prettified JSON representation of a [[za.co.absa.atum.model.ControlMeasure]] object 35 | */ 36 | def asJsonPretty(controlMeasure: ControlMeasure): String = { 37 | SerializationUtils.asJsonPretty[ControlMeasure](controlMeasure) 38 | } 39 | 40 | /** 41 | * The method returns a [[za.co.absa.atum.model.ControlMeasure]] object parsed from JSON string. 42 | */ 43 | def fromJson(jsonStr: String): ControlMeasure = { 44 | SerializationUtils.fromJson[ControlMeasure](jsonStr) 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /.github/workflows/build-sbt.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | name: SBT AutoBuild 17 | 18 | on: 19 | push: 20 | branches: [ master ] 21 | pull_request: 22 | branches: [ master ] 23 | 24 | jobs: 25 | build-sbt: 26 | runs-on: ubuntu-latest 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | scala: [ 2.11.12, 2.12.15 ] 31 | spark: [ 2.4.8, 3.2.2 ] 32 | exclude: 33 | - scala: 2.11.12 34 | spark: 3.2.2 35 | - scala: 2.12.15 36 | spark: 2.4.8 37 | name: SBT Spark ${{matrix.spark}} on Scala ${{matrix.scala}} 38 | steps: 39 | - name: Checkout code 40 | uses: actions/checkout@v2 41 | - uses: coursier/cache-action@v5 42 | - name: Setup Scala 43 | uses: olafurpg/setup-scala@v10 44 | with: 45 | java-version: "adopt@1.8" 46 | - name: Build and run tests 47 | if: ${{ ! contains( github.event.pull_request.labels.*.name, 'NoTestNeeded') }} 48 | run: sbt ++${{matrix.scala}} test 49 | - name: Build and run examples 50 | if: ${{ ! contains( github.event.pull_request.labels.*.name, 'NoTestNeeded') }} 51 | run: sbt ++${{matrix.scala}} examples/test s3sdkExamples/compile 52 | -------------------------------------------------------------------------------- /examples/src/test/scala/za/co/absa/atum/LocalFsTestUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum 17 | 18 | import java.io.File 19 | import java.nio.file.Files 20 | 21 | import org.apache.commons.io.FileUtils 22 | import org.apache.log4j.LogManager 23 | 24 | import scala.io.Source 25 | import scala.util.control.NonFatal 26 | 27 | object LocalFsTestUtils { 28 | private val log = LogManager.getLogger(this.getClass) 29 | 30 | /** 31 | * Creates a temporary directory in the local filesystem. 32 | * 33 | * @param prefix A prefix to use for the temporary directory. 34 | * @return A path to a temporary directory. 35 | */ 36 | def createLocalTemporaryDirectory(prefix: String): String = { 37 | val tmpPath = Files.createTempDirectory(prefix) 38 | tmpPath.toAbsolutePath.toString 39 | } 40 | 41 | def safeDeleteTestDir(path: String): Unit = { 42 | try { 43 | FileUtils.deleteDirectory(new File(path)) 44 | } catch { 45 | case NonFatal(_) => log.warn(s"Unable to delete a test directory $path") 46 | } 47 | } 48 | 49 | def readFileAsString(filename: String, lineSeparator: String = "\n"): String = { 50 | val sourceFile = Source.fromFile(filename) 51 | try { 52 | sourceFile.getLines().mkString(lineSeparator) 53 | } finally { 54 | sourceFile.close() 55 | } 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /model/src/main/scala/za/co/absa/atum/utils/SerializationUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | import org.json4s.jackson.Serialization 20 | import org.json4s.jackson.Serialization.{write, writePretty} 21 | import org.json4s.{Formats, NoTypeHints, ext} 22 | import za.co.absa.atum.model._ 23 | 24 | /** 25 | * This object contains utilities used in Control Measurements processing 26 | */ 27 | object SerializationUtils { 28 | 29 | implicit private val formatsJson: Formats = Serialization.formats(NoTypeHints).withBigDecimal + new ext.EnumNameSerializer(RunState) 30 | 31 | /** 32 | * The method returns arbitrary object as a Json string. 33 | * 34 | * @return A string representing the object in Json format 35 | */ 36 | def asJson[T <: AnyRef](obj: T): String = { 37 | write[T](obj) 38 | } 39 | 40 | /** 41 | * The method returns arbitrary object as a pretty Json string. 42 | * 43 | * @return A string representing the object in Json format 44 | */ 45 | def asJsonPretty[T <: AnyRef](obj: T): String = { 46 | writePretty[T](obj) 47 | } 48 | 49 | /** 50 | * The method returns arbitrary object parsed from Json string. 51 | * 52 | * @return An object deserialized from the Json string 53 | */ 54 | def fromJson[T <: AnyRef](jsonStr: String)(implicit m: Manifest[T]): T = { 55 | Serialization.read[T](jsonStr) 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /model/src/test/scala/za/co/absa/atum/util/JacksonJsonSerializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018-2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.util 17 | 18 | import com.fasterxml.jackson.annotation.JsonInclude.Include 19 | import com.fasterxml.jackson.databind.ObjectMapper 20 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 21 | 22 | import scala.reflect.ClassTag 23 | import scala.util.Try 24 | 25 | /** 26 | * Sample serializer that is expected to be used for Atum's model externally, e.g. in Enceladus 27 | */ 28 | object JacksonJsonSerializer { 29 | 30 | val objectMapper: ObjectMapper = new ObjectMapper() 31 | .registerModule(DefaultScalaModule) 32 | .setSerializationInclusion(Include.NON_EMPTY) // e.g. null-values fields omitted 33 | 34 | 35 | def fromJson[T](json: String) 36 | (implicit ct: ClassTag[T]): T = { 37 | val clazz = ct.runtimeClass.asInstanceOf[Class[T]] 38 | if (clazz == classOf[String]) { 39 | json.asInstanceOf[T] 40 | } else { 41 | objectMapper.readValue(json, clazz) 42 | } 43 | } 44 | 45 | def toJson[T](entity: T): String = { 46 | entity match { 47 | case str: String => 48 | if (isValidJson(str)) str else objectMapper.writeValueAsString(entity) 49 | case _ => 50 | objectMapper.writeValueAsString(entity) 51 | } 52 | } 53 | 54 | def isValidJson[T](str: T with String): Boolean = { 55 | Try(objectMapper.readTree(str)).isSuccess 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /examples/src/test/resources/input/wikidata.csv.info: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "sourceApplication": "WikiApp", 4 | "country": "", 5 | "historyType": "Snapshot", 6 | "dataFilename": "wikidata.csv", 7 | "sourceType": "public", 8 | "version": 1, 9 | "informationDate": "01-01-2017", 10 | "additionalInfo": { } 11 | }, 12 | "checkpoints": [ 13 | { 14 | "name": "Source", 15 | "processStartTime": "01-01-2017 08:00:00", 16 | "processEndTime": "01-01-2017 08:00:00", 17 | "workflowName": "Source", 18 | "order": 1, 19 | "controls": [ 20 | { 21 | "controlName": "recordCount", 22 | "controlType": "controlType.Count", 23 | "controlCol": "*", 24 | "controlValue": 4964 25 | }, 26 | { 27 | "controlName": "pvControlTotal", 28 | "controlType": "controlType.aggregatedTotal", 29 | "controlCol": "count_views", 30 | "controlValue": 5819 31 | }, 32 | { 33 | "controlName": "SumOfReponseSize", 34 | "controlType": "controlType.aggregatedTotal", 35 | "controlCol": "total_response_size", 36 | "controlValue": 97729547 37 | } 38 | ] 39 | }, 40 | { 41 | "name": "Raw", 42 | "processStartTime": "01-01-2017 08:00:00", 43 | "processEndTime": "01-01-2017 08:00:00", 44 | "workflowName": "Raw", 45 | "order": 2, 46 | "controls": [ 47 | { 48 | "controlName": "recordCount", 49 | "controlType": "controlType.Count", 50 | "controlCol": "*", 51 | "controlValue": 4964 52 | }, 53 | { 54 | "controlName": "pvControlTotal", 55 | "controlType": "controlType.aggregatedTotal", 56 | "controlCol": "count_views", 57 | "controlValue": 5819 58 | }, 59 | { 60 | "controlName": "SumOfReponseSize", 61 | "controlType": "controlType.aggregatedTotal", 62 | "controlCol": "total_response_size", 63 | "controlValue": 97729547 64 | } 65 | ] 66 | } 67 | ] 68 | } 69 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/CachingStorageLevelSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum 17 | 18 | import org.apache.hadoop.conf.Configuration 19 | import org.apache.hadoop.fs.FileSystem 20 | import org.apache.spark.storage.StorageLevel 21 | import org.scalatest.BeforeAndAfter 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest.matchers.should.Matchers 24 | import za.co.absa.atum.core.Atum 25 | import za.co.absa.atum.utils.SparkTestBase 26 | 27 | class CachingStorageLevelSpec extends AnyFlatSpec with Matchers with SparkTestBase with BeforeAndAfter { 28 | 29 | implicit val fs = FileSystem.get(new Configuration()) 30 | 31 | before { 32 | Atum.init(spark) 33 | } 34 | 35 | after { 36 | Atum.dispose(spark) 37 | } 38 | 39 | "enableCaching" should "enable caching with the default storage level" in { 40 | Atum.enableCaching() 41 | assert(Atum.cachingStorageLevel == StorageLevel.MEMORY_AND_DISK) 42 | } 43 | 44 | 45 | "enableCaching" should "enable caching with a specified storage level" in { 46 | Atum.enableCaching(StorageLevel.MEMORY_ONLY) 47 | assert(Atum.cachingStorageLevel == StorageLevel.MEMORY_ONLY) 48 | } 49 | 50 | "enableCaching" should "enable caching with a specified storage level expressed as a string" in { 51 | Atum.setCachingStorageLevel("MEMORY_ONLY") 52 | assert(Atum.cachingStorageLevel == StorageLevel.MEMORY_ONLY) 53 | } 54 | 55 | "disableCaching" should "disable caching" in { 56 | Atum.disableCaching() 57 | assert(Atum.cachingStorageLevel == StorageLevel.NONE) 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.persistence.hdfs 17 | 18 | import org.apache.hadoop.conf.Configuration 19 | import org.apache.hadoop.fs.{FileSystem, Path} 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import org.scalatest.matchers.should.Matchers 22 | import za.co.absa.atum.model.ControlMeasure 23 | import za.co.absa.atum.persistence.TestResources 24 | import za.co.absa.atum.utils.{FileUtils, HdfsFileUtils, SparkLocalMaster} 25 | 26 | class ControlMeasuresHdfsStorerJsonSpec extends AnyFlatSpec with Matchers with SparkLocalMaster { 27 | 28 | val expectedFilePath: String = TestResources.InputInfo.localPath 29 | val inputControlMeasure: ControlMeasure = TestResources.InputInfo.controlMeasure 30 | 31 | val hadoopConfiguration: Configuration = new Configuration() 32 | implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration) 33 | 34 | "ControlMeasuresHdfsStorerJsonFile" should "store json file to HDFS" in { 35 | 36 | val outputPath = new Path("/tmp/json-hdfs-storing-test") 37 | fs.delete(outputPath, false) 38 | 39 | ControlMeasuresHdfsStorerJsonFile(outputPath).store(inputControlMeasure) 40 | 41 | val actualContent = HdfsFileUtils.readHdfsFileToString(outputPath) 42 | val expectedContent = FileUtils.readFileToString(expectedFilePath) 43 | 44 | // some output may be prettified while other may not, we do not take this into account. 45 | TestResources.filterWhitespaces(actualContent) shouldBe TestResources.filterWhitespaces(expectedContent) 46 | 47 | fs.delete(outputPath, false) 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /publish.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // to successfully publish to Sonatype OSS (using sbt publishSigned), setup outline at 18 | // https://www.scala-sbt.org/1.x/docs/Using-Sonatype.html is expected 19 | 20 | ThisBuild / organizationName := "ABSA Group Limited" 21 | ThisBuild / organizationHomepage := Some(url("https://www.absa.africa")) 22 | ThisBuild / scmInfo := Some( 23 | ScmInfo( 24 | browseUrl = url("http://github.com/AbsaOSS/atum/tree/master"), 25 | connection = "scm:git:git://github.com/AbsaOSS/atum.git", 26 | devConnection = "scm:git:ssh://github.com/AbsaOSS/atum.git" 27 | ) 28 | ) 29 | 30 | ThisBuild / developers := List( 31 | Developer( 32 | id = "dk1844", 33 | name = "Daniel Kavan", 34 | email = "daniel.kavan@absa.africa", 35 | url = url("https://github.com/dk1844") 36 | ), 37 | Developer( 38 | id = "yruslan", 39 | name = "Ruslan Iushchenko", 40 | email = "ruslan.iushchenko@absa.africa", 41 | url = url("https://github.com/yruslan") 42 | ) 43 | ) 44 | 45 | ThisBuild / homepage := Some(url("https://github.com/AbsaOSS/atum")) 46 | ThisBuild / description := "Dynamic data completeness and accuracy at enterprise scale in Apache Spark" 47 | ThisBuild / startYear := Some(2018) 48 | ThisBuild / licenses += "Apache-2.0" -> url("https://www.apache.org/licenses/LICENSE-2.0.txt") 49 | 50 | ThisBuild / pomIncludeRepository := { _ => false } 51 | ThisBuild / publishTo := { 52 | val nexus = "https://oss.sonatype.org/" 53 | if (isSnapshot.value) { 54 | Some("snapshots" at s"${nexus}content/repositories/snapshots") 55 | } else { 56 | Some("releases" at s"${nexus}service/local/staging/deploy/maven2") 57 | } 58 | } 59 | ThisBuild / publishMavenStyle := true 60 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/plugins/EventListener.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.plugins 18 | 19 | import za.co.absa.atum.model.{ControlMeasure, RunStatus} 20 | 21 | /** Trait for Control Framework events listener. */ 22 | trait EventListener { 23 | 24 | /** 25 | * Called when an _INFO file have been loaded. 26 | * 27 | * @param sparkApplicationId An application Id of the Spark job that triggered the event. 28 | * @param inputInfoFileName The path to an _INFO file that was read. 29 | * @param controlMeasure The control framework information that was read from the _INFO file. 30 | */ 31 | def onLoad(sparkApplicationId: String, inputInfoFileName: String, controlMeasure: ControlMeasure): Unit 32 | 33 | /** 34 | * Called when a checkpoint have been completed. 35 | * 36 | * @param controlMeasure The new control framework information containing the new checkpoint. 37 | */ 38 | def onControlMeasurementsUpdated(controlMeasure: ControlMeasure): Unit 39 | 40 | /** 41 | * Called when job status changes. 42 | * 43 | @param newStatus The new status if the Spark job. 44 | */ 45 | def onJobStatusChange(newStatus: RunStatus): Unit 46 | 47 | /** 48 | * Called when a dataset controlled by Control Framework is saved. 49 | * 50 | * @param sparkApplicationId An application Id of the Spark job that triggered the event. 51 | * @param outputPath An path to the file that has been saved. 52 | */ 53 | def onSaveOutput(sparkApplicationId: String, outputPath: String): Unit 54 | 55 | /** 56 | * Called when the Spark application ends so the plugin can finalize and release resources. 57 | */ 58 | def onApplicationEnd(): Unit 59 | 60 | } 61 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.1.0") 17 | 18 | addSbtPlugin("de.heikoseeberger" % "sbt-header" % "5.6.0") 19 | 20 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") // formerly known as com.jsuereth:sbt-pgp 21 | 22 | // sbt-jacoco - workaround related dependencies required to download 23 | lazy val ow2Version = "9.5" 24 | lazy val jacocoVersion = "0.8.10-absa.1" 25 | 26 | def jacocoUrl(artifactName: String): String = s"https://github.com/AbsaOSS/jacoco/releases/download/$jacocoVersion/org.jacoco.$artifactName-$jacocoVersion.jar" 27 | def ow2Url(artifactName: String): String = s"https://repo1.maven.org/maven2/org/ow2/asm/$artifactName/$ow2Version/$artifactName-$ow2Version.jar" 28 | 29 | addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.11/2.0/scala-arm_2.11-2.0.jar") 30 | addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.12/2.0/scala-arm_2.12-2.0.jar") 31 | 32 | addSbtPlugin("za.co.absa.jacoco" % "report" % jacocoVersion from jacocoUrl("report")) 33 | addSbtPlugin("za.co.absa.jacoco" % "core" % jacocoVersion from jacocoUrl("core")) 34 | addSbtPlugin("za.co.absa.jacoco" % "agent" % jacocoVersion from jacocoUrl("agent")) 35 | addSbtPlugin("org.ow2.asm" % "asm" % ow2Version from ow2Url("asm")) 36 | addSbtPlugin("org.ow2.asm" % "asm-commons" % ow2Version from ow2Url("asm-commons")) 37 | addSbtPlugin("org.ow2.asm" % "asm-tree" % ow2Version from ow2Url("asm-tree")) 38 | 39 | addSbtPlugin("za.co.absa.sbt" % "sbt-jacoco" % "3.4.1-absa.3" from "https://github.com/AbsaOSS/sbt-jacoco/releases/download/3.4.1-absa.3/sbt-jacoco-3.4.1-absa.3.jar") 40 | 41 | addDependencyTreePlugin 42 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/core/ControlType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.core 18 | 19 | class ControlType(val value: String, val onlyForNumeric: Boolean) 20 | object ControlType { 21 | case object Count extends ControlType("count", false) 22 | case object DistinctCount extends ControlType("distinctCount", false) 23 | case object AggregatedTotal extends ControlType("aggregatedTotal", true) 24 | case object AbsAggregatedTotal extends ControlType("absAggregatedTotal", true) 25 | case object AggregatedTruncTotal extends ControlType("aggregatedTruncTotal", true) 26 | case object AbsAggregatedTruncTotal extends ControlType("absAggregatedTruncTotal", true) 27 | case object HashCrc32 extends ControlType("hashCrc32", false) 28 | 29 | val values: Seq[ControlType] = Seq(Count, DistinctCount, AggregatedTotal, AbsAggregatedTotal, 30 | AggregatedTruncTotal, AbsAggregatedTruncTotal, HashCrc32) 31 | val valueNames: Seq[String] = values.map(_.value) 32 | 33 | def getNormalizedValueName(input: String): String = { 34 | valueNames.find(value => isControlMeasureTypeEqual(input, value)).getOrElse(input) 35 | } 36 | 37 | def withValueName(s: String): ControlType = values.find(_.value.toString == s).getOrElse( 38 | throw new NoSuchElementException(s"No value found for '$s'. Allowed values are: $valueNames")) 39 | 40 | def isControlMeasureTypeEqual(x: String, y: String): Boolean = { 41 | if (x.toLowerCase == y.toLowerCase) { 42 | true 43 | } else { 44 | val strippedX = if (x.contains('.')) x.split('.').last.toLowerCase else x.toLowerCase 45 | val strippedY = if (y.contains('.')) y.split('.').last.toLowerCase else y.toLowerCase 46 | strippedX == strippedY 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresSdkS3LoaderJsonFile.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.persistence.s3 18 | 19 | import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, DefaultCredentialsProvider} 20 | import software.amazon.awssdk.services.s3.S3Client 21 | import software.amazon.awssdk.services.s3.model.GetObjectRequest 22 | import za.co.absa.atum.model.ControlMeasure 23 | import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresParser} 24 | import za.co.absa.atum.utils.SdkS3ClientUtils 25 | import za.co.absa.atum.utils.controlmeasure.ControlMeasureUtils 26 | 27 | /** 28 | * A loader of control measurements from a JSON file stored in AWS S3. 29 | * @param inputLocation S3 location to read the json measurements from 30 | * @param credentialsProvider a specific credentials provider (e.g. SAML profile). Consider using [[DefaultCredentialsProvider#create()]] when in doubt. 31 | */ 32 | case class ControlMeasuresSdkS3LoaderJsonFile(inputLocation: SimpleS3LocationWithRegion) 33 | (implicit credentialsProvider: AwsCredentialsProvider) extends ControlMeasuresLoader { 34 | override def load(): ControlMeasure = { 35 | val s3Client: S3Client = getS3Client 36 | 37 | val getRequest = GetObjectRequest 38 | .builder().bucket(inputLocation.bucketName).key(inputLocation.path) 39 | .build() 40 | 41 | val controlInfoJson = s3Client.getObjectAsBytes(getRequest).asUtf8String() 42 | ControlMeasureUtils.preprocessControlMeasure(ControlMeasuresParser fromJson controlInfoJson) 43 | } 44 | 45 | override def getInfo: String = { 46 | s"JSON deserializer from ${inputLocation.asSimpleS3LocationString}" 47 | } 48 | 49 | private[s3] def getS3Client: S3Client = SdkS3ClientUtils.getS3Client(inputLocation.region, credentialsProvider) 50 | 51 | } 52 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/main/scala/za/co/absa/atum/core/AtumSdkS3.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.core 18 | 19 | import org.apache.spark.sql.SparkSession 20 | 21 | /** 22 | * The object coordinates access to control measurements state 23 | */ 24 | object AtumSdkS3 extends AtumSdkS3 25 | 26 | class AtumSdkS3 extends Atum { 27 | 28 | private[atum] def controlFrameworkStateSdkS3: ControlFrameworkStateSdkS3 = state.asInstanceOf[ControlFrameworkStateSdkS3] 29 | 30 | override private[atum] def init(sparkSession: SparkSession): Unit = { 31 | preventDoubleInitialization(sparkSession) 32 | 33 | val s3State = new ControlFrameworkStateSdkS3(sparkSession) 34 | state = s3State // internal state assign 35 | 36 | sparkListener = new SparkEventListener(s3State) 37 | queryExecutionListener = new SparkQueryExecutionListenerSdkS3(s3State) 38 | 39 | sparkSession.sparkContext.addSparkListener(sparkListener) 40 | sparkSession.listenerManager.register(queryExecutionListener) 41 | 42 | val sessionConf = sparkSession.sessionState.conf 43 | sessionConf.setConfString(Constants.InitFlagKey, true.toString) 44 | } 45 | 46 | override private[atum] def dispose(sparkSession: SparkSession): Unit = { 47 | preventNotInitialized(sparkSession) 48 | 49 | if (state.havePendingCheckpoints) { 50 | AtumSdkS3.log.info(s"Saving control framework checkpoints") 51 | state.updateRunCheckpoints(saveInfoFile = true) 52 | } 53 | 54 | sparkSession.sparkContext.removeSparkListener(sparkListener) 55 | sparkSession.listenerManager.unregister(queryExecutionListener) 56 | 57 | sparkListener.onApplicationEnd(null) 58 | 59 | val sessionConf = sparkSession.sessionState.conf 60 | sessionConf.unsetConf(Constants.InitFlagKey) 61 | 62 | sparkListener = null 63 | queryExecutionListener = null 64 | state = null 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements1.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.examples 17 | 18 | import java.nio.file.{Files, Paths} 19 | 20 | import org.apache.hadoop.fs.{FileSystem, Path} 21 | import org.apache.spark.sql.{SaveMode, SparkSession} 22 | import org.scalatest.concurrent.Eventually 23 | import za.co.absa.atum.AtumImplicits._ 24 | 25 | import scala.concurrent.duration.DurationInt // using basic Atum without extensions 26 | 27 | object SampleMeasurements1 extends Eventually { 28 | def main(args: Array[String]) { 29 | val sparkBuilder = SparkSession.builder().appName("Sample Measurements 1 Job") 30 | val spark = sparkBuilder 31 | .getOrCreate() 32 | 33 | import spark.implicits._ 34 | val hadoopConfiguration = spark.sparkContext.hadoopConfiguration 35 | implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration) 36 | 37 | // Initializing library to hook up to Apache Spark 38 | val inputCsvInfo = this.getClass.getResource("/input/wikidata.csv.info").toString // path from test resources 39 | 40 | spark.enableControlMeasuresTracking(Some(inputCsvInfo), None) 41 | .setControlMeasuresWorkflow("Job 1") 42 | 43 | val inputCsv = this.getClass.getResource("/input/wikidata.csv").toString // path from test resources 44 | // A business logic of a spark job ... 45 | spark.read 46 | .option("header", "true") 47 | .option("inferSchema", "true") 48 | .csv(inputCsv) 49 | .as("source") 50 | .filter($"total_response_size" > 1000) 51 | .setCheckpoint("checkpoint1") 52 | .write.mode(SaveMode.Overwrite) 53 | .parquet("data/output/stage1_job_results") 54 | 55 | eventually(timeout(scaled(20.seconds)), interval(scaled(500.millis))) { 56 | if (!fs.exists(new Path("data/output/stage1_job_results/_INFO"))) { 57 | throw new Exception("_INFO file not found at data/output/stage1_job_results") 58 | } 59 | } 60 | 61 | spark.disableControlMeasuresTracking() 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements2.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.examples 17 | 18 | import org.apache.hadoop.fs.{FileSystem, Path} 19 | import org.apache.spark.sql.{SaveMode, SparkSession} 20 | import za.co.absa.atum.AtumImplicits._ // using basic Atum without extensions 21 | 22 | object SampleMeasurements2 { 23 | def main(args: Array[String]) { 24 | 25 | // This example is intended to run AFTER SampleMeasurements1, otherwise it will fail on input file absence 26 | 27 | val sparkBuilder = SparkSession.builder().appName("Sample Measurements 2 Job") 28 | val spark = sparkBuilder.getOrCreate() 29 | import spark.implicits._ 30 | 31 | val hadoopConfiguration = spark.sparkContext.hadoopConfiguration 32 | implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration) 33 | 34 | // Initializing library to hook up to Apache Spark 35 | // No need to specify datasetName and datasetVersion as it is stage 2 and it will be determined automatically 36 | spark.enableControlMeasuresTracking() 37 | .setControlMeasuresWorkflow("Job 2") 38 | 39 | val sourceDS = spark.read 40 | .parquet("data/output/stage1_job_results") 41 | 42 | // A business logic of a spark job ... 43 | 44 | // An example - a column rename 45 | // If the renamed column is one of control measurement columns, the rename need to be registered in Control Framework 46 | sourceDS.as("target") 47 | .withColumnRenamed("total_response_size", "trs") // Renaming the column 48 | .registerColumnRename("total_response_size", "trs") // Registering the rename, from now on the new name for the column is 'trs' 49 | .filter($"trs" > 1000) 50 | .setCheckpoint("checkpoint2") 51 | .write.mode(SaveMode.Overwrite) 52 | .parquet("data/output/stage2_job_results") 53 | 54 | spark.disableControlMeasuresTracking() 55 | 56 | fs.delete(new Path("data/output/stage1_job_results"), true) 57 | fs.delete(new Path("data/output/stage2_job_results"), true) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /model/src/main/scala/za/co/absa/atum/model/ControlMeasure.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.model 18 | 19 | import za.co.absa.atum.utils.SerializationUtils 20 | 21 | case class ControlMeasure 22 | ( 23 | metadata: ControlMeasureMetadata, 24 | runUniqueId: Option[String], 25 | checkpoints: List[Checkpoint] 26 | ) { 27 | def asJson: String = SerializationUtils.asJson(this) 28 | def asJsonPretty: String = SerializationUtils.asJsonPretty(this) 29 | 30 | /** 31 | * A new ControlMeasure will be constructed with the supplied `checkpoint1` as the new first checkpoint (as-is, 32 | * e.g. its order value is neither checked nor adjusted). 33 | * Any existing checkpoints will be shifted behind with their order indices increased by 1. 34 | * 35 | * @param checkpoint1 a new checkpoint preceding all the existing 36 | */ 37 | def withPrecedingCheckpoint(checkpoint1: Checkpoint): ControlMeasure = { 38 | val shiftedCheckpoints = checkpoints.map { cp => 39 | cp.copy(order = cp.order + 1) 40 | } 41 | 42 | this.copy(checkpoints = checkpoint1 :: shiftedCheckpoints) 43 | 44 | } 45 | 46 | /** 47 | * Adds a key-value pair as an additional information stored in the metadata. 48 | * 49 | * @param kv a tuple containing key-value pair that will be added into metadata -> additionalInfo Map. 50 | * @param replaceIfExists if the 'key' specified in 'kv' parameter already exist in the 51 | * metadata -> additionalInfo Map, then this parameter will decide whether 52 | * the value in this Map will be overwritten or no. 53 | */ 54 | def setAdditionalInfo(kv: (String, String), replaceIfExists: Boolean): ControlMeasure = { 55 | kv match { 56 | case (key, _) if replaceIfExists || !this.metadata.additionalInfo.contains(key) => 57 | val newInfo = metadata.additionalInfo + kv 58 | val newMetadata = metadata.copy(additionalInfo = newInfo) 59 | copy(metadata = newMetadata) 60 | case _ => 61 | this 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /examples/src/main/scala/za/co/absa/atum/examples/CreateInfoFileTool.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.examples 17 | 18 | import org.apache.log4j.LogManager 19 | import org.apache.spark.sql.SparkSession 20 | import za.co.absa.atum.utils.controlmeasure.ControlMeasureUtils 21 | 22 | /** 23 | * The object is a Spark Job for creating an info file for a specific data file in a specific format 24 | * 25 | * An example command to generate an info file: 26 | * {{{ 27 | * spark-submit --master yarn \ 28 | * --deploy-mode client \ 29 | * --class za.co.absa.atum.examples.CreateInfoFileTool \ 30 | * atum-examples-0.2.3-SNAPSHOT.jar \ 31 | * SampleDataSet /user/data/input 3 2017-11-07 parquet employeeId address dealId 32 | * }}} 33 | * 34 | * This routine is only a reference example implementation, it is by no means complete. 35 | * 36 | */ 37 | object CreateInfoFileTool { 38 | 39 | private val log = LogManager.getLogger("CreateInfoFileJob") 40 | 41 | def main(args: Array[String]) { 42 | 43 | val (sourceApplication: String, inputFileName: String, infoVersion: Int, infoDate: String, rawFormat: String, columnNames: Array[String]) = { 44 | if (args.length < 5) { 45 | System.err.println("Usage:\n\tprogram inputFileName datasetVersion date[yyyy-MM-dd] rawFormat [columns for aggregation...]") 46 | System.exit(1) 47 | } else { 48 | (args(0), args(1), args(2).toInt, args(3), args(4), args.slice(5, args.length)) 49 | } 50 | } 51 | 52 | val dateTokens = infoDate.split("-") 53 | val dateInDMYFormat = s"${dateTokens(2)}-${dateTokens(1)}-${dateTokens(0)}" 54 | 55 | val sparkBuilder = SparkSession.builder().appName("Create Info File Job") 56 | val spark = sparkBuilder 57 | // .master("local") 58 | .getOrCreate() 59 | 60 | val ds = spark.read.format(rawFormat).load(inputFileName) 61 | 62 | val strJson = ControlMeasureUtils.createInfoFile(ds, sourceApplication, inputFileName, dateInDMYFormat, infoVersion, aggregateColumns = columnNames.toSeq) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /examples-s3-sdk-extension/src/main/scala/za/co/absa/atum/examples/SampleSdkS3Measurements1.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.examples 17 | 18 | import org.apache.hadoop.fs.FileSystem 19 | import org.apache.spark.sql.{SaveMode, SparkSession} 20 | import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider 21 | import software.amazon.awssdk.regions.Region 22 | import za.co.absa.atum.AtumImplicitsSdkS3._ 23 | import za.co.absa.atum.persistence.s3.SimpleS3LocationWithRegion 24 | import za.co.absa.atum.utils.SdkS3ClientUtils 25 | 26 | object SampleSdkS3Measurements1 { 27 | def main(args: Array[String]) { 28 | val sparkBuilder = SparkSession.builder().appName("Sample S3 Measurements 1 Job") 29 | val spark = sparkBuilder 30 | // .master("local") 31 | .getOrCreate() 32 | 33 | import spark.implicits._ 34 | 35 | val hadoopConfiguration = spark.sparkContext.hadoopConfiguration 36 | implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration) 37 | 38 | // This sample example relies on local credentials profile named "saml" with access to the s3 location defined below 39 | implicit val samlCredentialsProvider: ProfileCredentialsProvider = SdkS3ClientUtils.getLocalProfileCredentialsProvider("saml") 40 | val myBucket = System.getenv("TOOLING_BUCKET_NAME") // load from an environment property in order not to disclose it here 41 | 42 | // Initializing library to hook up to Apache Spark 43 | spark.enableControlMeasuresTrackingForSdkS3( 44 | sourceS3Location = Some(SimpleS3LocationWithRegion("s3", myBucket, "atum/input/wikidata.csv.info", Region.EU_WEST_1)), 45 | destinationS3Config = None 46 | ).setControlMeasuresWorkflow("Job 1 S3 ") 47 | 48 | // A business logic of a spark job ... 49 | 50 | spark.read 51 | .option("header", "true") 52 | .option("inferSchema", "true") 53 | .csv("data/input/wikidata.csv") 54 | .as("source") 55 | .filter($"total_response_size" > 10000) 56 | .setCheckpoint("checkpoint1") 57 | .write.mode(SaveMode.Overwrite) 58 | .parquet("data/output_s3/stage1_job_results") 59 | 60 | spark.disableControlMeasuresTracking() 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/utils/ExecutionPlanUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.utils 17 | 18 | import org.apache.hadoop.conf.Configuration 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.sql.execution.QueryExecution 21 | import org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand 22 | import org.mockito.Mockito 23 | import org.mockito.scalatest.IdiomaticMockito 24 | import org.scalatest.flatspec.AnyFlatSpec 25 | import org.scalatest.matchers.should.Matchers 26 | 27 | class ExecutionPlanUtilsSuite extends AnyFlatSpec with Matchers with IdiomaticMockito { 28 | 29 | val hadoopConf = new Configuration 30 | 31 | implicit class SimplePath(path: Path) { 32 | // disregarding hdfs nameserver prefix or local FS fallback (file://) 33 | def simplePath: String = path.toUri.getPath 34 | } 35 | 36 | "inferOutputInfoFileName" should "derive output file name for HDFS from SaveIntoDataSourceCommand" in { 37 | val qe = mock[QueryExecution] 38 | Mockito.when(qe.analyzed).thenReturn( 39 | SaveIntoDataSourceCommand(null, null, options = Map(("path", "/tmp")), null) 40 | ) 41 | 42 | ExecutionPlanUtils.inferOutputFileName(qe, hadoopConf).get.simplePath shouldBe "/tmp" 43 | } 44 | 45 | "inferOutputInfoFileName" should "derive output info file name for HDFS from SaveIntoDataSourceCommand" in { 46 | val qe = mock[QueryExecution] 47 | val myInfoName = "myInfo" 48 | Mockito.when(qe.analyzed).thenReturn( 49 | SaveIntoDataSourceCommand(null, null, options = Map(("path", "/tmp/here")), null) 50 | ) 51 | 52 | ExecutionPlanUtils.inferOutputInfoFileName(qe, myInfoName).get.simplePath shouldBe "/tmp/here/myInfo" 53 | } 54 | 55 | "inferOutputInfoFileNameOnS3" should "derive output info file name for S3 from SaveIntoDataSourceCommand" in { 56 | val qe = mock[QueryExecution] 57 | val myInfoName = "myInfo" 58 | Mockito.when(qe.analyzed).thenReturn( 59 | // trailing slash should get taken care of 60 | SaveIntoDataSourceCommand(null, null, options = Map(("path", "/tmp/here2/")), null) 61 | ) 62 | 63 | ExecutionPlanUtils.inferOutputInfoFileNameOnS3(qe, myInfoName).get shouldBe "/tmp/here2/myInfo" 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /.github/workflows/jacoco_check.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2022 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | name: JaCoCo report 18 | 19 | on: 20 | pull_request: 21 | branches: [ master ] 22 | types: [ opened, edited, synchronize, reopened ] 23 | 24 | jobs: 25 | test: 26 | runs-on: ubuntu-latest 27 | strategy: 28 | matrix: 29 | include: 30 | - scala: 2.11.12 31 | scala_short: 2.11 32 | spark: 2.4.8 33 | overall: 0.0 34 | changed: 80.0 35 | - scala: 2.12.15 36 | scala_short: 2.12 37 | spark: 3.2.2 38 | overall: 0.0 39 | changed: 80.0 40 | steps: 41 | - name: Checkout code 42 | uses: actions/checkout@v2 43 | - name: Setup Scala 44 | uses: olafurpg/setup-scala@v10 45 | with: 46 | java-version: "adopt@1.8" 47 | - name: Build and run tests 48 | run: sbt ++${{matrix.scala}} jacoco 49 | - name: Add coverage to PR 50 | id: jacoco 51 | uses: madrapps/jacoco-report@v1.7.1 52 | with: 53 | paths: > 54 | ${{ github.workspace }}/atum/target/scala-${{ matrix.scala_short }}/jacoco/report/jacoco.xml, 55 | ${{ github.workspace }}/atum-s3-sdk-extension/target/scala-${{ matrix.scala_short }}/jacoco/report/jacoco.xml, 56 | ${{ github.workspace }}/model/target/scala-${{ matrix.scala_short }}/jacoco/report/jacoco.xml 57 | token: ${{ secrets.GITHUB_TOKEN }} 58 | min-coverage-overall: ${{ matrix.overall }} 59 | min-coverage-changed-files: ${{ matrix.changed }} 60 | title: JaCoCo code coverage report - scala:${{ matrix.scala }} 61 | update-comment: true 62 | - name: Get the Coverage info 63 | run: | 64 | echo "Total coverage ${{ steps.jacoco.outputs.coverage-overall }}" 65 | echo "Changed Files coverage ${{ steps.jacoco.outputs.coverage-changed-files }}" 66 | - name: Fail PR if changed files coverage is less than ${{ matrix.changed }}% 67 | if: ${{ steps.jacoco.outputs.coverage-changed-files < 80.0 }} 68 | uses: actions/github-script@v6 69 | with: 70 | script: | 71 | core.setFailed('Changed files coverage is less than ${{ matrix.changed }}%!') 72 | -------------------------------------------------------------------------------- /model/src/test/scala/za/co/absa/atum/util/SerializatonUtilsBigDecimalToJsonSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.util 17 | 18 | import org.json4s._ 19 | import org.json4s.jackson.Serialization 20 | import org.json4s.jackson.Serialization.write 21 | import org.scalatest.flatspec.AnyFlatSpec 22 | import org.scalatest.matchers.should.Matchers 23 | import za.co.absa.atum.model.Measurement 24 | 25 | class SerializatonUtilsBigDecimalToJsonSpec extends AnyFlatSpec with Matchers { 26 | implicit val formats: Formats = Serialization.formats(NoTypeHints).withBigDecimal 27 | 28 | "write" should "serialize a scala.math.BigDecimal" in 29 | { 30 | val number = BigDecimal(5.5) 31 | val s = write(number) 32 | s shouldEqual "5.5" 33 | } 34 | 35 | "write" should "serialize a big scala.math.BigDecimal" in 36 | { 37 | val number = BigDecimal("32847283324.324324") 38 | val s = write(number) 39 | s shouldEqual "32847283324.324324" 40 | } 41 | 42 | "write" should "serialize a collection with scala.math.BigDecimals" in 43 | { 44 | val numbers = Seq(BigDecimal(5.5), BigDecimal(0.56)) 45 | val s = write(numbers) 46 | s shouldEqual "[5.5,0.56]" 47 | } 48 | 49 | "write" should "serialize a map with a scala.math.BigDecimal" in 50 | { 51 | val map = Map[String, Any]("a" -> "a", "b" -> BigDecimal(5.5)) 52 | val s = write(map) 53 | s shouldEqual "{\"a\":\"a\",\"b\":5.5}" 54 | } 55 | 56 | "write" should "serialize a collection of measurements" in 57 | { 58 | val mearurements = Seq( 59 | Measurement( 60 | controlName = "pvControlTotal", 61 | controlType = "controlType.aggregatedTotal", 62 | controlCol = "pv", 63 | controlValue = "32847283324.324324" 64 | ), 65 | Measurement( 66 | controlName = "recordCount", 67 | controlType = "controlType.Count", 68 | controlCol = "id", 69 | controlValue = "243" 70 | ) 71 | ) 72 | val s = write(mearurements) 73 | s shouldEqual "[{\"controlName\":\"pvControlTotal\",\"controlType\":\"controlType.aggregatedTotal\"," + "\"controlCol\":\"pv\",\"controlValue\":\"32847283324.324324\"}," + 74 | "{\"controlName\":\"recordCount\",\"controlType\":\"controlType.Count\",\"controlCol\":\"id\",\"controlValue\":\"243\"}]" 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /examples/src/main/scala/za/co/absa/atum/examples/CreateInfoFileToolCSV.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.examples 17 | 18 | import org.apache.log4j.LogManager 19 | import org.apache.spark.sql.SparkSession 20 | import za.co.absa.atum.utils.controlmeasure.ControlMeasureUtils 21 | 22 | /** 23 | * The object is a Spark Job for creating an info file for a specific CSV file. 24 | * 25 | * An example command to generate an info file: 26 | * {{{ 27 | * spark-submit --master yarn \ 28 | * --deploy-mode client \ 29 | * --class za.co.absa.atum.examples.CreateInfoFileToolCSV \ 30 | * atum-examples-0.2.3-SNAPSHOT.jar \ 31 | * SampleDataSet /user/data/input 3 2017-11-07 | false ZA Snapshot employeeId address dealId 32 | * }}} 33 | * 34 | * This routine is only a reference example implementation, it is by no means complete. 35 | * 36 | */ 37 | object CreateInfoFileToolCSV { 38 | 39 | private val log = LogManager.getLogger(this.getClass) 40 | 41 | def main(args: Array[String]) { 42 | 43 | val (sourceApplication: String, 44 | inputFileName: String, 45 | infoVersion: Int, 46 | infoDate: String, 47 | delimiter: String, 48 | haveHeaders: Boolean, 49 | country: String, 50 | historyType: String, 51 | columnNames: Array[String]) = { 52 | if (args.length < 5) { 53 | System.err.println("Usage:\n\tprogram inputFileName infoVersion infoDate[yyyy-MM-dd] delimiter headers[true/false] " + 54 | " country historyType [columns for aggregation...]") 55 | System.exit(1) 56 | } else { 57 | (args(0), args(1), args(2).toInt, args(3), args(4), args(5).toBoolean, args(6), args(7), args.slice(8, args.length)) 58 | } 59 | } 60 | 61 | val dateTokens = infoDate.split("-") 62 | val dateInDMYFormat = s"${dateTokens(2)}-${dateTokens(1)}-${dateTokens(0)}" 63 | 64 | val sparkBuilder = SparkSession.builder().appName("Create Info File Job") 65 | val spark = sparkBuilder 66 | // .master("local") 67 | .getOrCreate() 68 | 69 | val ds = spark 70 | .read 71 | .format("csv") 72 | .option("delimiter", delimiter) 73 | .option("header", haveHeaders) 74 | .load(inputFileName) 75 | 76 | val strJson = ControlMeasureUtils.createInfoFile(ds, 77 | sourceApplication, 78 | inputFileName, 79 | dateInDMYFormat, 80 | infoVersion, 81 | historyType = historyType, 82 | country = country, 83 | aggregateColumns = columnNames.toSeq) 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresSdkS3StorerJsonFile.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.persistence.s3 18 | 19 | import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, DefaultCredentialsProvider} 20 | import software.amazon.awssdk.core.exception.SdkException 21 | import software.amazon.awssdk.core.sync.RequestBody 22 | import software.amazon.awssdk.services.s3.S3Client 23 | import software.amazon.awssdk.services.s3.model.PutObjectRequest 24 | import za.co.absa.atum.model.ControlMeasure 25 | import za.co.absa.atum.persistence.{ControlMeasuresParser, S3ControlMeasuresStorer} 26 | import za.co.absa.atum.utils.SdkS3ClientUtils 27 | 28 | /** 29 | * A storer of control measurements to a JSON file stored in AWS S3. 30 | * 31 | * @param outputLocation s3 location to save measurements data to 32 | * @param kmsSettings KMS settings - server side encryption configuration 33 | * @param credentialsProvider a specific credentials provider (e.g. SAML profile). Consider using [[DefaultCredentialsProvider#create()]] when in doubt. 34 | */ 35 | case class ControlMeasuresSdkS3StorerJsonFile(outputLocation: SimpleS3LocationWithRegion, kmsSettings: S3KmsSettings) 36 | (implicit val credentialsProvider: AwsCredentialsProvider) extends S3ControlMeasuresStorer { 37 | 38 | /** 39 | * Stores the `controlInfo` measurement to an S3 location. 40 | * 41 | * @param controlInfo measurements to store 42 | * @throws SdkException when storing fails. 43 | */ 44 | override def store(controlInfo: ControlMeasure): Unit = { 45 | val serialized = ControlMeasuresParser asJson controlInfo 46 | saveDataToFile(serialized) 47 | } 48 | 49 | private def saveDataToFile(data: String): Unit = { 50 | val s3Client = getS3Client 51 | 52 | val putRequest = PutObjectRequest.builder.bucket(outputLocation.bucketName).key(outputLocation.path) 53 | .serverSideEncryption(kmsSettings.serverSideEncryption) 54 | .ssekmsKeyId(kmsSettings.kmsKeyId) 55 | .build() 56 | 57 | // would throw S3Exception or SdkClientException in case of failure (base exception class: SdkException) 58 | s3Client.putObject(putRequest, RequestBody.fromString(data)) 59 | } 60 | 61 | override def getInfo: String = { 62 | s"JSON serializer for Storer to ${outputLocation.asSimpleS3LocationString}" 63 | } 64 | 65 | private[s3] def getS3Client: S3Client = SdkS3ClientUtils.getS3Client(outputLocation.region, credentialsProvider) 66 | } 67 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/InfoFile.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | import java.net.URI 20 | 21 | import org.apache.hadoop.conf.Configuration 22 | import org.apache.hadoop.fs.{FileSystem, Path} 23 | import za.co.absa.atum.AtumImplicits.{DefaultControlInfoLoader, DefaultControlInfoStorer, StringPathExt} 24 | import za.co.absa.commons.s3.SimpleS3Location.SimpleS3LocationExt 25 | 26 | object InfoFile { 27 | /** 28 | * Sanitizes (removes `?`s and `*`s) and converts string full path to Hadoop FS and Path, e.g. 29 | * `s3://mybucket1/path/to/file` -> S3 FS + `path/to/file` 30 | * `/path/on/hdfs/to/file` -> local HDFS + `/path/on/hdfs/to/file` 31 | * 32 | * Note, that non-local HDFS paths are not supported in this method, e.g. hdfs://nameservice123:8020/path/on/hdfs/too. 33 | * 34 | * @param fullPath path to convert to FS and relative path 35 | * @param hadoopConfiguration 36 | * @return FS + relative path 37 | */ 38 | def convertFullPathToFsAndRelativePath(fullPath: String)(implicit hadoopConfiguration: Configuration): (FileSystem, Path) = { 39 | val sanitizedFullPath = fullPath.replaceAll("[\\*\\?]", "") 40 | 41 | sanitizedFullPath.toSimpleS3Location match { 42 | 43 | case Some(s3Location) => 44 | // this is S3 over hadoop FS API, not SDK S3 approach 45 | val s3Uri = new URI(s3Location.asSimpleS3LocationString) // s3:// 46 | val s3Path = new Path(s"/${s3Location.path}") // / 47 | 48 | val fs = FileSystem.get(s3Uri, hadoopConfiguration) 49 | 50 | (fs, s3Path) 51 | 52 | case None => // local hdfs location 53 | val fs = FileSystem.get(hadoopConfiguration) 54 | 55 | (fs, sanitizedFullPath.toPath) 56 | } 57 | } 58 | } 59 | 60 | private[atum] case class InfoFile(infoFilePath: String) { 61 | require(infoFilePath.nonEmpty, "Empty info file path cannot be used to construct control info stror/loader!") 62 | 63 | def toFsPath(implicit hadoopConfiguration: Configuration): (FileSystem, Path) = { 64 | InfoFile.convertFullPathToFsAndRelativePath(infoFilePath) 65 | } 66 | 67 | def toDefaultControlInfoLoader(implicit hadoopConfiguration: Configuration): DefaultControlInfoLoader = { 68 | val (fs, path) = toFsPath 69 | new DefaultControlInfoLoader(path)(fs) 70 | } 71 | 72 | def toDefaultControlInfoStorer(implicit hadoopConfiguration: Configuration): DefaultControlInfoStorer = { 73 | val (fs, path) = toFsPath 74 | new DefaultControlInfoStorer(path)(fs) 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/ControlMeasureBaseTestSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum 17 | 18 | import za.co.absa.atum.model.{Checkpoint, ControlMeasure} 19 | 20 | trait ControlMeasureBaseTestSuite { 21 | val testingVersion = "1.2.3" 22 | val testingSoftware = "Atum" 23 | val testingDate = "20-02-2020" 24 | val testingDateTime1 = "20-02-2020 10:20:30 +0100" 25 | val testingDateTime2 = "20-02-2020 10:20:40 +0100" 26 | 27 | /** 28 | * Replaces metadata.informationDate, checkpoints.[].{version, processStartTime, processEndTime} with ControlUtilsSpec.testing* values 29 | * (and replaces CRLF endings with LF if found, too) in JSON 30 | * 31 | * @param actualJson 32 | * @return updated json 33 | */ 34 | def stabilizeJsonOutput(actualJson: String): String = { 35 | actualJson 36 | .replaceFirst("""(?<="informationDate"\s?:\s?")(\d{2}-\d{2}-\d{4})""", testingDate) 37 | .replaceAll("""(?<="processStartTime"\s?:\s?")([-+: \d]+)""", testingDateTime1) 38 | .replaceAll("""(?<="processEndTime"\s?:\s?")([-+: \d]+)""", testingDateTime2) 39 | .replaceAll("""(?<="version"\s?:\s?")([-\d\.A-z]+)""", testingVersion) 40 | .replaceAll("""(?<="software"\s?:\s?")([\d\.A-z_]+)""", testingSoftware) 41 | .replaceAll("\r\n", "\n") // Windows guard 42 | } 43 | 44 | implicit class ControlMeasureStabilizationExt(cm: ControlMeasure) { 45 | def replaceInformationDate(newDate: String): ControlMeasure = cm.copy(metadata = cm.metadata.copy(informationDate = newDate)) 46 | 47 | def updateCheckpoints(fn: Checkpoint => Checkpoint): ControlMeasure = cm.copy(checkpoints = cm.checkpoints.map(fn)) 48 | 49 | def replaceCheckpointsVersion(newVersion: Option[String]): ControlMeasure = cm.updateCheckpoints(_.copy(version = newVersion)) 50 | def replaceCheckpointsSoftware(newSoftware: Option[String]): ControlMeasure = cm.updateCheckpoints(_.copy(software = newSoftware)) 51 | def replaceCheckpointsProcessStartTime(newDateTime: String): ControlMeasure = cm.updateCheckpoints(_.copy(processStartTime = newDateTime)) 52 | def replaceCheckpointsProcessEndTime(newDateTime: String): ControlMeasure = cm.updateCheckpoints(_.copy(processEndTime = newDateTime)) 53 | 54 | def stabilizeTestingControlMeasure: ControlMeasure = { 55 | cm.replaceInformationDate(testingDate) 56 | .replaceCheckpointsVersion(Some(testingVersion)) 57 | .replaceCheckpointsSoftware(Some(testingSoftware)) 58 | .replaceCheckpointsProcessStartTime(testingDateTime1) 59 | .replaceCheckpointsProcessEndTime(testingDateTime2) 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresSdkS3LoaderJsonSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.persistence.s3 17 | 18 | import org.mockito.captor.{ArgCaptor, Captor} 19 | import org.mockito.scalatest.IdiomaticMockito 20 | import org.mockito.{ArgumentMatchers, Mockito} 21 | import org.scalatest.flatspec.AnyFlatSpec 22 | import org.scalatest.matchers.should.Matchers 23 | import software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider 24 | import software.amazon.awssdk.core.ResponseBytes 25 | import software.amazon.awssdk.regions.Region 26 | import software.amazon.awssdk.services.s3.S3Client 27 | import software.amazon.awssdk.services.s3.model.{GetObjectRequest, GetObjectResponse} 28 | import za.co.absa.atum.model.ControlMeasure 29 | import za.co.absa.atum.persistence.TestResources 30 | import za.co.absa.atum.utils.FileUtils 31 | 32 | class ControlMeasuresSdkS3LoaderJsonSpec extends AnyFlatSpec with Matchers with IdiomaticMockito { 33 | 34 | val expectedInputControlMeasure: ControlMeasure = TestResources.InputInfo.controlMeasure 35 | 36 | "ControlMeasuresS3LoaderJsonFile" should "load measurements from json file from (mocked) S3" in { 37 | 38 | val inputLocation = SimpleS3LocationWithRegion("s3", "bucket1", "path/to/json.info", Region.EU_WEST_2) 39 | val mockedS3Client = mock[S3Client] 40 | val mockedRequest: ResponseBytes[GetObjectResponse] = mock[ResponseBytes[GetObjectResponse]] 41 | 42 | implicit val credentialsProvider: DefaultCredentialsProvider = DefaultCredentialsProvider.create() 43 | val loader = new ControlMeasuresSdkS3LoaderJsonFile(inputLocation) { 44 | override def getS3Client: S3Client = mockedS3Client 45 | } 46 | 47 | // This file is mocked to be read from in S3 48 | val inputFilePath: String = TestResources.InputInfo.localPath 49 | val mockedS3Data = FileUtils.readFileToString(inputFilePath) 50 | 51 | // mock S3 response 52 | Mockito.when(mockedS3Client.getObjectAsBytes(ArgumentMatchers.any[GetObjectRequest]())).thenReturn(mockedRequest) 53 | Mockito.when(mockedRequest.asUtf8String()).thenReturn(mockedS3Data) 54 | val loadedControlMeasure = loader.load() 55 | 56 | // verify request content 57 | val getRequestCaptor: Captor[GetObjectRequest] = ArgCaptor[GetObjectRequest] 58 | Mockito.verify(mockedS3Client).getObjectAsBytes(getRequestCaptor.capture) 59 | val capturedGetRequest = getRequestCaptor.value 60 | 61 | capturedGetRequest.bucket shouldBe "bucket1" 62 | capturedGetRequest.key shouldBe "path/to/json.info" 63 | 64 | // verify returned value 65 | loadedControlMeasure shouldBe expectedInputControlMeasure 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/main/scala/za/co/absa/atum/AtumImplicitsSdkS3.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum 18 | 19 | import org.apache.spark.sql.{Dataset, Row, SparkSession} 20 | import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider 21 | import za.co.absa.atum.core.AtumSdkS3 22 | import za.co.absa.atum.persistence.s3.{ControlMeasuresSdkS3LoaderJsonFile, ControlMeasuresSdkS3StorerJsonFile, S3KmsSettings, SimpleS3LocationWithRegion} 23 | 24 | import scala.language.implicitConversions 25 | 26 | /** 27 | * The object contains implicit methods for Control Framework 28 | */ 29 | object AtumImplicitsSdkS3 extends AtumImplicitsBase { 30 | implicit val atum: AtumSdkS3 = AtumSdkS3 31 | 32 | /** 33 | * The class contains implicit methods for [[org.apache.spark.sql.SparkSession]]. 34 | */ 35 | implicit class AtumSparkSessionWrapperSdkS3(sparkSession: SparkSession)(implicit atum: AtumSdkS3) { 36 | 37 | /** 38 | * Enable S3-based control measurements tracking via SDK S3 39 | * 40 | * @param sourceS3Location s3 location to load info files from in S3 41 | * @param destinationS3Config s3 location and kms settings to save the data to in S3 42 | * @param credentialsProvider If you do not have a specific Credentials provider, use the default 43 | * { @link software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider#create()} 44 | * @return spark session with atum tracking enabled 45 | */ 46 | def enableControlMeasuresTrackingForSdkS3(sourceS3Location: Option[SimpleS3LocationWithRegion], 47 | destinationS3Config: Option[(SimpleS3LocationWithRegion, S3KmsSettings)]) 48 | (implicit credentialsProvider: AwsCredentialsProvider): SparkSession = { 49 | 50 | val loader = sourceS3Location.map(ControlMeasuresSdkS3LoaderJsonFile(_)) 51 | val storer = destinationS3Config.map { case (destLoc, kms) => 52 | ControlMeasuresSdkS3StorerJsonFile(destLoc, kms) 53 | } 54 | 55 | sparkSession.enableControlMeasuresTrackingDirectly(loader, storer) 56 | } 57 | 58 | } 59 | 60 | /** 61 | * The class contains implicit methods for [[org.apache.spark.sql.Dataset]]. 62 | */ 63 | implicit class DataSetWrapperSdkS3(dataset: Dataset[Row])(implicit atum: AtumSdkS3) { 64 | 65 | def writeInfoFileOnS3(s3Location: SimpleS3LocationWithRegion, s3KmsSettings: S3KmsSettings)(implicit credentialsProvider: AwsCredentialsProvider): Dataset[Row] = { 66 | atum.controlFrameworkStateSdkS3.storeCurrentInfoFileOnSdkS3(s3Location, s3KmsSettings) 67 | dataset 68 | } 69 | 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListenerSdkS3.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.core 18 | 19 | import org.apache.spark.sql.execution.QueryExecution 20 | import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider 21 | import software.amazon.awssdk.regions.Region 22 | import za.co.absa.atum.persistence.S3ControlMeasuresStorer 23 | import za.co.absa.atum.persistence.s3.S3KmsSettings 24 | import za.co.absa.atum.utils.ExecutionPlanUtils 25 | import za.co.absa.commons.s3.SimpleS3Location 26 | 27 | /** 28 | * The class is responsible for listening to DataSet save events and outputting corresponding control measurements. 29 | */ 30 | class SparkQueryExecutionListenerSdkS3(cf: ControlFrameworkStateSdkS3) extends SparkQueryExecutionListener(cf) { 31 | 32 | override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { 33 | if (funcName == "save") { 34 | 35 | // adding s3 processing 36 | cf.accumulator.getStorer match { 37 | case Some(s3storer: S3ControlMeasuresStorer) => 38 | AtumSdkS3.log.debug(s"SparkQueryExecutionListener.onSuccess for S3ControlMeasuresStorer: writing to ${s3storer.outputLocation.asSimpleS3LocationString}") 39 | writeInfoFileForQueryForSdkS3(qe, s3storer.outputLocation.region, s3storer.kmsSettings)(s3storer.credentialsProvider) 40 | 41 | // Notify listeners 42 | cf.updateRunCheckpoints(saveInfoFile = true) 43 | cf.updateStatusSuccess() 44 | updateSplineRef(qe) 45 | 46 | case _ => 47 | // regular SQE processing 48 | super.onSuccess(funcName, qe, durationNs) 49 | } 50 | } 51 | } 52 | 53 | /** Write _INFO file with control measurements to the output directory based on the query plan */ 54 | private def writeInfoFileForQueryForSdkS3(qe: QueryExecution, region: Region, kmsSettings: S3KmsSettings)(implicit credentialsProvider: AwsCredentialsProvider): Unit = { 55 | val infoFilePath = ExecutionPlanUtils.inferOutputInfoFileNameOnS3(qe, cf.outputInfoFileName) 56 | 57 | // Write _INFO file to the output directory 58 | infoFilePath.foreach(path => { 59 | 60 | import za.co.absa.atum.persistence.s3.S3LocationRegionImplicits.SimpleS3LocationRegionExt 61 | 62 | val location = SimpleS3Location(path) // would throw IAE on apply (parse error) 63 | .withRegion(region) 64 | 65 | AtumSdkS3.log.debug(s"Inferred _INFO Location = $location") 66 | cf.storeCurrentInfoFileOnSdkS3(location, kmsSettings) 67 | }) 68 | 69 | // Write _INFO file to a registered storer 70 | if (cf.accumulator.isStorerLoaded) { 71 | cf.accumulator.store() 72 | } 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/HdfsFileUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | import java.io.IOException 20 | 21 | import com.typesafe.config.{Config, ConfigFactory} 22 | import org.apache.commons.io.IOUtils 23 | import org.apache.hadoop.fs.permission.FsPermission 24 | import org.apache.hadoop.fs.{FileSystem, Path} 25 | import org.apache.spark.SparkContext 26 | 27 | import scala.collection.JavaConverters._ 28 | 29 | object HdfsFileUtils { 30 | final val FilePermissionsKey = "atum.hdfs.info.file.permissions" 31 | 32 | private val hadoopConfiguration = SparkContext.getOrCreate().hadoopConfiguration 33 | final val DefaultFilePermissions = FsPermission.getFileDefault.applyUMask( 34 | FsPermission.getUMask(FileSystem.get(hadoopConfiguration).getConf) 35 | ) 36 | 37 | /** 38 | * Reads Fs permissions from typesafe config from key [[za.co.absa.atum.utils.HdfsFileUtils#FilePermissionsKey()]] 39 | * Consider using za.co.absa.atum.utils.HdfsFileUtils#DefaultFilePermissions() when this method yields None, e.g.: 40 | * {{{ 41 | * HdfsFileUtils.getInfoFilePermissionsFromConfig() 42 | * .getOrElse(HdfsFileUtils.DefaultFilePermissions) 43 | * }}} 44 | * 45 | * @param config 46 | * @return defined some FsPermissions if key/value was found, None otherwise 47 | */ 48 | def getInfoFilePermissionsFromConfig(config: Config = ConfigFactory.load()): Option[FsPermission] = { 49 | if (config.hasPath(FilePermissionsKey)) { 50 | Some(new FsPermission(config.getString(FilePermissionsKey))) 51 | } else { 52 | None 53 | } 54 | } 55 | 56 | def readHdfsFileToString(path: Path)(implicit inputFs: FileSystem): String = { 57 | val stream = inputFs.open(path) 58 | try 59 | IOUtils.readLines(stream).asScala.mkString("\n") 60 | finally 61 | stream.close() 62 | } 63 | 64 | /** 65 | * Writes string data to a HDFS Path 66 | * 67 | * @param path Path to write to 68 | * @param data data to write 69 | * @param outputFs hadoop FS to use 70 | * @param filePermissions desired permissions to use for the file written 71 | * @throws IOException when data write errors occur 72 | */ 73 | def saveStringDataToFile(path: Path, data: String, filePermissions: FsPermission = DefaultFilePermissions) 74 | (implicit outputFs: FileSystem): Unit = { 75 | import ARMImplicits._ 76 | for (fos <- outputFs.create( 77 | path, 78 | filePermissions, 79 | true, 80 | 4096, 81 | outputFs.getDefaultReplication(path), 82 | outputFs.getDefaultBlockSize(path), 83 | null) 84 | ) { 85 | fos.write(data.getBytes) 86 | } 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/core/AccumulatorSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.core 17 | 18 | import org.scalatest.flatspec.AnyFlatSpec 19 | import org.scalatest.matchers.should.Matchers.convertToAnyShouldWrapper 20 | import za.co.absa.atum.model.{ControlMeasure, ControlMeasureMetadata} 21 | import za.co.absa.atum.persistence.ControlMeasuresLoader 22 | 23 | class AccumulatorSpec extends AnyFlatSpec { 24 | 25 | private val emptyControlMeasureMetadata = ControlMeasureMetadata( 26 | sourceApplication = "", 27 | country = "", 28 | historyType = "", 29 | dataFilename = "", 30 | sourceType = "", 31 | version = 0, 32 | informationDate = "", 33 | additionalInfo = Map.empty 34 | ) 35 | private val emptyControlMeasure: ControlMeasure = ControlMeasure( 36 | emptyControlMeasureMetadata, 37 | None, 38 | List.empty 39 | ) 40 | private def initAccumulator(controlMeasure: ControlMeasure = emptyControlMeasure): Accumulator = { 41 | val result = new Accumulator 42 | val loader: ControlMeasuresLoader = new ControlMeasuresLoader{ 43 | override def load(): ControlMeasure = controlMeasure 44 | override def getInfo: String = "" 45 | } 46 | result.loadControlMeasurements(loader) 47 | result 48 | } 49 | 50 | private def initAccumulator(controlMeasureMetadata: ControlMeasureMetadata): Accumulator = { 51 | initAccumulator(emptyControlMeasure.copy(metadata = controlMeasureMetadata)) 52 | } 53 | 54 | "setAdditionalInfo" should "add additional key" in { 55 | val expected = emptyControlMeasureMetadata.copy(additionalInfo = Map("Luke"->"Skywalker", "Han"->"Solo")) 56 | 57 | val accumulator = initAccumulator() 58 | accumulator.setAdditionalInfo(("Luke","Skywalker"), replaceIfExists = false) 59 | accumulator.setAdditionalInfo(("Han","Solo"), replaceIfExists = true) 60 | val actual = accumulator.getControlMeasure.metadata 61 | actual shouldBe expected 62 | } 63 | 64 | it should "overwrite a key with overwrite on" in { 65 | val initControlMeasureMetadata = emptyControlMeasureMetadata.copy(additionalInfo = Map("Leia"->"Organa", "Han"->"Solo")) 66 | val expected = emptyControlMeasureMetadata.copy(additionalInfo = Map("Leia"->"Organa Solo", "Han"->"Solo")) 67 | val accumulator = initAccumulator(initControlMeasureMetadata) 68 | accumulator.setAdditionalInfo(("Leia","Organa Solo"), replaceIfExists = true) 69 | val actual = accumulator.getControlMeasure.metadata 70 | actual shouldBe expected 71 | } 72 | 73 | it should "keep the old value if overwrite is off" in { 74 | val initControlMeasureMetadata = emptyControlMeasureMetadata.copy(additionalInfo = Map("Luke"->"Skywalker", "Han"->"Solo")) 75 | val accumulator = initAccumulator(initControlMeasureMetadata) 76 | accumulator.setAdditionalInfo(("Luke","Vader"), replaceIfExists = false) 77 | val actual = accumulator.getControlMeasure.metadata 78 | actual shouldBe initControlMeasureMetadata 79 | } 80 | 81 | 82 | } 83 | -------------------------------------------------------------------------------- /atum-s3-sdk-extension/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresSdkS3StorerJsonSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.persistence.s3 17 | 18 | import org.mockito.captor.{ArgCaptor, Captor} 19 | import org.mockito.scalatest.IdiomaticMockito 20 | import org.mockito.{ArgumentMatchers, Mockito} 21 | import org.scalatest.flatspec.AnyFlatSpec 22 | import org.scalatest.matchers.should.Matchers 23 | import software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider 24 | import software.amazon.awssdk.core.sync.RequestBody 25 | import software.amazon.awssdk.regions.Region 26 | import software.amazon.awssdk.services.s3.S3Client 27 | import software.amazon.awssdk.services.s3.model.{PutObjectRequest, PutObjectResponse, ServerSideEncryption} 28 | import za.co.absa.atum.model.ControlMeasure 29 | import za.co.absa.atum.persistence.TestResources 30 | import za.co.absa.atum.utils.FileUtils 31 | 32 | import scala.io.Source 33 | 34 | class ControlMeasuresSdkS3StorerJsonSpec extends AnyFlatSpec with Matchers with IdiomaticMockito { 35 | 36 | val inputControlMeasure: ControlMeasure = TestResources.InputInfo.controlMeasure 37 | 38 | "ControlMeasuresS3StorerJsonFile" should "store measurements to json file to S3" in { 39 | 40 | val outputLocation = SimpleS3LocationWithRegion("s3", "bucket1", "path/to/json.info", Region.EU_WEST_2) 41 | val kmsSettigns = S3KmsSettings("testingKeyId123") 42 | val mockedS3Client = mock[S3Client] 43 | 44 | implicit val credentialsProvider: DefaultCredentialsProvider = DefaultCredentialsProvider.create() 45 | 46 | val storer = new ControlMeasuresSdkS3StorerJsonFile(outputLocation, kmsSettigns) { 47 | override def getS3Client: S3Client = mockedS3Client 48 | } 49 | 50 | // mock S3 response 51 | Mockito.when(mockedS3Client.putObject(ArgumentMatchers.any[PutObjectRequest], ArgumentMatchers.any[RequestBody])) 52 | .thenReturn(mock[PutObjectResponse]) // anything non-throwing 53 | storer.store(inputControlMeasure) 54 | 55 | // verify request content 56 | val putRequestCaptor: Captor[PutObjectRequest] = ArgCaptor[PutObjectRequest] 57 | val requestBodyCaptor: Captor[RequestBody] = ArgCaptor[RequestBody] 58 | 59 | Mockito.verify(mockedS3Client).putObject(putRequestCaptor.capture, requestBodyCaptor.capture) 60 | val (capturedPutRequest, capturedRequestBody) = (putRequestCaptor.value, requestBodyCaptor.value) 61 | 62 | capturedPutRequest.bucket shouldBe "bucket1" 63 | capturedPutRequest.key shouldBe "path/to/json.info" 64 | capturedPutRequest.ssekmsKeyId shouldBe "testingKeyId123" 65 | capturedPutRequest.serverSideEncryption() shouldBe ServerSideEncryption.AWS_KMS 66 | 67 | // This expected request body content should be the same as content of this file (conforms to `inputControlMeasure`) 68 | val sameContentFile: String = TestResources.InputInfo.localPath 69 | val expectedContent = FileUtils.readFileToString(sameContentFile) 70 | 71 | val requestDataContent = Source.fromInputStream(capturedRequestBody.contentStreamProvider().newStream()).mkString 72 | TestResources.filterWhitespaces(requestDataContent) shouldBe TestResources.filterWhitespaces(expectedContent) 73 | 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/utils/HdfsFileUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.utils 17 | 18 | import com.typesafe.config.{ConfigFactory, ConfigValueFactory} 19 | import org.apache.hadoop.fs.permission.FsPermission 20 | import org.apache.hadoop.fs.{FileSystem, Path} 21 | import org.scalatest.flatspec.AnyFlatSpec 22 | import org.scalatest.matchers.should.Matchers 23 | import za.co.absa.atum.utils.OperatingSystem.OperatingSystems 24 | 25 | class HdfsFileUtilsSpec extends AnyFlatSpec with Matchers with SparkTestBase { 26 | 27 | implicit val fs: FileSystem = FileSystem.get(spark.sparkContext.hadoopConfiguration) 28 | 29 | private val Content = "Testing Content" 30 | 31 | "HdfsFileUtils" should "write a file to HDFS (default permissions)" in { 32 | assume(OperatingSystem.getCurrentOs != OperatingSystems.WINDOWS) 33 | val path = new Path("/tmp/hdfs-file-utils-test/def-perms.file") 34 | 35 | HdfsFileUtils.getInfoFilePermissionsFromConfig() shouldBe None // key not present, testing default => 36 | HdfsFileUtils.saveStringDataToFile(path, Content) 37 | 38 | fs.exists(path) shouldBe true 39 | fs.getFileStatus(path).getPermission shouldBe HdfsFileUtils.DefaultFilePermissions 40 | fs.deleteOnExit(path) 41 | } 42 | 43 | it should "write a file to HDFS (max permissions 777 - default umask 022 -> 755)" in { 44 | assume(OperatingSystem.getCurrentOs != OperatingSystems.WINDOWS) 45 | 46 | val path = new Path("/tmp/hdfs-file-utils-test/max-perms.file") 47 | 48 | val customConfig = ConfigFactory.empty() 49 | .withValue("atum.hdfs.info.file.permissions", ConfigValueFactory.fromAnyRef("755")) 50 | HdfsFileUtils.saveStringDataToFile(path, Content, HdfsFileUtils.getInfoFilePermissionsFromConfig(customConfig).get) 51 | 52 | fs.exists(path) shouldBe true 53 | // Default 022 umask allows max fsPermissions 755 54 | fs.getFileStatus(path).getPermission shouldBe new FsPermission("755") 55 | fs.deleteOnExit(path) 56 | } 57 | 58 | it should "write a file to HDFS (min permissions)" in { 59 | assume(OperatingSystem.getCurrentOs != OperatingSystems.WINDOWS) 60 | 61 | val path = new Path("/tmp/hdfs-file-utils-test/min-perms.file") 62 | val customConfig = ConfigFactory.empty() 63 | .withValue("atum.hdfs.info.file.permissions", ConfigValueFactory.fromAnyRef("000")) 64 | HdfsFileUtils.saveStringDataToFile(path, Content, HdfsFileUtils.getInfoFilePermissionsFromConfig(customConfig).get) 65 | 66 | fs.exists(path) shouldBe true 67 | fs.getFileStatus(path).getPermission shouldBe new FsPermission("000") 68 | fs.deleteOnExit(path) 69 | } 70 | 71 | it should "write a file to HDFS (custom permissions)" in { 72 | assume(OperatingSystem.getCurrentOs != OperatingSystems.WINDOWS) 73 | 74 | val path = new Path("/tmp/hdfs-file-utils-test/custom-perms.file") 75 | val customConfig = ConfigFactory.empty() 76 | .withValue("atum.hdfs.info.file.permissions", ConfigValueFactory.fromAnyRef("751")) 77 | HdfsFileUtils.saveStringDataToFile(path, Content, HdfsFileUtils.getInfoFilePermissionsFromConfig(customConfig).get) 78 | 79 | fs.exists(path) shouldBe true 80 | fs.getFileStatus(path).getPermission shouldBe new FsPermission("751") 81 | fs.deleteOnExit(path) 82 | } 83 | 84 | Seq( 85 | "garbage$55%$", 86 | "", 87 | "1" 88 | ).foreach { invalidFsPermissionString => 89 | it should s"fail on invalid permissions config (case $invalidFsPermissionString)" in { 90 | val customConfig = ConfigFactory.empty() 91 | .withValue("atum.hdfs.info.file.permissions", ConfigValueFactory.fromAnyRef(invalidFsPermissionString)) 92 | 93 | intercept[IllegalArgumentException] { 94 | HdfsFileUtils.getInfoFilePermissionsFromConfig(customConfig) 95 | } 96 | } 97 | } 98 | 99 | 100 | } 101 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.core 18 | 19 | import java.io.{PrintWriter, StringWriter} 20 | import org.apache.hadoop.conf.Configuration 21 | import org.apache.hadoop.fs.{FileSystem, Path} 22 | import org.apache.log4j.LogManager 23 | import org.apache.spark.sql.execution.QueryExecution 24 | import org.apache.spark.sql.execution.datasources.{InsertIntoHadoopFsRelationCommand, SaveIntoDataSourceCommand} 25 | import org.apache.spark.sql.util.QueryExecutionListener 26 | import za.co.absa.atum.utils.ExecutionPlanUtils.log 27 | import za.co.absa.atum.utils.{ExecutionPlanUtils, InfoFile} 28 | 29 | /** 30 | * The class is responsible for listening to DataSet save events and outputting corresponding control measurements. 31 | */ 32 | class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecutionListener { 33 | private val log = LogManager.getLogger("SparkQueryExecutionListener") 34 | 35 | override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { 36 | (funcName, qe.analyzed) match { 37 | case ("save", _) => writeInfoFileCommon(qe) // < spark 3.1.x 38 | case ("command", saveCommand) 39 | if saveCommand.isInstanceOf[SaveIntoDataSourceCommand] || saveCommand.isInstanceOf[InsertIntoHadoopFsRelationCommand] // spark 3.2+ 40 | => writeInfoFileCommon(qe) 41 | case _ => 42 | 43 | // explanation: https://spark.apache.org/docs/latest/sql-migration-guide.html#upgrading-from-spark-sql-31-to-32 44 | // "In Spark 3.2, the query executions triggered by DataFrameWriter are always named command when being sent 45 | // to QueryExecutionListener. In Spark 3.1 and earlier, the name is one of save, insertInto, saveAsTable." 46 | } 47 | 48 | def writeInfoFileCommon(qe: QueryExecution) = { 49 | Atum.log.debug(s"SparkQueryExecutionListener.onSuccess: writing to Hadoop FS") 50 | writeInfoFileForQuery(qe) 51 | 52 | // Notify listeners 53 | cf.updateRunCheckpoints(saveInfoFile = true) 54 | cf.updateStatusSuccess() 55 | 56 | updateSplineRef(qe) 57 | } 58 | } 59 | 60 | override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { 61 | val sw = new StringWriter 62 | exception.printStackTrace(new PrintWriter(sw)) 63 | 64 | cf.updateStatusFailure(qe.sparkSession.sparkContext.appName, funcName, exception.getMessage, 65 | sw.toString + "\r\n\r\n" + qe.optimizedPlan.toString) 66 | } 67 | 68 | /** Write _INFO file with control measurements to the output directory based on the query plan */ 69 | private[core] def writeInfoFileForQuery(qe: QueryExecution)(): Unit = { 70 | val infoFileDir: Option[String] = ExecutionPlanUtils.inferOutputInfoFileDir(qe) 71 | 72 | implicit val hadoopConf: Configuration = qe.sparkSession.sparkContext.hadoopConfiguration 73 | val fsWithDir: Option[(FileSystem, Path)] = infoFileDir 74 | .map(InfoFile(_).toFsPath) // path + FS based on HDFS or S3 over hadoopFS 75 | 76 | // Write _INFO file to the output directory 77 | fsWithDir.foreach { case (fs, dir) => 78 | val path = new Path(dir, cf.outputInfoFileName) 79 | 80 | Atum.log.info(s"Inferred _INFO Path = ${path.toUri.toString}") 81 | cf.storeCurrentInfoFile(path)(fs) 82 | } 83 | 84 | // Write _INFO file to a registered storer 85 | if (cf.accumulator.isStorerLoaded) { 86 | cf.accumulator.store() 87 | } 88 | } 89 | 90 | /** Update Spline reference of the job and notify plugins */ 91 | protected def updateSplineRef(qe: QueryExecution): Unit = { 92 | val outputPath = ExecutionPlanUtils.inferOutputFileName(qe, qe.sparkSession.sparkContext.hadoopConfiguration) 93 | outputPath.foreach(path => { 94 | cf.updateSplineRef(path.toUri.toString) 95 | Atum.log.info(s"Inferred Output Path = ${path.toUri.toString}") 96 | }) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/ConfigurationImplicits.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | import org.apache.commons.configuration.Configuration 20 | import org.apache.commons.lang.StringUtils.isNotBlank 21 | 22 | /** 23 | * The object contains extension methods for the [[org.apache.commons.configuration.Configuration Configuration]] interface. 24 | */ 25 | object ConfigurationImplicits { 26 | 27 | /** 28 | * The class wraps the [[org.apache.commons.configuration.Configuration Configuration]] interface in order to provide extension methods. 29 | * 30 | * @param conf A configuration instance 31 | * @tparam T A specific type implementing the [[org.apache.commons.configuration.Configuration Configuration]] interface 32 | */ 33 | implicit class ConfigurationRequiredWrapper[T <: Configuration](conf: T) { 34 | 35 | /** 36 | * Gets a value of string configuration property and checks whether property exists. 37 | * 38 | * @return A value of string configuration property if exists, otherwise throws an exception. 39 | */ 40 | def getRequiredString: (String) => String = getRequired(conf.getString, isNotBlank) 41 | 42 | /** 43 | * Gets a value of boolean configuration property and checks whether property exists. 44 | * 45 | * @return A value of boolean configuration property if exists, otherwise throws an exception. 46 | */ 47 | def getRequiredBoolean: (String) => Boolean = getRequired(conf.getBoolean(_, null), null.!=) //NOSONAR 48 | 49 | /** 50 | * Gets a value of big decimal configuration property and checks whether property exists. 51 | * 52 | * @return A value of big decimal configuration property if exists, otherwise throws an exception. 53 | */ 54 | def getRequiredBigDecimal: (String) => BigDecimal = getRequired(conf.getBigDecimal, null.!=) //NOSONAR 55 | 56 | /** 57 | * Gets a value of byte configuration property and checks whether property exists. 58 | * 59 | * @return A value of byte configuration property if exists, otherwise throws an exception. 60 | */ 61 | def getRequiredByte: (String) => Byte = getRequired(conf.getByte(_, null), null.!=) //NOSONAR 62 | 63 | /** 64 | * Gets a value of short configuration property and checks whether property exists. 65 | * 66 | * @return A value of short configuration property if exists, otherwise throws an exception. 67 | */ 68 | def getRequiredShort: (String) => Short = getRequired(conf.getShort(_, null), null.!=) //NOSONAR 69 | 70 | /** 71 | * Gets a value of int configuration property and checks whether property exists. 72 | * 73 | * @return A value of int configuration property if exists, otherwise throws an exception. 74 | */ 75 | def getRequiredInt: (String) => Int = getRequired(conf.getInteger(_, null), null.!=) //NOSONAR 76 | 77 | /** 78 | * Gets a value of long configuration property and checks whether property exists. 79 | * 80 | * @return A value of long configuration property if exists, otherwise throws an exception. 81 | */ 82 | def getRequiredLong: (String) => Long = getRequired(conf.getLong(_, null), null.!=) //NOSONAR 83 | 84 | /** 85 | * Gets a value of float configuration property and checks whether property exists. 86 | * 87 | * @return A value of float configuration property if exists, otherwise throws an exception. 88 | */ 89 | def getRequiredFloat: (String) => Float = getRequired(conf.getFloat(_, null), null.!=) //NOSONAR 90 | 91 | /** 92 | * Gets a value of double configuration property and checks whether property exists. 93 | * 94 | * @return A value of double configuration property if exists, otherwise throws an exception. 95 | */ 96 | def getRequiredDouble: (String) => Double = getRequired(conf.getDouble(_, null), null.!=) //NOSONAR 97 | 98 | private def getRequired[V](get: String => V, check: V => Boolean)(key: String): V = { 99 | val v = get(key) 100 | require(check(v), s"Missing configuration property $key") 101 | v 102 | } 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /examples-s3-sdk-extension/src/main/scala/za/co/absa/atum/examples/SampleSdkS3Measurements2.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.examples 17 | 18 | import org.apache.hadoop.fs.FileSystem 19 | import org.apache.log4j.{LogManager, Logger} 20 | import org.apache.spark.sql.{SaveMode, SparkSession} 21 | import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider 22 | import software.amazon.awssdk.regions.Region 23 | import za.co.absa.atum.AtumImplicitsSdkS3._ 24 | import za.co.absa.atum.model.{ControlMeasure, Measurement} 25 | import za.co.absa.atum.persistence.s3.{ControlMeasuresSdkS3LoaderJsonFile, S3KmsSettings, SimpleS3LocationWithRegion} 26 | import za.co.absa.atum.utils.SdkS3ClientUtils 27 | 28 | object SampleSdkS3Measurements2 { 29 | val log: Logger = LogManager.getLogger("SampleSdkS3Measurements2") 30 | 31 | def main(args: Array[String]) { 32 | 33 | // This example is intended to run AFTER SampleMeasurements1, otherwise it will fail on input file absence 34 | 35 | val sparkBuilder = SparkSession.builder().appName("Sample Measurements 2 Job") 36 | //val spark = sparkBuilder.master("local").getOrCreate() 37 | val spark = sparkBuilder.getOrCreate() 38 | import spark.implicits._ 39 | 40 | val hadoopConfiguration = spark.sparkContext.hadoopConfiguration 41 | implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration) 42 | 43 | // This sample example relies on local credentials profile named "saml" with access to the s3 location defined below 44 | // AND by having explicitly defined KMS Key ID 45 | implicit val samlCredentialsProvider: ProfileCredentialsProvider = SdkS3ClientUtils.getLocalProfileCredentialsProvider("saml") 46 | val kmsKeyId = System.getenv("TOOLING_KMS_KEY_ID") // load from an environment property in order not to disclose it here 47 | val myBucket = System.getenv("TOOLING_BUCKET_NAME") // same reason 48 | log.info(s"kmsKeyId from env loaded = ${kmsKeyId.take(10)}...") 49 | log.info(s"BucketName from env loaded = $myBucket") 50 | 51 | val infoFileOutputLocation = SimpleS3LocationWithRegion("s3", myBucket, "atum/output/wikidata.csv.info", Region.EU_WEST_1) 52 | 53 | // Initializing library to hook up to Apache Spark 54 | // No need to specify datasetName and datasetVersion as it is stage 2 and it will be determined automatically 55 | spark.enableControlMeasuresTrackingForSdkS3( 56 | sourceS3Location = None, 57 | destinationS3Config = Some( 58 | infoFileOutputLocation, 59 | S3KmsSettings(kmsKeyId) 60 | ) 61 | ).setControlMeasuresWorkflow("Job 2") 62 | 63 | val sourceDS = spark.read 64 | .parquet("data/output_s3/stage1_job_results") 65 | 66 | // A business logic of a spark job ... 67 | 68 | // An example - a column rename 69 | // If the renamed column is one of control measurement columns, the rename need to be registered in Control Framework 70 | sourceDS.as("target") 71 | .withColumnRenamed("total_response_size", "trs") // Renaming the column 72 | .registerColumnRename("total_response_size", "trs") // Registering the rename, from now on the new name for the column is 'trs' 73 | .filter($"trs" > 15000) 74 | .setCheckpoint("checkpoint2") 75 | .write.mode(SaveMode.Overwrite) 76 | .parquet("data/output_s3/stage2_job_results") 77 | 78 | spark.disableControlMeasuresTracking() 79 | 80 | // checking info file presence via wrapped AWS SDK s3 81 | val loader = ControlMeasuresSdkS3LoaderJsonFile(infoFileOutputLocation) 82 | val controlMeasure = loader.load() 83 | 84 | val expectedCheckpointRecordCounts = Seq( 85 | "Source" -> 4964, "Raw" -> 4964, "checkpoint1" -> 3072, "checkpoint2" -> 1651) 86 | val extractedCounts = extractCheckpointsRecordCounts(controlMeasure) 87 | assert(extractedCounts == expectedCheckpointRecordCounts, s"expecting control measure counts to be: $expectedCheckpointRecordCounts, but $extractedCounts found.") 88 | } 89 | 90 | private def extractCheckpointsRecordCounts(controlMeasure: ControlMeasure): Seq[(String, Int)] = { 91 | controlMeasure.checkpoints.map { checkpoint => 92 | val count: Int = checkpoint.controls.collectFirst { case Measurement("recordCount", _, _, value) => value.toString.toInt }.getOrElse(0) 93 | (checkpoint.name, count) 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/core/Accumulator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.core 18 | 19 | import za.co.absa.atum.core.Atum.log 20 | import za.co.absa.atum.model 21 | import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresParser, ControlMeasuresStorer} 22 | import za.co.absa.atum.model._ 23 | import za.co.absa.atum.model.CheckpointImplicits.CheckpointExt 24 | import za.co.absa.atum.utils.controlmeasure.ControlMeasureUtils 25 | 26 | import scala.util.control.NonFatal 27 | 28 | /** 29 | * The class is responsible for accumulating control measurements. 30 | */ 31 | class Accumulator() { 32 | private var controlMeasure: ControlMeasure = _ 33 | private var storer: ControlMeasuresStorer = _ 34 | private var lastProcessingDate: String = ControlMeasureUtils.getTimestampAsString 35 | 36 | def isControlMeasuresLoaded: Boolean = controlMeasure != null 37 | def isStorerLoaded: Boolean = storer != null 38 | 39 | /** Loads control measurements */ 40 | def loadControlMeasurements(loaderIn: ControlMeasuresLoader): Unit = { 41 | try { 42 | controlMeasure = loaderIn.load() 43 | } catch { 44 | case NonFatal(e) => 45 | log.warn(s"Unable to load control measurements via ${loaderIn.getInfo}. Error: ${e.getMessage}") 46 | throw e 47 | } 48 | } 49 | 50 | /** Sets a persistor object for saving control checkpoints. */ 51 | def setStorer(storerIn: ControlMeasuresStorer): Unit = { 52 | storer = storerIn 53 | } 54 | 55 | /** Gets information about the storer. */ 56 | def getStorerInfo: String = { 57 | if (storer != null) { 58 | storer.getInfo 59 | } else { 60 | "" 61 | } 62 | } 63 | 64 | /** 65 | * Ability to view the storer if set. 66 | * @return 67 | */ 68 | private[atum] def getStorer: Option[ControlMeasuresStorer] = if (isStorerLoaded) Some(storer) else None 69 | 70 | /** 71 | * The method returns Control Info object in which checkpoints are sorted by calculation order. 72 | */ 73 | def getControlMeasure: ControlMeasure = this.synchronized( 74 | ControlMeasure(controlMeasure.metadata, controlMeasure.runUniqueId, controlMeasure.checkpoints.sortBy(c => c.order))) 75 | 76 | /** Returns checkpoints that are accumulated up until now. */ 77 | def getCheckpoints: List[Checkpoint] = { 78 | controlMeasure.checkpoints 79 | } 80 | 81 | /** Returns the unique ID associated with the sequence of jobs (aka runs) if available. */ 82 | def getUniqueRunId: Option[String] = { 83 | controlMeasure.runUniqueId 84 | } 85 | 86 | /** Sets an unique ID to be associated with the sequence of jobs (aka runs). */ 87 | def setRunUniqueId(runUniqueId: String): Unit = { 88 | controlMeasure = controlMeasure.copy(runUniqueId = Some(runUniqueId)) 89 | } 90 | 91 | /** Adds a key-value pair as an additional information stored in the metadata. */ 92 | def setAdditionalInfo(kv: (String, String), replaceIfExists: Boolean): Unit = { 93 | controlMeasure = controlMeasure.setAdditionalInfo(kv, replaceIfExists) 94 | } 95 | 96 | /** The method returns Control Info object as a Json string. */ 97 | def asJson: String = { 98 | ControlMeasuresParser.asJson(getControlMeasure) 99 | } 100 | 101 | /** Stores control measurements by delegating to the persistence layer. */ 102 | def store(): Unit = { 103 | if (storer != null) { 104 | storer.store(getControlMeasure) 105 | } 106 | } 107 | 108 | /** The method adds a new checkpoint to the accumulator. */ 109 | def addCheckpoint(name: String, 110 | workflowName: String, 111 | controls: Seq[Measurement] 112 | ): Checkpoint = this.synchronized { 113 | val order = controlMeasure.checkpoints.map(c => c.order).fold(0)(Math.max) + 1 114 | val timestampStr = ControlMeasureUtils.getTimestampAsString 115 | val checkpoint = model.Checkpoint(name = name, 116 | processStartTime = lastProcessingDate, 117 | processEndTime = timestampStr, 118 | workflowName = workflowName, 119 | order = order, 120 | controls = controls.toList) 121 | .withBuildProperties 122 | lastProcessingDate = timestampStr 123 | controlMeasure = ControlMeasure(controlMeasure.metadata, controlMeasure.runUniqueId, checkpoint :: controlMeasure.checkpoints) 124 | checkpoint 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /model/src/test/scala/za/co/absa/atum/model/ControlMeasureSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.model 17 | 18 | import org.scalatest.matchers.should.Matchers 19 | 20 | class ControlMeasureSpec extends org.scalatest.flatspec.AnyFlatSpec with Matchers { 21 | 22 | private val cp1 = Checkpoint("prependingCp", None, None, "01-01-2020 07:00:00", "01-01-2020 07:00:10", "wf1", 1, List( 23 | Measurement("control1", "someControlType", "column1", "1234") 24 | )) 25 | 26 | "ControlMeasure" should "get a new Checkpoint with no checkpoints" in { 27 | val cm = getTestingControlMeasure(0) 28 | cm.checkpoints should have length 0 29 | 30 | val updatedCm = cm.withPrecedingCheckpoint(cp1) 31 | 32 | // cp1 prepended as-is 33 | val expectedCm: ControlMeasure = ControlMeasure( 34 | ControlMeasureMetadata("AtumTest", "CZ", "Snapshot", "example_input.csv", "public", 1, "01-01-2020", Map.empty), 35 | runUniqueId = None, 36 | checkpoints = List(Checkpoint("prependingCp", None, None, "01-01-2020 07:00:00", "01-01-2020 07:00:10", "wf1", 1, List( 37 | Measurement("control1", "someControlType", "column1", "1234")) 38 | ))) 39 | 40 | updatedCm shouldBe expectedCm 41 | } 42 | 43 | "ControlMeasure" should "get a new Checkpoint with existing checkpoints being shifted back with their order, too" in { 44 | val cm = getTestingControlMeasure(2) 45 | cm.checkpoints should have length 2 46 | cm.checkpoints.map(_.order) shouldBe Seq(1,2) 47 | 48 | val updatedCm = cm.withPrecedingCheckpoint(cp1) 49 | 50 | // cp1 prepended as-is 51 | updatedCm.checkpoints should have length 3 52 | updatedCm.checkpoints.head shouldBe cp1 53 | updatedCm.checkpoints.tail.map(_.order) shouldBe Seq(2,3) // existing order shifted back 54 | } 55 | 56 | "ControlMeasure" should "allow to add new metadata.additionalInfo field" in { 57 | val cm = getTestingControlMeasure(0) 58 | 59 | val updatedCm = cm.setAdditionalInfo(("Some", "Metadata"), replaceIfExists = false) 60 | 61 | val expectedCm: ControlMeasure = ControlMeasure( 62 | ControlMeasureMetadata( 63 | "AtumTest", "CZ", "Snapshot", "example_input.csv", "public", 1, "01-01-2020", Map("Some" -> "Metadata") 64 | ), 65 | runUniqueId = None, 66 | checkpoints = List() 67 | ) 68 | 69 | updatedCm shouldBe expectedCm 70 | } 71 | 72 | "ControlMeasure" should "allow to not overwrite existing metadata.additionalInfo field" in { 73 | val cm = getTestingControlMeasure(0, Map("Some" -> "MetadataOld")) 74 | 75 | val updatedCm = cm.setAdditionalInfo(("Some", "MetadataNew"), replaceIfExists = false) 76 | 77 | val expectedCm: ControlMeasure = ControlMeasure( 78 | ControlMeasureMetadata( 79 | "AtumTest", "CZ", "Snapshot", "example_input.csv", "public", 1, "01-01-2020", Map("Some" -> "MetadataOld") 80 | ), 81 | runUniqueId = None, 82 | checkpoints = List() 83 | ) 84 | 85 | updatedCm shouldBe expectedCm 86 | } 87 | 88 | "ControlMeasure" should "allow to change existing metadata.additionalInfo field" in { 89 | val cm = getTestingControlMeasure(0, Map("Some" -> "MetadataOld")) 90 | 91 | val updatedCm = cm.setAdditionalInfo(("Some", "MetadataNew"), replaceIfExists = true) 92 | 93 | val expectedCm: ControlMeasure = ControlMeasure( 94 | ControlMeasureMetadata( 95 | "AtumTest", "CZ", "Snapshot", "example_input.csv", "public", 1, "01-01-2020", Map("Some" -> "MetadataNew") 96 | ), 97 | runUniqueId = None, 98 | checkpoints = List() 99 | ) 100 | 101 | updatedCm shouldBe expectedCm 102 | } 103 | 104 | private def getTestingControlMeasure( 105 | cpCount: Int, metadataAdditionalInfo: Map[String, String] = Map.empty 106 | ): ControlMeasure = { 107 | require(cpCount >= 0 && cpCount < 10) 108 | val testingCheckpoints = Range(0, cpCount).map(_ + 1) // starting with order: 1 109 | .map { order => 110 | Checkpoint(s"orig-cp$order", None, None, s"01-01-2020 0$order:00:00", s"01-01-2020 0$order:00:10", "wf1", order, List( 111 | Measurement("control1", "someControlType", "column1", "1234") 112 | )) 113 | } 114 | 115 | ControlMeasure( 116 | ControlMeasureMetadata( 117 | "AtumTest", "CZ", "Snapshot", "example_input.csv", "public", 1, "01-01-2020", metadataAdditionalInfo 118 | ), 119 | runUniqueId = None, 120 | checkpoints = testingCheckpoints.toList 121 | ) 122 | } 123 | 124 | } 125 | -------------------------------------------------------------------------------- /examples/src/test/scala/za/co/absa/atum/HdfsInfoIntegrationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum 17 | 18 | import java.nio.file.{Files, Paths} 19 | 20 | import org.apache.hadoop.fs.FileSystem 21 | import org.apache.log4j.LogManager 22 | import org.apache.spark.sql.{DataFrame, SaveMode} 23 | import org.scalatest.BeforeAndAfterAll 24 | import org.scalatest.concurrent.Eventually 25 | import org.scalatest.concurrent.PatienceConfiguration.Timeout 26 | import org.scalatest.flatspec.{AnyFlatSpec, AsyncFlatSpec} 27 | import org.scalatest.matchers.should.Matchers 28 | import org.specs2.matcher.Matchers.concurrentExecutionContext 29 | import za.co.absa.atum.model.{Checkpoint, Measurement} 30 | import za.co.absa.atum.persistence.ControlMeasuresParser 31 | import za.co.absa.atum.utils.SparkTestBase 32 | import za.co.absa.atum.AtumImplicits._ 33 | 34 | import scala.concurrent.duration.{Duration, DurationInt} 35 | import scala.concurrent.{Await, Future} 36 | 37 | class HdfsInfoIntegrationSuite extends AnyFlatSpec with SparkTestBase with Matchers with BeforeAndAfterAll with Eventually { 38 | 39 | private val log = LogManager.getLogger(this.getClass) 40 | val tempDir: String = LocalFsTestUtils.createLocalTemporaryDirectory("hdfsTestOutput") 41 | 42 | override def afterAll: Unit = { 43 | LocalFsTestUtils.safeDeleteTestDir(tempDir) 44 | } 45 | 46 | private val inputCsv = getClass.getResource("/input/wikidata.csv").toString 47 | private def readSparkInputCsv(inputCsvPath: String): DataFrame = spark.read 48 | .option("header", "true") 49 | .option("inferSchema", "true") 50 | .csv(inputCsvPath) 51 | 52 | private def writeSparkData(df: DataFrame, outputPath: String): Unit = { 53 | df.write.mode(SaveMode.Overwrite) 54 | .parquet(outputPath) 55 | 56 | eventually(timeout(scaled(20.seconds)), interval(scaled(500.millis))) { 57 | if (!Files.exists(Paths.get(outputPath))) { 58 | throw new Exception("_INFO file not found at " + outputPath) 59 | } 60 | } 61 | } 62 | 63 | { 64 | val outputPath = s"$tempDir/outputCheck1" 65 | // implicit variant only writes to derived outputPath, explicit writes to both implicit derived path and the explicit one, too. 66 | Seq( 67 | ("implicit output _INFO path only", None, Seq(s"$outputPath/_INFO")), 68 | ("implicit & explicit output _INFO path", Some(s"$outputPath/extra/_INFO2"), Seq(s"$outputPath/_INFO", s"$outputPath/extra/_INFO2")) 69 | ).foreach { case (testCaseName, destinationOptInfoFilePath, expectedPaths) => 70 | 71 | "_INFO" should s"be written on spark.write ($testCaseName)" in { 72 | val hadoopConfiguration = spark.sparkContext.hadoopConfiguration 73 | implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration) 74 | 75 | // Initializing library to hook up to Apache Spark 76 | val inputCsvInfo = getClass.getResource("/input/wikidata.csv.info").toString 77 | spark.enableControlMeasuresTracking(sourceInfoFilePath = Some(inputCsvInfo), destinationInfoFilePath = destinationOptInfoFilePath) 78 | .setControlMeasuresWorkflow("Job 1") 79 | 80 | import spark.implicits._ 81 | val df1 = readSparkInputCsv(inputCsv) 82 | df1.setCheckpoint("Checkpoint0") 83 | val filteredDf1 = df1.filter($"total_response_size" > 1000) 84 | filteredDf1.setCheckpoint("Checkpoint1") // stateful, do not need return value 85 | writeSparkData(filteredDf1, outputPath) // implicit output _INFO file path is derived from this path passed to spark.write 86 | 87 | spark.disableControlMeasuresTracking() 88 | 89 | expectedPaths.foreach { expectedPath => 90 | log.info(s"Checking $expectedPath to contain expected values") 91 | 92 | val infoControlMeasures = eventually(timeout(scaled(20.seconds)), interval(scaled(2.seconds))) { 93 | log.info(s"Reading $expectedPath") 94 | val infoContentJson = LocalFsTestUtils.readFileAsString(expectedPath) 95 | ControlMeasuresParser.fromJson(infoContentJson) 96 | } 97 | 98 | infoControlMeasures.checkpoints.map(_.name) shouldBe Seq("Source", "Raw", "Checkpoint0", "Checkpoint1") 99 | val checkpoint0 = infoControlMeasures.checkpoints.collectFirst { case c: Checkpoint if c.name == "Checkpoint0" => c }.get 100 | checkpoint0.controls should contain(Measurement("recordCount", "count", "*", "5000")) 101 | 102 | val checkpoint1 = infoControlMeasures.checkpoints.collectFirst { case c: Checkpoint if c.name == "Checkpoint1" => c }.get 103 | checkpoint1.controls should contain(Measurement("recordCount", "count", "*", "4964")) 104 | } 105 | } 106 | } 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /model/src/test/scala/za/co/absa/atum/util/SerializationUtilsJsonSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum.util 17 | 18 | import org.scalatest.flatspec.AnyFlatSpec 19 | import org.scalatest.matchers.should.Matchers 20 | import za.co.absa.atum.model.{Checkpoint, ControlMeasure, ControlMeasureMetadata, Measurement, RunError, RunState, RunStatus} 21 | import za.co.absa.atum.utils.SerializationUtils 22 | 23 | /** 24 | * Unit tests for ControlMeasure and RunStatus SerializationUtils-based object serialization 25 | */ 26 | class SerializationUtilsJsonSpec extends AnyFlatSpec with Matchers { 27 | 28 | val exampleCtrlInfo = ControlMeasure( 29 | metadata = ControlMeasureMetadata( 30 | sourceApplication = "FrontArena", 31 | country = "ZA", 32 | historyType = "Snapshot", 33 | dataFilename = "example.dat", 34 | sourceType = "", 35 | version = 1, 36 | informationDate = "01-01-2017", 37 | additionalInfo = Map("key1" -> "value1", "key2" -> "value2") 38 | ), None, 39 | checkpoints = List(Checkpoint( 40 | name = "Source", 41 | processStartTime = "01-01-2017 08:00:00", 42 | processEndTime = "01-01-2017 08:00:00", 43 | workflowName = "Source", 44 | order = 1, 45 | controls = List( 46 | Measurement( 47 | controlName = "pvControlTotal", 48 | controlType = "aggregatedTotal", 49 | controlCol = "pv", 50 | controlValue = "32847283324.324324" 51 | ), 52 | Measurement( 53 | controlName = "recordCount", 54 | controlType = "count", 55 | controlCol = "id", 56 | controlValue = "243" 57 | )), 58 | software = Some("ExampleSoftware"), 59 | version = Some("1.2.3") 60 | ), Checkpoint( 61 | name = "Raw", 62 | processStartTime = "01-01-2017 08:00:00", 63 | processEndTime = "01-01-2017 08:00:00", 64 | workflowName = "Raw", 65 | order = 2, 66 | controls = List( 67 | Measurement( 68 | controlName = "pvControlTotal", 69 | controlType = "aggregatedTotal", 70 | controlCol = "pv", 71 | controlValue = "32847283324.324324" 72 | ), 73 | Measurement( 74 | controlName = "recordCount", 75 | controlType = "count", 76 | controlCol = "id", 77 | controlValue = "243" 78 | ) 79 | ), 80 | software = Some("AnotherExampleSoftware"), 81 | version = Some("3.4.5") 82 | ) 83 | ) 84 | ) 85 | val exampleOutputJson: String = "{\"metadata\":{\"sourceApplication\":\"FrontArena\",\"country\":\"ZA\"," + 86 | "\"historyType\":\"Snapshot\",\"dataFilename\":\"example.dat\",\"sourceType\":\"\"," + 87 | "\"version\":1,\"informationDate\":\"01-01-2017\",\"additionalInfo\":{\"key1\":\"value1\",\"key2\":\"value2\"}}," + 88 | "\"checkpoints\":[{\"name\":\"Source\"," + 89 | "\"software\":\"ExampleSoftware\",\"version\":\"1.2.3\"," + 90 | "\"processStartTime\":\"01-01-2017 08:00:00\"," + 91 | "\"processEndTime\":\"01-01-2017 08:00:00\",\"workflowName\":\"Source\",\"order\":1," + 92 | "\"controls\":[{\"controlName\":\"pvControlTotal\",\"controlType\":\"aggregatedTotal\"," + 93 | "\"controlCol\":\"pv\",\"controlValue\":\"32847283324.324324\"},{\"controlName\":\"recordCount\"," + 94 | "\"controlType\":\"count\",\"controlCol\":\"id\",\"controlValue\":\"243\"}]},{\"name\":\"Raw\"," + 95 | "\"software\":\"AnotherExampleSoftware\",\"version\":\"3.4.5\"," + 96 | "\"processStartTime\":\"01-01-2017 08:00:00\",\"processEndTime\":\"01-01-2017 08:00:00\"," + 97 | "\"workflowName\":\"Raw\",\"order\":2,\"controls\":[{\"controlName\":\"pvControlTotal\"," + 98 | "\"controlType\":\"aggregatedTotal\",\"controlCol\":\"pv\",\"controlValue\":\"32847283324.324324\"}," + 99 | "{\"controlName\":\"recordCount\",\"controlType\":\"count\",\"controlCol\":\"id\"," + 100 | "\"controlValue\":\"243\"}]}]}" 101 | 102 | "ControlInfo" should "serialize a ControlInfo object via asJson" in { 103 | val s = SerializationUtils.asJson(exampleCtrlInfo) 104 | s shouldEqual exampleOutputJson 105 | } 106 | 107 | it should "deserialize a ControlInfo object via fromJson" in { 108 | val s = SerializationUtils.fromJson[ControlMeasure](exampleOutputJson) 109 | s shouldEqual exampleCtrlInfo 110 | } 111 | 112 | it should "serialize via asJsonPretty and deserialize back" in { 113 | SerializationUtils.fromJson[ControlMeasure](SerializationUtils.asJsonPretty(exampleCtrlInfo)) shouldEqual exampleCtrlInfo 114 | } 115 | 116 | val runStatuses = Seq( 117 | RunStatus(RunState.failed, Some(RunError("job1", "step1", "example job1", "X=1, Z=ABC"))), 118 | RunStatus(RunState.allSucceeded, None) 119 | ) 120 | 121 | val runStatusesJson = 122 | """[ 123 | |{"status":"failed","error":{"job":"job1","step":"step1","description":"example job1","technicalDetails":"X=1, Z=ABC"}}, 124 | |{"status":"allSucceeded"} 125 | |]""".stripMargin.replaceAll("\n", "") 126 | 127 | "RunStatus" should "serialize via asJson" in { 128 | SerializationUtils.asJson(runStatuses) shouldBe runStatusesJson 129 | } 130 | 131 | it should "deserialize via fromJson" in { 132 | SerializationUtils.fromJson[Seq[RunStatus]](runStatusesJson) shouldBe runStatuses 133 | } 134 | 135 | it should "serialize via asJsonPretty and deserialize back" in { 136 | SerializationUtils.fromJson[Seq[RunStatus]](SerializationUtils.asJsonPretty(runStatuses)) shouldEqual runStatuses 137 | } 138 | 139 | // jackson serialization support (notice the `runStatusesJson` being reused): 140 | it should "serialize via Jackson's toJson" in { 141 | JacksonJsonSerializer.toJson(runStatuses) shouldBe runStatusesJson 142 | } 143 | 144 | it should "deserialize via Jackson's fromJson" in { 145 | JacksonJsonSerializer.fromJson[Array[RunStatus]](runStatusesJson) shouldBe runStatuses // Array to overcome runtime erasure 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | import sbt._ 17 | 18 | object Dependencies { 19 | 20 | object Versions { 21 | val spark2 = "2.4.8" 22 | val spark3 = "3.2.2" 23 | 24 | val json4s_spark2 = "3.5.3" 25 | val json4s_spark3 = "3.7.0-M11" 26 | 27 | val jacksonModuleScala_spark2 = "2.12.7" 28 | val jacksonModuleScala_spark3 = "2.14.1" 29 | val jacksonDatabind_spark2 = "2.12.7.1" // databind has extra extra patch for this version - for Spark2 30 | val jacksonDatabind_spark3 = jacksonModuleScala_spark3 // for Spark3 - latest version is ok 31 | 32 | val absaCommons = "0.0.27" 33 | val typesafeConfig = "1.4.1" 34 | val mockitoScala = "1.17.12" 35 | val scalatest = "3.2.9" 36 | val specs2 = "2.5" 37 | val aws = "2.17.85" 38 | 39 | val apacheCommonsLang3 = "3.12.0" 40 | val commonsConfiguration = "1.6" 41 | } 42 | 43 | // basic idea of crossversion version picking is based on https://github.com/scala/scala-module-dependency-sample 44 | 45 | // this is just for the compile-depended printing task 46 | def sparkVersionForScala(scalaVersion: String): String = { 47 | scalaVersion match { 48 | case _ if scalaVersion.startsWith("2.11") => Versions.spark2 49 | case _ if scalaVersion.startsWith("2.12") => Versions.spark3 50 | case _ => throw new IllegalArgumentException("Only Scala 2.11 and 2.12 are currently supported.") 51 | } 52 | } 53 | 54 | // general wrapper to simplify s2.11/2.12 version assigning 55 | def moduleByScala(moduleIdWithoutVersion: String => ModuleID) 56 | (scala211Version: String, scala212Version: String) 57 | (actualScalaVersion: String): ModuleID = { 58 | actualScalaVersion match { 59 | case _ if actualScalaVersion.startsWith("2.11") => moduleIdWithoutVersion.apply(scala211Version) 60 | case _ if actualScalaVersion.startsWith("2.12") => moduleIdWithoutVersion.apply(scala212Version) 61 | case _ => throw new IllegalArgumentException("Only Scala 2.11 and 2.12 are currently supported.") 62 | } 63 | } 64 | 65 | 66 | // extended version where to moduleId Fn takes 2 params: module version and scala version (to pass along) 67 | def moduleByScalaUsingScalaVersion(moduleIdWithoutVersionNeedsScalaVersion: (String, String) => ModuleID) 68 | (scala211Version: String, scala212Version: String) 69 | (actualScalaVersion: String): ModuleID = { 70 | actualScalaVersion match { 71 | case _ if actualScalaVersion.startsWith("2.11") => moduleIdWithoutVersionNeedsScalaVersion.apply(scala211Version, actualScalaVersion) 72 | case _ if actualScalaVersion.startsWith("2.12") => moduleIdWithoutVersionNeedsScalaVersion.apply(scala212Version, actualScalaVersion) 73 | case _ => throw new IllegalArgumentException("Only Scala 2.11 and 2.12 are currently supported.") 74 | } 75 | } 76 | 77 | 78 | lazy val sparkCore = { 79 | def coreWithExcludes(version: String, scalaVersion: String): ModuleID = "org.apache.spark" %% "spark-core" % version % Provided exclude( 80 | "com.fasterxml.jackson.core", "jackson-databind" 81 | ) exclude( 82 | "com.fasterxml.jackson.module", "jackson-module-scala_" + scalaVersion.substring(0, 4) // e.g. 2.11 83 | ) 84 | moduleByScalaUsingScalaVersion(coreWithExcludes)(Versions.spark2, Versions.spark3) _ 85 | } 86 | 87 | lazy val sparkSql = moduleByScala("org.apache.spark" %% "spark-sql" % _ % Provided)(Versions.spark2, Versions.spark3) _ 88 | 89 | lazy val scalaTest = "org.scalatest" %% "scalatest" % Versions.scalatest % Test 90 | 91 | lazy val json4sExt = moduleByScala("org.json4s" %% "json4s-ext" % _)(Versions.json4s_spark2, Versions.json4s_spark3) _ 92 | lazy val json4sCore = moduleByScala("org.json4s" %% "json4s-core" % _ % Provided)(Versions.json4s_spark2, Versions.json4s_spark3) _ 93 | lazy val json4sJackson = moduleByScala("org.json4s" %% "json4s-jackson" % _ % Provided)(Versions.json4s_spark2, Versions.json4s_spark3) _ 94 | lazy val json4sNative = moduleByScala("org.json4s" %% "json4s-native" % _ % Provided)(Versions.json4s_spark2, Versions.json4s_spark3) _ 95 | 96 | lazy val jacksonModuleScala = moduleByScala("com.fasterxml.jackson.module" %% "jackson-module-scala" % _)(Versions.jacksonModuleScala_spark2, Versions.jacksonModuleScala_spark3) _ 97 | lazy val jacksonDatabind = moduleByScala("com.fasterxml.jackson.core" % "jackson-databind" % _)(Versions.jacksonDatabind_spark2, Versions.jacksonDatabind_spark3) _ 98 | 99 | lazy val absaCommons = "za.co.absa.commons" %% "commons" % Versions.absaCommons 100 | lazy val commonsConfiguration = "commons-configuration" % "commons-configuration" % Versions.commonsConfiguration 101 | lazy val apacheCommons = "org.apache.commons" % "commons-lang3" % Versions.apacheCommonsLang3 102 | lazy val typeSafeConfig = "com.typesafe" % "config" % Versions.typesafeConfig 103 | 104 | lazy val mockitoScala = "org.mockito" %% "mockito-scala" % Versions.mockitoScala % Test 105 | lazy val mockitoScalaScalatest = "org.mockito" %% "mockito-scala-scalatest" % Versions.mockitoScala % Test 106 | 107 | lazy val scalaTestProvided = "org.scalatest" %% "scalatest" % Versions.scalatest % Provided 108 | lazy val specs2core = "org.specs2" %% "specs2-core" % Versions.specs2 % Test 109 | 110 | lazy val sdkS3 = "software.amazon.awssdk" % "s3" % Versions.aws 111 | 112 | def rootDependencies(scalaVersion: String): Seq[ModuleID] = Seq( 113 | sparkCore(scalaVersion), 114 | sparkSql(scalaVersion), 115 | scalaTest, 116 | json4sExt(scalaVersion) 117 | ) 118 | 119 | def modelDependencies(scalaVersion: String): Seq[ModuleID] = Seq( 120 | json4sCore(scalaVersion), 121 | json4sJackson(scalaVersion), 122 | json4sNative(scalaVersion), 123 | jacksonModuleScala(scalaVersion), 124 | jacksonDatabind(scalaVersion) 125 | ) 126 | 127 | def coreDependencies(scalaVersion: String): Seq[ModuleID] = Seq( 128 | absaCommons, 129 | commonsConfiguration, 130 | apacheCommons, 131 | typeSafeConfig, 132 | 133 | mockitoScala, 134 | mockitoScalaScalatest, 135 | ) 136 | 137 | lazy val examplesDependencies: Seq[ModuleID] = Seq( 138 | specs2core, 139 | scalaTestProvided 140 | ) 141 | 142 | lazy val s3sdkExtensionDependencies: Seq[ModuleID] = Seq( 143 | absaCommons, 144 | sdkS3, 145 | mockitoScala, 146 | mockitoScalaScalatest 147 | ) 148 | 149 | } 150 | -------------------------------------------------------------------------------- /atum/src/test/scala/za/co/absa/atum/ControlInfoToJsonSerializationSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package za.co.absa.atum 17 | 18 | import org.scalatest.flatspec.AnyFlatSpec 19 | import org.scalatest.matchers.should.Matchers 20 | import za.co.absa.atum.model.{Checkpoint, ControlMeasure, ControlMeasureMetadata, Measurement} 21 | import za.co.absa.atum.utils.{BuildProperties, SerializationUtils} 22 | import za.co.absa.atum.model.CheckpointImplicits.CheckpointExt 23 | import za.co.absa.atum.utils.controlmeasure.ControlMeasureUtils 24 | 25 | /** 26 | * Unit tests for ControlInfo object serialization 27 | */ 28 | class ControlInfoToJsonSerializationSpec extends AnyFlatSpec with Matchers { 29 | private val version = BuildProperties.buildVersion 30 | private val software = BuildProperties.projectName 31 | 32 | private val exampleCtrlInfo = ControlMeasure( 33 | metadata = ControlMeasureMetadata( 34 | sourceApplication = "FrontArena", 35 | country = "ZA", 36 | historyType = "Snapshot", 37 | dataFilename = "example.dat", 38 | sourceType = "", 39 | version = 1, 40 | informationDate = "01-01-2017", 41 | additionalInfo = Map("key1" -> "value1", "key2" -> "value2") 42 | ), None, 43 | checkpoints = List(Checkpoint( 44 | name = "Source", 45 | processStartTime = "01-01-2017 08:00:00", 46 | processEndTime = "01-01-2017 08:00:00", 47 | workflowName = "Source", 48 | order = 1, 49 | controls = List( 50 | Measurement( 51 | controlName = "pvControlTotal", 52 | controlType = "aggregatedTotal", 53 | controlCol = "pv", 54 | controlValue = "32847283324.324324" 55 | ), 56 | Measurement( 57 | controlName = "recordCount", 58 | controlType = "count", 59 | controlCol = "id", 60 | controlValue = "243" 61 | )) 62 | ).withBuildProperties, Checkpoint( 63 | name = "Raw", 64 | processStartTime = "01-01-2017 08:00:00", 65 | processEndTime = "01-01-2017 08:00:00", 66 | workflowName = "Raw", 67 | order = 2, 68 | controls = List( 69 | Measurement( 70 | controlName = "pvControlTotal", 71 | controlType = "aggregatedTotal", 72 | controlCol = "pv", 73 | controlValue = "32847283324.324324" 74 | ), 75 | Measurement( 76 | controlName = "recordCount", 77 | controlType = "count", 78 | controlCol = "id", 79 | controlValue = "243" 80 | ) 81 | ) 82 | ).withBuildProperties 83 | ) 84 | ) 85 | 86 | private val exampleInputJson: String = s"""{ 87 | |"metadata":{ 88 | |"sourceApplication":"FrontArena", 89 | |"country":"ZA", 90 | |"historyType":"Snapshot", 91 | |"dataFilename":"example.dat", 92 | |"sourceType":"", 93 | |"version":1, 94 | |"informationDate":"01-01-2017", 95 | |"additionalInfo":{ 96 | |"key1":"value1", 97 | |"key2":"value2" 98 | |} 99 | |}, 100 | |"checkpoints":[{ 101 | |"name":"Source", 102 | |"software":"$software", 103 | |"version":"$version", 104 | |"processStartTime":"01-01-2017 08:00:00", 105 | |"processEndTime":"01-01-2017 08:00:00", 106 | |"workflowName":"Source", 107 | |"order":1, 108 | |"controls":[{ 109 | |"controlName":"pvControlTotal", 110 | |"controlType":"type.aggregatedTotal", 111 | |"controlCol":"pv", 112 | |"controlValue":"32847283324.324324" 113 | |},{ 114 | |"controlName":"recordCount", 115 | |"controlType":"type.Count", 116 | |"controlCol":"id", 117 | |"controlValue":243 118 | |}] 119 | |},{ 120 | |"name":"Raw", 121 | |"software":"$software", 122 | |"version":"$version", 123 | |"processStartTime":"01-01-2017 08:00:00", 124 | |"processEndTime":"01-01-2017 08:00:00", 125 | |"workflowName":"Raw", 126 | |"order":2, 127 | |"controls":[{ 128 | |"controlName":"pvControlTotal", 129 | |"controlType":"type.aggregatedTotal", 130 | |"controlCol":"pv", 131 | |"controlValue":"32847283324.324324" 132 | |},{ 133 | |"controlName":"recordCount", 134 | |"controlType":"type.Count", 135 | |"controlCol":"id", 136 | |"controlValue":243 137 | |}] 138 | |}] 139 | |}""".stripMargin.filter(_ >= ' ') 140 | 141 | private val exampleOutputJson: String = s"""{ 142 | |"metadata":{ 143 | |"sourceApplication":"FrontArena", 144 | |"country":"ZA", 145 | |"historyType":"Snapshot", 146 | |"dataFilename":"example.dat", 147 | |"sourceType":"", 148 | |"version":1, 149 | |"informationDate":"01-01-2017", 150 | |"additionalInfo":{ 151 | |"key1":"value1", 152 | |"key2":"value2" 153 | |} 154 | |}, 155 | |"checkpoints":[{ 156 | |"name":"Source", 157 | |"software":"$software", 158 | |"version":"$version", 159 | |"processStartTime":"01-01-2017 08:00:00", 160 | |"processEndTime":"01-01-2017 08:00:00", 161 | |"workflowName":"Source", 162 | |"order":1, 163 | |"controls":[{ 164 | |"controlName":"pvControlTotal", 165 | |"controlType":"aggregatedTotal", 166 | |"controlCol":"pv", 167 | |"controlValue":"32847283324.324324" 168 | |},{ 169 | |"controlName":"recordCount", 170 | |"controlType":"count", 171 | |"controlCol":"id", 172 | |"controlValue":"243" 173 | |}] 174 | |},{ 175 | |"name":"Raw", 176 | |"software":"$software", 177 | |"version":"$version", 178 | |"processStartTime":"01-01-2017 08:00:00", 179 | |"processEndTime":"01-01-2017 08:00:00", 180 | |"workflowName":"Raw", 181 | |"order":2, 182 | |"controls":[{ 183 | |"controlName":"pvControlTotal", 184 | |"controlType":"aggregatedTotal", 185 | |"controlCol":"pv", 186 | |"controlValue":"32847283324.324324" 187 | |},{ 188 | |"controlName":"recordCount", 189 | |"controlType":"count", 190 | |"controlCol":"id", 191 | |"controlValue":"243" 192 | |}] 193 | |}] 194 | |}""".stripMargin.filter(_ >= ' ') 195 | 196 | "toJson" should "serialize a ControlInfo object" in 197 | { 198 | val s = SerializationUtils.asJson(exampleCtrlInfo) 199 | s shouldEqual exampleOutputJson 200 | } 201 | 202 | "fromJson" should "deserialize a ControlInfo object" in 203 | { 204 | val obj = ControlMeasureUtils.preprocessControlMeasure(SerializationUtils.fromJson[ControlMeasure](exampleInputJson)) 205 | obj shouldEqual exampleCtrlInfo 206 | } 207 | 208 | "asJson" should "return the json with control values converted to strings and normalized control type" in 209 | { 210 | val obj = ControlMeasureUtils.preprocessControlMeasure(SerializationUtils.fromJson[ControlMeasure](exampleInputJson)) 211 | val str = SerializationUtils.asJson(obj) 212 | str shouldEqual exampleOutputJson 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/utils/ExecutionPlanUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.utils 18 | 19 | import org.apache.hadoop.conf.Configuration 20 | import org.apache.hadoop.fs.{FileSystem, Path} 21 | import org.apache.log4j.LogManager 22 | import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LeafNode, LogicalPlan, UnaryNode} 23 | import org.apache.spark.sql.execution.QueryExecution 24 | import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InsertIntoHadoopFsRelationCommand, LogicalRelation, SaveIntoDataSourceCommand} 25 | import org.apache.spark.sql.{Dataset, Row} 26 | import za.co.absa.atum.core.Constants 27 | import za.co.absa.atum.utils.FileUtils.PathJoin 28 | 29 | /** 30 | * This object contains utils for traversing execution plan DAG to infer control measurement input/output paths 31 | */ 32 | object ExecutionPlanUtils { 33 | 34 | private val log = LogManager.getLogger("Atum.ExecutionPlanUtils") 35 | 36 | /** 37 | * The method returns input control measurements info file name inferred from the source dataset 38 | * Ensure one and only one input pathname has input control measurements 39 | * 40 | * @param dataset A dataset where input path name(s) will be searched 41 | * @param infoFileName A file name of an info file, e.g. "_INFO" 42 | * 43 | * @return The inferred input control measurements file path of the source dataset 44 | */ 45 | def inferInputInfoFilePath(dataset: Dataset[Row], infoFileName: String = Constants.DefaultInfoFileName)(implicit inputFs: FileSystem): Path = { 46 | val plan = dataset.queryExecution.logical 47 | val paths = getSourceFileNames(plan) 48 | if (paths.isEmpty) { 49 | throw new IllegalStateException("Control framework was unable to infer dataset input file name.") 50 | } 51 | val infoNames = paths.flatMap(p => { 52 | val infoName = new Path(p, infoFileName) 53 | log.info(s"Inferred info file name: $infoName, from path $p and name $infoFileName") 54 | Some(infoName).filter(inputFs.exists) 55 | }) 56 | val path = infoNames match { 57 | case List(p) => p 58 | case _ :: _ => throw new IllegalStateException("Ambiguous control measurements file names: " + infoNames 59 | .mkString(",")) 60 | case _ => throw new IllegalStateException("Control framework was unable to infer dataset input file name.") 61 | } 62 | path 63 | } 64 | 65 | def getHadoopFullPath(path: Path, hadoopConfiguration: Configuration): Path = { 66 | val fs = FileSystem.get(hadoopConfiguration) 67 | fs.getFileStatus(path).getPath 68 | } 69 | 70 | /** 71 | * The method returns output file name inferred from the source dataset 72 | * 73 | * @param qe A query execution object where output path name will be searched 74 | * 75 | * @return The inferred output control measurements file path of the source dataset 76 | */ 77 | def inferOutputFileName(qe: QueryExecution, hadoopConfiguration: Configuration): Option[Path] = { 78 | qe.analyzed match { 79 | case s: SaveIntoDataSourceCommand => 80 | Some(getHadoopFullPath(new Path(s.options("path")), hadoopConfiguration)) 81 | case h: InsertIntoHadoopFsRelationCommand => 82 | Some(h.outputPath) 83 | case a => 84 | log.warn(s"Logical plan: ${qe.logical.treeString}") 85 | log.warn(s"Analyzed plan: ${qe.analyzed.treeString}") 86 | log.warn(s"Optimized plan: ${qe.optimizedPlan.treeString}") 87 | log.error(s"Unable to infer storage path to output control measurements to for query execution $qe.") 88 | None 89 | } 90 | } 91 | 92 | /** 93 | * The method returns output control measurements info file name inferred from the source dataset on HDFS 94 | * 95 | * @param qe A query execution object where output path name will be searched 96 | * @param infoFileName A file name of an info file, e.g. "_INFO" 97 | * 98 | * @return The inferred output control measurements file path of the source dataset 99 | */ 100 | def inferOutputInfoFileName(qe: QueryExecution, infoFileName: String = Constants.DefaultInfoFileName): Option[Path] = { 101 | inferOutputInfoFileDir(qe).map { dir => 102 | new Path(dir, infoFileName) 103 | } 104 | } 105 | 106 | /** 107 | * Based on the `qe` supplied, output _INFO file path is inference is attempted 108 | * @param qe QueryExecution - path inference basis 109 | * @return optional inferred _INFO file path 110 | */ 111 | def inferOutputInfoFileDir(qe: QueryExecution): Option[String] = { 112 | qe.analyzed match { 113 | case s: SaveIntoDataSourceCommand => 114 | Some(s.options("path")) 115 | case h: InsertIntoHadoopFsRelationCommand => 116 | Some(h.outputPath.toString) 117 | case a => 118 | log.warn(s"Logical plan: ${qe.logical.treeString}") 119 | log.warn(s"Analyzed plan: ${qe.analyzed.treeString}") 120 | log.warn(s"Optimized plan: ${qe.optimizedPlan.treeString}") 121 | log.error(s"Unable to infer output path for control measurements for query execution $qe.") 122 | None 123 | } 124 | } 125 | 126 | /** 127 | * The method returns output control measurements info file name inferred from the source dataset on S3 128 | * 129 | * @param qe A query execution object where output path name will be searched 130 | * @param infoFileName A file name of an info file, e.g. "_INFO" 131 | * 132 | * @return The inferred output control measurements file path of the source dataset 133 | */ 134 | def inferOutputInfoFileNameOnS3(qe: QueryExecution, infoFileName: String = Constants.DefaultInfoFileName): Option[String] = { 135 | qe.analyzed match { 136 | case s: SaveIntoDataSourceCommand => 137 | Some(s.options("path") / infoFileName) 138 | case _ => 139 | log.warn(s"Logical plan: ${qe.logical.treeString}") 140 | log.warn(s"Analyzed plan: ${qe.analyzed.treeString}") 141 | log.warn(s"Optimized plan: ${qe.optimizedPlan.treeString}") 142 | log.error(s"Unable to infer output path for control measurements for query execution $qe.") 143 | None 144 | } 145 | } 146 | 147 | /** 148 | * The method returns source file names of a DataSet execution plan by traversing the DAG. 149 | * Thanks za.co.absa.spline.core 150 | * 151 | * @param plan A logical plan of execution 152 | * 153 | * @return The list of input files paths 154 | */ 155 | def getSourceFileNames(plan: LogicalPlan): List[Path] = { 156 | plan match { 157 | case n: LeafNode => n match { 158 | case a: LogicalRelation => 159 | // LogicalRelation is a data source node 160 | a.relation match { 161 | case hfsr: HadoopFsRelation => hfsr.location.rootPaths.toList 162 | case _ => Nil 163 | } 164 | case _ => Nil 165 | } 166 | case n: UnaryNode => getSourceFileNames(n.child) 167 | case b: BinaryNode => getSourceFileNames(b.left) ::: getSourceFileNames(b.right) 168 | } 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /scalastyle-config.xml: -------------------------------------------------------------------------------- 1 | 15 | 16 | 17 | Scalastyle standard configuration 18 | 19 | 20 | /* 21 | * Copyright 2018 ABSA Group Limited 22 | * 23 | * Licensed under the Apache License, Version 2.0 (the "License"); 24 | * you may not use this file except in compliance with the License. 25 | * You may obtain a copy of the License at 26 | * http://www.apache.org/licenses/LICENSE-2.0 27 | * 28 | * Unless required by applicable law or agreed to in writing, software 29 | * distributed under the License is distributed on an "AS IS" BASIS, 30 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 31 | * See the License for the specific language governing permissions and 32 | * limitations under the License. 33 | */ 34 | 35 | 36 | 37 | 38 | 39 | 300 40 | 41 | 42 | 43 | 44 | 900 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 140 53 | 2 54 | 55 | 56 | 57 | 58 | [A-Z][A-Za-z0-9]* 59 | 60 | 61 | 62 | 63 | [A-Z][A-Za-z0-9]* 64 | 65 | 66 | 67 | 68 | ^[a-z][A-Za-z0-9]*$ 69 | 70 | 71 | 72 | 73 | 74 | sun._,java.awt._ 75 | 76 | 77 | 78 | 79 | 8 80 | 81 | 82 | 83 | 84 | -1,0,1,2,3,1000 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | \bprint(|ln|f)\( 98 | 99 | 100 | 101 | 102 | 30 103 | 104 | 105 | 106 | 107 | 10 108 | 109 | 110 | 111 | 112 | 113 | 114 | true 115 | false 116 | 117 | 118 | 119 | 120 | 60 121 | 122 | 123 | 124 | 125 | ^[a-z][A-Za-z0-9]*$ 126 | 127 | 128 | 129 | 130 | 30 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | \bprint(|ln|f)\( 143 | 144 | 145 | 146 | 147 | 148 | 149 | ^[A-Z_]$ 150 | 151 | 152 | 153 | 154 | 155 | 156 | 2 157 | ^""$ 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /atum/src/main/scala/za/co/absa/atum/core/MeasurementProcessor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.atum.core 18 | 19 | import org.apache.spark.sql.functions._ 20 | import org.apache.spark.sql.types.{DecimalType, LongType, StringType} 21 | import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} 22 | import za.co.absa.atum.core.ControlType._ 23 | import za.co.absa.atum.core.MeasurementProcessor.MeasurementFunction 24 | import za.co.absa.atum.model.Measurement 25 | import za.co.absa.atum.utils.controlmeasure.ControlMeasureUtils 26 | 27 | import scala.util.{Failure, Success, Try} 28 | 29 | object MeasurementProcessor { 30 | type MeasurementFunction = DataFrame => String 31 | 32 | private val valueColumnName: String = "value" 33 | 34 | /** 35 | * Assembles the measurement function based for controlCol based on the controlType 36 | * @param controlCol 37 | * @param controlType 38 | * @return 39 | */ 40 | private[atum] def getMeasurementFunction(controlCol: String, controlType: ControlType): MeasurementFunction = { 41 | controlType match { 42 | case Count => (ds: Dataset[Row]) => 43 | ds.select(col(controlCol)).count().toString 44 | case DistinctCount => (ds: Dataset[Row]) => { 45 | ds.select(col(controlCol)).distinct().count().toString 46 | } 47 | case AggregatedTotal => 48 | (ds: Dataset[Row]) => { 49 | val aggCol = sum(col(valueColumnName)) 50 | aggregateColumn(ds, controlCol, aggCol) 51 | } 52 | case AbsAggregatedTotal => 53 | (ds: Dataset[Row]) => { 54 | val aggCol = sum(abs(col(valueColumnName))) 55 | aggregateColumn(ds, controlCol, aggCol) 56 | } 57 | case HashCrc32 => 58 | (ds: Dataset[Row]) => { 59 | val aggColName = ControlMeasureUtils.getTemporaryColumnName(ds) 60 | val v = ds.withColumn(aggColName, crc32(col(controlCol).cast("String"))) 61 | .agg(sum(col(aggColName))).collect()(0)(0) 62 | if (v == null) "" else v.toString 63 | } 64 | case AggregatedTruncTotal => 65 | (ds: Dataset[Row]) => { 66 | val aggCol = sum(col(valueColumnName).cast(LongType)) 67 | aggregateColumn(ds, controlCol, aggCol) 68 | } 69 | case AbsAggregatedTruncTotal => 70 | (ds: Dataset[Row]) => { 71 | val aggCol = sum(abs(col(valueColumnName).cast(LongType))) 72 | aggregateColumn(ds, controlCol, aggCol) 73 | } 74 | } 75 | } 76 | 77 | private def aggregateColumn(ds: Dataset[Row], measureColumn: String, aggExpression: Column): String = { 78 | val dataType = ds.select(measureColumn).schema.fields(0).dataType 79 | val aggregatedValue = dataType match { 80 | case _: LongType => 81 | // This is protection against long overflow, e.g. Long.MaxValue = 9223372036854775807: 82 | // scala> sc.parallelize(List(Long.MaxValue, 1)).toDF.agg(sum("value")).take(1)(0)(0) 83 | // res11: Any = -9223372036854775808 84 | // Converting to BigDecimal fixes the issue 85 | //val ds2 = ds.select(col(measurement.controlCol).cast(DecimalType(38, 0)).as("value")) 86 | //ds2.agg(sum(abs($"value"))).collect()(0)(0) 87 | val ds2 = ds.select(col(measureColumn).cast(DecimalType(38, 0)).as(valueColumnName)) 88 | val collected = ds2.agg(aggExpression).collect()(0)(0) 89 | if (collected == null) 0 else collected 90 | case _: StringType => 91 | // Support for string type aggregation 92 | val ds2 = ds.select(col(measureColumn).cast(DecimalType(38, 18)).as(valueColumnName)) 93 | val collected = ds2.agg(aggExpression).collect()(0)(0) 94 | val value = if (collected==null) new java.math.BigDecimal(0) else collected.asInstanceOf[java.math.BigDecimal] 95 | value 96 | .stripTrailingZeros // removes trailing zeros (2001.500000 -> 2001.5, but can introduce scientific notation (600.000 -> 6E+2) 97 | .toPlainString // converts to normal string (6E+2 -> "600") 98 | case _ => 99 | val ds2 = ds.select(col(measureColumn).as(valueColumnName)) 100 | val collected = ds2.agg(aggExpression).collect()(0)(0) 101 | if (collected == null) 0 else collected 102 | } 103 | //check if total is required to be presented as larger type - big decimal 104 | workaroundBigDecimalIssues(aggregatedValue) 105 | } 106 | 107 | private def workaroundBigDecimalIssues(value: Any): String = { 108 | // If aggregated value is java.math.BigDecimal, convert it to scala.math.BigDecimal 109 | value match { 110 | case v: java.math.BigDecimal => 111 | // Convert the value to string to workaround different serializers generate different JSONs for BigDecimal 112 | v.stripTrailingZeros // removes trailing zeros (2001.500000 -> 2001.5, but can introduce scientific notation (600.000 -> 6E+2) 113 | .toPlainString // converts to normal string (6E+2 -> "600") 114 | case v: BigDecimal => 115 | // Convert the value to string to workaround different serializers generate different JSONs for BigDecimal 116 | new java.math.BigDecimal(v.toString()) 117 | .stripTrailingZeros // removes trailing zeros (2001.500000 -> 2001.5, but can introduce scientific notation (600.000 -> 6E+2) 118 | .toPlainString // converts to normal string (6E+2 -> "600") 119 | case a => a.toString 120 | } 121 | } 122 | 123 | } 124 | 125 | /** 126 | * This class is used for processing Spark Dataset to calculate aggregates / control measures 127 | */ 128 | class MeasurementProcessor(private var measurements: Seq[Measurement]) { 129 | type MeasurementProcessor = (Measurement, MeasurementFunction) 130 | 131 | // Assigning measurement function to each measurement 132 | var processors: Seq[MeasurementProcessor] = 133 | measurements.map(m => (m, getMeasurementFunction(m))) 134 | 135 | 136 | /** The method calculates measurements for each control. */ 137 | private[atum] def measureDataset(ds: Dataset[Row]): Seq[Measurement] = { 138 | Atum.log.debug(s"Schema: ${ds.schema.treeString}") 139 | processors.map(p => Measurement(controlName = p._1.controlName, 140 | controlType = p._1.controlType, 141 | controlCol = p._1.controlCol, 142 | controlValue = p._2(ds) // call measurement function 143 | )) 144 | } 145 | 146 | /** Register new column name so that the new name is used when calculating a new checkpoint. */ 147 | private[atum] def registerColumnRename(oldName: String, newName: String): Unit = { 148 | val oldLowercaseName = oldName.trim.toLowerCase 149 | val newMeasurements = measurements.map(measure => { 150 | if (measure.controlCol.trim.toLowerCase == oldLowercaseName) { 151 | measure.copy(controlCol = newName) 152 | } 153 | else { 154 | measure 155 | } 156 | }) 157 | 158 | processors = newMeasurements.map(m => (m, getMeasurementFunction(m))) 159 | measurements = newMeasurements 160 | } 161 | 162 | /** Register a column drop so no measurements tracking is necessary. */ 163 | private[atum] def registerColumnDrop(columnName: String): Unit = { 164 | val oldLowercaseName = columnName.trim.toLowerCase 165 | val newMeasurements = measurements.filter(measure => measure.controlCol.trim.toLowerCase != oldLowercaseName) 166 | 167 | processors = newMeasurements.map(m => (m, getMeasurementFunction(m))) 168 | measurements = newMeasurements 169 | } 170 | 171 | /** The method maps string representation of control type to measurement function. */ 172 | private def getMeasurementFunction(measurement: Measurement): MeasurementFunction = { 173 | Try { 174 | ControlType.withValueName(measurement.controlType) 175 | } match { 176 | case Failure(exception) => 177 | Atum.log.error(s"Unrecognized control measurement type '${measurement.controlType}'. Available control measurement types are: " + 178 | s"${ControlType.values.mkString(",")}.") 179 | Atum.log.error(exception.getLocalizedMessage) 180 | (_: Dataset[Row]) => "N/A" 181 | 182 | case Success(controlType) => 183 | MeasurementProcessor.getMeasurementFunction(measurement.controlCol, controlType) 184 | } 185 | } 186 | 187 | } 188 | --------------------------------------------------------------------------------