├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .travis.yml ├── CNAME ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DaFlow.png ├── LICENSE ├── README.md ├── daflow-commons ├── README.md ├── pom.xml └── src │ ├── main │ └── scala │ │ └── com │ │ └── abhioncbr │ │ └── daflow │ │ └── commons │ │ ├── CommonConstants.scala │ │ ├── Context.scala │ │ ├── ContextConstantEnum.scala │ │ ├── ExecutionResult.scala │ │ ├── NotificationMessages.scala │ │ ├── ProcessFrequencyEnum.scala │ │ ├── conf │ │ ├── DaFlowJobConf.scala │ │ ├── JobStaticParamConf.scala │ │ ├── common │ │ │ ├── DataPath.scala │ │ │ ├── FieldMappingConf.scala │ │ │ ├── GeneralParamConf.scala │ │ │ └── QueryConf.scala │ │ ├── extract │ │ │ ├── ExtractConf.scala │ │ │ └── ExtractionType.scala │ │ ├── load │ │ │ ├── LoadConf.scala │ │ │ ├── LoadType.scala │ │ │ └── PartitioningDataConf.scala │ │ └── transform │ │ │ └── TransformConf.scala │ │ └── util │ │ └── FileUtil.scala │ └── test │ └── scala │ └── com │ └── abhioncbr │ └── daflow │ └── commons │ ├── CommonSpec.scala │ ├── Fixture.scala │ └── util │ └── FileUtilSpec.scala ├── daflow-core ├── README.md ├── pom.xml └── src │ └── main │ └── scala │ └── com │ └── abhioncbr │ └── daflow │ └── core │ ├── LaunchETLSparkJobExecution.scala │ ├── extractData │ ├── AbstractExtractData.scala │ ├── ExtractData.scala │ ├── ExtractDataFromDB.scala │ ├── ExtractDataFromFileSystem.scala │ ├── ExtractDataFromHive.scala │ └── ExtractUtil.scala │ ├── loadData │ ├── LoadData.scala │ ├── LoadDataIntoFileSystem.scala │ ├── LoadDataIntoHive.scala │ └── LoadUtil.scala │ ├── transformData │ ├── Transform.scala │ ├── TransformData.scala │ ├── TransformRule.scala │ ├── TransformStep.scala │ └── TransformUtil.scala │ └── validateData │ ├── ValidateData.scala │ └── ValidateTransformedData.scala ├── daflow-examples ├── README.md ├── daflow-xml-templates │ ├── extract_jdbc_import.xml │ ├── extract_json_import.xml │ ├── multiple_group_name.xml │ └── multiple_transform_rule.xml ├── demo │ ├── daflow-job-xml │ │ └── json_etl_example.xml │ └── sample-data │ │ └── json_data.json └── scripts │ ├── execute_etl_feed.sh │ └── execute_etl_feed_airflow.sh ├── daflow-job-conf ├── daflow-job-conf-xml │ ├── README.md │ ├── daflow-feed-job.xsd │ ├── pom.xml │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── abhioncbr │ │ │ └── daflow │ │ │ └── job │ │ │ └── conf │ │ │ └── xml │ │ │ ├── AttributeTags.scala │ │ │ ├── NodeTags.scala │ │ │ ├── ParseDaFlowJobXml.scala │ │ │ ├── ParseDataPath.scala │ │ │ ├── ParseExtract.scala │ │ │ ├── ParseFieldMapping.scala │ │ │ ├── ParseGeneralParams.scala │ │ │ ├── ParseJobStaticParam.scala │ │ │ ├── ParseLoad.scala │ │ │ ├── ParsePartitioningData.scala │ │ │ ├── ParseQuery.scala │ │ │ ├── ParseTransform.scala │ │ │ ├── ParseTransformRule.scala │ │ │ └── ParseUtil.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── abhioncbr │ │ └── daflow │ │ └── job │ │ └── conf │ │ └── xml │ │ ├── ParseDaFlowJobXmlSpec.scala │ │ ├── ParseDataPathSpec.scala │ │ ├── ParseExtractSpec.scala │ │ ├── ParseFieldMappingSpec.scala │ │ ├── ParseGeneralParamsSpec.scala │ │ ├── ParseJobStaticParamSpec.scala │ │ ├── ParseLoadSpec.scala │ │ ├── ParsePartitioningRuleSpec.scala │ │ ├── ParseQuerySpec.scala │ │ ├── ParseTransformRuleSpec.scala │ │ ├── ParseTransformSpec.scala │ │ ├── ParseUtilSpec.scala │ │ └── XmlJobConfBase.scala └── daflow-job-conf-yaml │ └── pom.xml ├── daflow-metrics ├── pom.xml ├── scripts │ └── daflow-feed-stat.sh ├── sql │ └── daflow-feed-stat └── src │ └── main │ └── scala │ └── com │ └── abhioncbr │ └── daflow │ └── metrics │ ├── promethus │ └── PrometheusObject.scala │ └── stats │ ├── JobResult.scala │ └── UpdateFeedStats.scala ├── daflow-sql-parser ├── README.md ├── pom.xml └── src │ ├── main │ └── scala │ │ └── com │ │ └── abhioncbr │ │ └── daflow │ │ └── sqlParser │ │ ├── QueryDsl.scala │ │ └── SQLParser.scala │ └── test │ └── scala │ └── com │ └── abhioncbr │ └── daflow │ └── sqlParser │ └── SqlParserSpec.scala ├── docker ├── compose │ ├── docker-compose-daflow.yml │ └── hadoop.env ├── images │ ├── hadoop │ │ ├── base │ │ │ ├── Dockerfile │ │ │ ├── entrypoint.sh │ │ │ ├── export_container_ip.sh │ │ │ └── pom.xml │ │ ├── datanode │ │ │ ├── Dockerfile │ │ │ ├── pom.xml │ │ │ └── run_dn.sh │ │ ├── historyserver │ │ │ ├── Dockerfile │ │ │ ├── pom.xml │ │ │ └── run_history.sh │ │ └── namenode │ │ │ ├── Dockerfile │ │ │ ├── pom.xml │ │ │ └── run_nn.sh │ ├── hive │ │ ├── Dockerfile │ │ ├── conf │ │ │ ├── beeline-log4j2.properties │ │ │ ├── hive-env.sh │ │ │ ├── hive-exec-log4j2.properties │ │ │ ├── hive-log4j2.properties │ │ │ ├── hive-site.xml │ │ │ ├── ivysettings.xml │ │ │ └── llap-daemon-log4j2.properties │ │ ├── entrypoint.sh │ │ ├── pom.xml │ │ └── startup.sh │ ├── pom.xml │ └── spark │ │ ├── adhoc │ │ ├── Dockerfile │ │ ├── adhoc.sh │ │ └── pom.xml │ │ ├── base │ │ ├── Dockerfile │ │ ├── execute-step.sh │ │ ├── finish-step.sh │ │ ├── pom.xml │ │ └── wait-for-step.sh │ │ ├── master │ │ ├── Dockerfile │ │ ├── master.sh │ │ └── pom.xml │ │ └── worker │ │ ├── Dockerfile │ │ ├── pom.xml │ │ └── worker.sh ├── scripts │ └── setup_demo_container.sh ├── setup_demo.sh └── stop_demo.sh ├── pom.xml └── style ├── checkstyle-suppressions.xml ├── checkstyle.xml ├── eclipse-java-google-style.xml ├── intellij-java-google-style.xml └── scalastyle-config.xml /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | labels: 5 | 6 | --- 7 | 8 | **Describe the bug** 9 | A clear and concise description of what the bug is. 10 | 11 | **To Reproduce** 12 | Steps to reproduce the behavior: 13 | 1. Go to '...' 14 | 2. Click on '....' 15 | 3. Scroll down to '....' 16 | 4. See error 17 | 18 | **Expected behavior** 19 | A clear and concise description of what you expected to happen. 20 | 21 | **Screenshots** 22 | If applicable, add screenshots to help explain your problem. 23 | 24 | **Desktop (please complete the following information):** 25 | - OS: [e.g. iOS] 26 | - Browser [e.g. chrome, safari] 27 | - Version [e.g. 22] 28 | 29 | **Smartphone (please complete the following information):** 30 | - Device: [e.g. iPhone6] 31 | - OS: [e.g. iOS8.1] 32 | - Browser [e.g. stock browser, safari] 33 | - Version [e.g. 22] 34 | 35 | **Additional context** 36 | Add any other context about the problem here. 37 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | labels: 5 | 6 | --- 7 | 8 | **Is your feature request related to a problem? Please describe.** 9 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 10 | 11 | **Describe the solution you'd like** 12 | A clear and concise description of what you want to happen. 13 | 14 | **Describe alternatives you've considered** 15 | A clear and concise description of any alternative solutions or features you've considered. 16 | 17 | **Additional context** 18 | Add any other context or screenshots about the feature request here. 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | local 4 | target 5 | project/target 6 | project/project/target 7 | daflow-core/target 8 | daflow-core/.DS_Store 9 | daflow-core/src/.DS_Store 10 | daflow-core/src/main/.DS_Store 11 | daflow-core/src/test/.DS_Store 12 | daflow-core/src/main/resources/.DS_Store 13 | daflow-commons/target 14 | daflow-commons/.DS_Store 15 | daflow-commons/src/.DS_Store 16 | daflow-commons/src/main/.DS_Store 17 | daflow-core/src/test/.DS_Store 18 | daflow-commons/src/main/resources/.DS_Store 19 | daflow-core/*.iml 20 | daflow-commons/*.iml 21 | daflow-metrics/*.iml 22 | daflow-sql-parser/*.iml 23 | daflow-job-conf/daflow-job-conf-xml/*.iml 24 | daflow-job-conf/daflow-job-conf-yaml/*.iml 25 | docker/images/*.iml 26 | docker/images/hive/*.iml 27 | docker/images/hadoop/base/*.iml 28 | docker/images/hadoop/datanode/*.iml 29 | docker/images/hadoop/namenode/*.iml 30 | docker/images/hadoop/historyserver/*.iml 31 | docker/images/spark/base/*.iml 32 | docker/images/spark/master/*.iml 33 | docker/images/spark/worker/*.iml 34 | daflow-examples/demo/aritfacts/* 35 | daflow-examples/demo/sample-data/daflow-result/* 36 | daflow-examples/demo/sample-feed-stats/* 37 | 38 | *.iml 39 | **/*.iml -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | jdk: 4 | - oraclejdk8 5 | 6 | sudo: required 7 | 8 | services: 9 | - docker 10 | 11 | cache: 12 | directories: 13 | - $HOME/.m2 14 | 15 | script: 16 | - mvn -Ddocker.build.skip=$DOCKER_BUILD_SKIP -Ddocker.reg=$DOCKER_REGISTRY -Ddocker.reg.username=$DOCKER_USERNAME -Ddocker.reg.password=$DOCKER_PASSWORD -q clean install 17 | 18 | after_success: 19 | - mvn -q clean cobertura:cobertura 20 | - bash <(curl -s https://codecov.io/bash) 21 | -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | daflow.sparsecode.io -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at abhioncbr@yahoo.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at abhioncbr@yahoo.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /DaFlow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sparsecode/DaFlow/068ec7582ef58cbc6c782d2bc19a43aca2dead81/DaFlow.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This software is licensed under the Apache 2 license, quoted below. 2 | 3 | Copyright (C) 2018-2019 Abhishek Sharma 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); you may not 6 | use this file except in compliance with the License. You may obtain a copy of 7 | the License at 8 | 9 | [http://www.apache.org/licenses/LICENSE-2.0] 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14 | License for the specific language governing permissions and limitations under 15 | the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | #DaFlow [Data Flow(ETL) Framework] 4 | 5 | [![Build Status](https://travis-ci.org/abhioncbr/DaFlow.svg?branch=master)](https://travis-ci.org/abhioncbr/DaFlow/) 6 | [![License](https://img.shields.io/:license-Apache%202-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.txt) 7 | [![codecov](https://codecov.io/gh/abhioncbr/DaFlow/branch/master/graph/badge.svg)](https://codecov.io/gh/abhioncbr/DaFlow) 8 | [![Code Climate](https://codeclimate.com/github/codeclimate/codeclimate/badges/gpa.svg)](https://codeclimate.com/github/abhioncbr/DaFlow) 9 | 10 | Apache-Spark based Data Flow(ETL) Framework which supports multiple read, write destinations of different types and also support multiple categories of transformation rules. -------------------------------------------------------------------------------- /daflow-commons/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sparsecode/DaFlow/068ec7582ef58cbc6c782d2bc19a43aca2dead81/daflow-commons/README.md -------------------------------------------------------------------------------- /daflow-commons/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.abhioncbr.daflow 7 | daflow 8 | ${revision} 9 | 10 | 4.0.0 11 | 12 | jar 13 | daflow-commons 14 | daflow-commons 15 | ${daflow.common.version} 16 | 17 | 18 | 19 | Apache License, Version 2.0 20 | http://www.apache.org/licenses/LICENSE-2.0.txt 21 | repo 22 | A business-friendly OSS license 23 | 24 | 25 | 26 | 27 | 28 | scala-tools.org 29 | Scala-Tools Maven2 Repository 30 | http://scala-tools.org/repo-releases 31 | 32 | 33 | 34 | 35 | 36 | scala-tools.org 37 | Scala-Tools Maven2 Repository 38 | http://scala-tools.org/repo-releases 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | org.scala-tools 55 | maven-scala-plugin 56 | 57 | ${scala.version} 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/CommonConstants.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons 19 | 20 | object CommonConstants { 21 | val DIRECTORY_SEPARATOR: Char = '/' 22 | } 23 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/Context.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons 19 | 20 | object Context { 21 | private val contextualObjects = scala.collection.mutable.Map[ContextConstantEnum.constant, Any]() 22 | 23 | def addContextualObject[T](key: ContextConstantEnum.constant, obj: T): Unit = { 24 | contextualObjects +=(key -> obj) 25 | } 26 | 27 | def getContextualObject[T](key: ContextConstantEnum.constant): T = { 28 | val output = contextualObjects.getOrElse(key, None) 29 | output.asInstanceOf[T] 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/ContextConstantEnum.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons 19 | 20 | object ContextConstantEnum extends Enumeration{ 21 | type constant = Value 22 | val START_DATE, END_DATE, 23 | HADOOP_CONF, SPARK_CONTEXT, SQL_CONTEXT, 24 | JOB_STATIC_PARAM_CONF, EXTRACT_CONF, TRANSFORM_CONF, LOAD_CONF, 25 | SCHEMA, OTHER_PARAM = Value 26 | } 27 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/ExecutionResult.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons 19 | 20 | import org.apache.spark.sql.DataFrame 21 | 22 | case class ExecutionResult(feedName: String, resultDF: DataFrame, otherAttributes: Option[Map[String, Any]] = None) 23 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/NotificationMessages.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons 19 | 20 | object NotificationMessages { 21 | val fileDoesNotExist: String => String = 22 | (filePath: String) => { s"Provided file path '$filePath' doesn't exist." } 23 | 24 | val jobXmlFileDoesNotExist: String => String = 25 | (filePath: String) => { s"Not able to load job xml file. Provided path: '$filePath'" } 26 | 27 | val exceptionMessage: Exception => String = 28 | (exception: Exception) => { s"Exception message: ${exception.getMessage}" } 29 | 30 | val unknownXMLEntity: String = "Unknown entity found instead of ''" 31 | val exceptionWhileParsing: String = "Exception while parsing job xml file. Please validate xml." 32 | 33 | // extract 34 | val extractNotSupported: String => String = 35 | (extractType: String) => { s"extracting data from $extractType is not supported right now" } 36 | } 37 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/ProcessFrequencyEnum.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons 19 | 20 | object ProcessFrequencyEnum extends Enumeration { 21 | type frequencyType = Value 22 | val ONCE, HOURLY, DAILY, WEEKLY, MONTHLY, YEARLY, DATE_RANGE = Value 23 | 24 | def getProcessFrequencyEnum(frequencyString: String): ProcessFrequencyEnum.frequencyType = { 25 | val processFrequencyEnum = frequencyString match { 26 | case "ONCE" => ProcessFrequencyEnum.ONCE 27 | case "HOURLY" => ProcessFrequencyEnum.HOURLY 28 | case "DAILY" => ProcessFrequencyEnum.DAILY 29 | case "WEEKLY" => ProcessFrequencyEnum.WEEKLY 30 | case "MONTHLY" => ProcessFrequencyEnum.MONTHLY 31 | case "YEARLY" => ProcessFrequencyEnum.YEARLY 32 | case "DATE_RANGE" => ProcessFrequencyEnum.DATE_RANGE 33 | case _ => throw new RuntimeException(s"'$frequencyString', process frequency not supported.") 34 | } 35 | processFrequencyEnum 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/DaFlowJobConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf 19 | 20 | import com.abhioncbr.daflow.commons.conf.extract.ExtractConf 21 | import com.abhioncbr.daflow.commons.conf.load.LoadConf 22 | import com.abhioncbr.daflow.commons.conf.transform.TransformConf 23 | 24 | case class DaFlowJobConf(jobStaticParam: JobStaticParamConf, extract: ExtractConf, transform: TransformConf, load: LoadConf) 25 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/JobStaticParamConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf 19 | 20 | import com.abhioncbr.daflow.commons.ProcessFrequencyEnum 21 | import com.abhioncbr.daflow.commons.conf.common.GeneralParamConf 22 | 23 | case class JobStaticParamConf(processFrequency: ProcessFrequencyEnum.frequencyType, jobName: String, publishStats: Boolean, 24 | otherParams: Option[Array[GeneralParamConf]] = None) 25 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/common/DataPath.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.common 19 | 20 | case class DataPath(pathPrefix: Option[String], cataloguePatterns: Option[Array[PathInfixParam]] = None, 21 | feedPattern: Option[PathInfixParam] = None, fileName: Option[FileNameParam] = None) 22 | 23 | case class PathInfixParam(order: Option[Int] = None, infixPattern: String, 24 | formatInfix: Option[Boolean] = Some(false), formatInfixArgs: Option[Array[GeneralParamConf]] = None) 25 | 26 | case class FileNameParam(fileNamePrefix: Option[String] = None, fileNameSuffix: Option[String] = None, 27 | fileNameSeparator: Option[String] = Some(".")) 28 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/common/FieldMappingConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.common 19 | 20 | case class FieldMappingConf(sourceFieldName: String, targetFieldName: String) 21 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/common/GeneralParamConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.common 19 | 20 | case class GeneralParamConf(order: Int, paramName: String, paramValue: String, paramDefaultValue: String) 21 | 22 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/common/QueryConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.common 19 | 20 | case class QueryConf(queryFile: QueryFilesConf, queryArgs: Option[Array[GeneralParamConf]]) 21 | case class QueryFilesConf(configurationFile: Option[DataPath], queryFile: Option[DataPath]) 22 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/extract/ExtractConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.extract 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.DataPath 21 | import com.abhioncbr.daflow.commons.conf.common.QueryConf 22 | 23 | case class ExtractConf(feeds: Array[ExtractFeedConf]) 24 | case class ExtractFeedConf(extractFeedName: String, extractionType: ExtractionType.valueType, 25 | extractionAttributesMap: Map[String, String], dataPath: Option[DataPath], query: Option[QueryConf], 26 | validateExtractedData: Boolean) 27 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/extract/ExtractionType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.extract 19 | 20 | object ExtractionType extends Enumeration { 21 | type valueType = Value 22 | val JDBC, HIVE, FILE_SYSTEM, UNSUPPORTED = Value 23 | 24 | def getValueType(valueTypeString: String): ExtractionType.valueType = { 25 | val valueType = valueTypeString match { 26 | case "JDBC" => ExtractionType.JDBC 27 | case "HIVE" => ExtractionType.HIVE 28 | case "FILESYSTEM" => ExtractionType.FILE_SYSTEM 29 | case "UNSUPPORTED" => ExtractionType.UNSUPPORTED 30 | } 31 | valueType 32 | } 33 | 34 | def getDataValue(valueType: ExtractionType.valueType): String = { 35 | val output = valueType match { 36 | case JDBC => "JDBC" 37 | case HIVE => "HIVE" 38 | case FILE_SYSTEM => "FILE_SYSTEM" 39 | case UNSUPPORTED => "UNSUPPORTED" 40 | } 41 | output 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/load/LoadConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.load 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.DataPath 21 | 22 | case class LoadConf(feeds: Array[LoadFeedConf]) 23 | case class LoadFeedConf(loadFeedName: String, loadType: LoadType.valueType, attributesMap: Map[String, String], 24 | dataPath: DataPath, partitioningData: Option[PartitioningDataConf] ) 25 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/load/LoadType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.load 19 | 20 | object LoadType extends Enumeration { 21 | type valueType = Value 22 | val JDBC, HIVE, FILE_SYSTEM = Value 23 | 24 | def getValueType(valueTypeString: String): LoadType.valueType = { 25 | val valueType = valueTypeString match { 26 | case "JDBC" => LoadType.JDBC 27 | case "HIVE" => LoadType.HIVE 28 | case "FILESYSTEM" => LoadType.FILE_SYSTEM 29 | } 30 | valueType 31 | } 32 | 33 | def getDataValue(valueType: LoadType.valueType): String = { 34 | val output = valueType match { 35 | case JDBC => "JDBC" 36 | case HIVE => "HIVE" 37 | case FILE_SYSTEM => "FILESYSTEM" 38 | } 39 | output 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/load/PartitioningDataConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.load 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.GeneralParamConf 21 | 22 | case class PartitioningDataConf(coalesce: Boolean, overwrite: Boolean, coalesceCount: Int, 23 | partitionColumns: List[GeneralParamConf]) 24 | -------------------------------------------------------------------------------- /daflow-commons/src/main/scala/com/abhioncbr/daflow/commons/conf/transform/TransformConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.conf.transform 19 | 20 | case class TransformConf(transformSteps: List[TransformStepConf], validateTransformedData: Boolean) 21 | case class TransformStepConf(order: Int, rules: Map[String, TransformRuleConf]) 22 | case class TransformRuleConf(ruleType: String, condition: String, ruleAttributesMap: Map[String, String]) 23 | -------------------------------------------------------------------------------- /daflow-commons/src/test/scala/com/abhioncbr/daflow/commons/CommonSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons 19 | 20 | import ContextConstantEnum.HADOOP_CONF 21 | import org.apache.hadoop.conf.Configuration 22 | import org.scalatest.BeforeAndAfterEach 23 | import org.scalatest.FlatSpec 24 | import org.scalatest.Matchers 25 | 26 | class CommonSpec extends FlatSpec with Matchers with BeforeAndAfterEach { 27 | 28 | override def beforeEach(): Unit = { 29 | super.beforeEach() 30 | 31 | val dir: String = System.getProperty("user.dir") 32 | System.setProperty("hadoop.home.dir", dir) 33 | Context.addContextualObject[Configuration](HADOOP_CONF, new Configuration()) 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /daflow-commons/src/test/scala/com/abhioncbr/daflow/commons/util/FileUtilSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.commons.util 19 | 20 | import com.abhioncbr.daflow.commons.CommonConstants.{DIRECTORY_SEPARATOR => DS} 21 | import com.abhioncbr.daflow.commons.CommonSpec 22 | import com.abhioncbr.daflow.commons.Fixture 23 | 24 | class FileUtilSpec extends CommonSpec { 25 | 26 | "getFilePathString" should "return only pathPrefix as path string" in { 27 | val expectedPath: String = Fixture.pathPrefix + DS 28 | val pathString: String = FileUtil.getFilePathString(Fixture.dataPath) 29 | pathString should not be None 30 | pathString should be(expectedPath) 31 | } 32 | 33 | "getFilePathString" should "return pathPrefix & catalogue as path string" in { 34 | val expectedPath: String = Fixture.pathPrefix + DS + Fixture.catalogueStaticInfixPattern1 + DS 35 | val pathString: String = FileUtil.getFilePathString(Fixture.dataPath1) 36 | pathString should not be None 37 | pathString should be(expectedPath) 38 | } 39 | 40 | "getFilePathString" should "return pathPrefix, catalogue & feed as path string" in { 41 | val expectedPath 42 | : String = Fixture.pathPrefix + DS + Fixture.catalogueStaticInfixPattern1 + DS + Fixture.feedStaticInfixParam + DS 43 | val pathString: String = FileUtil.getFilePathString(Fixture.dataPath2) 44 | pathString should not be None 45 | pathString should be(expectedPath) 46 | } 47 | 48 | "getFilePathString" should "return pathPrefix, catalogue, feed and fileName as path string" in { 49 | val expectedPath: String = Fixture.pathPrefix + DS + Fixture.catalogueStaticInfixPattern1 + 50 | DS + Fixture.feedStaticInfixParam + DS + Fixture.fileNamePrefix1 + "." + Fixture.fileNameSuffix1 51 | val pathString: String = FileUtil.getFilePathString(Fixture.dataPath3) 52 | pathString should not be None 53 | pathString should be(expectedPath) 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /daflow-core/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sparsecode/DaFlow/068ec7582ef58cbc6c782d2bc19a43aca2dead81/daflow-core/README.md -------------------------------------------------------------------------------- /daflow-core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.abhioncbr.daflow 7 | daflow 8 | ${revision} 9 | 10 | 4.0.0 11 | 12 | jar 13 | daflow-core 14 | ${daflow.core.version} 15 | daflow-core 16 | 17 | 18 | 19 | Apache License, Version 2.0 20 | http://www.apache.org/licenses/LICENSE-2.0.txt 21 | repo 22 | A business-friendly OSS license 23 | 24 | 25 | 26 | 27 | 28 | scala-tools.org 29 | Scala-Tools Maven2 Repository 30 | http://scala-tools.org/repo-releases 31 | 32 | 33 | 34 | 35 | 36 | scala-tools.org 37 | Scala-Tools Maven2 Repository 38 | http://scala-tools.org/repo-releases 39 | 40 | 41 | 42 | 43 | 3.3.0 44 | 45 | 46 | 47 | 48 | com.abhioncbr.daflow 49 | daflow-commons 50 | ${daflow.common.version} 51 | 52 | 53 | 54 | com.abhioncbr.daflow 55 | daflow-job-conf-xml 56 | ${daflow.job.conf.xml.version} 57 | 58 | 59 | 60 | com.abhioncbr.daflow 61 | daflow-metrics 62 | ${daflow.metrics.version} 63 | 64 | 65 | 66 | com.abhioncbr.daflow 67 | daflow-sql-parser 68 | ${daflow.sql.parser.version} 69 | 70 | 71 | 72 | com.github.scopt 73 | scopt_2.11 74 | ${scopt.version} 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | org.apache.maven.plugins 83 | maven-shade-plugin 84 | 3.2.1 85 | 86 | 87 | package 88 | 89 | shade 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | org.scala-tools 101 | maven-scala-plugin 102 | 103 | ${scala.version} 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/extractData/AbstractExtractData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.extractData 19 | 20 | abstract class AbstractExtractData extends ExtractData 21 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/extractData/ExtractData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.extractData 19 | 20 | import com.abhioncbr.daflow.commons.ExecutionResult 21 | 22 | trait ExtractData { def getRawData: Either[ExecutionResult, String] } 23 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/extractData/ExtractDataFromDB.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.extractData 19 | 20 | import com.abhioncbr.daflow.commons.Context 21 | import com.abhioncbr.daflow.commons.ContextConstantEnum._ 22 | import com.abhioncbr.daflow.commons.ExecutionResult 23 | import com.abhioncbr.daflow.commons.NotificationMessages.{exceptionMessage => EM} 24 | import com.abhioncbr.daflow.commons.conf.common.GeneralParamConf 25 | import com.abhioncbr.daflow.commons.conf.common.QueryConf 26 | import com.abhioncbr.daflow.commons.conf.extract.ExtractFeedConf 27 | import com.abhioncbr.daflow.commons.util.FileUtil 28 | import com.typesafe.scalalogging.Logger 29 | import java.io.BufferedReader 30 | import java.io.InputStreamReader 31 | import java.util.Properties 32 | import org.apache.hadoop.conf.Configuration 33 | import org.apache.hadoop.fs.FileSystem 34 | import org.apache.hadoop.fs.Path 35 | import org.apache.spark.sql.DataFrame 36 | import org.apache.spark.sql.SQLContext 37 | 38 | class ExtractDataFromDB(feed: ExtractFeedConf) extends AbstractExtractData { 39 | private val logger = Logger(this.getClass) 40 | val query: Option[QueryConf] = feed.query 41 | 42 | def getRawData: Either[ExecutionResult, String] = { 43 | try { 44 | lazy val fs = FileSystem.get(new Configuration()) 45 | 46 | // reading database properties from property file. 47 | val propertyFilePath = 48 | FileUtil.getFilePathString(query.get.queryFile.configurationFile.get) 49 | logger.info( 50 | s"[ExtractDataFromDB]-[getRawData]: DB property file path: $propertyFilePath" 51 | ) 52 | 53 | val connectionProps = new Properties() 54 | connectionProps.load(fs.open(new Path(propertyFilePath))) 55 | val dbUri = connectionProps.getProperty("dburi") 56 | 57 | // reading query from the query file. 58 | val sqlQueryFile = 59 | FileUtil.getFilePathString(query.get.queryFile.queryFile.get) 60 | val tableQueryReader = new BufferedReader( 61 | new InputStreamReader(fs.open(new Path(sqlQueryFile))) 62 | ) 63 | val rawQuery = Stream 64 | .continually(tableQueryReader.readLine()) 65 | .takeWhile(_ != null) 66 | .toArray[String] 67 | .mkString 68 | .stripMargin 69 | 70 | val sqlQueryParams: Array[GeneralParamConf] = query.get.queryArgs.get 71 | val queryParams = ExtractUtil.getParamsValue(sqlQueryParams.toList) 72 | 73 | logger.info( 74 | "[ExtractDataFromDB]-[getRawData]: Query param values: " + queryParams 75 | .mkString(" , ") 76 | ) 77 | val tableQuery = String.format(rawQuery, queryParams: _*) 78 | logger.info( 79 | s"[ExtractDataFromDB]-[getRawData]: Going to execute jdbc query: \\n $tableQuery" 80 | ) 81 | 82 | val sqlContext = Context.getContextualObject[SQLContext](SQL_CONTEXT) 83 | val dataFrame: DataFrame = sqlContext.read.jdbc( 84 | url = dbUri, 85 | table = tableQuery, 86 | properties = connectionProps 87 | ) 88 | Left(ExecutionResult(feed.extractFeedName, dataFrame)) 89 | } catch { 90 | case exception: Exception => 91 | logger.error("[ExtractDataFromDB]-[getRawData]: ", exception) 92 | Right(s"[ExtractDataFromDB]-[getRawData]: ${EM(exception)}".stripMargin) 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/extractData/ExtractDataFromFileSystem.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.extractData 19 | 20 | import com.abhioncbr.daflow.commons.Context 21 | import com.abhioncbr.daflow.commons.ContextConstantEnum._ 22 | import com.abhioncbr.daflow.commons.ExecutionResult 23 | import com.abhioncbr.daflow.commons.NotificationMessages.{exceptionMessage => EM} 24 | import com.abhioncbr.daflow.commons.NotificationMessages.{extractNotSupported => ENS} 25 | import com.abhioncbr.daflow.commons.conf.common.DataPath 26 | import com.abhioncbr.daflow.commons.conf.extract.ExtractFeedConf 27 | import com.abhioncbr.daflow.commons.util.FileUtil 28 | import com.typesafe.scalalogging.Logger 29 | import org.apache.spark.sql.SQLContext 30 | 31 | class ExtractDataFromFileSystem(feed: ExtractFeedConf) extends ExtractData { 32 | private val logger = Logger(this.getClass) 33 | val dataPath: Option[DataPath] = feed.dataPath 34 | 35 | def getRawData: Either[ExecutionResult, String] = { 36 | try { 37 | val sqlContext: SQLContext = 38 | Context.getContextualObject[SQLContext](SQL_CONTEXT) 39 | val fileNamePatternString = FileUtil.getFilePathString(dataPath.get) 40 | logger.info( 41 | s"[ExtractDataFromFileSystem]-[getRawData]: path of data extraction: $fileNamePatternString" 42 | ) 43 | 44 | val output: Either[ExecutionResult, String] = 45 | feed.extractionAttributesMap("fileType") match { 46 | case "CSV" => 47 | Left( 48 | ExecutionResult( 49 | feed.extractFeedName, 50 | sqlContext.read.csv(fileNamePatternString) 51 | ) 52 | ) 53 | case "JSON" => 54 | Left( 55 | ExecutionResult( 56 | feed.extractFeedName, 57 | sqlContext.read.json(fileNamePatternString) 58 | ) 59 | ) 60 | case "PARQUET" => 61 | Left( 62 | ExecutionResult( 63 | feed.extractFeedName, 64 | sqlContext.read.parquet(fileNamePatternString) 65 | ) 66 | ) 67 | case _ => 68 | Right( 69 | s"[ExtractDataFromFileSystem]-[getRawData]: ${ENS(feed.extractionAttributesMap("fileType"))}" 70 | ) 71 | } 72 | output 73 | } catch { 74 | case exception: Exception => 75 | logger.error("[ExtractDataFromFileSystem]-[getRawData]: ", exception) 76 | Right( 77 | s"[ExtractDataFromFileSystem]-[getRawData]: ${EM(exception)}".stripMargin 78 | ) 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/extractData/ExtractDataFromHive.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.extractData 19 | 20 | import com.abhioncbr.daflow.commons.Context 21 | import com.abhioncbr.daflow.commons.ContextConstantEnum._ 22 | import com.abhioncbr.daflow.commons.ExecutionResult 23 | import com.abhioncbr.daflow.commons.NotificationMessages.{exceptionMessage => EM} 24 | import com.abhioncbr.daflow.commons.conf.common.GeneralParamConf 25 | import com.abhioncbr.daflow.commons.conf.common.QueryConf 26 | import com.abhioncbr.daflow.commons.conf.extract.ExtractFeedConf 27 | import com.abhioncbr.daflow.commons.util.FileUtil 28 | import com.typesafe.scalalogging.Logger 29 | import java.io.BufferedReader 30 | import java.io.InputStreamReader 31 | import org.apache.hadoop.conf.Configuration 32 | import org.apache.hadoop.fs.FileSystem 33 | import org.apache.hadoop.fs.Path 34 | import org.apache.spark.sql.DataFrame 35 | import org.apache.spark.sql.SQLContext 36 | 37 | class ExtractDataFromHive(feed: ExtractFeedConf) extends AbstractExtractData { 38 | private val logger = Logger(this.getClass) 39 | val query: Option[QueryConf] = feed.query 40 | 41 | def getRawData: Either[ExecutionResult, String] = { 42 | try { 43 | lazy val fs = FileSystem.get(new Configuration()) 44 | 45 | // reading query from the query file. 46 | val sqlQueryFile: String = 47 | FileUtil.getFilePathString(query.get.queryFile.queryFile.get) 48 | val tableQueryReader = new BufferedReader( 49 | new InputStreamReader(fs.open(new Path(sqlQueryFile))) 50 | ) 51 | val rawQuery = Stream 52 | .continually(tableQueryReader.readLine()) 53 | .takeWhile(_ != null) 54 | .toArray[String] 55 | .mkString 56 | .stripMargin 57 | 58 | val sqlQueryParams: Array[GeneralParamConf] = query.get.queryArgs.get 59 | val queryParams = ExtractUtil.getParamsValue(sqlQueryParams.toList) 60 | logger.info( 61 | "[ExtractDataFromHive]-[getRawData]: Qquery param values" + queryParams 62 | .mkString(" , ") 63 | ) 64 | val tableQuery = String.format(rawQuery, queryParams: _*) 65 | logger.info( 66 | s"[ExtractDataFromHive]-[getRawData]: Going to execute hive query: \\n $tableQuery" 67 | ) 68 | 69 | val sqlContext = Context.getContextualObject[SQLContext](SQL_CONTEXT) 70 | val dataFrame: DataFrame = sqlContext.sql(tableQuery) 71 | Left(ExecutionResult(feed.extractFeedName, dataFrame)) 72 | } catch { 73 | case exception: Exception => 74 | logger.error("[ExtractDataFromHive]-[getRawData]: ", exception) 75 | Right( 76 | s"[ExtractDataFromHive]-[getRawData]: ${EM(exception)}".stripMargin 77 | ) 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/extractData/ExtractUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.extractData 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.GeneralParamConf 21 | import com.abhioncbr.daflow.commons.util.FileUtil 22 | 23 | object ExtractUtil { 24 | def getParamsValue(paramList: List[GeneralParamConf]): Array[Object] = { 25 | paramList 26 | .map( 27 | queryParam => 28 | (queryParam.order, FileUtil.mapFormatArgs(Some(paramList.toArray))) 29 | ) 30 | .sortBy(_._1) 31 | .map(_._2) 32 | .toArray 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/loadData/LoadData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.loadData 19 | 20 | import org.apache.spark.sql.DataFrame 21 | import org.joda.time.DateTime 22 | 23 | trait LoadData{ 24 | def loadTransformedData(dataFrame: DataFrame, date: Option[DateTime]): Either[Boolean, String] 25 | } 26 | 27 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/loadData/LoadDataIntoFileSystem.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.loadData 19 | 20 | import com.abhioncbr.daflow.commons.Context 21 | import com.abhioncbr.daflow.commons.ContextConstantEnum.JOB_STATIC_PARAM_CONF 22 | import com.abhioncbr.daflow.commons.conf.JobStaticParamConf 23 | import com.abhioncbr.daflow.commons.conf.common.DataPath 24 | import com.abhioncbr.daflow.commons.conf.load.LoadFeedConf 25 | import com.abhioncbr.daflow.commons.util.FileUtil 26 | import com.typesafe.scalalogging.Logger 27 | import org.apache.spark.sql.DataFrame 28 | import org.apache.spark.sql.SaveMode 29 | import org.joda.time.DateTime 30 | 31 | class LoadDataIntoFileSystem(feed: LoadFeedConf) extends LoadData { 32 | private val logger = Logger(this.getClass) 33 | private val processFrequency = Context 34 | .getContextualObject[JobStaticParamConf](JOB_STATIC_PARAM_CONF) 35 | .processFrequency 36 | 37 | private val datasetName: String = 38 | feed.attributesMap.getOrElse("catalogName", "") 39 | private val feedName = feed.attributesMap.getOrElse("feedName", "") 40 | private val dataPath: DataPath = feed.dataPath 41 | 42 | def loadTransformedData( 43 | dataFrame: DataFrame, 44 | date: Option[DateTime] = None 45 | ): Either[Boolean, String] = { 46 | val path = FileUtil.getFilePathString(dataPath) 47 | 48 | try { 49 | logger.info( 50 | s"Writing $processFrequency dataFrame for dataset: $datasetName, feed $feedName to ($path). " + 51 | s"Total number of data rows saved: ${dataFrame.count}" 52 | ) 53 | 54 | val fileType = feed.attributesMap("fileType") 55 | 56 | val output: Either[Boolean, String] = fileType match { 57 | case "CSV" => 58 | dataFrame.write.mode(SaveMode.Overwrite).csv(path) 59 | logger.info(s"Data written at ($path) successfully.") 60 | Left(true) 61 | 62 | case "JSON" => 63 | dataFrame.write.mode(SaveMode.Overwrite).json(path) 64 | logger.info(s"Data written at ($path) successfully.") 65 | Left(true) 66 | 67 | case "PARQUET" => 68 | dataFrame.write.mode(SaveMode.Overwrite).parquet(path) 69 | logger.info(s"Data written at ($path) successfully.") 70 | Left(true) 71 | 72 | case _ => Right(s"file type '$fileType' not supported.") 73 | } 74 | 75 | output 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/loadData/LoadUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.loadData 19 | 20 | import com.abhioncbr.daflow.commons.conf.load.PartitioningDataConf 21 | 22 | object LoadUtil { 23 | def getPartitioningString(data: PartitioningDataConf): String = { 24 | data.partitionColumns.map(col => s"${col.paramName} = '${col.paramValue}'").mkString(" , ") 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/transformData/Transform.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.transformData 19 | 20 | case class Transform(transformSteps: List[TransformStep], validateTransformedData: Boolean) 21 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/transformData/TransformData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.transformData 19 | 20 | import com.abhioncbr.daflow.commons.ExecutionResult 21 | import com.typesafe.scalalogging.Logger 22 | 23 | class TransformData(transform : Transform) { 24 | private val logger = Logger(this.getClass) 25 | 26 | val test: (Either[Array[ExecutionResult], String], TransformStep) => Either[Array[ExecutionResult], String] = (input, step) => { 27 | input match { 28 | case Left(array) => 29 | step.addInputData(array.map(res => res.resultDF)) match { 30 | case None => 31 | 32 | // val stepOutput: ArrayBuffer[ExecutionResult] = new ArrayBuffer() 33 | val stepOutput: List[Either[Array[ExecutionResult], String]] = step.getRules.zipWithIndex.map(rule => { 34 | logger.info(s"step order: ${step.getOrder}, rule: $rule - checking condition") 35 | if (rule._1._2.condition(step.getInputData)) { 36 | logger.info(s"step order: ${step.getOrder}, rule: $rule - executing") 37 | rule._1._2.execute(step.getInputData) match { 38 | case Left(outputArray) => Left(outputArray) 39 | case Right(s) => Right(s) 40 | } 41 | } else { 42 | Right(s"For transformation step order: ${step.getOrder}, rule group:${rule._1._2.getGroup} : condition failed.") 43 | } 44 | }).toList 45 | 46 | val filteredStepOutput = stepOutput.filter(_.isRight) 47 | if(filteredStepOutput.nonEmpty) { Right(filteredStepOutput.mkString(" \\n ")) } 48 | else { Left(stepOutput.flatMap(_.left.get).toArray) } 49 | 50 | case Some(s) => Right(s) 51 | } 52 | 53 | case Right(e) => Right(e) 54 | } 55 | } 56 | 57 | def performTransformation(extractResult: Array[ExecutionResult]): Either[Array[ExecutionResult], String] = { 58 | val steps = transform.transformSteps 59 | val stepOutput: Either[Array[ExecutionResult], String] = Left(extractResult) 60 | 61 | val output: Either[Array[ExecutionResult], String] = steps.foldLeft(stepOutput)((c, n) => test(c, n)) 62 | output 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/transformData/TransformStep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.transformData 19 | 20 | import org.apache.spark.sql.DataFrame 21 | 22 | class TransformStep(order: Int, rules: Map[String, TransformRule]){ 23 | override def toString: String = s" step order: $order, step rules: $rules" 24 | def getOrder: Int = order 25 | def getRules: Map[String, TransformRule] = rules 26 | 27 | val inputData: scala.collection.mutable.Map[String, DataFrame] = scala.collection.mutable.Map[String, DataFrame]() 28 | def getInputData(i: String): DataFrame = inputData(i) 29 | 30 | lazy val requiredDF: Array[String] = rules.values.flatMap { 31 | case merge: MergeRule => 32 | val temp = merge.asInstanceOf[MergeRule].getMergeGroup 33 | Array(temp._1, temp._2) 34 | case default: Any => Array(default.getGroup) 35 | }.toArray 36 | 37 | def addInputData(dataArray: Array[DataFrame]) : Option[String] = { 38 | if (dataArray.length == requiredDF.length) { 39 | inputData.clear 40 | inputData ++= requiredDF.zip(dataArray).toMap 41 | None 42 | } else { 43 | Some(s"For transformation step ${this.getOrder}: input data frames size(${dataArray.length}) " + 44 | s"is not equal to rules map size(${rules.size})") 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/validateData/ValidateData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.validateData 19 | 20 | import com.abhioncbr.daflow.commons.ExecutionResult 21 | import org.apache.spark.sql.types.StructType 22 | 23 | trait ValidateData{ 24 | def validateSchema(input: ExecutionResult) : (Boolean, Option[StructType], Option[StructType]) 25 | def validateData(input: ExecutionResult, structType: StructType, first: Any, second: Any): Array[ExecutionResult] 26 | } 27 | -------------------------------------------------------------------------------- /daflow-core/src/main/scala/com/abhioncbr/daflow/core/validateData/ValidateTransformedData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.core.validateData 19 | 20 | abstract class ValidateTransformedData extends ValidateData 21 | /* private val logger = Logger(this.getClass) 22 | private val sparkContext: SparkContext = Context.getContextualObject[SparkContext](SPARK_CONTEXT) 23 | private val sqlContext: SQLContext = Context.getContextualObject[SQLContext](SQL_CONTEXT) 24 | 25 | private val tableName = Context.getContextualObject[LoadFeedConf](LOAD_CONF).attributesMap("tableName") 26 | private val databaseName = Context.getContextualObject[LoadFeedConf](LOAD_CONF).attributesMap("databaseName") 27 | val partitionColumns: List[String] = Context.getContextualObject[LoadFeedConf](LOAD_CONF).partitioningData.get. 28 | partitionColumns.map(column => column.paramName) 29 | 30 | def validateSchema(dataFrame: DataFrame): (Boolean, Option[StructType], Option[StructType]) = { 31 | logger.info("Validating data frame schema and hive table schema") 32 | 33 | val dataFrameSchema = dataFrame.schema 34 | 35 | var tableSchema = Context.getContextualObject[(Option[StructType], Option[StructType])](SCHEMA) 36 | if(tableSchema == null) 37 | tableSchema = TransformUtil.tableMetadata(tableName, databaseName, sqlContext, partitionColumns) 38 | 39 | val output = if(tableSchema._1.isDefined) tableSchema._1.get == dataFrameSchema else false 40 | (output, tableSchema._1, Some(dataFrameSchema)) 41 | } 42 | 43 | def validateData(dataFrame: DataFrame, structType: StructType, first: Any, second: Any): 44 | Array[(DataFrame, DataFrame, Any, Any)] ={ 45 | logger.info("Validating data frame row schema and hive table schema") 46 | 47 | //val temp1 = dataFrame.collect 48 | //val temp = temp1.partition(row => compareSchema( row, structType)) 49 | //val validatedRdd = sparkContext.parallelize(temp._1) 50 | val validatedDataFrame = sqlContext.createDataFrame(dataFrame.rdd.filter(_.schema == structType), structType) 51 | 52 | //val nonValidatedRdd = sparkContext.parallelize(temp._2) 53 | val nonValidatedDataFrame = sqlContext.createDataFrame(dataFrame.rdd.filter(_.schema != structType), structType) 54 | 55 | Array((validatedDataFrame,nonValidatedDataFrame, first, second)) 56 | } 57 | 58 | def compareSchema(row: Row, structType: StructType): Boolean = { 59 | try{ row.schema == structType } 60 | catch { case e: Throwable => println(row.mkString); false } 61 | } */ 62 | -------------------------------------------------------------------------------- /daflow-examples/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sparsecode/DaFlow/068ec7582ef58cbc6c782d2bc19a43aca2dead81/daflow-examples/README.md -------------------------------------------------------------------------------- /daflow-examples/daflow-xml-templates/extract_jdbc_import.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | {sql-query-file-path.sql} 13 | {db-property-file-path} 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | test_string2 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | {partition-file-path-initial} 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /daflow-examples/daflow-xml-templates/extract_json_import.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | {json-file-path-suffix} 13 | 14 | json_data 15 | json 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | {col1}, {col2}, {col3} 27 | 28 | 29 | 30 | 31 | 32 | {col1}, {col2}, {col3} 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | {partition-file-path-initial} 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /daflow-examples/daflow-xml-templates/multiple_group_name.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {json-file-path-suffix} 18 | 19 | json_data 20 | json 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | {col1} is not null 31 | 32 | 33 | 34 | {cond2} 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | records.value 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | {path} 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /daflow-examples/daflow-xml-templates/multiple_transform_rule.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {json-file-path-suffix} 18 | 19 | 20 | 1 21 | group_%s 22 | true 23 | 24 | 25 | 26 | 27 | 28 | 29 | feed_%s 30 | true 31 | 32 | 33 | 34 | 35 | 36 | json_data 37 | json 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | {col1} like 'my%' 49 | 50 | 51 | 52 | 53 | 54 | {col2}, {col3}, {col4}, {col5} 55 | 56 | 57 | 58 | 59 | 60 | {col2}, {col3}, {col4}, {col5} 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | {path} 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /daflow-examples/demo/daflow-job-xml/json_etl_example.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | daflow-examples/demo/sample-data 11 | 12 | json_data 13 | json 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | test_string1,test_string2 24 | 25 | 26 | 27 | 28 | test_string2 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | daflow-examples/demo/sample-data/daflow-result/ 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /daflow-examples/demo/sample-data/json_data.json: -------------------------------------------------------------------------------- 1 | {"test_string1":"1","test_string2":"2"} 2 | {"test_string2":"22","test_string3":"3"} -------------------------------------------------------------------------------- /daflow-examples/scripts/execute_etl_feed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | spark-submit \ 5 | --class com.abhioncbr.daflow.core.LaunchDaFlowSparkJobExecution \ 6 | daflow-examples/demo/artifacts/daflow-core-0.1-SNAPSHOT.jar \ 7 | -j example -c daflow-examples/demo/daflow-job-xml/json_etl_example.xml -------------------------------------------------------------------------------- /daflow-examples/scripts/execute_etl_feed_airflow.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | {{ params.conf['spark_home'] }}/spark-submit --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ 3 | --conf spark.sql.hive.convertMetastoreParquet=false \ 4 | --conf spark.yarn.executor.memoryOverhead={{ params.conf['memoryOverhead'] }} \ 5 | --conf spark.memory.useLegacyMode=true \ 6 | --conf spark.shuffle.memoryFraction=0.5 \ 7 | --conf spark.storage.memoryFraction=0.5 \ 8 | --conf spark.io.compression.codec=org.apache.spark.io.SnappyCompressionCodec \ 9 | --master yarn-client --queue {{ params.conf['queue'] }} \ 10 | --num-executors {{ params.conf['num_executors'] }} --driver-memory {{ params.conf['driver_memory'] }} --executor-cores {{ params.conf['executor_cores'] }} \ 11 | --executor-memory {{ params.conf['executor_memory'] }} \ 12 | --class {{ params.conf['entry_class'] }} \ 13 | {{params.conf['app'] }} \ 14 | --date "{{ execution_date }}" {{params.arg_string }} -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sparsecode/DaFlow/068ec7582ef58cbc6c782d2bc19a43aca2dead81/daflow-job-conf/daflow-job-conf-xml/README.md -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.abhioncbr.daflow 7 | daflow 8 | ${revision} 9 | ../../pom.xml 10 | 11 | 4.0.0 12 | 13 | jar 14 | daflow-job-conf-xml 15 | daflow-job-conf-xml 16 | ${daflow.job.conf.xml.version} 17 | 18 | 19 | 20 | Apache License, Version 2.0 21 | http://www.apache.org/licenses/LICENSE-2.0.txt 22 | repo 23 | A business-friendly OSS license 24 | 25 | 26 | 27 | 28 | 29 | scala-tools.org 30 | Scala-Tools Maven2 Repository 31 | http://scala-tools.org/repo-releases 32 | 33 | 34 | 35 | 36 | 37 | scala-tools.org 38 | Scala-Tools Maven2 Repository 39 | http://scala-tools.org/repo-releases 40 | 41 | 42 | 43 | 44 | ${project.parent.basedir} 45 | 46 | 47 | 48 | 49 | com.abhioncbr.daflow 50 | daflow-commons 51 | ${daflow.common.version} 52 | test-jar 53 | test 54 | 55 | 56 | 57 | com.abhioncbr.daflow 58 | daflow-commons 59 | ${daflow.common.version} 60 | 61 | 62 | 63 | 64 | 65 | 66 | org.scalatest 67 | scalatest-maven-plugin 68 | 69 | 70 | ${main.basedir} 71 | 72 | 73 | 74 | 75 | test 76 | 77 | test 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | org.scala-tools 90 | maven-scala-plugin 91 | 92 | ${scala.version} 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/AttributeTags.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | object AttributeTags { 21 | val NAME: String = "@name" 22 | val TYPE: String = "@type" 23 | val VALUE: String = "@value" 24 | val ORDER: String = "@order" 25 | val JOB_NAME: String = "@jobName" 26 | val FREQUENCY: String = "@frequency" 27 | val FEED_NAME: String = "@feedName" 28 | val SOURCE_NAME: String = "@sourceName" 29 | val TARGET_NAME: String = "@targetName" 30 | val DEFAULT_VALUE: String = "@defaultValue" 31 | val PUBLISH_STATS: String = "@publishStats" 32 | val COALESCE_PARTITION: String = "@coalescePartition" 33 | val OVERWRITE_PARTITION: String = "@overwritePartition" 34 | val VALIDATE_EXTRACTED_DATA: String = "@validateExtractedData" 35 | val COALESCE_PARTITION_COUNT: String = "@coalescePartitionCount" 36 | val VALIDATE_TRANSFORMED_DATA: String = "@validateTransformedData" 37 | } 38 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/NodeTags.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | object NodeTags { 21 | // Four root node tags 22 | val LOAD: String = "load" 23 | val EXTRACT: String = "extract" 24 | val TRANSFORM: String = "transform" 25 | val JOB_STATIC_PARAM: String = "jobStaticParam" 26 | 27 | // General node tags 28 | val FEED: String = "feed" 29 | val DATA_PATH: String = "dataPath" 30 | 31 | // Extract node tags 32 | val JDBC: String = "jdbc" 33 | val QUERY: String = "query" 34 | val FILE_SYSTEM: String = "fileSystem" 35 | val QUERY_PARAMS: String = "queryParams" 36 | val SQL_QUERY_FILE: String = "sqlQueryFile" 37 | val CONFIGURATION_FILE: String = "configurationFile" 38 | 39 | // Transform node tags 40 | val RULE: String = "rule" 41 | val STEP: String = "step" 42 | val GROUP: String = "group" 43 | val CONDITION: String = "condition" 44 | 45 | // Load node tags 46 | val HIVE: String = "hive" 47 | val COLUMN: String = "column" 48 | val PARTITION_DATA: String = "partitionData" 49 | val PARTITION_COLUMNS: String = "partitionColumns" 50 | 51 | // Other node tags 52 | val PARAM: String = "param" 53 | val OTHER_PARAMS: String = "otherParams" 54 | val FIELD_MAPPING: String = "fieldMapping" 55 | 56 | // Data path node tags 57 | val PATH: String = "path" 58 | val ORDER: String = "order" 59 | val MEMBER: String = "member" 60 | val PREFIX: String = "prefix" 61 | val SUFFIX: String = "suffix" 62 | val FILE_NAME: String = "fileName" 63 | val SEPARATOR: String = "separator" 64 | val PATH_PATTERN: String = "pathPattern" 65 | val FEED_PATTERN: String = "feedPattern" 66 | val INITIAL_PATH: String = "initialPath" 67 | val GROUP_PATTERN: String = "groupPattern" 68 | val FORMAT_FEED_NAME: String = "formatFeedName" 69 | val FEED_NAME_PATTERN: String = "feedNamePattern" 70 | val FORMAT_ARG_VALUES: String = "formatArgValues" 71 | val FORMAT_GROUP_NAME: String = "formatGroupName" 72 | val GROUP_NAME_PATTERN: String = "groupNamePattern" 73 | } 74 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseDaFlowJobXml.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.Context 21 | import com.abhioncbr.daflow.commons.ContextConstantEnum.HADOOP_CONF 22 | import com.abhioncbr.daflow.commons.NotificationMessages.{exceptionMessage => EM} 23 | import com.abhioncbr.daflow.commons.NotificationMessages.{unknownXMLEntity => UE} 24 | import com.abhioncbr.daflow.commons.NotificationMessages.{jobXmlFileDoesNotExist => JXF} 25 | import com.abhioncbr.daflow.commons.NotificationMessages.{exceptionWhileParsing => EWP} 26 | import com.abhioncbr.daflow.commons.conf.DaFlowJobConf 27 | import com.abhioncbr.daflow.commons.conf.JobStaticParamConf 28 | import com.abhioncbr.daflow.commons.conf.extract.ExtractConf 29 | import com.abhioncbr.daflow.commons.conf.load.LoadConf 30 | import com.abhioncbr.daflow.commons.conf.transform.TransformConf 31 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 32 | import java.io._ 33 | import javax.xml.XMLConstants 34 | import javax.xml.transform.stream.StreamSource 35 | import javax.xml.validation.SchemaFactory 36 | import org.apache.hadoop.conf.Configuration 37 | import org.apache.hadoop.fs.FileSystem 38 | import org.apache.hadoop.fs.Path 39 | import scala.util.Try 40 | import scala.xml.Node 41 | 42 | object DaFlowJob{ 43 | def fromXML(node: scala.xml.NodeSeq): DaFlowJobConf = { 44 | DaFlowJobConf(ParseJobStaticParam.fromXML(node \ JOB_STATIC_PARAM), 45 | ParseExtract.fromXML(node \ EXTRACT), 46 | ParseTransform.fromXML(node \ TRANSFORM), 47 | ParseLoad.fromXML(node \ LOAD)) 48 | } 49 | } 50 | 51 | class ParseDaFlowJobXml { 52 | def parseXml(path: String, loadFromHDFS: Boolean): Either[String, String] = { 53 | try { 54 | val reader: BufferedReader = if (loadFromHDFS) { 55 | val fs = FileSystem.get(Context.getContextualObject[Configuration](HADOOP_CONF)) 56 | new BufferedReader(new InputStreamReader(fs.open(new Path(path)))) 57 | } else { new BufferedReader(new InputStreamReader(new FileInputStream(path))) } 58 | 59 | val lines = Stream.continually(reader.readLine()).takeWhile(_ != null).toArray[String].mkString 60 | reader.close() 61 | Left(lines) 62 | } catch { 63 | case fileNotFoundException: FileNotFoundException => Right(s"${JXF(path)}. ${EM(fileNotFoundException)}".stripMargin) 64 | case exception: Exception => Right(s"$EWP ${EM(exception)}".stripMargin) 65 | } 66 | } 67 | 68 | def validateXml(xsdFile: String, xmlFile: String): Boolean = { 69 | Try({ 70 | val factory: SchemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI) 71 | val schema = factory.newSchema(new StreamSource(new FileInputStream(xsdFile))) 72 | schema.newValidator().validate(new StreamSource(new FileInputStream(xmlFile))) 73 | true 74 | }).getOrElse(false) 75 | } 76 | 77 | def parseNode(node: scala.xml.Node): Either[(JobStaticParamConf, ExtractConf, TransformConf, LoadConf), String] = { 78 | val trimmedNode: Node = scala.xml.Utility.trim(node) 79 | trimmedNode match { 80 | case {_*} => val daFlowJob = DaFlowJob.fromXML(trimmedNode) 81 | Left((daFlowJob.jobStaticParam, daFlowJob.extract, daFlowJob.transform, daFlowJob.load)) 82 | case _ => Right(UE) 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseExtract.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.DataPath 21 | import com.abhioncbr.daflow.commons.conf.common.QueryConf 22 | import com.abhioncbr.daflow.commons.conf.extract.ExtractConf 23 | import com.abhioncbr.daflow.commons.conf.extract.ExtractFeedConf 24 | import com.abhioncbr.daflow.commons.conf.extract.ExtractionType 25 | import com.abhioncbr.daflow.job.conf.xml.AttributeTags._ 26 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 27 | 28 | object ParseExtract { 29 | def fromXML(node: scala.xml.NodeSeq): ExtractConf = { 30 | val extract: ExtractConf = 31 | ExtractConf(feeds = Array[ExtractFeedConf]((node \ FEED).toList map { s => ParseExtractFeed.fromXML(s) }: _*)) 32 | extract 33 | } 34 | } 35 | 36 | object ParseExtractFeed { 37 | def fromXML(node: scala.xml.NodeSeq): ExtractFeedConf = { 38 | val feedName: String = (node \ FEED_NAME).text 39 | val validateExtractedData: Boolean = ParseUtil.parseBoolean((node \ VALIDATE_EXTRACTED_DATA).text) 40 | 41 | val extractionType: ExtractionType.valueType = 42 | ExtractionType.getValueType(valueTypeString = (node \ "_").head.label.toUpperCase) 43 | 44 | val attributesMap: Map[String, String] = 45 | (node \ "_").head.attributes.map(meta => (meta.key, meta.value.toString)).toMap 46 | 47 | val query: Option[QueryConf] = ParseUtil.parseNode[QueryConf](node \ JDBC \ QUERY, None, ParseQuery.fromXML) 48 | val dataPath: Option[DataPath] = ParseUtil.parseNode[DataPath](node \ FILE_SYSTEM \ DATA_PATH, None, ParseDataPath.fromXML) 49 | 50 | val feed: ExtractFeedConf = ExtractFeedConf(extractFeedName = feedName, 51 | extractionType = extractionType, extractionAttributesMap = attributesMap, 52 | dataPath = dataPath, query = query, validateExtractedData = validateExtractedData) 53 | feed 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseFieldMapping.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.FieldMappingConf 21 | import com.abhioncbr.daflow.job.conf.xml.AttributeTags._ 22 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 23 | 24 | object ParseFieldMappings { 25 | def fromXML(node: scala.xml.NodeSeq): List[FieldMappingConf] = { 26 | List[FieldMappingConf]((node \ FIELD_MAPPING).toList map { s => ParseFieldMapping.fromXML(s) }: _*) 27 | } 28 | } 29 | 30 | object ParseFieldMapping { 31 | def fromXML(node: scala.xml.NodeSeq): FieldMappingConf = { 32 | FieldMappingConf(sourceFieldName = (node \ SOURCE_NAME).text, 33 | targetFieldName = (node \ TARGET_NAME).text) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseGeneralParams.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.GeneralParamConf 21 | import com.abhioncbr.daflow.job.conf.xml.AttributeTags._ 22 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 23 | 24 | object ParseGeneralParams { 25 | def fromXML(node: scala.xml.NodeSeq): Array[GeneralParamConf] = { 26 | Array[GeneralParamConf]((node \ PARAM).toList map { s => ParseGeneralParam.fromXML(s) }: _*) 27 | } 28 | } 29 | 30 | object ParseGeneralParam { 31 | def fromXML(node: scala.xml.NodeSeq): GeneralParamConf = { 32 | val order = ParseUtil.parseInt((node \ AttributeTags.ORDER).text) 33 | val paramName = (node \ NAME).text 34 | val paramValue = (node \ VALUE).text 35 | val paramDefaultValue = (node \ DEFAULT_VALUE).text 36 | GeneralParamConf(order, paramName, paramValue, paramDefaultValue) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseJobStaticParam.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.ProcessFrequencyEnum 21 | import com.abhioncbr.daflow.commons.conf.JobStaticParamConf 22 | import com.abhioncbr.daflow.commons.conf.common.GeneralParamConf 23 | import com.abhioncbr.daflow.job.conf.xml.AttributeTags._ 24 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 25 | 26 | object ParseJobStaticParam { 27 | def fromXML(node: scala.xml.NodeSeq): JobStaticParamConf = { 28 | JobStaticParamConf(processFrequency = ProcessFrequencyEnum.getProcessFrequencyEnum((node \ FREQUENCY).text), 29 | jobName = (node \ JOB_NAME).text, 30 | publishStats = ParseUtil.parseBoolean((node \ PUBLISH_STATS).text), 31 | otherParams = ParseUtil.parseNode[Array[GeneralParamConf]](node \ OTHER_PARAMS, None, ParseGeneralParams.fromXML) 32 | ) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseLoad.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.DataPath 21 | import com.abhioncbr.daflow.commons.conf.load.LoadConf 22 | import com.abhioncbr.daflow.commons.conf.load.LoadFeedConf 23 | import com.abhioncbr.daflow.commons.conf.load.LoadType 24 | import com.abhioncbr.daflow.commons.conf.load.PartitioningDataConf 25 | import com.abhioncbr.daflow.job.conf.xml.AttributeTags._ 26 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 27 | 28 | object ParseLoad { 29 | def fromXML(node: scala.xml.NodeSeq): LoadConf = { 30 | val load: LoadConf = LoadConf(feeds = 31 | Array[LoadFeedConf]((node \ FEED).toList map { s => ParseLoadFeed.fromXML(s) }: _*)) 32 | load 33 | } 34 | } 35 | 36 | object ParseLoadFeed { 37 | def fromXML(node: scala.xml.NodeSeq): LoadFeedConf = { 38 | val loadFeedName: String = (node \ NAME).text 39 | val loadType: LoadType.valueType = 40 | LoadType.getValueType(valueTypeString = (node \ "_").head.label.toUpperCase) 41 | 42 | val attributesMap: Map[String, String] = (node \ "_").head.attributes.map(meta => (meta.key, meta.value.toString)).toMap 43 | val dataPath: DataPath = ParseUtil.parseNode[DataPath](node \ "_" \ DATA_PATH, None, ParseDataPath.fromXML).orNull 44 | val partitioningData: Option[PartitioningDataConf] = 45 | ParseUtil.parseNode[PartitioningDataConf](node \ HIVE \ PARTITION_DATA, None, ParsePartitioningData.fromXML) 46 | 47 | val feed: LoadFeedConf = LoadFeedConf(loadFeedName = loadFeedName, 48 | loadType = loadType, attributesMap = attributesMap, dataPath = dataPath, partitioningData = partitioningData) 49 | 50 | feed 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParsePartitioningData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.GeneralParamConf 21 | import com.abhioncbr.daflow.commons.conf.load.PartitioningDataConf 22 | import com.abhioncbr.daflow.job.conf.xml.AttributeTags._ 23 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 24 | 25 | object ParsePartitioningData { 26 | def fromXML(node: scala.xml.NodeSeq): PartitioningDataConf = { 27 | val coalesce = ParseUtil.parseBoolean((node \ COALESCE_PARTITION).text) 28 | val overwrite = ParseUtil.parseBoolean((node \ OVERWRITE_PARTITION).text) 29 | val coalesceCount = ParseUtil.parseInt((node \ COALESCE_PARTITION_COUNT).text) 30 | val partitionColumns = List[GeneralParamConf]((node \ PARTITION_COLUMNS \ COLUMN). 31 | toList map { s => ParseGeneralParam.fromXML(s) }: _*) 32 | 33 | PartitioningDataConf(coalesce = coalesce, overwrite = overwrite, 34 | coalesceCount = coalesceCount, partitionColumns = partitionColumns) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseQuery.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.DataPath 21 | import com.abhioncbr.daflow.commons.conf.common.GeneralParamConf 22 | import com.abhioncbr.daflow.commons.conf.common.QueryConf 23 | import com.abhioncbr.daflow.commons.conf.common.QueryFilesConf 24 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 25 | 26 | object ParseQuery { 27 | def fromXML(node: scala.xml.NodeSeq): QueryConf = { 28 | val configurationFile: Option[DataPath] = 29 | ParseUtil.parseNode[DataPath](node \ CONFIGURATION_FILE, None, ParseDataPath.fromXML) 30 | 31 | val queryFile: Option[DataPath] = ParseUtil.parseNode[DataPath](node \ SQL_QUERY_FILE, None, ParseDataPath.fromXML) 32 | 33 | val queryArgs: Option[Array[GeneralParamConf]] = 34 | ParseUtil.parseNode[Array[GeneralParamConf]](node \ QUERY_PARAMS, None, ParseGeneralParams.fromXML) 35 | 36 | val queryFiles: QueryFilesConf = QueryFilesConf(configurationFile = configurationFile, queryFile = queryFile) 37 | val query: QueryConf = QueryConf(queryFile = queryFiles, queryArgs = queryArgs) 38 | query 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseTransform.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.abhioncbr.daflow.job.conf.xml 18 | 19 | import com.abhioncbr.daflow.commons.conf.transform 20 | import com.abhioncbr.daflow.commons.conf.transform.TransformConf 21 | import com.abhioncbr.daflow.commons.conf.transform.TransformStepConf 22 | import com.abhioncbr.daflow.job.conf.xml.AttributeTags._ 23 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 24 | 25 | object ParseTransform { 26 | def fromXML(node: scala.xml.NodeSeq): TransformConf = { 27 | val steps: List[TransformStepConf] = 28 | List[TransformStepConf]((node \ STEP).toList map { s => ParseTransformStep.fromXML(s) }: _*) 29 | 30 | transform.TransformConf(transformSteps = steps, 31 | validateTransformedData = ParseUtil.parseBoolean((node \ VALIDATE_TRANSFORMED_DATA).text)) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseTransformRule.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.transform 21 | import com.abhioncbr.daflow.commons.conf.transform.TransformRuleConf 22 | import com.abhioncbr.daflow.commons.conf.transform.TransformStepConf 23 | import com.abhioncbr.daflow.job.conf.xml.AttributeTags._ 24 | import com.abhioncbr.daflow.job.conf.xml.NodeTags._ 25 | 26 | object ParseTransformStep { 27 | def fromXML(node: scala.xml.NodeSeq): TransformStepConf = { 28 | val order = ParseUtil.parseInt((node \ AttributeTags.ORDER).text) 29 | 30 | val rules: Map[String, TransformRuleConf] = List[TransformRuleConf]((node \ RULE).toList map { 31 | s => ParseTransformRule.fromXML(s) 32 | }: _*).map(rule => (rule.ruleAttributesMap(GROUP), rule)).toMap 33 | 34 | transform.TransformStepConf(order = order, rules = rules) 35 | } 36 | } 37 | 38 | object ParseTransformRule { 39 | def fromXML(node: scala.xml.NodeSeq): TransformRuleConf = { 40 | val ruleType = (node \ TYPE).text 41 | val condition = (node \ CONDITION).text 42 | val ruleAttributesMap: Map[String, String] = node.head.attributes.map(meta => (meta.key, meta.value.toString)).toMap 43 | TransformRuleConf(ruleType = ruleType, condition = condition, ruleAttributesMap = ruleAttributesMap) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/main/scala/com/abhioncbr/daflow/job/conf/xml/ParseUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.DataPath 21 | import com.abhioncbr.daflow.commons.util.FileUtil 22 | import scala.util.Try 23 | 24 | object ParseUtil { 25 | def parseNode[T](node: scala.xml.NodeSeq, defaultValue: Option[T], fun: scala.xml.NodeSeq => T): Option[T] 26 | = if (node.nonEmpty){ Some(fun(node)) } else { defaultValue } 27 | 28 | def parseNodeText(node: scala.xml.NodeSeq): String = node.text 29 | 30 | def parseBoolean(node: scala.xml.NodeSeq): Boolean = parseBoolean(node.text) 31 | def parseBoolean(text: String): Boolean = Try(text.toBoolean).getOrElse(false) 32 | 33 | def parseInt(node: scala.xml.NodeSeq): Int = parseInt(node.text) 34 | def parseInt(text: String): Int = Try(text.toInt).getOrElse(-1) 35 | 36 | def parseFilePathString(node: scala.xml.NodeSeq): Either[DataPath, String] = parseFilePathString(node.text) 37 | def parseFilePathString(text: String, fileNameSeparator: String = "."): Either[DataPath, String] 38 | = FileUtil.getFilePathObject(text, fileNameSeparator) 39 | } 40 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/test/scala/com/abhioncbr/daflow/job/conf/xml/ParseDaFlowJobXmlSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | class ParseDaFlowJobXmlSpec extends XmlJobConfBase{ 21 | "validateXml" should "return true when valid xml file is provided as input" in { 22 | val xsdFile = xsdFilePath 23 | val xmlFile = s"$daflowExampleDemoJobXmlPath/json_etl_example.xml" 24 | val parse: ParseDaFlowJobXml = new ParseDaFlowJobXml 25 | val output: Boolean = parse.validateXml(xsdFile, xmlFile) 26 | 27 | output should not equal None 28 | output should be (true) 29 | } 30 | 31 | "validateXml" should "return true when valid xml jdbc_template file is provided as input" in { 32 | val xsdFile = xsdFilePath 33 | val xmlFile = s"$daflowExampleJobXmlTemplatePath/extract_jdbc_import.xml" 34 | val parse: ParseDaFlowJobXml = new ParseDaFlowJobXml 35 | val output: Boolean = parse.validateXml(xsdFile, xmlFile) 36 | 37 | output should not equal None 38 | output should be (true) 39 | } 40 | 41 | "validateXml" should "return true when valid xml json_template file is provided as input" in { 42 | val xsdFile = xsdFilePath 43 | val xmlFile = s"$daflowExampleJobXmlTemplatePath/extract_json_import.xml" 44 | val parse: ParseDaFlowJobXml = new ParseDaFlowJobXml 45 | val output: Boolean = parse.validateXml(xsdFile, xmlFile) 46 | 47 | output should not equal None 48 | output should be (true) 49 | } 50 | 51 | "validateXml" should "return true when valid xml multiple_group_template file is provided as input" in { 52 | val xsdFile = xsdFilePath 53 | val xmlFile = s"$daflowExampleJobXmlTemplatePath/multiple_group_name.xml" 54 | val parse: ParseDaFlowJobXml = new ParseDaFlowJobXml 55 | val output: Boolean = parse.validateXml(xsdFile, xmlFile) 56 | 57 | output should not equal None 58 | output should be (true) 59 | } 60 | 61 | "validateXml" should "return true when valid xml multiple_transform_template file is provided as input" in { 62 | val xsdFile = xsdFilePath 63 | val xmlFile = s"$daflowExampleJobXmlTemplatePath/multiple_transform_rule.xml" 64 | val parse: ParseDaFlowJobXml = new ParseDaFlowJobXml 65 | val output: Boolean = parse.validateXml(xsdFile, xmlFile) 66 | 67 | output should not equal None 68 | output should be (true) 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/test/scala/com/abhioncbr/daflow/job/conf/xml/ParseFieldMappingSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.common.FieldMappingConf 21 | 22 | class ParseFieldMappingSpec extends XmlJobConfBase { 23 | 24 | "ParseFieldMapping-fromXML" should "return FieldMapping object" in { 25 | val xmlContent: String = """""" 26 | val filedMappingObject: FieldMappingConf = ParseFieldMapping.fromXML(node(xmlContent)) 27 | filedMappingObject should not equal None 28 | filedMappingObject.sourceFieldName should be ("source") 29 | filedMappingObject.targetFieldName should be ("target") 30 | } 31 | 32 | "ParseFieldMappings-fromXML" should "return FieldMapping object" in { 33 | val xmlContent: String = 34 | """ 35 | | 36 | | 37 | | 38 | |""".stripMargin 39 | val filedMappingArrayObject: List[FieldMappingConf] = ParseFieldMappings.fromXML(node(xmlContent)) 40 | filedMappingArrayObject should not equal None 41 | filedMappingArrayObject.length should be (3) 42 | filedMappingArrayObject.head.sourceFieldName should be ("source1") 43 | filedMappingArrayObject.head.targetFieldName should be ("target1") 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/test/scala/com/abhioncbr/daflow/job/conf/xml/ParseJobStaticParamSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.JobStaticParamConf 21 | 22 | class ParseJobStaticParamSpec extends XmlJobConfBase { 23 | 24 | "ParseJobStaticParam-fromXML" should "return JobStaticParam object" in { 25 | val xmlContent: String = """""" 26 | val jobStaticParamObject: JobStaticParamConf = ParseJobStaticParam.fromXML(node(xmlContent)) 27 | jobStaticParamObject should not equal None 28 | jobStaticParamObject.jobName should be ("Job1") 29 | jobStaticParamObject.processFrequency.toString should be ("ONCE") 30 | jobStaticParamObject.publishStats should be (false) 31 | } 32 | 33 | "ParseJobStaticParam-fromXML" should "return JobStaticParam object with otherParams also" in { 34 | val xmlContent: String = 35 | """ 36 | | 37 | | 38 | | 39 | |""".stripMargin 40 | val jobStaticParamObject: JobStaticParamConf = ParseJobStaticParam.fromXML(node(xmlContent)) 41 | jobStaticParamObject should not equal None 42 | jobStaticParamObject.jobName should be ("Job1") 43 | jobStaticParamObject.processFrequency.toString should be ("ONCE") 44 | jobStaticParamObject.publishStats should be (false) 45 | jobStaticParamObject.otherParams should not equal None 46 | jobStaticParamObject.otherParams.get should not equal None 47 | jobStaticParamObject.otherParams.get.length should be (2) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/test/scala/com/abhioncbr/daflow/job/conf/xml/ParsePartitioningRuleSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.load.PartitioningDataConf 21 | 22 | class ParsePartitioningRuleSpec extends XmlJobConfBase { 23 | 24 | "ParsePartitioningData" should "return PartitioningData object with all variable having values" in { 25 | val xmlContent = """ 26 | 27 | 28 | 29 | """.stripMargin 30 | val partitioningDataObject: PartitioningDataConf = ParsePartitioningData.fromXML(node(xmlContent)) 31 | partitioningDataObject should not equal None 32 | partitioningDataObject.coalesce should be (true) 33 | partitioningDataObject.overwrite should be (true) 34 | partitioningDataObject.coalesceCount should be (10) 35 | partitioningDataObject.partitionColumns should not be None 36 | partitioningDataObject.partitionColumns.length should be (1) 37 | partitioningDataObject.partitionColumns.head.paramName should be ("Date") 38 | partitioningDataObject.partitionColumns.head.paramValue should be ("date") 39 | } 40 | 41 | "ParsePartitioningData" should "return PartitioningData object with only provided variables" in { 42 | val xmlContent = """ 43 | 44 | 45 | 46 | """.stripMargin 47 | val partitioningDataObject: PartitioningDataConf = ParsePartitioningData.fromXML(node(xmlContent)) 48 | partitioningDataObject should not equal None 49 | partitioningDataObject.coalesce should be (false) 50 | partitioningDataObject.overwrite should be (true) 51 | partitioningDataObject.coalesceCount should be (-1) 52 | partitioningDataObject.partitionColumns should not be None 53 | partitioningDataObject.partitionColumns.length should be (1) 54 | partitioningDataObject.partitionColumns.head.paramName should be ("Date") 55 | partitioningDataObject.partitionColumns.head.paramValue should be ("date") 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/test/scala/com/abhioncbr/daflow/job/conf/xml/ParseTransformSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.conf.transform.TransformConf 21 | 22 | class ParseTransformSpec extends XmlJobConfBase { 23 | 24 | "ParseTransform" should "return TransformConf object with array of TransformStepsConf" in { 25 | val xmlContent = """ 26 | {col1} like 'my%' 27 | 28 | """ 29 | 30 | val transformConfObject: TransformConf = ParseTransform.fromXML(node(xmlContent)) 31 | transformConfObject should not equal None 32 | transformConfObject.validateTransformedData should be (false) 33 | transformConfObject.transformSteps.size should be (1) 34 | transformConfObject.transformSteps.head.order should be (23) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-xml/src/test/scala/com/abhioncbr/daflow/job/conf/xml/XmlJobConfBase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.job.conf.xml 19 | 20 | import com.abhioncbr.daflow.commons.CommonSpec 21 | import scala.xml.XML 22 | 23 | class XmlJobConfBase extends CommonSpec{ 24 | 25 | val userDirectory: String = System.getProperty("user.dir") 26 | 27 | val daflowExamplesPath = s"$userDirectory/daflow-examples" 28 | val daflowExamplesDemoPath = s"$daflowExamplesPath/demo" 29 | val daflowExamplesDemoSampleDataPath = s"$daflowExamplesDemoPath/sample-data" 30 | val daflowExampleDemoJobXmlPath = s"$daflowExamplesDemoPath/daflow-job-xml" 31 | val daflowExampleJobXmlTemplatePath = s"$daflowExamplesPath/daflow-xml-templates" 32 | 33 | val xsdFilePath = s"$userDirectory/daflow-job-conf/daflow-job-conf-xml/daflow-feed-job.xsd" 34 | 35 | val node: String => scala.xml.NodeSeq = (xmlContent: String) => { XML.loadString(xmlContent) } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /daflow-job-conf/daflow-job-conf-yaml/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.abhioncbr.daflow 7 | daflow 8 | ${revision} 9 | ../../pom.xml 10 | 11 | 4.0.0 12 | 13 | jar 14 | daflow-job-conf-yaml 15 | daflow-job-conf-yaml 16 | ${daflow.job.conf.yaml.version} 17 | 18 | 19 | 20 | Apache License, Version 2.0 21 | http://www.apache.org/licenses/LICENSE-2.0.txt 22 | repo 23 | A business-friendly OSS license 24 | 25 | 26 | 27 | 28 | 29 | scala-tools.org 30 | Scala-Tools Maven2 Repository 31 | http://scala-tools.org/repo-releases 32 | 33 | 34 | 35 | 36 | 37 | scala-tools.org 38 | Scala-Tools Maven2 Repository 39 | http://scala-tools.org/repo-releases 40 | 41 | 42 | 43 | 44 | 0.11.1 45 | 0.10.0 46 | 47 | 48 | 49 | 50 | com.abhioncbr.daflow 51 | daflow-commons 52 | ${daflow.common.version} 53 | 54 | 55 | 56 | io.circe 57 | circe-yaml_2.11 58 | ${circe-yaml.version} 59 | 60 | 61 | 62 | io.circe 63 | circe-generic_2.11 64 | ${circe-generic.version} 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | org.scala-tools 75 | maven-scala-plugin 76 | 77 | ${scala.version} 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /daflow-metrics/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.abhioncbr.daflow 7 | daflow 8 | ${revision} 9 | 10 | 4.0.0 11 | 12 | jar 13 | daflow-metrics 14 | daflow-metrics 15 | ${daflow.metrics.version} 16 | 17 | 18 | 19 | Apache License, Version 2.0 20 | http://www.apache.org/licenses/LICENSE-2.0.txt 21 | repo 22 | A business-friendly OSS license 23 | 24 | 25 | 26 | 27 | 28 | scala-tools.org 29 | Scala-Tools Maven2 Repository 30 | http://scala-tools.org/repo-releases 31 | 32 | 33 | 34 | 35 | 36 | scala-tools.org 37 | Scala-Tools Maven2 Repository 38 | http://scala-tools.org/repo-releases 39 | 40 | 41 | 42 | 43 | 0.6.0 44 | 45 | 46 | 47 | 48 | com.abhioncbr.daflow 49 | daflow-commons 50 | ${daflow.core.version} 51 | 52 | 53 | 54 | io.prometheus 55 | simpleclient 56 | ${prometheusVersion} 57 | 58 | 59 | 60 | io.prometheus 61 | simpleclient_servlet 62 | ${prometheusVersion} 63 | 64 | 65 | 66 | io.prometheus 67 | simpleclient_pushgateway 68 | ${prometheusVersion} 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | org.scala-tools 78 | maven-scala-plugin 79 | 80 | ${scala.version} 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /daflow-metrics/scripts/daflow-feed-stat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | JOB_NAME=$1 4 | JOB_SUBTASK_NAME=$2 5 | 6 | STATUS=$3 7 | FREQUENCY=$4 8 | DATE=$5 9 | HOUR=$6 10 | 11 | V_PASSED_COUNT=$7 12 | V_FAILED_COUNT=$8 13 | EXECUTION_TIME=${9} 14 | 15 | T_PASSED_COUNT=${10} 16 | T_FAILED_COUNT=${11} 17 | FAILURE_REASON="${12}" 18 | 19 | echo "updating daflow feed stat table for job=$JOB_NAME, job_subtask=$JOB_SUBTASK_NAME, venture=$VENTURE, date=$DATE" 20 | QUERY="INSERT INTO TABLE daflow.daflow-feed-stat PARTITION (job_name = '$JOB_NAME') (job_subtask, status, frequency, 21 | data_date, data_hour, schema_validation_passed_data_count, schema_validation_failed_data_count, feed_execution_time, transformation_passed_data_count, transformation_failed_data_count, failure_reason) VALUES ('$JOB_SUBTASK_NAME', '$STATUS', '$FREQUENCY', '$DATE', '$HOUR', $V_PASSED_COUNT, $V_FAILED_COUNT, $EXECUTION_TIME, $T_PASSED_COUNT, $T_FAILED_COUNT, '$FAILURE_REASON');" 22 | echo "Going to execute query: $QUERY" 23 | 24 | hive -e " 25 | SET mapred.job.queue.name=pipelines; 26 | $QUERY 27 | " 28 | 29 | exit_code=$? 30 | exit ${exit_code} -------------------------------------------------------------------------------- /daflow-metrics/sql/daflow-feed-stat: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE daflow_feed_stat ( 2 | job_subtask String, 3 | status string, 4 | frequency string, 5 | data_date date, 6 | data_hour string, 7 | failure_reason string, 8 | transformation_passed_data_count bigint, 9 | transformation_failed_data_count bigint, 10 | schema_validation_passed_data_count bigint, 11 | schema_validation_failed_data_count bigint, 12 | feed_execution_time bigint) 13 | PARTITIONED BY (job_name string) 14 | STORED AS parquetfile 15 | LOCATION "/data/daflow_data/daflow_feed_stat" -------------------------------------------------------------------------------- /daflow-metrics/src/main/scala/com/abhioncbr/daflow/metrics/promethus/PrometheusObject.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.metrics.promethus 19 | 20 | import com.abhioncbr.daflow.commons.NotificationMessages 21 | import com.typesafe.scalalogging.Logger 22 | import io.prometheus.client.CollectorRegistry 23 | import io.prometheus.client.Gauge 24 | import io.prometheus.client.exporter.PushGateway 25 | import scala.util.Failure 26 | import scala.util.Success 27 | import scala.util.Try 28 | 29 | class PrometheusObject(feedName: String, pushGatewayIpAddress: String) { 30 | private val logger = Logger(this.getClass) 31 | 32 | @transient lazy val feedDataStatGauge: Gauge = Gauge.build() 33 | .name(feedName).help(s"number of entries for a given $feedName").register() 34 | 35 | def pushMetrics(metricsJobName: String, metricData: Long): Either[Unit, String] = { 36 | @transient val conf: Map[String, String] = Map() 37 | val pushGatewayAddress = conf.getOrElse("pushGatewayAddr", pushGatewayIpAddress) 38 | val pushGateway = new PushGateway(pushGatewayAddress) 39 | 40 | feedDataStatGauge.labels(feedName).set(metricData) 41 | 42 | val output: Either[Unit, String] = Try(pushGateway.push(CollectorRegistry.defaultRegistry, metricsJobName)) match { 43 | case Success(u: Unit) => Left(u) 44 | case Failure(ex: Exception) => val str = s"Unable to push metrics. ${NotificationMessages.exceptionMessage(ex)}" 45 | logger.warn(str) 46 | Right(str) 47 | } 48 | output 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /daflow-metrics/src/main/scala/com/abhioncbr/daflow/metrics/stats/JobResult.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.abhioncbr.daflow.metrics.stats 19 | 20 | case class JobResult(success: Boolean, feedName: String, transformationPassedCount: Long, 21 | transformationFailedCount: Long, validateCount: Long, nonValidatedCount: Long, failureReason: String) 22 | -------------------------------------------------------------------------------- /daflow-sql-parser/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sparsecode/DaFlow/068ec7582ef58cbc6c782d2bc19a43aca2dead81/daflow-sql-parser/README.md -------------------------------------------------------------------------------- /daflow-sql-parser/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.abhioncbr.daflow 7 | daflow 8 | ${revision} 9 | 10 | 4.0.0 11 | 12 | jar 13 | daflow-sql-parser 14 | daflow-sql-parser 15 | ${daflow.sql.parser.version} 16 | 17 | 18 | 19 | Apache License, Version 2.0 20 | http://www.apache.org/licenses/LICENSE-2.0.txt 21 | repo 22 | A business-friendly OSS license 23 | 24 | 25 | 26 | 27 | 28 | scala-tools.org 29 | Scala-Tools Maven2 Repository 30 | http://scala-tools.org/repo-releases 31 | 32 | 33 | 34 | 35 | 36 | scala-tools.org 37 | Scala-Tools Maven2 Repository 38 | http://scala-tools.org/repo-releases 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | org.scala-tools 53 | maven-scala-plugin 54 | 55 | ${scala.version} 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /daflow-sql-parser/src/main/scala/com/abhioncbr/daflow/sqlParser/QueryDsl.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.abhioncbr.daflow.sqlParser 18 | 19 | abstract class Operation { def from(table: String): From = From(table, Option(this)) } 20 | case class Select(fields: String*) extends Operation 21 | case class From(table: String, operation: Option[Operation] = None) { 22 | def where(clauses: Clause*): Query = Query(from = this, operation = operation.get, Option(Where(clauses: _*))) 23 | } 24 | 25 | case class Query(from: From, operation: Operation, where: Option[Where], order: Option[Direction] = None) { 26 | def order(dir: Direction): Query = this.copy(order = Option(dir)) 27 | } 28 | 29 | case class Where(clauses: Clause*) 30 | 31 | abstract class Clause { 32 | def hasField: Boolean 33 | def getFields: Map[String, Any] 34 | def and(otherField: Clause): Clause = And(this, otherField) 35 | def or(otherField: Clause): Clause = Or(this, otherField) 36 | } 37 | abstract class ValueClause(t: String, f: String, values: Any*) extends Clause { 38 | override def hasField: Boolean = true 39 | override def getFields: Map[String, Any] = Map("type" -> t, "field" -> f, "value" -> values) 40 | } 41 | abstract class ExpressionValueClause(t: String, f: String, expr: String, values: Any*) extends ValueClause(t, f, 42 | values) { 43 | override def hasField: Boolean = true 44 | override def getFields: Map[String, Any] = Map("type" -> t, "field" -> f, "value" -> values, "expr" -> expr) 45 | } 46 | abstract class ReferenceClause(t: String, lClause: Clause, rClause: Clause) extends Clause { 47 | override def hasField: Boolean = false 48 | override def getFields: Map[String, Any] = Map("type" -> t, "lClause" -> lClause, "rClause" -> rClause) 49 | } 50 | 51 | case class Null(f: String) extends ValueClause("Null", f) 52 | case class NotNull(f: String) extends ValueClause("NotNull", f) 53 | case class Like(f: String, value: Any) extends ValueClause("Like", f, value) 54 | case class In(f: String, values: String*) extends ValueClause("in", f, values) 55 | case class Between(f: String, values: Any*) extends ValueClause("Between", f, values) 56 | case class StringExpressions(f: String, expr: String, value: String) 57 | extends ExpressionValueClause("stringEquals", f, expr: String, value) 58 | case class NumberExpressions(f: String, expr: String, value: Number) 59 | extends ExpressionValueClause("numberEquals", f, expr: String, value) 60 | case class BooleanExpressions(f: String, expr: String, value: Boolean) 61 | extends ExpressionValueClause("booleanEquals", f, expr, value) 62 | 63 | case class And(lClause: Clause, rClause: Clause) extends ReferenceClause("and", lClause, rClause) 64 | case class Or(lClause: Clause, rClause: Clause) extends ReferenceClause("or", lClause, rClause) 65 | 66 | abstract class Direction 67 | case class Asc(field: String) extends Direction 68 | case class Desc(field: String) extends Direction 69 | -------------------------------------------------------------------------------- /docker/compose/hadoop.env: -------------------------------------------------------------------------------- 1 | HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://daflow-hive-metastore-postgresql/metastore 2 | HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver 3 | HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive 4 | HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive 5 | HIVE_SITE_CONF_datanucleus_autoCreateSchema=false 6 | HIVE_SITE_CONF_hive_metastore_uris=thrift://daflow-hivemetastore:9083 7 | HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false 8 | 9 | HDFS_CONF_dfs_webhdfs_enabled=true 10 | HDFS_CONF_dfs_permissions_enabled=false 11 | #HDFS_CONF_dfs_client_use_datanode_hostname=true 12 | #HDFS_CONF_dfs_namenode_use_datanode_hostname=true 13 | 14 | CORE_CONF_fs_defaultFS=hdfs://daflow-namenode:8020 15 | CORE_CONF_hadoop_http_staticuser_user=root 16 | CORE_CONF_hadoop_proxyuser_hue_hosts=* 17 | CORE_CONF_hadoop_proxyuser_hue_groups=* 18 | 19 | YARN_CONF_yarn_log___aggregation___enable=true 20 | YARN_CONF_yarn_resourcemanager_recovery_enabled=true 21 | YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore 22 | YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate 23 | YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs 24 | YARN_CONF_yarn_log_server_url=http://daflow-historyserver:8188/applicationhistory/logs/ 25 | YARN_CONF_yarn_timeline___service_enabled=true 26 | YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true 27 | YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true 28 | YARN_CONF_yarn_resourcemanager_hostname=daflow-resourcemanager 29 | YARN_CONF_yarn_timeline___service_hostname=daflow-historyserver 30 | YARN_CONF_yarn_resourcemanager_address=daflow-resourcemanager:8032 31 | YARN_CONF_yarn_resourcemanager_scheduler_address=daflow-resourcemanager:8030 32 | YARN_CONF_yarn_resourcemanager_resource___tracker_address=daflow-resourcemanager:8031 33 | YARN_CONF_yarn_nodemanager_vmem___check___enabled=false 34 | -------------------------------------------------------------------------------- /docker/images/hadoop/base/Dockerfile: -------------------------------------------------------------------------------- 1 | #COPIED FROM -> https://github.com/apache/incubator-hudi/blob/master/docker/hoodie/hadoop/base/Dockerfile 2 | #Also idea from, https://github.com/big-data-europe/docker-hadoop 3 | 4 | FROM openjdk:8u212-jdk-slim-stretch 5 | MAINTAINER DaFlow 6 | USER root 7 | 8 | # Default to UTF-8 file.encoding 9 | ENV LANG C.UTF-8 10 | 11 | ARG HADOOP_VERSION=2.8.4 12 | ARG HADOOP_URL=https://www.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz 13 | ENV HADOOP_VERSION ${HADOOP_VERSION} 14 | ENV HADOOP_URL ${HADOOP_URL} 15 | 16 | RUN set -x \ 17 | && DEBIAN_FRONTEND=noninteractive apt-get -yq update && apt-get -yq install curl wget netcat procps \ 18 | && echo "Fetch URL2 is : ${HADOOP_URL}" \ 19 | && curl -fSL "${HADOOP_URL}" -o /tmp/hadoop.tar.gz \ 20 | && curl -fSL "${HADOOP_URL}.asc" -o /tmp/hadoop.tar.gz.asc \ 21 | && mkdir -p /opt/hadoop-$HADOOP_VERSION/logs \ 22 | && tar -xvf /tmp/hadoop.tar.gz -C /opt/ \ 23 | && rm /tmp/hadoop.tar.gz* \ 24 | && ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop \ 25 | && cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml \ 26 | && mkdir /hadoop-data 27 | 28 | ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION 29 | ENV HADOOP_CONF_DIR=/etc/hadoop 30 | ENV MULTIHOMED_NETWORK=1 31 | ENV HADOOP_HOME=${HADOOP_PREFIX} 32 | ENV HADOOP_INSTALL=${HADOOP_HOME} 33 | ENV USER=root 34 | ENV PATH /usr/bin:/bin:$HADOOP_PREFIX/bin/:$PATH 35 | 36 | # Exposing a union of ports across hadoop versions 37 | # Well known ports including ssh 38 | EXPOSE 0-1024 4040 7000-10100 5000-5100 50000-50200 58188 58088 58042 39 | 40 | ADD entrypoint.sh /entrypoint.sh 41 | ADD export_container_ip.sh /usr/bin/ 42 | RUN chmod a+x /usr/bin/export_container_ip.sh \ 43 | && chmod a+x /entrypoint.sh 44 | 45 | ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] -------------------------------------------------------------------------------- /docker/images/hadoop/base/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ####################################################################################### 4 | ## COPIED FROM ## 5 | ## https://github.com/big-data-europe/docker-hadoop/blob/master/base/entrypoint.sh ## 6 | ## ## 7 | ####################################################################################### 8 | 9 | # Set some sensible defaults 10 | export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020} 11 | 12 | function addProperty() { 13 | local path=$1 14 | local name=$2 15 | local value=$3 16 | 17 | local entry="$name${value}" 18 | local escapedEntry=$(echo $entry | sed 's/\//\\\//g') 19 | sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path 20 | } 21 | 22 | function configure() { 23 | local path=$1 24 | local module=$2 25 | local envPrefix=$3 26 | 27 | local var 28 | local value 29 | 30 | echo "Configuring $module" 31 | for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do 32 | name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'` 33 | var="${envPrefix}_${c}" 34 | value=${!var} 35 | echo " - Setting $name=$value" 36 | addProperty /etc/hadoop/$module-site.xml $name "$value" 37 | done 38 | } 39 | 40 | configure /etc/hadoop/core-site.xml core CORE_CONF 41 | configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF 42 | configure /etc/hadoop/yarn-site.xml yarn YARN_CONF 43 | configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF 44 | configure /etc/hadoop/kms-site.xml kms KMS_CONF 45 | 46 | if [ "$MULTIHOMED_NETWORK" = "1" ]; then 47 | echo "Configuring for multihomed network" 48 | 49 | # HDFS 50 | addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0 51 | addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0 52 | addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0 53 | addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0 54 | addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true 55 | addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true 56 | 57 | # YARN 58 | addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0 59 | addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 60 | addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 61 | addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0 62 | 63 | # MAPRED 64 | addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0 65 | fi 66 | 67 | if [ -n "$GANGLIA_HOST" ]; then 68 | mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig 69 | mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig 70 | 71 | for module in mapred jvm rpc ugi; do 72 | echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31" 73 | echo "$module.period=10" 74 | echo "$module.servers=$GANGLIA_HOST:8649" 75 | done > /etc/hadoop/hadoop-metrics.properties 76 | 77 | for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do 78 | echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31" 79 | echo "$module.sink.ganglia.period=10" 80 | echo "$module.sink.ganglia.supportsparse=true" 81 | echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both" 82 | echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40" 83 | echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649" 84 | done > /etc/hadoop/hadoop-metrics2.properties 85 | fi 86 | 87 | # Save Container IP in ENV variable 88 | /usr/bin/export_container_ip.sh 89 | 90 | exec "$@" 91 | -------------------------------------------------------------------------------- /docker/images/hadoop/base/export_container_ip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############################################################################################################# 4 | ## COPIED FROM ## 5 | ## https://github.com/apache/incubator-hudi/blob/master/docker/hoodie/hadoop/base/export_container_ip.sh ## 6 | ## ## 7 | ############################################################################################################# 8 | 9 | interfaces=( "en0" "eth0" ) 10 | 11 | ipAddr="" 12 | for interface in "${interfaces[@]}" 13 | do 14 | ipAddr=`ifconfig ${interface} | grep -Eo 'inet (addr:)?([0-9]+\.){3}[0-9]+' | grep -Eo '([0-9]+\.){3}[0-9]+' | grep -v '127.0.0.1' | head` 15 | if [[ -n "$ipAddr" ]]; then 16 | break 17 | fi 18 | done 19 | 20 | echo "Container IP is set to : $ipAddr" 21 | export MY_CONTAINER_IP=${ipAddr} 22 | -------------------------------------------------------------------------------- /docker/images/hadoop/base/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | com.abhioncbr.daflow 4 | daflow-docker 5 | ${revision} 6 | ../../pom.xml 7 | 8 | 9 | pom 10 | ${daflow.docker.hadoop.base.version} 11 | 4.0.0 12 | daflow-hadoop-base-docker 13 | DaFlow's Docker Image of Hadoop Base 14 | 15 | 16 | UTF-8 17 | 18 | 19 | 20 | 21 | com.abhioncbr.daflow 22 | daflow-docker 23 | ${project.version} 24 | pom 25 | import 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | com.spotify 35 | dockerfile-maven-plugin 36 | ${dockerfile.maven.version} 37 | 38 | 39 | tag-latest 40 | pre-integration-test 41 | 42 | build 43 | tag 44 | push 45 | 46 | 47 | latest 48 | ${docker.build.skip} 49 | false 50 | ${docker.reg.username} 51 | ${docker.reg.password} 52 | abhioncbr/daflow-hadoop-base 53 | 54 | ${docker.hadoop.version} 55 | 56 | 57 | 58 | 59 | tag-version 60 | pre-integration-test 61 | 62 | build 63 | tag 64 | push 65 | 66 | 67 | ${docker.hadoop.version} 68 | ${docker.build.skip} 69 | false 70 | ${docker.reg.username} 71 | ${docker.reg.password} 72 | abhioncbr/daflow-hadoop-base 73 | 74 | ${docker.hadoop.version} 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /docker/images/hadoop/datanode/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG HADOOP_VERSION=2.8.4 2 | ARG HADOOP_DN_PORT=50075 3 | FROM abhioncbr/daflow-hadoop-base:${HADOOP_VERSION} 4 | 5 | ENV HADOOP_DN_PORT ${HADOOP_DN_PORT} 6 | 7 | ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data 8 | RUN mkdir -p /hadoop/dfs/data 9 | VOLUME /hadoop/dfs/data 10 | 11 | ADD run_dn.sh /run_dn.sh 12 | RUN chmod a+x /run_dn.sh 13 | 14 | CMD ["/run_dn.sh"] -------------------------------------------------------------------------------- /docker/images/hadoop/datanode/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | com.abhioncbr.daflow 4 | daflow-docker 5 | ${revision} 6 | ../../pom.xml 7 | 8 | 9 | pom 10 | ${daflow.docker.hadoop.datanode.version} 11 | 4.0.0 12 | daflow-hadoop-datanode-docker 13 | DaFlow's Docker Image of Hadoop Data Node. 14 | 15 | 16 | UTF-8 17 | 18 | 19 | 20 | 21 | com.abhioncbr.daflow 22 | daflow-hadoop-base-docker 23 | ${project.version} 24 | pom 25 | import 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | com.spotify 34 | dockerfile-maven-plugin 35 | ${dockerfile.maven.version} 36 | 37 | 38 | tag-latest 39 | pre-integration-test 40 | 41 | build 42 | tag 43 | push 44 | 45 | 46 | latest 47 | ${docker.build.skip} 48 | false 49 | ${docker.reg.username} 50 | ${docker.reg.password} 51 | abhioncbr/daflow-hadoop-datanode 52 | 53 | ${docker.hadoop.version} 54 | ${docker.hadoop.dn.port} 55 | 56 | 57 | 58 | 59 | tag-version 60 | pre-integration-test 61 | 62 | build 63 | tag 64 | push 65 | 66 | 67 | ${docker.hadoop.version}-${project.version} 68 | ${docker.build.skip} 69 | false 70 | ${docker.reg.username} 71 | ${docker.reg.password} 72 | abhioncbr/daflow-hadoop-datanode 73 | 74 | ${docker.hadoop.version} 75 | ${docker.hadoop.dn.port} 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /docker/images/hadoop/datanode/run_dn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'` 4 | if [ ! -d $datadir ]; then 5 | echo "Datanode data directory not found: $datadir" 6 | exit 2 7 | fi 8 | 9 | $HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR datanode -------------------------------------------------------------------------------- /docker/images/hadoop/historyserver/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG HADOOP_VERSION=2.8.4 2 | ARG HADOOP_HISTORY_PORT=8188 3 | FROM abhioncbr/daflow-hadoop-base:${HADOOP_VERSION} 4 | 5 | ENV HADOOP_HISTORY_PORT ${HADOOP_HISTORY_PORT} 6 | 7 | ENV YARN_CONF_yarn_timeline___service_leveldb___timeline___store_path=/hadoop/yarn/timeline 8 | RUN mkdir -p /hadoop/yarn/timeline 9 | VOLUME /hadoop/yarn/timeline 10 | 11 | ADD run_history.sh /run_history.sh 12 | RUN chmod a+x /run_history.sh 13 | 14 | CMD ["/run_history.sh"] -------------------------------------------------------------------------------- /docker/images/hadoop/historyserver/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | com.abhioncbr.daflow 4 | daflow-docker 5 | ${revision} 6 | ../../pom.xml 7 | 8 | 9 | pom 10 | ${daflow.docker.hadoop.historyserver.version} 11 | 4.0.0 12 | daflow-hadoop-historyserver-docker 13 | DaFlow's Docker Image of Hadoop History Server. 14 | 15 | 16 | UTF-8 17 | 18 | 19 | 20 | 21 | com.abhioncbr.daflow 22 | daflow-hadoop-base-docker 23 | ${project.version} 24 | pom 25 | import 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | com.spotify 34 | dockerfile-maven-plugin 35 | ${dockerfile.maven.version} 36 | 37 | 38 | tag-latest 39 | pre-integration-test 40 | 41 | build 42 | tag 43 | push 44 | 45 | 46 | latest 47 | ${docker.build.skip} 48 | false 49 | ${docker.reg.username} 50 | ${docker.reg.password} 51 | abhioncbr/daflow-hadoop-historyserver 52 | 53 | ${docker.hadoop.version} 54 | ${docker.hadoop.hs.port} 55 | 56 | 57 | 58 | 59 | tag-version 60 | pre-integration-test 61 | 62 | build 63 | tag 64 | push 65 | 66 | 67 | ${docker.hadoop.version}-${project.version} 68 | ${docker.build.skip} 69 | false 70 | ${docker.reg.username} 71 | ${docker.reg.password} 72 | abhioncbr/daflow-hadoop-historyserver 73 | 74 | ${docker.hadoop.version} 75 | ${docker.hadoop.hs.port} 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /docker/images/hadoop/historyserver/run_history.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR historyserver -------------------------------------------------------------------------------- /docker/images/hadoop/namenode/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG HADOOP_VERSION=2.8.4 2 | ARG HADOOP_WEBHDFS_PORT=50070 3 | FROM abhioncbr/daflow-hadoop-base:${HADOOP_VERSION} 4 | 5 | ENV HADOOP_WEBHDFS_PORT ${HADOOP_WEBHDFS_PORT} 6 | 7 | ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name 8 | RUN mkdir -p /hadoop/dfs/name 9 | VOLUME /hadoop/dfs/name 10 | 11 | ADD run_nn.sh /run_nn.sh 12 | RUN chmod a+x /run_nn.sh 13 | 14 | CMD ["/run_nn.sh"] -------------------------------------------------------------------------------- /docker/images/hadoop/namenode/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | com.abhioncbr.daflow 4 | daflow-docker 5 | ${revision} 6 | ../../pom.xml 7 | 8 | 9 | pom 10 | ${daflow.docker.hadoop.namenode.version} 11 | 4.0.0 12 | daflow-hadoop-namenode-docker 13 | DaFlow's Docker Image of Hadoop Name Node. 14 | 15 | 16 | UTF-8 17 | 18 | 19 | 20 | 21 | com.abhioncbr.daflow 22 | daflow-hadoop-base-docker 23 | ${project.version} 24 | pom 25 | import 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | com.spotify 34 | dockerfile-maven-plugin 35 | ${dockerfile.maven.version} 36 | 37 | 38 | tag-latest 39 | pre-integration-test 40 | 41 | build 42 | tag 43 | push 44 | 45 | 46 | latest 47 | ${docker.build.skip} 48 | false 49 | ${docker.reg.username} 50 | ${docker.reg.password} 51 | abhioncbr/daflow-hadoop-namenode 52 | 53 | ${docker.hadoop.version} 54 | ${docker.hadoop.webHdfs.port} 55 | 56 | 57 | 58 | 59 | tag-version 60 | pre-integration-test 61 | 62 | build 63 | tag 64 | push 65 | 66 | 67 | ${docker.hadoop.version}-${project.version} 68 | ${docker.build.skip} 69 | false 70 | ${docker.reg.username} 71 | ${docker.reg.password} 72 | abhioncbr/daflow-hadoop-namenode 73 | 74 | ${docker.hadoop.version} 75 | ${docker.hadoop.webHdfs.port} 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /docker/images/hadoop/namenode/run_nn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'` 4 | if [ ! -d $namedir ]; then 5 | echo "Namenode name directory not found: $namedir" 6 | exit 2 7 | fi 8 | 9 | if [ -z "$CLUSTER_NAME" ]; then 10 | echo "Cluster name not specified" 11 | exit 2 12 | fi 13 | 14 | if [ "`ls -A $namedir`" == "" ]; then 15 | echo "Formatting namenode name directory: $namedir" 16 | $HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME 17 | fi 18 | 19 | $HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode -------------------------------------------------------------------------------- /docker/images/hive/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG HADOOP_VERSION=2.8.4 2 | FROM abhioncbr/daflow-hadoop-base:${HADOOP_VERSION} 3 | 4 | ENV HIVE_HOME /opt/hive 5 | ENV PATH $HIVE_HOME/bin:$PATH 6 | ENV HADOOP_HOME /opt/hadoop-$HADOOP_VERSION 7 | 8 | WORKDIR /opt 9 | 10 | ARG HIVE_VERSION=2.3.3 11 | ARG HIVE_URL=https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz 12 | ENV HIVE_VERSION ${HIVE_VERSION} 13 | ENV HIVE_URL ${HIVE_URL} 14 | 15 | #Install Hive MySQL, PostgreSQL JDBC 16 | RUN echo "Hive URL is :${HIVE_URL}" && wget ${HIVE_URL} -O hive.tar.gz && \ 17 | tar -xzvf hive.tar.gz && mv *hive*-bin hive && \ 18 | ln -s /usr/share/java/mysql-connector-java.jar $HIVE_HOME/lib/mysql-connector-java.jar && \ 19 | wget https://jdbc.postgresql.org/download/postgresql-9.4.1212.jar -O $HIVE_HOME/lib/postgresql-jdbc.jar && \ 20 | rm hive.tar.gz && mkdir -p /var/daflow/ws/docker/daflow/hadoop/hive_base/target/ 21 | 22 | #Spark should be compiled with Hive to be able to use it 23 | #hive-site.xml should be copied to $SPARK_HOME/conf folder 24 | 25 | #Custom configuration goes here 26 | ADD conf/hive-site.xml $HADOOP_CONF_DIR 27 | ADD conf/beeline-log4j2.properties $HIVE_HOME/conf 28 | ADD conf/hive-env.sh $HIVE_HOME/conf 29 | ADD conf/hive-exec-log4j2.properties $HIVE_HOME/conf 30 | ADD conf/hive-log4j2.properties $HIVE_HOME/conf 31 | ADD conf/ivysettings.xml $HIVE_HOME/conf 32 | ADD conf/llap-daemon-log4j2.properties $HIVE_HOME/conf 33 | 34 | # Setup DaFlow Library jars 35 | ADD target/demo /var/daflow/ws/daflow-examples/demo 36 | 37 | ENV DAFLOW_BUNDLE=/var/daflow/demo/artifacts/daflow-*.jar 38 | 39 | COPY startup.sh /usr/local/bin/ 40 | RUN chmod +x /usr/local/bin/startup.sh 41 | 42 | COPY entrypoint.sh /usr/local/bin/ 43 | RUN chmod +x /usr/local/bin/entrypoint.sh 44 | 45 | ENV PATH $HIVE_HOME/bin/:$PATH 46 | 47 | ENTRYPOINT ["entrypoint.sh"] 48 | CMD startup.sh 49 | -------------------------------------------------------------------------------- /docker/images/hive/conf/beeline-log4j2.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | status = INFO 18 | name = BeelineLog4j2 19 | packages = org.apache.hadoop.hive.ql.log 20 | 21 | # list of properties 22 | property.hive.log.level = WARN 23 | property.hive.root.logger = console 24 | 25 | # list of all appenders 26 | appenders = console 27 | 28 | # console appender 29 | appender.console.type = Console 30 | appender.console.name = console 31 | appender.console.target = SYSTEM_ERR 32 | appender.console.layout.type = PatternLayout 33 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n 34 | 35 | # list of all loggers 36 | loggers = HiveConnection 37 | 38 | # HiveConnection logs useful info for dynamic service discovery 39 | logger.HiveConnection.name = org.apache.hive.jdbc.HiveConnection 40 | logger.HiveConnection.level = INFO 41 | 42 | # root logger 43 | rootLogger.level = ${sys:hive.log.level} 44 | rootLogger.appenderRefs = root 45 | rootLogger.appenderRef.root.ref = ${sys:hive.root.logger} 46 | -------------------------------------------------------------------------------- /docker/images/hive/conf/hive-env.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Set Hive and Hadoop environment variables here. These variables can be used 18 | # to control the execution of Hive. It should be used by admins to configure 19 | # the Hive installation (so that users do not have to set environment variables 20 | # or set command line parameters to get correct behavior). 21 | # 22 | # The hive service being invoked (CLI/HWI etc.) is available via the environment 23 | # variable SERVICE 24 | 25 | 26 | # Hive Client memory usage can be an issue if a large number of clients 27 | # are running at the same time. The flags below have been useful in 28 | # reducing memory usage: 29 | # 30 | # if [ "$SERVICE" = "cli" ]; then 31 | # if [ -z "$DEBUG" ]; then 32 | # export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:+UseParNewGC -XX:-UseGCOverheadLimit" 33 | # else 34 | # export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:-UseGCOverheadLimit" 35 | # fi 36 | # fi 37 | 38 | # The heap size of the jvm stared by hive shell script can be controlled via: 39 | # 40 | # export HADOOP_HEAPSIZE=1024 41 | # 42 | # Larger heap size may be required when running queries over large number of files or partitions. 43 | # By default hive shell scripts use a heap size of 256 (MB). Larger heap size would also be 44 | # appropriate for hive server (hwi etc). 45 | 46 | 47 | # Set HADOOP_HOME to point to a specific hadoop install directory 48 | # HADOOP_HOME=${bin}/../../hadoop 49 | 50 | # Hive Configuration Directory can be controlled by: 51 | # export HIVE_CONF_DIR= 52 | 53 | # Folder containing extra ibraries required for hive compilation/execution can be controlled by: 54 | # export HIVE_AUX_JARS_PATH= 55 | -------------------------------------------------------------------------------- /docker/images/hive/conf/hive-exec-log4j2.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | status = INFO 18 | name = HiveExecLog4j2 19 | packages = org.apache.hadoop.hive.ql.log 20 | 21 | # list of properties 22 | property.hive.log.level = INFO 23 | property.hive.root.logger = FA 24 | property.hive.query.id = hadoop 25 | property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name} 26 | property.hive.log.file = ${sys:hive.query.id}.log 27 | 28 | # list of all appenders 29 | appenders = console, FA 30 | 31 | # console appender 32 | appender.console.type = Console 33 | appender.console.name = console 34 | appender.console.target = SYSTEM_ERR 35 | appender.console.layout.type = PatternLayout 36 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n 37 | 38 | # simple file appender 39 | appender.FA.type = File 40 | appender.FA.name = FA 41 | appender.FA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file} 42 | appender.FA.layout.type = PatternLayout 43 | appender.FA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n 44 | 45 | # list of all loggers 46 | loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX 47 | 48 | logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn 49 | logger.NIOServerCnxn.level = WARN 50 | 51 | logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO 52 | logger.ClientCnxnSocketNIO.level = WARN 53 | 54 | logger.DataNucleus.name = DataNucleus 55 | logger.DataNucleus.level = ERROR 56 | 57 | logger.Datastore.name = Datastore 58 | logger.Datastore.level = ERROR 59 | 60 | logger.JPOX.name = JPOX 61 | logger.JPOX.level = ERROR 62 | 63 | # root logger 64 | rootLogger.level = ${sys:hive.log.level} 65 | rootLogger.appenderRefs = root 66 | rootLogger.appenderRef.root.ref = ${sys:hive.root.logger} 67 | -------------------------------------------------------------------------------- /docker/images/hive/conf/hive-log4j2.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | status = INFO 18 | name = HiveLog4j2 19 | packages = org.apache.hadoop.hive.ql.log 20 | 21 | # list of properties 22 | property.hive.log.level = INFO 23 | property.hive.root.logger = DRFA 24 | property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name} 25 | property.hive.log.file = hive.log 26 | 27 | # list of all appenders 28 | appenders = console, DRFA 29 | 30 | # console appender 31 | appender.console.type = Console 32 | appender.console.name = console 33 | appender.console.target = SYSTEM_ERR 34 | appender.console.layout.type = PatternLayout 35 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n 36 | 37 | # daily rolling file appender 38 | appender.DRFA.type = RollingFile 39 | appender.DRFA.name = DRFA 40 | appender.DRFA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file} 41 | # Use %pid in the filePattern to append @ to the filename if you want separate log files for different CLI session 42 | appender.DRFA.filePattern = ${sys:hive.log.dir}/${sys:hive.log.file}.%d{yyyy-MM-dd} 43 | appender.DRFA.layout.type = PatternLayout 44 | appender.DRFA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n 45 | appender.DRFA.policies.type = Policies 46 | appender.DRFA.policies.time.type = TimeBasedTriggeringPolicy 47 | appender.DRFA.policies.time.interval = 1 48 | appender.DRFA.policies.time.modulate = true 49 | appender.DRFA.strategy.type = DefaultRolloverStrategy 50 | appender.DRFA.strategy.max = 30 51 | 52 | # list of all loggers 53 | loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX 54 | 55 | logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn 56 | logger.NIOServerCnxn.level = WARN 57 | 58 | logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO 59 | logger.ClientCnxnSocketNIO.level = WARN 60 | 61 | logger.DataNucleus.name = DataNucleus 62 | logger.DataNucleus.level = ERROR 63 | 64 | logger.Datastore.name = Datastore 65 | logger.Datastore.level = ERROR 66 | 67 | logger.JPOX.name = JPOX 68 | logger.JPOX.level = ERROR 69 | 70 | # root logger 71 | rootLogger.level = ${sys:hive.log.level} 72 | rootLogger.appenderRefs = root 73 | rootLogger.appenderRef.root.ref = ${sys:hive.root.logger} 74 | -------------------------------------------------------------------------------- /docker/images/hive/conf/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | -------------------------------------------------------------------------------- /docker/images/hive/conf/ivysettings.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /docker/images/hive/startup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | hadoop fs -mkdir /tmp 4 | hadoop fs -mkdir -p /user/hive/warehouse 5 | hadoop fs -chmod g+w /tmp 6 | hadoop fs -chmod g+w /user/hive/warehouse 7 | 8 | cd ${HIVE_HOME}/bin 9 | export AUX_CLASSPATH=file://${DAFLOW_BUNDLE} 10 | ./hiveserver2 --hiveconf hive.server2.enable.doAs=false --hiveconf hive.aux.jars.path=file://${DAFLOW_BUNDLE} 11 | -------------------------------------------------------------------------------- /docker/images/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.abhioncbr.daflow 5 | daflow 6 | ${revision} 7 | ../../pom.xml 8 | 9 | 10 | pom 11 | ${revision} 12 | 4.0.0 13 | daflow-docker 14 | 15 | 16 | hive 17 | spark/base 18 | hadoop/base 19 | spark/adhoc 20 | spark/worker 21 | spark/master 22 | hadoop/datanode 23 | hadoop/namenode 24 | hadoop/historyserver 25 | 26 | 27 | 28 | daflow 29 | daflow 30 | abc@daflow.com 31 | 32 | true 33 | 34 | 2.3.3 35 | 2.4.3 36 | 2.8.4 37 | 38 | 8188 39 | 50075 40 | 50070 41 | 42 | 2.7 43 | 44 | 1.4.10 45 | 46 | 47 | 48 | 49 | 50 | com.spotify 51 | dockerfile-maven-extension 52 | ${dockerfile.maven.version} 53 | 54 | 55 | 56 | 57 | com.spotify 58 | dockerfile-maven-plugin 59 | ${dockerfile.maven.version} 60 | 61 | true 62 | false 63 | false 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /docker/images/spark/adhoc/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG HADOOP_VERSION=2.8.4 2 | ARG HIVE_VERSION=2.3.3 3 | ARG SPARK_VERSION=2.3.1 4 | FROM abhioncbr/daflow-spark-base:${SPARK_VERSION} 5 | 6 | COPY adhoc.sh /opt/spark 7 | 8 | ENV SPARK_WORKER_WEBUI_PORT 8081 9 | ENV SPARK_WORKER_LOG /spark/logs 10 | ENV SPARK_MASTER "spark://spark-master:7077" 11 | 12 | CMD ["/bin/bash", "/opt/spark/adhoc.sh"] -------------------------------------------------------------------------------- /docker/images/spark/adhoc/adhoc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . "/spark/sbin/spark-config.sh" 4 | 5 | . "/spark/bin/load-spark-env.sh" 6 | 7 | 8 | export SPARK_HOME=/opt/spark 9 | 10 | date 11 | echo "SPARK HOME is : $SPARK_HOME" 12 | 13 | tail -f /dev/null 14 | -------------------------------------------------------------------------------- /docker/images/spark/adhoc/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | com.abhioncbr.daflow 4 | daflow-docker 5 | ${revision} 6 | ../../pom.xml 7 | 8 | 9 | pom 10 | ${daflow.docker.spark.adhoc.version} 11 | 4.0.0 12 | daflow-adhoc1 13 | DaFlow's Docker Image of Spark adhoc node. 14 | 15 | 16 | UTF-8 17 | 18 | 19 | 20 | 21 | com.abhioncbr.daflow 22 | daflow-spark-base-docker 23 | ${project.version} 24 | pom 25 | import 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | com.spotify 34 | dockerfile-maven-plugin 35 | ${dockerfile.maven.version} 36 | 37 | 38 | tag-latest 39 | pre-integration-test 40 | 41 | build 42 | tag 43 | push 44 | 45 | 46 | latest 47 | ${docker.build.skip} 48 | false 49 | ${docker.reg.username} 50 | ${docker.reg.password} 51 | abhioncbr/daflow-adhoc1 52 | 53 | ${docker.hive.version} 54 | ${docker.spark.version} 55 | ${docker.hadoop.version} 56 | 57 | 58 | 59 | 60 | tag-version 61 | pre-integration-test 62 | 63 | build 64 | tag 65 | push 66 | 67 | 68 | ${docker.spark.version}-${project.version} 69 | ${docker.build.skip} 70 | false 71 | ${docker.reg.username} 72 | ${docker.reg.password} 73 | abhioncbr/daflow-adhoc1 74 | 75 | ${docker.hive.version} 76 | ${docker.spark.version} 77 | ${docker.hadoop.version} 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /docker/images/spark/base/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG HADOOP_VERSION=2.8.4 2 | ARG HIVE_VERSION=2.3.3 3 | 4 | FROM abhioncbr/daflow-hive:${HADOOP_VERSION}-${HIVE_VERSION} 5 | 6 | ENV ENABLE_INIT_DAEMON true 7 | ENV INIT_DAEMON_BASE_URI http://identifier/init-daemon 8 | ENV INIT_DAEMON_STEP spark_master_init 9 | 10 | ARG SPARK_VERSION=2.4.3 11 | ARG SPARK_HADOOP_VERSION=2.7 12 | 13 | ENV SPARK_VERSION ${SPARK_VERSION} 14 | ENV HADOOP_VERSION ${SPARK_HADOOP_VERSION} 15 | 16 | COPY wait-for-step.sh / 17 | COPY execute-step.sh / 18 | COPY finish-step.sh / 19 | 20 | RUN echo "Installing Spark-version (${SPARK_VERSION})" \ 21 | && wget http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ 22 | && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ 23 | && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} /opt/spark \ 24 | && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ 25 | && cd / 26 | 27 | #Give permission to execute scripts 28 | RUN chmod +x /wait-for-step.sh && chmod +x /execute-step.sh && chmod +x /finish-step.sh 29 | 30 | # Fix the value of PYTHONHASHSEED 31 | # Note: this is needed when you use Python 3.3 or greater 32 | ENV PYTHONHASHSEED 1 33 | 34 | ENV SPARK_HOME /opt/spark 35 | ENV SPARK_INSTALL ${SPARK_HOME} 36 | ENV SPARK_CONF_DIR ${SPARK_HOME}/conf 37 | ENV PATH $SPARK_INSTALL/bin:$PATH 38 | 39 | ENV SPARK_DRIVER_PORT 5001 40 | ENV SPARK_UI_PORT 5002 41 | ENV SPARK_BLOCKMGR_PORT 5003 42 | 43 | EXPOSE $SPARK_DRIVER_PORT $SPARK_UI_PORT $SPARK_BLOCKMGR_PORT 44 | 45 | # Without this spark-shell fails - Download if it is not already there in $SPARK_INSTALL 46 | RUN wget -nc -q -O "${SPARK_INSTALL}/jars/jersey-bundle-1.19.4.jar" "http://repo1.maven.org/maven2/com/sun/jersey/jersey-bundle/1.19.4/jersey-bundle-1.19.4.jar" 47 | 48 | -------------------------------------------------------------------------------- /docker/images/spark/base/execute-step.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $ENABLE_INIT_DAEMON = "true" ] 4 | then 5 | echo "Execute step ${INIT_DAEMON_STEP} in pipeline" 6 | while true; do 7 | sleep 5 8 | echo -n '.' 9 | string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/execute?step=$INIT_DAEMON_STEP -o /dev/null) 10 | [ "$string" = "204" ] && break 11 | done 12 | echo "Notified execution of step ${INIT_DAEMON_STEP}" 13 | fi 14 | 15 | -------------------------------------------------------------------------------- /docker/images/spark/base/finish-step.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $ENABLE_INIT_DAEMON = "true" ] 4 | then 5 | echo "Finish step ${INIT_DAEMON_STEP} in pipeline" 6 | while true; do 7 | sleep 5 8 | echo -n '.' 9 | string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/finish?step=$INIT_DAEMON_STEP -o /dev/null) 10 | [ "$string" = "204" ] && break 11 | done 12 | echo "Notified finish of step ${INIT_DAEMON_STEP}" 13 | fi 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /docker/images/spark/base/wait-for-step.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $ENABLE_INIT_DAEMON = "true" ] 4 | then 5 | echo "Validating if step ${INIT_DAEMON_STEP} can start in pipeline" 6 | while true; do 7 | sleep 5 8 | echo -n '.' 9 | string=$(curl -s $INIT_DAEMON_BASE_URI/canStart?step=$INIT_DAEMON_STEP) 10 | [ "$string" = "true" ] && break 11 | done 12 | echo "Can start step ${INIT_DAEMON_STEP}" 13 | fi 14 | -------------------------------------------------------------------------------- /docker/images/spark/master/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG HADOOP_VERSION=2.8.4 2 | ARG HIVE_VERSION=2.3.3 3 | ARG SPARK_VERSION=2.4.3 4 | FROM abhioncbr/daflow-spark-base:${SPARK_VERSION} 5 | 6 | COPY master.sh /opt/spark 7 | 8 | ENV SPARK_MASTER_PORT 7077 9 | ENV SPARK_MASTER_WEBUI_PORT 8080 10 | ENV SPARK_MASTER_LOG /opt/spark/logs 11 | 12 | EXPOSE 8080 7077 6066 13 | 14 | CMD ["/bin/bash", "/opt/spark/master.sh"] 15 | -------------------------------------------------------------------------------- /docker/images/spark/master/master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export SPARK_MASTER_HOST=`hostname` 4 | 5 | . "/opt/spark/sbin/spark-config.sh" 6 | 7 | . "/opt/spark/bin/load-spark-env.sh" 8 | 9 | mkdir -p $SPARK_MASTER_LOG 10 | 11 | export SPARK_HOME=/opt/spark 12 | 13 | ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out 14 | 15 | cd /opt/spark/bin && /opt/spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master \ 16 | --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out 17 | -------------------------------------------------------------------------------- /docker/images/spark/worker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG HADOOP_VERSION=2.8.4 2 | ARG HIVE_VERSION=2.3.3 3 | ARG SPARK_VERSION=2.4.3 4 | FROM abhioncbr/daflow-spark-base:${SPARK_VERSION} 5 | 6 | COPY worker.sh /opt/spark 7 | 8 | ENV SPARK_WORKER_WEBUI_PORT 8081 9 | ENV SPARK_WORKER_LOG /spark/logs 10 | ENV SPARK_MASTER "spark://spark-master:7077" 11 | 12 | EXPOSE 8081 13 | 14 | CMD ["/bin/bash", "/opt/spark/worker.sh"] 15 | -------------------------------------------------------------------------------- /docker/images/spark/worker/worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . "/spark/sbin/spark-config.sh" 4 | 5 | . "/spark/bin/load-spark-env.sh" 6 | 7 | mkdir -p $SPARK_WORKER_LOG 8 | 9 | export SPARK_HOME=/opt/spark 10 | 11 | ln -sf /dev/stdout ${SPARK_WORKER_LOG}/spark-worker.out 12 | 13 | date 14 | echo "SPARK HOME is : $SPARK_HOME" 15 | /opt/spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker \ 16 | --webui-port ${SPARK_WORKER_WEBUI_PORT} ${SPARK_MASTER} >> ${SPARK_WORKER_LOG}/spark-worker.out 17 | -------------------------------------------------------------------------------- /docker/scripts/setup_demo_container.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | hadoop fs -mkdir -p /user/root/daflow-examples/demo/sample-data 3 | hadoop fs -copyFromLocal -f /var/daflow/ws/daflow-examples/demo/sample-data/json_data.json /user/root/daflow-examples/demo/sample-data 4 | hadoop fs -mkdir -p /user/root/daflow-examples/demo/sample-data/daflow-result 5 | -------------------------------------------------------------------------------- /docker/setup_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Create host mount directory and copy 4 | mkdir -p /tmp/daflow_hadoop_namenode 5 | mkdir -p /tmp/daflow_hadoop_datanode 6 | 7 | DAFLOW_ROOT=`dirname $PWD` 8 | 9 | # restart cluster 10 | DAFLOW_WS=${DAFLOW_ROOT} docker-compose -f compose/docker-compose-daflow.yml down 11 | DAFLOW_WS=${DAFLOW_ROOT} docker-compose -f compose/docker-compose-daflow.yml pull 12 | rm -rf /tmp/daflow_hadoop_datanode/* 13 | rm -rf /tmp/daflow_hadoop_namenode/* 14 | sleep 5 15 | 16 | DAFLOW_WS=${DAFLOW_ROOT} docker-compose -f compose/docker-compose-daflow.yml up -d 17 | sleep 15 18 | 19 | docker exec -it daflow-adhoc-1 /bin/bash /var/daflow/ws/docker/scripts/setup_demo_container.sh -------------------------------------------------------------------------------- /docker/stop_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # set up root directory 3 | DAFLOW_ROOT=`dirname $PWD` 4 | 5 | # shut down cluster 6 | DAFLOW_WS=${DAFLOW_ROOT} docker-compose -f compose/docker-compose-daflow.yml down 7 | 8 | # remove houst mount directory 9 | rm -rf /tmp/daflow_hadoop_datanode 10 | rm -rf /tmp/daflow_hadoop_namenode 11 | -------------------------------------------------------------------------------- /style/checkstyle-suppressions.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | --------------------------------------------------------------------------------