├── .github └── PULL_REQUEST_TEMPLATE ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── build └── mvn ├── examples └── zeepline_notebook │ ├── SAC_ Spark ML.json │ ├── SAC_ Spark SQL.json │ ├── SAC_ Spark Streaming and ML.json │ ├── Spark ETL_Lineage.png │ ├── Spark ML_Lineage.png │ └── Spark_ML_Streaming_Lineage.png ├── patch ├── 1100-spark_model.json ├── Spark_ML_Listener_2.3.diff ├── Spark_ML_Listener_2.3.patch ├── Spark_ML_Listener_2.4.diff └── Spark_ML_Listener_2.4.patch ├── pom.xml ├── scalastyle-config.xml ├── spark-atlas-connector-assembly └── pom.xml └── spark-atlas-connector ├── pom.xml └── src ├── main └── scala │ ├── com │ └── hortonworks │ │ └── spark │ │ └── atlas │ │ ├── AbstractEventProcessor.scala │ │ ├── AtlasClient.scala │ │ ├── AtlasClientConf.scala │ │ ├── AtlasEntityCreationRequestHelper.scala │ │ ├── AtlasEntityReadHelper.scala │ │ ├── AtlasUtils.scala │ │ ├── KafkaAtlasClient.scala │ │ ├── RestAtlasClient.scala │ │ ├── SACAtlasEntity.scala │ │ ├── SparkAtlasEventTracker.scala │ │ ├── SparkAtlasStreamingQueryEventTracker.scala │ │ ├── ml │ │ └── MLPipelineEventProcessor.scala │ │ ├── sql │ │ ├── CommandsHarvester.scala │ │ ├── Harvester.scala │ │ ├── KafkaTopicInformation.scala │ │ ├── SparkCatalogEventProcessor.scala │ │ ├── SparkExecutionPlanProcessor.scala │ │ └── SparkExtension.scala │ │ ├── types │ │ ├── AtlasEntityUtils.scala │ │ ├── external.scala │ │ ├── internal.scala │ │ └── metadata.scala │ │ └── utils │ │ ├── CatalogUtils.scala │ │ ├── JdbcUtils.scala │ │ ├── Logging.scala │ │ ├── ReflectionHelper.scala │ │ └── SparkUtils.scala │ └── org │ └── apache │ └── spark │ └── sql │ └── kafka010 │ └── atlas │ └── ExtractFromDataSource.scala └── test ├── resources ├── atlas-application.properties ├── log4j.properties └── users.parquet └── scala └── com ├── hortonworks └── spark │ └── atlas │ ├── AtlasEntityCreationRequestHelperSuite.scala │ ├── BaseResourceIT.scala │ ├── KafkaClientIT.scala │ ├── TestUtils.scala │ ├── WithHDFSSupport.scala │ ├── WithHiveSupport.scala │ ├── WithRemoteHiveMetastoreServiceSupport.scala │ ├── ml │ ├── MLPipelineTrackerIT.scala │ └── MLPipelineWithSaveIntoSuite.scala │ ├── sql │ ├── CatalogEventToAtlasIT.scala │ ├── CreateDataSourceTableAsSelectHarvesterSuite.scala │ ├── CreateHiveTableAsSelectHarvesterSuite.scala │ ├── CreateViewHarvesterSuite.scala │ ├── InsertIntoHarvesterSuite.scala │ ├── InsertIntoHiveDirHarvesterSuite.scala │ ├── LoadDataHarvesterSuite.scala │ ├── SparkCatalogEventProcessorSuite.scala │ ├── SparkExecutionPlanProcessForRdbmsQuerySuite.scala │ ├── SparkExecutionPlanProcessorForBatchQuerySuite.scala │ ├── SparkExecutionPlanProcessorForComplicatedQuerySuite.scala │ ├── SparkExecutionPlanProcessorForStreamingQuerySuite.scala │ ├── SparkExecutionPlanProcessorForViewSuite.scala │ ├── SparkExecutionPlanProcessorWithRemoteHiveMetastoreServiceSuite.scala │ └── testhelper │ │ ├── AtlasQueryExecutionListener.scala │ │ ├── AtlasStreamingQueryProgressListener.scala │ │ ├── BaseHarvesterSuite.scala │ │ ├── CreateEntitiesTrackingAtlasClient.scala │ │ ├── DirectProcessSparkExecutionPlanProcessor.scala │ │ ├── FsEntityValidator.scala │ │ ├── KafkaTopicEntityValidator.scala │ │ ├── ProcessEntityValidator.scala │ │ └── TableEntityValidator.scala │ ├── types │ ├── AtlasExternalEntityUtilsSuite.scala │ ├── MLAtlasEntityUtilsSuite.scala │ └── SparkAtlasEntityUtilsSuite.scala │ └── utils │ ├── JdbcUtilsTest.scala │ └── SparkUtilsSuite.scala └── hotels └── beeju └── ThriftHiveMetaStoreTestUtil.scala /.github/PULL_REQUEST_TEMPLATE: -------------------------------------------------------------------------------- 1 | ## What changes were proposed in this pull request? 2 | 3 | (Please fill in changes proposed in this fix) 4 | 5 | ## How was this patch tested? 6 | 7 | (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | .idea 4 | *.iml 5 | target 6 | metastore_db 7 | tmp 8 | dependency-reduced-pom.xml 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | sudo: false 17 | dist: trusty 18 | 19 | language: java 20 | jdk: 21 | - oraclejdk8 22 | 23 | cache: 24 | directories: 25 | - $HOME/.m2 26 | 27 | notifications: 28 | email: false 29 | 30 | install: 31 | - mvn -q clean checkstyle:check scalastyle:check package -DskipTests 32 | -------------------------------------------------------------------------------- /build/mvn: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Determine the current working directory 21 | _DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 22 | # Preserve the calling directory 23 | _CALLING_DIR="$(pwd)" 24 | # Options used during compilation 25 | _COMPILE_JVM_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" 26 | 27 | # Installs any application tarball given a URL, the expected tarball name, 28 | # and, optionally, a checkable binary path to determine if the binary has 29 | # already been installed 30 | ## Arg1 - URL 31 | ## Arg2 - Tarball Name 32 | ## Arg3 - Checkable Binary 33 | install_app() { 34 | local remote_tarball="$1/$2" 35 | local local_tarball="${_DIR}/$2" 36 | local binary="${_DIR}/$3" 37 | 38 | # setup `curl` and `wget` silent options if we're running on Jenkins 39 | local curl_opts="-L" 40 | local wget_opts="" 41 | if [ -n "$AMPLAB_JENKINS" ]; then 42 | curl_opts="-s ${curl_opts}" 43 | wget_opts="--quiet ${wget_opts}" 44 | else 45 | curl_opts="--progress-bar ${curl_opts}" 46 | wget_opts="--progress=bar:force ${wget_opts}" 47 | fi 48 | 49 | if [ -z "$3" -o ! -f "$binary" ]; then 50 | # check if we already have the tarball 51 | # check if we have curl installed 52 | # download application 53 | [ ! -f "${local_tarball}" ] && [ $(command -v curl) ] && \ 54 | echo "exec: curl ${curl_opts} ${remote_tarball}" 1>&2 && \ 55 | curl ${curl_opts} "${remote_tarball}" > "${local_tarball}" 56 | # if the file still doesn't exist, lets try `wget` and cross our fingers 57 | [ ! -f "${local_tarball}" ] && [ $(command -v wget) ] && \ 58 | echo "exec: wget ${wget_opts} ${remote_tarball}" 1>&2 && \ 59 | wget ${wget_opts} -O "${local_tarball}" "${remote_tarball}" 60 | # if both were unsuccessful, exit 61 | [ ! -f "${local_tarball}" ] && \ 62 | echo -n "ERROR: Cannot download $2 with cURL or wget; " && \ 63 | echo "please install manually and try again." && \ 64 | exit 2 65 | cd "${_DIR}" && tar -xzf "$2" 66 | rm -rf "$local_tarball" 67 | fi 68 | } 69 | 70 | # Determine the Maven version from the root pom.xml file and 71 | # install maven under the build/ folder if needed. 72 | install_mvn() { 73 | local MVN_VERSION=`grep "" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` 74 | MVN_BIN="$(command -v mvn)" 75 | if [ "$MVN_BIN" ]; then 76 | local MVN_DETECTED_VERSION="$(mvn --version | head -n1 | awk '{print $3}')" 77 | fi 78 | # See simple version normalization: http://stackoverflow.com/questions/16989598/bash-comparing-version-numbers 79 | function version { echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }'; } 80 | if [ $(version $MVN_DETECTED_VERSION) -lt $(version $MVN_VERSION) ]; then 81 | local APACHE_MIRROR=${APACHE_MIRROR:-'https://www.apache.org/dyn/closer.lua?action=download&filename='} 82 | 83 | install_app \ 84 | "${APACHE_MIRROR}/maven/maven-3/${MVN_VERSION}/binaries" \ 85 | "apache-maven-${MVN_VERSION}-bin.tar.gz" \ 86 | "apache-maven-${MVN_VERSION}/bin/mvn" 87 | 88 | MVN_BIN="${_DIR}/apache-maven-${MVN_VERSION}/bin/mvn" 89 | fi 90 | } 91 | 92 | # Install the proper version of Scala, Zinc and Maven for the build 93 | install_mvn 94 | 95 | # Reset the current working directory 96 | cd "${_CALLING_DIR}" 97 | 98 | # Set any `mvn` options if not already present 99 | export MAVEN_OPTS=${MAVEN_OPTS:-"$_COMPILE_JVM_OPTS"} 100 | 101 | echo "Using \`mvn\` from path: $MVN_BIN" 1>&2 102 | "${MVN_BIN}" "$@" 103 | -------------------------------------------------------------------------------- /examples/zeepline_notebook/SAC_ Spark ML.json: -------------------------------------------------------------------------------- 1 | {"paragraphs":[{"text":"%conf\nspark.app.name Spark-ML\nspark.jars /tmp/atlas/spark-atlas-connector-assembly_2.11-0.1.0-SNAPSHOT.jar\nspark.jars.packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0\nspark.extraListeners com.hortonworks.spark.atlas.SparkAtlasEventTracker\nspark.sql.queryExecutionListeners com.hortonworks.spark.atlas.SparkAtlasEventTracker\nspark.sql.streaming.streamingQueryListeners com.hortonworks.spark.atlas.SparkAtlasStreamingQueryEventTracker\n","user":"admin","dateUpdated":"2018-06-21T22:26:50+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1529020561038_-89670889","id":"20180614-235601_1829676857","dateCreated":"2018-06-14T23:56:01+0000","dateStarted":"2018-06-21T22:26:50+0000","dateFinished":"2018-06-21T22:26:50+0000","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:754"},{"title":" Step2: Build a Spark ML pipeline and store the trained model into HDFS","text":"%spark2\nimport org.apache.spark.ml.{Pipeline, PipelineModel}\nimport org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer}\nimport org.apache.spark.sql.SparkSession\nimport org.apache.spark.sql.SparkSession\nimport spark.implicits._\n\nval training = spark.sql(\"select * from training_table\")\n\ntraining.show()\n\n// Configure an ML pipeline, which consists of three stages: tokenizer, remover.\nval tokenizer = new Tokenizer().setInputCol(\"text\").setOutputCol(\"words\")\n\nval remover = new StopWordsRemover().setInputCol(\"words\").setOutputCol(\"filtered\")\n\nval pipeline = new Pipeline().setStages(Array(tokenizer, remover))\n\nval model = pipeline.fit(training)\n\nval pipelineDir = \"/tmp/pipeline_streaming_dir\"\n\nval modelDir = \"/tmp/model_streaming_dir\"\n\npipeline.write.overwrite().save(pipelineDir)\n\nmodel.write.overwrite().save(modelDir)\n","user":"admin","dateUpdated":"2018-06-21T22:28:02+0000","config":{"editorSetting":{"language":"scala","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/scala","title":true,"results":{},"enabled":true,"fontSize":9},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import org.apache.spark.ml.{Pipeline, PipelineModel}\nimport org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer}\nimport org.apache.spark.sql.SparkSession\nimport org.apache.spark.sql.SparkSession\nimport spark.implicits._\ntraining: org.apache.spark.sql.DataFrame = [text: string]\n+--------------------+\n| text|\n+--------------------+\n|\"Hortonworks is a...|\n|Temporary views i...|\n|\"Datasets are sim...|\n+--------------------+\n\ntokenizer: org.apache.spark.ml.feature.Tokenizer = tok_01d4ad905c67\nremover: org.apache.spark.ml.feature.StopWordsRemover = stopWords_c83b472c2ee9\npipeline: org.apache.spark.ml.Pipeline = pipeline_b4b3bb3719d6\nmodel: org.apache.spark.ml.PipelineModel = pipeline_b4b3bb3719d6\npipelineDir: String = /tmp/pipeline_streaming_dir\nmodelDir: String = /tmp/model_streaming_dir\n"}]},"runtimeInfos":{"jobUrl":{"propertyName":"jobUrl","label":"SPARK JOB","tooltip":"View in Spark web UI","group":"spark","values":["http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=0","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=1","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=2","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=3","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=4","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=5","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=6","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=7"],"interpreterSettingId":"spark2"}},"apps":[],"jobName":"paragraph_1529009689077_1168589782","id":"20180503-200931_1296064876","dateCreated":"2018-06-14T20:54:49+0000","dateStarted":"2018-06-21T22:28:02+0000","dateFinished":"2018-06-21T22:28:53+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:755"},{"text":"%spark2\n","user":"admin","dateUpdated":"2018-06-14T20:54:49+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","results":{},"enabled":true,"editorSetting":{"language":"scala","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"fontSize":9},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1529009689079_-1579730663","id":"20180506-231703_1473196617","dateCreated":"2018-06-14T20:54:49+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:756"}],"name":"SAC: Spark ML","id":"2DF4E4NE9","noteParams":{},"noteForms":{},"angularObjects":{"spark2:shared_process":[]},"config":{"isZeppelinNotebookCronEnable":false,"looknfeel":"default","personalizedMode":"false"},"info":{}} -------------------------------------------------------------------------------- /examples/zeepline_notebook/SAC_ Spark SQL.json: -------------------------------------------------------------------------------- 1 | {"paragraphs":[{"text":"%conf\nspark.app.name Spark-ETL\nspark.jars /tmp/atlas/spark-atlas-connector-assembly_2.11-0.1.0-SNAPSHOT.jar\nspark.jars.packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0\nspark.extraListeners com.hortonworks.spark.atlas.SparkAtlasEventTracker\nspark.sql.queryExecutionListeners com.hortonworks.spark.atlas.SparkAtlasEventTracker\nspark.sql.streaming.streamingQueryListeners com.hortonworks.spark.atlas.SparkAtlasStreamingQueryEventTracker","user":"admin","dateUpdated":"2018-06-21T22:18:16+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1529009709294_238885199","id":"20180614-205509_1930236875","dateCreated":"2018-06-14T20:55:09+0000","dateStarted":"2018-06-21T22:18:16+0000","dateFinished":"2018-06-21T22:18:16+0000","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:2470"},{"title":"Step1: Build a Hive Table","text":"%spark2\nimport org.apache.spark.sql.SparkSession\nimport spark.implicits._\nimport org.apache.spark.sql.{SaveMode, SparkSession}\n\nspark.sql(\"CREATE TABLE IF NOT EXISTS training_table (text STRING) USING hive\")\n\nval trainData = Seq(\n (\"Hortonworks is a big data software company based in Santa Clara, California.\"),\n (\"Temporary views in Spark SQL are session-scoped and will disappear if the session that creates it terminates.\"),\n (\"Datasets are similar to RDDs, however, instead of using Java serialization or Kryo they use a specialized Encoder.\")\n).toDF(\"text\")\n\ntrainData.write.mode(SaveMode.Overwrite).format(\"csv\").save(\"/tmp/training_table.csv\")\n\nspark.sql(\"LOAD DATA INPATH '/tmp/training_table.csv' INTO TABLE training_table\")\nspark.sql(\"select * from training_table\").show()\n","user":"admin","dateUpdated":"2018-06-21T22:18:24+0000","config":{"tableHide":false,"editorSetting":{"language":"scala","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":false,"title":true,"results":{},"enabled":true,"fontSize":9},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import org.apache.spark.sql.SparkSession\nimport spark.implicits._\nimport org.apache.spark.sql.{SaveMode, SparkSession}\nres4: org.apache.spark.sql.DataFrame = []\ntrainData: org.apache.spark.sql.DataFrame = [text: string]\nres6: org.apache.spark.sql.DataFrame = []\n+--------------------+\n| text|\n+--------------------+\n|\"Hortonworks is a...|\n|Temporary views i...|\n|\"Datasets are sim...|\n+--------------------+\n\n"}]},"runtimeInfos":{"jobUrl":{"propertyName":"jobUrl","label":"SPARK JOB","tooltip":"View in Spark web UI","group":"spark","values":["http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=0","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=1","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=2"],"interpreterSettingId":"spark2"}},"apps":[],"jobName":"paragraph_1529009617489_-1522856994","id":"20180503-195630_1514310671","dateCreated":"2018-06-14T20:53:37+0000","dateStarted":"2018-06-21T22:18:24+0000","dateFinished":"2018-06-21T22:19:18+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:2471"},{"text":"","user":"admin","dateUpdated":"2018-06-18T21:57:33+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","results":{},"enabled":true,"editorSetting":{"language":"scala","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"fontSize":9},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1529009617505_1441244159","id":"20180503-200854_877477857","dateCreated":"2018-06-14T20:53:37+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:2472"}],"name":"SAC: Spark SQL","id":"2DGPRFBM9","noteParams":{},"noteForms":{},"angularObjects":{"spark2:shared_process":[]},"config":{"isZeppelinNotebookCronEnable":false,"looknfeel":"default","personalizedMode":"false"},"info":{}} -------------------------------------------------------------------------------- /examples/zeepline_notebook/SAC_ Spark Streaming and ML.json: -------------------------------------------------------------------------------- 1 | {"paragraphs":[{"text":"%conf\nspark.app.name Spark-Streaming\nspark.jars /tmp/atlas/spark-atlas-connector-assembly_2.11-0.1.0-SNAPSHOT.jar\nspark.jars.packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0\nspark.extraListeners com.hortonworks.spark.atlas.SparkAtlasEventTracker\nspark.sql.queryExecutionListeners com.hortonworks.spark.atlas.SparkAtlasEventTracker\nspark.sql.streaming.streamingQueryListeners com.hortonworks.spark.atlas.SparkAtlasStreamingQueryEventTracker","user":"admin","dateUpdated":"2018-06-21T22:40:38+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1529020694752_1363032102","id":"20180614-235814_690445349","dateCreated":"2018-06-14T23:58:14+0000","dateStarted":"2018-06-21T22:40:38+0000","dateFinished":"2018-06-21T22:40:38+0000","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:3152"},{"title":"Step3: Build a Spark streaming processing pipeline with trained model for Kafka Streaming","text":"%spark2\n\nimport org.apache.spark.ml.{Pipeline, PipelineModel}\nimport org.apache.spark.sql.streaming.{OutputMode, Trigger}\n\nval kafkaServer = \"172.27.22.200:6667\"\n\nval sameModel = PipelineModel.load(\"/tmp/model_streaming_dir\")\n \nval df = spark.readStream.format(\"kafka\").option(\"kafka.bootstrap.servers\", kafkaServer).option(\"subscribe\", \"kafka_input\").load()\n\nval df2 = df.selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\").as[(String, String)].toDF(\"id\", \"text\")\n\n//sink streaming data to other kafaka \nval output = sameModel.transform(df2).toDF(\"key\", \"value\", \"words\", \"filtered\").selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\")\n\noutput.writeStream.format(\"kafka\").option(\"kafka.bootstrap.servers\", kafkaServer).option(\"checkpointLocation\", \"/tmp/demo/chckpnt\").option(\"topic\", \"kafka_output\").start()\n","user":"admin","dateUpdated":"2018-06-21T22:40:41+0000","config":{"editorSetting":{"language":"scala","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/scala","title":true,"results":{},"enabled":true,"fontSize":9},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import org.apache.spark.ml.{Pipeline, PipelineModel}\nimport org.apache.spark.sql.streaming.{OutputMode, Trigger}\nkafkaServer: String = 172.27.22.200:6667\nsameModel: org.apache.spark.ml.PipelineModel = pipeline_b4b3bb3719d6\ndf: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]\ndf2: org.apache.spark.sql.DataFrame = [id: string, text: string]\noutput: org.apache.spark.sql.DataFrame = [key: string, value: string]\nres5: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@1d5473b0\n"}]},"runtimeInfos":{"jobUrl":{"propertyName":"jobUrl","label":"SPARK JOB","tooltip":"View in Spark web UI","group":"spark","values":["http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=0","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=1","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=2","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=3","http://ctr-e138-1518143905142-364859-01-000004.hwx.site:4041/jobs/job?id=4"],"interpreterSettingId":"spark2"}},"apps":[],"jobName":"paragraph_1529009681224_1846251547","id":"20180503-201028_932250986","dateCreated":"2018-06-14T20:54:41+0000","dateStarted":"2018-06-21T22:40:41+0000","dateFinished":"2018-06-21T22:41:24+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:3153"},{"text":"%spark2\n","user":"admin","dateUpdated":"2018-06-14T20:54:41+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","results":{},"enabled":true,"editorSetting":{"language":"scala","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"fontSize":9},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1529009681226_1853252567","id":"20180506-232424_172987971","dateCreated":"2018-06-14T20:54:41+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:3154"}],"name":"SAC: Spark Streaming and ML","id":"2DJSKKUA5","noteParams":{},"noteForms":{},"angularObjects":{"spark2:shared_process":[]},"config":{"isZeppelinNotebookCronEnable":false,"looknfeel":"default","personalizedMode":"false"},"info":{}} -------------------------------------------------------------------------------- /examples/zeepline_notebook/Spark ETL_Lineage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hortonworks-spark/spark-atlas-connector/0b10e337cdfd427744a92f8505d46297afb4c295/examples/zeepline_notebook/Spark ETL_Lineage.png -------------------------------------------------------------------------------- /examples/zeepline_notebook/Spark ML_Lineage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hortonworks-spark/spark-atlas-connector/0b10e337cdfd427744a92f8505d46297afb4c295/examples/zeepline_notebook/Spark ML_Lineage.png -------------------------------------------------------------------------------- /examples/zeepline_notebook/Spark_ML_Streaming_Lineage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hortonworks-spark/spark-atlas-connector/0b10e337cdfd427744a92f8505d46297afb4c295/examples/zeepline_notebook/Spark_ML_Streaming_Lineage.png -------------------------------------------------------------------------------- /spark-atlas-connector-assembly/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 22 | 4.0.0 23 | 24 | com.hortonworks.spark 25 | spark-atlas-connector-main_2.11 26 | 0.1.0-SNAPSHOT 27 | ../pom.xml 28 | 29 | 30 | spark-atlas-connector-assembly 31 | jar 32 | 33 | 34 | 35 | com.hortonworks.spark 36 | spark-atlas-connector_${scala.binary.version} 37 | ${project.version} 38 | 39 | 40 | 41 | 42 | target/scala-${scala.binary.version}/classes 43 | target/scala-${scala.binary.version}/test-classes 44 | 45 | 46 | org.apache.maven.plugins 47 | maven-shade-plugin 48 | 49 | false 50 | 51 | 52 | *:* 53 | 54 | 55 | 56 | 57 | *:* 58 | 59 | META-INF/*.SF 60 | META-INF/*.DSA 61 | META-INF/*.RSA 62 | 63 | 64 | 65 | 66 | 67 | 68 | package 69 | 70 | shade 71 | 72 | 73 | 74 | 75 | 76 | reference.conf 77 | 78 | 79 | log4j.properties 80 | 81 | 82 | 83 | 84 | 85 | 86 | org.apache.hadoop.hbase 87 | com.hortonworks.spark.atlas.shade.org.apache.hbase 88 | 89 | 90 | org.apache.htrace 91 | com.hortonworks.spark.atlas.shade.org.apache.htrace 92 | 93 | 94 | org.apache.commons.configuration 95 | com.hortonworks.spark.atlas.shade.org.apache.commons.configuration 96 | 97 | 98 | com.sun.jersey 99 | com.hortonworks.spark.atlas.shade.com.sun.jersey 100 | 101 | 102 | org.codehaus.jackson 103 | com.hortonworks.spark.atlas.shade.org.codehaus.jackson 104 | 105 | 106 | javax.ws.rs 107 | com.hortonworks.spark.atlas.javax.ws.rs 108 | 109 | 110 | com.fasterxml.jackson 111 | com.hortonworks.spark.atlas.com.fasterxml.jackson 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /spark-atlas-connector/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 21 | 4.0.0 22 | 23 | 24 | com.hortonworks.spark 25 | spark-atlas-connector-main_2.11 26 | 0.1.0-SNAPSHOT 27 | ../pom.xml 28 | 29 | 30 | spark-atlas-connector_2.11 31 | jar 32 | 33 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/AbstractEventProcessor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.util.concurrent.{LinkedBlockingQueue, TimeUnit} 21 | 22 | import scala.reflect.ClassTag 23 | import scala.util.control.NonFatal 24 | import com.google.common.annotations.VisibleForTesting 25 | import com.hortonworks.spark.atlas.utils.Logging 26 | 27 | abstract class AbstractEventProcessor[T: ClassTag] extends Logging { 28 | def conf: AtlasClientConf 29 | 30 | private val capacity = conf.get(AtlasClientConf.BLOCKING_QUEUE_CAPACITY).toInt 31 | 32 | private[atlas] val eventQueue = new LinkedBlockingQueue[T](capacity) 33 | 34 | private val timeout = conf.get(AtlasClientConf.BLOCKING_QUEUE_PUT_TIMEOUT).toInt 35 | 36 | private val eventProcessThread = new Thread { 37 | override def run(): Unit = { 38 | eventProcess() 39 | } 40 | } 41 | 42 | def pushEvent(event: T): Unit = { 43 | event match { 44 | case e: T => 45 | if (!eventQueue.offer(e, timeout, TimeUnit.MILLISECONDS)) { 46 | logError(s"Fail to put event $e into queue within time limit $timeout, will throw it") 47 | } 48 | case _ => // Ignore other events 49 | } 50 | } 51 | 52 | def startThread(): Unit = { 53 | eventProcessThread.setName(this.getClass.getSimpleName + "-thread") 54 | eventProcessThread.setDaemon(true) 55 | 56 | val ctxClassLoader = Thread.currentThread().getContextClassLoader 57 | if (ctxClassLoader != null && getClass.getClassLoader != ctxClassLoader) { 58 | eventProcessThread.setContextClassLoader(ctxClassLoader) 59 | } 60 | 61 | eventProcessThread.start() 62 | } 63 | 64 | protected def process(e: T): Unit 65 | 66 | @VisibleForTesting 67 | private[atlas] def eventProcess(): Unit = { 68 | var stopped = false 69 | while (!stopped) { 70 | try { 71 | Option(eventQueue.poll(3000, TimeUnit.MILLISECONDS)).foreach { e => 72 | process(e) 73 | } 74 | } catch { 75 | case _: InterruptedException => 76 | logDebug("Thread is interrupted") 77 | stopped = true 78 | 79 | case NonFatal(f) => 80 | logWarn(s"Caught exception during parsing event", f) 81 | } 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/AtlasClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import scala.util.control.NonFatal 21 | 22 | import com.sun.jersey.core.util.MultivaluedMapImpl 23 | import org.apache.atlas.model.instance.AtlasEntity 24 | import org.apache.atlas.model.typedef.AtlasTypesDef 25 | 26 | import com.hortonworks.spark.atlas.utils.Logging 27 | 28 | trait AtlasClient extends Logging { 29 | 30 | def createAtlasTypeDefs(typeDefs: AtlasTypesDef): Unit 31 | 32 | def getAtlasTypeDefs(searchParams: MultivaluedMapImpl): AtlasTypesDef 33 | 34 | def updateAtlasTypeDefs(typeDefs: AtlasTypesDef): Unit 35 | 36 | final def createEntitiesWithDependencies( 37 | entity: SACAtlasReferenceable): Unit = this.synchronized { 38 | entity match { 39 | case e: SACAtlasEntityWithDependencies => 40 | // handle dependencies first 41 | if (e.dependencies.nonEmpty) { 42 | val deps = e.dependencies.filter(_.isInstanceOf[SACAtlasEntityWithDependencies]) 43 | .map(_.asInstanceOf[SACAtlasEntityWithDependencies]) 44 | 45 | val depsHavingAnotherDeps = deps.filter(_.dependencies.nonEmpty) 46 | val depsHavingNoDeps = deps.filterNot(_.dependencies.nonEmpty) 47 | 48 | // we should handle them one by one if they're having additional dependencies 49 | depsHavingAnotherDeps.foreach(createEntitiesWithDependencies) 50 | 51 | // otherwise, we can handle them at once 52 | createEntities(depsHavingNoDeps.map(_.entity)) 53 | } 54 | 55 | // done with dependencies, process origin entity 56 | createEntities(Seq(e.entity)) 57 | 58 | case _ => // don't request creation entity for reference 59 | } 60 | } 61 | 62 | final def createEntitiesWithDependencies( 63 | entities: Seq[SACAtlasReferenceable]): Unit = this.synchronized { 64 | entities.foreach(createEntitiesWithDependencies) 65 | } 66 | 67 | final def createEntities(entities: Seq[AtlasEntity]): Unit = this.synchronized { 68 | if (entities.isEmpty) { 69 | return 70 | } 71 | 72 | try { 73 | doCreateEntities(entities) 74 | } catch { 75 | case NonFatal(e) => 76 | logWarn(s"Failed to create entities", e) 77 | } 78 | } 79 | 80 | protected def doCreateEntities(entities: Seq[AtlasEntity]): Unit 81 | 82 | final def deleteEntityWithUniqueAttr( 83 | entityType: String, attribute: String): Unit = this.synchronized { 84 | try { 85 | doDeleteEntityWithUniqueAttr(entityType, attribute) 86 | } catch { 87 | case NonFatal(e) => 88 | logWarn(s"Failed to delete entity with type $entityType", e) 89 | } 90 | } 91 | 92 | protected def doDeleteEntityWithUniqueAttr(entityType: String, attribute: String): Unit 93 | 94 | final def updateEntityWithUniqueAttr( 95 | entityType: String, 96 | attribute: String, 97 | entity: AtlasEntity): Unit = this.synchronized { 98 | try { 99 | doUpdateEntityWithUniqueAttr(entityType, attribute, entity) 100 | } catch { 101 | case NonFatal(e) => 102 | logWarn(s"Failed to update entity $entity with type $entityType and attribute " + 103 | s"$attribute", e) 104 | } 105 | } 106 | 107 | protected def doUpdateEntityWithUniqueAttr( 108 | entityType: String, 109 | attribute: String, 110 | entity: AtlasEntity): Unit 111 | } 112 | 113 | object AtlasClient { 114 | @volatile private var client: AtlasClient = null 115 | 116 | def atlasClient(conf: AtlasClientConf): AtlasClient = { 117 | if (client == null) { 118 | AtlasClient.synchronized { 119 | if (client == null) { 120 | conf.get(AtlasClientConf.CLIENT_TYPE).trim match { 121 | case "rest" => 122 | client = new RestAtlasClient(conf) 123 | case "kafka" => 124 | client = new KafkaAtlasClient(conf) 125 | case e => 126 | client = Class.forName(e) 127 | .getConstructor(classOf[AtlasClientConf]) 128 | .newInstance(conf) 129 | .asInstanceOf[AtlasClient] 130 | } 131 | } 132 | } 133 | } 134 | 135 | client 136 | } 137 | } 138 | 139 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/AtlasClientConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import org.apache.atlas.ApplicationProperties 21 | import com.hortonworks.spark.atlas.AtlasClientConf.ConfigEntry 22 | 23 | class AtlasClientConf { 24 | 25 | private lazy val configuration = ApplicationProperties.get() 26 | 27 | def set(key: String, value: String): AtlasClientConf = { 28 | configuration.setProperty(key, value) 29 | this 30 | } 31 | 32 | def set(key: ConfigEntry, value: String): AtlasClientConf = { 33 | configuration.setProperty(key.key, value) 34 | this 35 | } 36 | 37 | def get(key: String, defaultValue: String): String = { 38 | Option(configuration.getProperty(key).asInstanceOf[String]).getOrElse(defaultValue) 39 | } 40 | 41 | def getOption(key: String): Option[String] = { 42 | Option(configuration.getProperty(key).asInstanceOf[String]) 43 | } 44 | 45 | def getUrl(key: String): Object = { 46 | configuration.getProperty(key) 47 | } 48 | 49 | def get(t: ConfigEntry): String = { 50 | Option(configuration.getProperty(t.key).asInstanceOf[String]).getOrElse(t.defaultValue) 51 | } 52 | } 53 | 54 | object AtlasClientConf { 55 | case class ConfigEntry(key: String, defaultValue: String) 56 | 57 | val ATLAS_SPARK_ENABLED = ConfigEntry("atlas.spark.enabled", "true") 58 | 59 | val ATLAS_REST_ENDPOINT = ConfigEntry("atlas.rest.address", "localhost:21000") 60 | 61 | val BLOCKING_QUEUE_CAPACITY = ConfigEntry("atlas.blockQueue.size", "10000") 62 | val BLOCKING_QUEUE_PUT_TIMEOUT = ConfigEntry("atlas.blockQueue.putTimeout.ms", "3000") 63 | 64 | val CLIENT_TYPE = ConfigEntry("atlas.client.type", "kafka") 65 | val CLIENT_USERNAME = ConfigEntry("atlas.client.username", "admin") 66 | val CLIENT_PASSWORD = ConfigEntry("atlas.client.password", "admin123") 67 | val CLIENT_NUM_RETRIES = ConfigEntry("atlas.client.numRetries", "3") 68 | 69 | val CLUSTER_NAME = ConfigEntry("atlas.cluster.name", "primary") 70 | } 71 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/AtlasEntityCreationRequestHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.util.UUID 21 | 22 | import com.hortonworks.spark.atlas.types.metadata 23 | import com.hortonworks.spark.atlas.utils.Logging 24 | import org.apache.atlas.model.instance.AtlasObjectId 25 | 26 | import scala.collection.mutable 27 | 28 | class AtlasEntityCreationRequestHelper( 29 | atlasClient: AtlasClient) extends Logging { 30 | // query to (inputs, outputs) 31 | private val queryToInputsAndOutputs = new mutable.HashMap[UUID, 32 | (Set[AtlasObjectId], Set[AtlasObjectId])]() 33 | 34 | def requestCreation(entities: Seq[SACAtlasReferenceable], queryId: Option[UUID] = None): Unit = { 35 | queryId match { 36 | case Some(rid) => updateEntitiesForStreamingQuery(rid, entities) 37 | case None => updateEntitiesForBatchQuery(entities) 38 | } 39 | } 40 | 41 | private def updateEntitiesForBatchQuery(entities: Seq[SACAtlasReferenceable]): Unit = { 42 | // the query is batch, hence always create entities 43 | // create input/output entities as well as update process entity(-ies) 44 | createEntities(entities) 45 | } 46 | 47 | private def updateEntitiesForStreamingQuery( 48 | queryId: UUID, 49 | entities: Seq[SACAtlasReferenceable]): Unit = { 50 | // the query is streaming, so which partial of source/sink entities can be seen 51 | // in specific batch - need to accumulate efficiently 52 | val processes = entities 53 | .filter(en => en.typeName == metadata.PROCESS_TYPE_STRING 54 | && en.isInstanceOf[SACAtlasEntityWithDependencies]) 55 | .map(_.asInstanceOf[SACAtlasEntityWithDependencies]) 56 | 57 | val inputs = processes.flatMap { p => 58 | AtlasEntityReadHelper.getSeqAtlasObjectIdAttribute(p.entity, "inputs") 59 | }.toSet 60 | 61 | val outputs = processes.flatMap { p => 62 | AtlasEntityReadHelper.getSeqAtlasObjectIdAttribute(p.entity, "outputs") 63 | }.toSet 64 | 65 | queryToInputsAndOutputs.get(queryId) match { 66 | case Some((is, os)) if !inputs.subsetOf(is) || !outputs.subsetOf(os) => 67 | // The query is streaming, and at least either inputs or outputs is not a 68 | // subset of accumulated one. 69 | 70 | // NOTE: we leverage the 'process' model's definition: 71 | // inputs and outputs are defined as set in definition, and Atlas automatically 72 | // accumulate these values which doesn't require us to track all inputs and 73 | // outputs and always provide accumulated one. 74 | // If we need to do in our own, we should also accumulate inputs and outputs 75 | // in SparkCatalogEventProcessor and maintain full of inputs and outputs. 76 | // Here we only accumulate inputs/outputs for each streaming query (queryId). 77 | 78 | createEntities(entities) 79 | 80 | // update inputs and outputs as accumulating current one and new inputs/outputs 81 | updateInputsAndOutputs(queryId, is.union(inputs), os.union(outputs)) 82 | 83 | case Some((_, _)) => // if inputs.subsetOf(is) && outputs.subsetOf(os) 84 | // we already updated superset of inputs/outputs, skip updating 85 | 86 | case _ => 87 | // the streaming query hasn't been examined in current session 88 | createEntities(entities) 89 | 90 | // update inputs and outputs as new inputs/outputs, as there's nothing to accumulate 91 | updateInputsAndOutputs(queryId, inputs, outputs) 92 | } 93 | } 94 | 95 | private def createEntities(entities: Seq[SACAtlasReferenceable]): Unit = { 96 | // create input/output entities as well as update process entity(-ies) 97 | atlasClient.createEntitiesWithDependencies(entities) 98 | logDebug(s"Created entities without columns") 99 | } 100 | 101 | private def updateInputsAndOutputs( 102 | queryId: UUID, 103 | newInputs: Set[AtlasObjectId], 104 | newOutputs: Set[AtlasObjectId]): Unit = { 105 | queryToInputsAndOutputs.put(queryId, (newInputs, newOutputs)) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/AtlasEntityReadHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import scala.collection.convert.Wrappers.SeqWrapper 21 | import org.apache.atlas.model.instance.{AtlasEntity, AtlasObjectId} 22 | 23 | object AtlasEntityReadHelper { 24 | def listAtlasEntitiesAsType(entities: Seq[AtlasEntity], typeStr: String): Seq[AtlasEntity] = { 25 | entities.filter(p => p.getTypeName.equals(typeStr)) 26 | } 27 | 28 | def getOnlyOneEntity(entities: Seq[AtlasEntity], typeStr: String): AtlasEntity = { 29 | val filteredEntities = entities.filter { p => 30 | p.getTypeName.equals(typeStr) 31 | } 32 | assert(filteredEntities.size == 1) 33 | filteredEntities.head 34 | } 35 | 36 | def getOnlyOneObjectId(objIds: Seq[AtlasObjectId], typeStr: String): AtlasObjectId = { 37 | val filteredObjIds = objIds.filter { p => 38 | p.getTypeName.equals(typeStr) 39 | } 40 | assert(filteredObjIds.size == 1) 41 | filteredObjIds.head 42 | } 43 | 44 | def getOnlyOneEntityOnAttribute( 45 | entities: Seq[AtlasEntity], 46 | attrName: String, 47 | attrValue: String): AtlasEntity = { 48 | val filteredEntities = entities.filter { p => 49 | p.getAttribute(attrName).equals(attrValue) 50 | } 51 | assert(filteredEntities.size == 1) 52 | filteredEntities.head 53 | } 54 | 55 | def getStringAttribute(entity: AtlasEntity, attrName: String): String = { 56 | entity.getAttribute(attrName).asInstanceOf[String] 57 | } 58 | 59 | def getQualifiedName(entity: AtlasEntity): String = { 60 | entity.getAttribute(org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) 61 | .asInstanceOf[String] 62 | } 63 | 64 | def getAtlasEntityAttribute(entity: AtlasEntity, attrName: String): AtlasEntity = { 65 | entity.getAttribute(attrName).asInstanceOf[AtlasEntity] 66 | } 67 | 68 | def getAtlasObjectIdAttribute(entity: AtlasEntity, attrName: String): AtlasObjectId = { 69 | entity.getAttribute(attrName).asInstanceOf[AtlasObjectId] 70 | } 71 | 72 | def getAtlasObjectIdRelationshipAttribute( 73 | entity: AtlasEntity, 74 | attrName: String): AtlasObjectId = { 75 | entity.getRelationshipAttribute(attrName).asInstanceOf[AtlasObjectId] 76 | } 77 | 78 | def getSeqAtlasEntityAttribute( 79 | entity: AtlasEntity, 80 | attrName: String): Seq[AtlasEntity] = { 81 | entity.getAttribute(attrName).asInstanceOf[SeqWrapper[AtlasEntity]].underlying 82 | } 83 | 84 | def getSeqAtlasObjectIdAttribute( 85 | entity: AtlasEntity, 86 | attrName: String): Seq[AtlasObjectId] = { 87 | entity.getAttribute(attrName).asInstanceOf[SeqWrapper[AtlasObjectId]].underlying 88 | } 89 | 90 | def getSeqAtlasObjectIdRelationshipAttribute( 91 | entity: AtlasEntity, 92 | attrName: String): Seq[AtlasObjectId] = { 93 | entity.getRelationshipAttribute(attrName).asInstanceOf[SeqWrapper[AtlasObjectId]].underlying 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/AtlasUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.util.concurrent.atomic.AtomicLong 21 | 22 | import com.hortonworks.spark.atlas.utils.Logging 23 | import org.apache.atlas.model.instance.{AtlasEntity, AtlasObjectId} 24 | 25 | object AtlasUtils extends Logging { 26 | private val executionId = new AtomicLong(0L) 27 | 28 | def entityToReference(entity: AtlasEntity, useGuid: Boolean = false): AtlasObjectId = { 29 | if (useGuid) { 30 | new AtlasObjectId(entity.getGuid) 31 | } else { 32 | new AtlasObjectId(entity.getTypeName, "qualifiedName", entity.getAttribute("qualifiedName")) 33 | } 34 | } 35 | 36 | def entitiesToReferences( 37 | entities: Seq[AtlasEntity], 38 | useGuid: Boolean = false): Set[AtlasObjectId] = { 39 | entities.map(entityToReference(_, useGuid)).toSet 40 | } 41 | 42 | def issueExecutionId(): Long = executionId.getAndIncrement() 43 | 44 | def isSacEnabled(conf: AtlasClientConf): Boolean = { 45 | if (!conf.get(AtlasClientConf.ATLAS_SPARK_ENABLED).toBoolean) { 46 | logWarn("Spark Atlas Connector is disabled.") 47 | false 48 | } else { 49 | true 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/KafkaAtlasClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.util 21 | 22 | import scala.collection.JavaConverters._ 23 | import com.sun.jersey.core.util.MultivaluedMapImpl 24 | import org.apache.atlas.hook.AtlasHook 25 | import org.apache.atlas.model.typedef.AtlasTypesDef 26 | import org.apache.atlas.model.instance.{AtlasEntity, AtlasObjectId} 27 | import org.apache.atlas.v1.model.notification.HookNotificationV1 28 | import org.apache.atlas.v1.model.notification.HookNotificationV1.{EntityCreateRequest, EntityDeleteRequest} 29 | import org.apache.atlas.v1.model.instance.Referenceable 30 | import org.apache.atlas.model.notification.HookNotification 31 | import com.hortonworks.spark.atlas.utils.SparkUtils 32 | import org.apache.atlas.AtlasClientV2.API_V2 33 | import org.apache.atlas.model.instance.AtlasEntity.{AtlasEntitiesWithExtInfo, AtlasEntityWithExtInfo} 34 | import org.apache.atlas.model.notification.HookNotification.{EntityCreateRequestV2, EntityDeleteRequestV2, EntityPartialUpdateRequestV2} 35 | 36 | class KafkaAtlasClient(atlasClientConf: AtlasClientConf) extends AtlasHook with AtlasClient { 37 | 38 | protected def getNumberOfRetriesPropertyKey: String = { 39 | AtlasClientConf.CLIENT_NUM_RETRIES.key 40 | } 41 | 42 | override def createAtlasTypeDefs(typeDefs: AtlasTypesDef): Unit = { 43 | throw new UnsupportedOperationException("Kafka atlas client doesn't support create type defs") 44 | } 45 | 46 | override def getAtlasTypeDefs(searchParams: MultivaluedMapImpl): AtlasTypesDef = { 47 | throw new UnsupportedOperationException("Kafka atlas client doesn't support get type defs") 48 | } 49 | 50 | override def updateAtlasTypeDefs(typeDefs: AtlasTypesDef): Unit = { 51 | throw new UnsupportedOperationException("Kafka atlas client doesn't support update type defs") 52 | } 53 | 54 | override protected def doCreateEntities(entities: Seq[AtlasEntity]): Unit = { 55 | val entitiesWithExtInfo = new AtlasEntitiesWithExtInfo() 56 | entities.foreach(entitiesWithExtInfo.addEntity) 57 | val createRequest = new EntityCreateRequestV2( 58 | SparkUtils.currUser(), entitiesWithExtInfo): HookNotification 59 | 60 | notifyEntities(Seq(createRequest).asJava, SparkUtils.ugi()) 61 | } 62 | 63 | override protected def doDeleteEntityWithUniqueAttr( 64 | entityType: String, 65 | attribute: String): Unit = { 66 | val deleteRequest = new EntityDeleteRequestV2( 67 | SparkUtils.currUser(), 68 | Seq(new AtlasObjectId(entityType, 69 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 70 | attribute)).asJava 71 | ): HookNotification 72 | 73 | notifyEntities(Seq(deleteRequest).asJava, SparkUtils.ugi()) 74 | } 75 | 76 | override protected def doUpdateEntityWithUniqueAttr( 77 | entityType: String, 78 | attribute: String, 79 | entity: AtlasEntity): Unit = { 80 | val partialUpdateRequest = new EntityPartialUpdateRequestV2( 81 | SparkUtils.currUser(), 82 | new AtlasObjectId(entityType, 83 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 84 | attribute), 85 | new AtlasEntityWithExtInfo(entity) 86 | ): HookNotification 87 | 88 | notifyEntities(Seq(partialUpdateRequest).asJava, SparkUtils.ugi()) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/RestAtlasClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.util 21 | 22 | import scala.collection.JavaConverters._ 23 | import com.sun.jersey.core.util.MultivaluedMapImpl 24 | import org.apache.atlas.AtlasClientV2 25 | import org.apache.atlas.model.SearchFilter 26 | import org.apache.atlas.model.instance.AtlasEntity 27 | import org.apache.atlas.model.instance.AtlasEntity.{AtlasEntitiesWithExtInfo, AtlasEntityWithExtInfo} 28 | import org.apache.atlas.model.typedef.AtlasTypesDef 29 | import org.apache.atlas.utils.AuthenticationUtil 30 | 31 | class RestAtlasClient(atlasClientConf: AtlasClientConf) extends AtlasClient { 32 | 33 | private val client = { 34 | if (!AuthenticationUtil.isKerberosAuthenticationEnabled) { 35 | val basicAuth = Array(atlasClientConf.get(AtlasClientConf.CLIENT_USERNAME), 36 | atlasClientConf.get(AtlasClientConf.CLIENT_PASSWORD)) 37 | new AtlasClientV2(getServerUrl(), basicAuth) 38 | } else { 39 | new AtlasClientV2(getServerUrl(): _*) 40 | } 41 | } 42 | 43 | private def getServerUrl(): Array[String] = { 44 | 45 | atlasClientConf.getUrl(AtlasClientConf.ATLAS_REST_ENDPOINT.key) match { 46 | case a: util.ArrayList[_] => a.toArray().map(b => b.toString) 47 | case s: String => Array(s) 48 | case _: Throwable => throw new IllegalArgumentException(s"Fail to get atlas.rest.address") 49 | } 50 | } 51 | 52 | override def createAtlasTypeDefs(typeDefs: AtlasTypesDef): Unit = { 53 | client.createAtlasTypeDefs(typeDefs) 54 | } 55 | 56 | override def getAtlasTypeDefs(searchParams: MultivaluedMapImpl): AtlasTypesDef = { 57 | val searchFilter = new SearchFilter(searchParams) 58 | client.getAllTypeDefs(searchFilter) 59 | } 60 | 61 | override def updateAtlasTypeDefs(typeDefs: AtlasTypesDef): Unit = { 62 | client.updateAtlasTypeDefs(typeDefs) 63 | } 64 | 65 | override protected def doCreateEntities(entities: Seq[AtlasEntity]): Unit = { 66 | val entitesWithExtInfo = new AtlasEntitiesWithExtInfo() 67 | entities.foreach(entitesWithExtInfo.addEntity) 68 | val response = client.createEntities(entitesWithExtInfo) 69 | try { 70 | logInfo(s"Entities ${response.getCreatedEntities.asScala.map(_.getGuid).mkString(", ")} " + 71 | s"created") 72 | } catch { 73 | case _: Throwable => throw new IllegalStateException(s"Fail to get create entities") 74 | } 75 | } 76 | 77 | override protected def doDeleteEntityWithUniqueAttr( 78 | entityType: String, 79 | attribute: String): Unit = { 80 | client.deleteEntityByAttribute(entityType, 81 | Map(org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME -> attribute).asJava) 82 | } 83 | 84 | override protected def doUpdateEntityWithUniqueAttr( 85 | entityType: String, 86 | attribute: String, 87 | entity: AtlasEntity): Unit = { 88 | client.updateEntityByAttribute( 89 | entityType, 90 | Map(org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME -> attribute).asJava, 91 | new AtlasEntityWithExtInfo(entity)) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/SACAtlasEntity.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import org.apache.atlas.model.instance.{AtlasEntity, AtlasObjectId} 21 | 22 | trait SACAtlasReferenceable { 23 | def typeName: String 24 | def qualifiedName: String 25 | def asObjectId: AtlasObjectId 26 | } 27 | 28 | case class SACAtlasEntityReference(ref: AtlasObjectId) extends SACAtlasReferenceable { 29 | require(typeName != null && !typeName.isEmpty) 30 | require(qualifiedName != null && !qualifiedName.isEmpty) 31 | 32 | override def typeName: String = ref.getTypeName 33 | 34 | override def qualifiedName: String = ref.getUniqueAttributes.get( 35 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME).toString 36 | 37 | override def asObjectId: AtlasObjectId = ref 38 | } 39 | 40 | case class SACAtlasEntityWithDependencies( 41 | entity: AtlasEntity, 42 | dependencies: Seq[SACAtlasReferenceable]) extends SACAtlasReferenceable { 43 | 44 | require(typeName != null && !typeName.isEmpty) 45 | require(qualifiedName != null && !qualifiedName.isEmpty) 46 | 47 | override def typeName: String = entity.getTypeName 48 | 49 | override def qualifiedName: String = entity.getAttribute( 50 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME).toString 51 | 52 | override def asObjectId: AtlasObjectId = AtlasUtils.entityToReference(entity, useGuid = false) 53 | 54 | def dependenciesAdded(deps: Seq[SACAtlasReferenceable]): SACAtlasEntityWithDependencies = { 55 | new SACAtlasEntityWithDependencies(entity, dependencies ++ deps) 56 | } 57 | } 58 | 59 | object SACAtlasEntityWithDependencies { 60 | def apply(entity: AtlasEntity): SACAtlasEntityWithDependencies = { 61 | new SACAtlasEntityWithDependencies(entity, Seq.empty) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/SparkAtlasEventTracker.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import com.google.common.annotations.VisibleForTesting 21 | import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} 22 | import org.apache.spark.sql.catalyst.catalog.ExternalCatalogEvent 23 | import org.apache.spark.sql.execution.QueryExecution 24 | import org.apache.spark.sql.util.QueryExecutionListener 25 | import com.hortonworks.spark.atlas.sql._ 26 | import com.hortonworks.spark.atlas.ml.MLPipelineEventProcessor 27 | import com.hortonworks.spark.atlas.utils.Logging 28 | 29 | class SparkAtlasEventTracker(atlasClient: AtlasClient, atlasClientConf: AtlasClientConf) 30 | extends SparkListener with QueryExecutionListener with Logging { 31 | 32 | def this(atlasClientConf: AtlasClientConf) = { 33 | this(AtlasClient.atlasClient(atlasClientConf), atlasClientConf) 34 | } 35 | 36 | def this() { 37 | this(new AtlasClientConf) 38 | } 39 | 40 | private val enabled: Boolean = AtlasUtils.isSacEnabled(atlasClientConf) 41 | 42 | // Processor to handle DDL related events 43 | @VisibleForTesting 44 | private[atlas] val catalogEventTracker = 45 | new SparkCatalogEventProcessor(atlasClient, atlasClientConf) 46 | catalogEventTracker.startThread() 47 | 48 | // Processor to handle DML related events 49 | private val executionPlanTracker = new SparkExecutionPlanProcessor(atlasClient, atlasClientConf) 50 | executionPlanTracker.startThread() 51 | 52 | private val mlEventTracker = new MLPipelineEventProcessor(atlasClient, atlasClientConf) 53 | mlEventTracker.startThread() 54 | 55 | override def onOtherEvent(event: SparkListenerEvent): Unit = { 56 | if (!enabled) { 57 | // No op if SAC is disabled 58 | return 59 | } 60 | 61 | // We only care about SQL related events. 62 | event match { 63 | case e: ExternalCatalogEvent => catalogEventTracker.pushEvent(e) 64 | case e: SparkListenerEvent if e.getClass.getName.contains("org.apache.spark.ml") => 65 | mlEventTracker.pushEvent(e) 66 | case _ => // Ignore other events 67 | } 68 | } 69 | 70 | override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { 71 | if (!enabled) { 72 | // No op if SAC is disabled 73 | return 74 | } 75 | 76 | if (qe.logical.isStreaming) { 77 | // streaming query will be tracked via SparkAtlasStreamingQueryEventTracker 78 | return 79 | } 80 | 81 | val qd = QueryDetail.fromQueryExecutionListener(qe, durationNs) 82 | executionPlanTracker.pushEvent(qd) 83 | } 84 | 85 | override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { 86 | // No-op: SAC is one of the listener. 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/SparkAtlasStreamingQueryEventTracker.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import com.hortonworks.spark.atlas.sql.{QueryDetail, SparkExecutionPlanProcessor} 21 | 22 | import scala.collection.mutable 23 | import org.apache.spark.sql.streaming.StreamingQueryListener 24 | import org.apache.spark.sql.streaming.StreamingQueryListener._ 25 | import com.hortonworks.spark.atlas.utils.Logging 26 | import org.apache.spark.sql.SparkSession 27 | import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper} 28 | 29 | class SparkAtlasStreamingQueryEventTracker( 30 | atlasClient: AtlasClient, 31 | atlasClientConf: AtlasClientConf) 32 | extends StreamingQueryListener with Logging { 33 | 34 | def this(atlasClientConf: AtlasClientConf) = { 35 | this(AtlasClient.atlasClient(atlasClientConf), atlasClientConf) 36 | } 37 | 38 | def this() { 39 | this(new AtlasClientConf) 40 | } 41 | 42 | private val enabled: Boolean = AtlasUtils.isSacEnabled(atlasClientConf) 43 | 44 | private val executionPlanTracker = new SparkExecutionPlanProcessor(atlasClient, atlasClientConf) 45 | executionPlanTracker.startThread() 46 | 47 | override def onQueryStarted(event: QueryStartedEvent): Unit = { 48 | logDebug(s"Start to track the Spark Streaming query in the Spark Atlas $event") 49 | } 50 | 51 | override def onQueryProgress(event: QueryProgressEvent): Unit = { 52 | if (!enabled) { 53 | // No op if SAC is disabled 54 | return 55 | } 56 | logInfo(s"Track running Spark Streaming query in the Spark Atlas: $event") 57 | val query = SparkSession.active.streams.get(event.progress.id) 58 | if (query != null) { 59 | val qd = query match { 60 | case query: StreamingQueryWrapper => 61 | Some(QueryDetail.fromStreamingQueryListener(query.streamingQuery, event)) 62 | 63 | case query: StreamExecution => 64 | Some(QueryDetail.fromStreamingQueryListener(query, event)) 65 | 66 | case _ => 67 | logWarn(s"Unexpected type of streaming query: ${query.getClass}") 68 | None 69 | } 70 | 71 | qd.foreach { q => 72 | if (q.qe != null) { 73 | executionPlanTracker.pushEvent(q) 74 | } else { 75 | logInfo(s"Can't retrieve query execution information for query ${event.progress.id}" + 76 | " - skip and wait for next batch.") 77 | } 78 | } 79 | } else { 80 | logWarn(s"Cannot find query ${event.progress.id} from active spark session!") 81 | } 82 | } 83 | 84 | override def onQueryTerminated(event: QueryTerminatedEvent): Unit = { 85 | logDebug(s"Tack Spark Streaming query in the Spark Atlas Terminated: $event") 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/sql/Harvester.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import com.hortonworks.spark.atlas.SACAtlasReferenceable 21 | 22 | trait Harvester[T] { 23 | def harvest(node: T, qd: QueryDetail): Seq[SACAtlasReferenceable] 24 | } 25 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/sql/KafkaTopicInformation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | case class KafkaTopicInformation(topicName: String, clusterName: Option[String] = None) 21 | 22 | object KafkaTopicInformation { 23 | def getQualifiedName(ti: KafkaTopicInformation, defaultClusterName: String): String = { 24 | val cName = ti.clusterName.getOrElse(defaultClusterName) 25 | s"${ti.topicName}@$cName" 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/sql/SparkCatalogEventProcessor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import scala.collection.mutable 21 | import org.apache.atlas.model.instance.AtlasEntity 22 | import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException 23 | import org.apache.spark.sql.catalyst.catalog._ 24 | import com.hortonworks.spark.atlas.{AbstractEventProcessor, AtlasClient, AtlasClientConf, AtlasEntityReadHelper} 25 | import com.hortonworks.spark.atlas.types.{AtlasEntityUtils, external} 26 | import com.hortonworks.spark.atlas.utils.{Logging, SparkUtils} 27 | 28 | class SparkCatalogEventProcessor( 29 | private[atlas] val atlasClient: AtlasClient, 30 | val conf: AtlasClientConf) 31 | extends AbstractEventProcessor[ExternalCatalogEvent] with AtlasEntityUtils with Logging { 32 | 33 | private val cachedObject = new mutable.WeakHashMap[String, Object] 34 | 35 | override protected def process(e: ExternalCatalogEvent): Unit = { 36 | if (SparkUtils.usingRemoteMetastoreService()) { 37 | // SAC will not handle any DDL events when remote HMS is used: 38 | // Hive hook will take care of all DDL events in Hive Metastore Service. 39 | // No-op here. 40 | return 41 | } 42 | 43 | e match { 44 | case CreateDatabasePreEvent(_) => // No-op 45 | 46 | case CreateDatabaseEvent(db) => 47 | val dbDefinition = SparkUtils.getExternalCatalog().getDatabase(db) 48 | val entity = sparkDbToEntity(dbDefinition) 49 | atlasClient.createEntitiesWithDependencies(entity) 50 | logDebug(s"Created db entity $db") 51 | 52 | case DropDatabasePreEvent(db) => 53 | try { 54 | cachedObject.put(sparkDbUniqueAttribute(db), 55 | SparkUtils.getExternalCatalog().getDatabase(db)) 56 | } catch { 57 | case _: NoSuchDatabaseException => 58 | logDebug(s"Spark already deleted the database: $db") 59 | } 60 | 61 | case DropDatabaseEvent(db) => 62 | atlasClient.deleteEntityWithUniqueAttr(sparkDbType, sparkDbUniqueAttribute(db)) 63 | 64 | cachedObject.remove(sparkDbUniqueAttribute(db)).foreach { o => 65 | val dbDef = o.asInstanceOf[CatalogDatabase] 66 | val path = dbDef.locationUri.toString 67 | val pathEntity = external.pathToEntity(path) 68 | 69 | atlasClient.deleteEntityWithUniqueAttr(pathEntity.entity.getTypeName, 70 | AtlasEntityReadHelper.getQualifiedName(pathEntity.entity)) 71 | } 72 | 73 | logDebug(s"Deleted db entity $db") 74 | 75 | case CreateTablePreEvent(_, _) => // No-op 76 | 77 | // TODO. We should also not create/alter view table in Atlas 78 | case CreateTableEvent(db, table) => 79 | val tableDefinition = SparkUtils.getExternalCatalog().getTable(db, table) 80 | val tableEntity = sparkTableToEntity(tableDefinition) 81 | atlasClient.createEntitiesWithDependencies(tableEntity) 82 | logDebug(s"Created table entity $table without columns") 83 | 84 | case DropTablePreEvent(_, _) => // No-op 85 | 86 | case DropTableEvent(db, table) => 87 | logDebug(s"Can't handle drop table event since we don't have context information for " + 88 | s"table $table in db $db. Can't delete table entity and corresponding entities.") 89 | 90 | case RenameTableEvent(db, name, newName) => 91 | // Update storageFormat's unique attribute 92 | val sdEntity = new AtlasEntity(sparkStorageFormatType) 93 | sdEntity.setAttribute(org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 94 | sparkStorageFormatUniqueAttribute(db, newName)) 95 | atlasClient.updateEntityWithUniqueAttr( 96 | sparkStorageFormatType, 97 | sparkStorageFormatUniqueAttribute(db, name), 98 | sdEntity) 99 | 100 | // Update Table name and Table's unique attribute 101 | val tableEntity = new AtlasEntity(sparkTableType) 102 | tableEntity.setAttribute(org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 103 | sparkTableUniqueAttribute(db, newName)) 104 | tableEntity.setAttribute("name", newName) 105 | atlasClient.updateEntityWithUniqueAttr( 106 | sparkTableType, 107 | sparkTableUniqueAttribute(db, name), 108 | tableEntity) 109 | 110 | logDebug(s"Rename table entity $name to $newName") 111 | 112 | case AlterDatabaseEvent(db) => 113 | val dbDefinition = SparkUtils.getExternalCatalog().getDatabase(db) 114 | val dbEntity = sparkDbToEntity(dbDefinition) 115 | atlasClient.createEntitiesWithDependencies(dbEntity) 116 | logDebug(s"Updated DB properties") 117 | 118 | case AlterTableEvent(db, table, kind) => 119 | val tableDefinition = SparkUtils.getExternalCatalog().getTable(db, table) 120 | kind match { 121 | case "table" => 122 | val tableEntity = sparkTableToEntityForAlterTable(tableDefinition) 123 | atlasClient.createEntitiesWithDependencies(tableEntity) 124 | logDebug(s"Updated table entity $table without columns") 125 | 126 | case "dataSchema" => 127 | // We don't mind updating column 128 | logDebug("Detected updating of table schema but ignored: " + 129 | "column update will not be tracked here") 130 | 131 | case "stats" => 132 | logDebug(s"Stats update will not be tracked here") 133 | 134 | case _ => 135 | // No op. 136 | } 137 | 138 | case f => 139 | logDebug(s"Drop unknown event $f") 140 | } 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/sql/SparkExtension.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} 21 | import org.apache.spark.sql.catalyst.expressions.Expression 22 | import org.apache.spark.sql.catalyst.parser.ParserInterface 23 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 24 | import org.apache.spark.sql.types.{DataType, StructType} 25 | import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} 26 | 27 | 28 | class SparkExtension extends (SparkSessionExtensions => Unit) { 29 | def apply(e: SparkSessionExtensions): Unit = { 30 | e.injectParser(SparkAtlasConnectorParser) 31 | } 32 | } 33 | 34 | case class SparkAtlasConnectorParser(spark: SparkSession, delegate: ParserInterface) 35 | extends ParserInterface { 36 | override def parsePlan(sqlText: String): LogicalPlan = { 37 | SQLQuery.set(sqlText) 38 | delegate.parsePlan(sqlText) 39 | } 40 | 41 | override def parseExpression(sqlText: String): Expression = 42 | delegate.parseExpression(sqlText) 43 | 44 | override def parseTableIdentifier(sqlText: String): TableIdentifier = 45 | delegate.parseTableIdentifier(sqlText) 46 | 47 | override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = 48 | delegate.parseFunctionIdentifier(sqlText) 49 | 50 | override def parseTableSchema(sqlText: String): StructType = 51 | delegate.parseTableSchema(sqlText) 52 | 53 | override def parseDataType(sqlText: String): DataType = 54 | delegate.parseDataType(sqlText) 55 | } 56 | 57 | object SQLQuery { 58 | private[this] val sqlQuery = new ThreadLocal[String] 59 | def get(): String = sqlQuery.get 60 | def set(s: String): Unit = sqlQuery.set(s) 61 | } 62 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/types/AtlasEntityUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.types 19 | 20 | import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogStorageFormat, CatalogTable} 21 | import com.hortonworks.spark.atlas.{AtlasClientConf, SACAtlasEntityWithDependencies, SACAtlasReferenceable} 22 | import com.hortonworks.spark.atlas.utils.{Logging, SparkUtils} 23 | import org.apache.spark.ml.Pipeline 24 | 25 | trait AtlasEntityUtils extends Logging { 26 | 27 | def conf: AtlasClientConf 28 | 29 | def clusterName: String = conf.get(AtlasClientConf.CLUSTER_NAME) 30 | 31 | def sparkDbType: String = metadata.DB_TYPE_STRING 32 | 33 | def sparkDbToEntity(dbDefinition: CatalogDatabase): SACAtlasEntityWithDependencies = { 34 | internal.sparkDbToEntity(dbDefinition, clusterName, SparkUtils.currUser()) 35 | } 36 | 37 | def sparkDbUniqueAttribute(db: String): String = { 38 | internal.sparkDbUniqueAttribute(db) 39 | } 40 | 41 | def sparkStorageFormatType: String = metadata.STORAGEDESC_TYPE_STRING 42 | 43 | def sparkStorageFormatToEntity( 44 | storageFormat: CatalogStorageFormat, 45 | db: String, 46 | table: String): SACAtlasEntityWithDependencies = { 47 | internal.sparkStorageFormatToEntity(storageFormat, db, table) 48 | } 49 | 50 | def sparkStorageFormatUniqueAttribute(db: String, table: String): String = { 51 | internal.sparkStorageFormatUniqueAttribute(db, table) 52 | } 53 | 54 | def sparkTableType: String = metadata.TABLE_TYPE_STRING 55 | 56 | def tableToEntity( 57 | tableDefinition: CatalogTable, 58 | mockDbDefinition: Option[CatalogDatabase] = None): SACAtlasReferenceable = { 59 | if (SparkUtils.usingRemoteMetastoreService()) { 60 | external.hiveTableToReference(tableDefinition, clusterName, mockDbDefinition) 61 | } else { 62 | internal.sparkTableToEntity(tableDefinition, clusterName, mockDbDefinition) 63 | } 64 | } 65 | 66 | def sparkTableToEntity( 67 | tableDefinition: CatalogTable, 68 | mockDbDefinition: Option[CatalogDatabase] = None): SACAtlasReferenceable = { 69 | internal.sparkTableToEntity(tableDefinition, clusterName, mockDbDefinition) 70 | } 71 | 72 | def sparkTableToEntityForAlterTable( 73 | tableDefinition: CatalogTable, 74 | mockDbDefinition: Option[CatalogDatabase] = None): SACAtlasReferenceable = { 75 | internal.sparkTableToEntityForAlterTable(tableDefinition, clusterName, mockDbDefinition) 76 | } 77 | 78 | def sparkTableUniqueAttribute(db: String, table: String): String = { 79 | internal.sparkTableUniqueAttribute(db, table) 80 | } 81 | 82 | def pipelineUniqueAttribute(pipeline: Pipeline): String = { 83 | pipeline.uid 84 | } 85 | 86 | def processType: String = metadata.PROCESS_TYPE_STRING 87 | 88 | def processUniqueAttribute(executionId: Long): String = 89 | internal.sparkProcessUniqueAttribute(executionId) 90 | 91 | // If there is cycle, return empty output entity list 92 | def cleanOutput( 93 | inputs: Seq[SACAtlasReferenceable], 94 | outputs: Seq[SACAtlasReferenceable]): List[SACAtlasReferenceable] = { 95 | val qualifiedNames = inputs.map(_.qualifiedName) 96 | val isCycle = outputs.exists(x => qualifiedNames.contains(x.qualifiedName)) 97 | if (isCycle) { 98 | logWarn("Detected cycle - same entity observed to both input and output. " + 99 | "Discarding output entities as Atlas doesn't support cycle.") 100 | List.empty 101 | } else { 102 | outputs.toList 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/types/metadata.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.types 19 | 20 | object metadata { 21 | val METADATA_VERSION = "1.0" 22 | val DB_TYPE_STRING = "spark_db" 23 | val STORAGEDESC_TYPE_STRING = "spark_storagedesc" 24 | val TABLE_TYPE_STRING = "spark_table" 25 | val PROCESS_TYPE_STRING = "spark_process" 26 | val ML_DIRECTORY_TYPE_STRING = "spark_ml_directory" 27 | val ML_PIPELINE_TYPE_STRING = "spark_ml_pipeline" 28 | val ML_MODEL_TYPE_STRING = "spark_ml_model" 29 | } 30 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/utils/CatalogUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.utils 19 | 20 | import java.net.URI 21 | 22 | import org.apache.spark.sql.catalyst.TableIdentifier 23 | import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogStorageFormat, CatalogTable, CatalogTableType} 24 | import org.apache.spark.sql.types.StructType 25 | 26 | 27 | object CatalogUtils { 28 | 29 | def createDB(name: String, location: String): CatalogDatabase = { 30 | CatalogDatabase(name, "", new URI(location), Map.empty) 31 | } 32 | 33 | def createStorageFormat( 34 | locationUri: Option[URI] = None, 35 | inputFormat: Option[String] = None, 36 | outputFormat: Option[String] = None, 37 | serd: Option[String] = None, 38 | compressed: Boolean = false, 39 | properties: Map[String, String] = Map.empty): CatalogStorageFormat = { 40 | CatalogStorageFormat(locationUri, inputFormat, outputFormat, serd, compressed, properties) 41 | } 42 | 43 | def createTable( 44 | db: String, 45 | table: String, 46 | schema: StructType, 47 | storage: CatalogStorageFormat, 48 | isHiveTable: Boolean = false): CatalogTable = { 49 | CatalogTable( 50 | TableIdentifier(table, Some(db)), 51 | CatalogTableType.MANAGED, 52 | storage, 53 | schema, 54 | provider = if (isHiveTable) Some("hive") else None) 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/utils/JdbcUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.utils 19 | 20 | import com.hortonworks.spark.atlas.sql.CommandsHarvester.logWarn 21 | 22 | object JdbcUtils { 23 | 24 | private val DB2_PREFIX = "jdbc:db2" 25 | private val DERBY_PREFIX = "jdbc:derby" 26 | private val MARIADB_PREFIX = "jdbc:mariadb" 27 | private val MYSQL_PREFIX = "jdbc:mysql" 28 | private val ORACLE_PREFIX = "jdbc:oracle" 29 | private val POSTGRES_PREFIX = "jdbc:postgresql" 30 | private val SQL_SERVER_PREFIX = "jdbc:sqlserver" 31 | private val TERADATA_PREFIX = "jdbc:teradata" 32 | 33 | /** 34 | * Retrieves the database name from the url 35 | * 36 | * @param url the url used by the JDBC driver 37 | * @return 38 | */ 39 | def getDatabaseName(url: String): String = url match { 40 | case url if url.startsWith(DB2_PREFIX) => getDatabaseNameEndOfUrl(url) 41 | case url if url.startsWith(DERBY_PREFIX) => getDatabaseNameDerbyFormat(url) 42 | case url if url.startsWith(MARIADB_PREFIX) => getDatabaseNameEndOfUrl(url) 43 | case url if url.startsWith(MYSQL_PREFIX) => getDatabaseNameEndOfUrl(url) 44 | case url if url.startsWith(ORACLE_PREFIX) => getDatabaseOracleFormat(url) 45 | case url if url.startsWith(POSTGRES_PREFIX) => getDatabaseNameEndOfUrl(url) 46 | case url if url.startsWith(SQL_SERVER_PREFIX) => getDatabaseSqlServerFormat(url) 47 | case url if url.startsWith(TERADATA_PREFIX) => getDatabaseNameTeradataFormat(url) 48 | case _ => 49 | logWarn(s"Unsupported JDBC driver for url: $url") 50 | "" 51 | } 52 | 53 | /** 54 | * Retrieves database name where in hose:port/dbname format 55 | */ 56 | private def getDatabaseNameEndOfUrl(url: String): String = { 57 | val parsedUrl = url.substring(url.lastIndexOf("/") + 1) 58 | if (parsedUrl.contains("?")) { 59 | return parsedUrl.substring(0, parsedUrl.indexOf("?")) 60 | } 61 | 62 | parsedUrl 63 | } 64 | 65 | /** 66 | * Retrieves the database name based on Derby format 67 | */ 68 | private def getDatabaseNameDerbyFormat(url: String): String = { 69 | val parsedUrl = url match { 70 | case url if url.contains("/") => url.substring(url.lastIndexOf("/") + 1) 71 | case _ => url.substring(url.lastIndexOf(":") + 1) 72 | } 73 | 74 | if (parsedUrl.contains(";")) { 75 | return parsedUrl.substring(0, parsedUrl.indexOf(";")) 76 | } 77 | 78 | parsedUrl 79 | } 80 | 81 | /** 82 | * Retrieves the database name based on Teradata format 83 | */ 84 | private def getDatabaseNameTeradataFormat(url: String): String = { 85 | val databaseKey = "/DATABASE=" 86 | val parsedUrl = url.substring(url.indexOf(databaseKey) + databaseKey.length) 87 | if (parsedUrl.contains("/")) { 88 | return parsedUrl.substring(0, parsedUrl.indexOf("/")) 89 | } 90 | 91 | parsedUrl 92 | } 93 | 94 | /** 95 | * Retrieves the database name based on Oracle format 96 | * e.g. jdbc:oracle:thin:@localhost:1521:testdb 97 | */ 98 | private def getDatabaseOracleFormat(url: String): String = { 99 | url.substring(url.toUpperCase().lastIndexOf(":") + 1) 100 | } 101 | 102 | /** 103 | * Retrieves the database name based on Microsoft SQL Server format 104 | */ 105 | private def getDatabaseSqlServerFormat(url: String): String = { 106 | val databaseNameKey = ";databaseName=" 107 | val parsedUrl = url.substring(url.indexOf(databaseNameKey) + databaseNameKey.length) 108 | if (parsedUrl.contains(";")) { 109 | return parsedUrl.substring(0, parsedUrl.indexOf(";")) 110 | } 111 | 112 | parsedUrl 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/utils/Logging.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.utils 19 | 20 | import org.slf4j.LoggerFactory 21 | 22 | trait Logging { 23 | lazy val logger = LoggerFactory.getLogger(this.getClass) 24 | 25 | def logTrace(message: => Any): Unit = { 26 | if (logger.isTraceEnabled) { 27 | logger.trace(message.toString) 28 | } 29 | } 30 | 31 | def logDebug(message: => Any): Unit = { 32 | if (logger.isDebugEnabled) { 33 | logger.debug(message.toString) 34 | } 35 | } 36 | 37 | def logInfo(message: => Any): Unit = { 38 | if (logger.isInfoEnabled) { 39 | logger.info(message.toString) 40 | } 41 | } 42 | 43 | def logWarn(message: => Any): Unit = { 44 | logger.warn(message.toString) 45 | } 46 | 47 | def logWarn(message: => Any, t: Throwable): Unit = { 48 | logger.warn(message.toString, t) 49 | } 50 | 51 | def logError(message: => Any, t: Throwable): Unit = { 52 | logger.error(message.toString, t) 53 | } 54 | 55 | def logError(message: => Any): Unit = { 56 | logger.error(message.toString) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/main/scala/com/hortonworks/spark/atlas/utils/ReflectionHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.utils 19 | 20 | import scala.util.control.NonFatal 21 | 22 | object ReflectionHelper extends Logging { 23 | import scala.reflect.runtime.universe.{TermName, runtimeMirror, typeOf, TypeTag} 24 | private val currentMirror = runtimeMirror(getClass.getClassLoader) 25 | 26 | def reflectField[T, OUT](obj: Any, fieldName: String)(implicit ttag: TypeTag[T]): Option[OUT] = { 27 | val relMirror = currentMirror.reflect(obj) 28 | 29 | try { 30 | val method = typeOf[T].decl(TermName(fieldName)).asTerm.accessed.asTerm 31 | 32 | Some(relMirror.reflectField(method).get.asInstanceOf[OUT]) 33 | } catch { 34 | case NonFatal(_) => 35 | logWarn(s"Failed to reflect field $fieldName from $obj. " + 36 | s"Maybe missing to apply necessary patch?") 37 | None 38 | } 39 | } 40 | 41 | def reflectFieldWithContextClassloaderLoosenType(obj: Any, fieldName: String): Option[Any] = { 42 | val typeMirror = runtimeMirror(Thread.currentThread().getContextClassLoader) 43 | val instanceMirror = typeMirror.reflect(obj) 44 | 45 | val members = instanceMirror.symbol.typeSignature.members 46 | val field = members.find(_.name.decodedName.toString == fieldName) 47 | field match { 48 | case Some(f) => 49 | try { 50 | Some(instanceMirror.reflectField(f.asTerm).get) 51 | } catch { 52 | case NonFatal(e) => 53 | logWarn(s"Failed to reflect field $fieldName from $obj. " + 54 | s"Maybe missing to apply necessary patch? $e") 55 | None 56 | } 57 | 58 | case None => 59 | logWarn(s"Failed to reflect field $fieldName from $obj. " + 60 | s"Maybe missing to apply necessary patch?") 61 | None 62 | } 63 | } 64 | 65 | def reflectFieldWithContextClassloader[OUT](obj: Any, fieldName: String): Option[OUT] = { 66 | reflectFieldWithContextClassloaderLoosenType(obj, fieldName).map(_.asInstanceOf[OUT]) 67 | } 68 | 69 | def reflectMethodWithContextClassloaderLoosenType( 70 | obj: Any, 71 | methodName: String, 72 | params: Any*): Option[Any] = { 73 | val typeMirror = runtimeMirror(Thread.currentThread().getContextClassLoader) 74 | val instanceMirror = typeMirror.reflect(obj) 75 | 76 | val members = instanceMirror.symbol.typeSignature.members 77 | val method = members.find(_.name.decodedName.toString == methodName) 78 | method match { 79 | case Some(f) => 80 | try { 81 | Some(instanceMirror.reflectMethod(f.asMethod).apply(params)) 82 | } catch { 83 | case NonFatal(_) => 84 | logWarn(s"Failed to call method $methodName from $obj via reflection. " + 85 | s"Maybe missing to apply necessary patch?") 86 | None 87 | } 88 | 89 | case None => 90 | logWarn(s"Failed to call method $methodName from $obj via reflection. " + 91 | s"Maybe missing to apply necessary patch?") 92 | None 93 | } 94 | } 95 | 96 | def reflectMethodWithContextClassloader[OUT]( 97 | obj: Any, 98 | fieldName: String, 99 | params: Any*): Option[OUT] = { 100 | reflectMethodWithContextClassloaderLoosenType(obj, fieldName, params: _*) 101 | .map(_.asInstanceOf[OUT]) 102 | } 103 | 104 | def classForName(className: String): Class[_] = { 105 | Class.forName(className, true, getContextOrClassClassLoader) 106 | } 107 | 108 | private def getContextOrClassClassLoader: ClassLoader = 109 | Option(Thread.currentThread().getContextClassLoader).getOrElse(getClass.getClassLoader) 110 | } 111 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file target/unit-tests.log 19 | test.appender=file 20 | log4j.rootCategory=INFO, ${test.appender} 21 | log4j.appender.file=org.apache.log4j.FileAppender 22 | log4j.appender.file.append=true 23 | log4j.appender.file.file=target/unit-tests.log 24 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 25 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 26 | 27 | # Silence some noisy libraries. 28 | log4j.logger.org.apache.http=WARN 29 | log4j.logger.org.apache.spark=INFO 30 | log4j.logger.org.eclipse.jetty=WARN 31 | log4j.logger.org.spark-project.jetty=WARN 32 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/resources/users.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hortonworks-spark/spark-atlas-connector/0b10e337cdfd427744a92f8505d46297afb4c295/spark-atlas-connector/src/test/resources/users.parquet -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/AtlasEntityCreationRequestHelperSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.util.UUID 21 | 22 | import com.hortonworks.spark.atlas.sql.KafkaTopicInformation 23 | import com.hortonworks.spark.atlas.sql.testhelper.CreateEntitiesTrackingAtlasClient 24 | import com.hortonworks.spark.atlas.types.{external, internal} 25 | import org.apache.atlas.model.instance.AtlasEntity 26 | import org.scalatest.{BeforeAndAfterEach, FunSuite} 27 | 28 | class AtlasEntityCreationRequestHelperSuite 29 | extends FunSuite 30 | with WithHiveSupport 31 | with BeforeAndAfterEach { 32 | 33 | private val client = new CreateEntitiesTrackingAtlasClient 34 | private var sut: AtlasEntityCreationRequestHelper = _ 35 | 36 | override protected def beforeEach(): Unit = { 37 | client.clearEntities() 38 | sut = new AtlasEntityCreationRequestHelper(client) 39 | } 40 | 41 | test("SAC-253 partial sources presented in streaming query") { 42 | val cluster = "cl1" 43 | val queryId = UUID.randomUUID() 44 | 45 | val topic1 = KafkaTopicInformation("topic1") 46 | val topic2 = KafkaTopicInformation("topic2") 47 | val topic3 = KafkaTopicInformation("topic3") 48 | val topicSink = KafkaTopicInformation("topicSink") 49 | 50 | val source1 = external.kafkaToEntity(cluster, topic1) 51 | val source2 = external.kafkaToEntity(cluster, topic2) 52 | val source3 = external.kafkaToEntity(cluster, topic3) 53 | val sink = external.kafkaToEntity(cluster, topicSink) 54 | 55 | // source1 56 | validateInputsOutputs(queryId, Seq(source1), Seq(sink), expectNoCreationRequest = false) 57 | 58 | client.clearEntities() 59 | 60 | // source1, source2 61 | validateInputsOutputs(queryId, Seq(source1, source2), Seq(sink), 62 | expectNoCreationRequest = false) 63 | 64 | client.clearEntities() 65 | 66 | // source2, source3 67 | validateInputsOutputs(queryId, Seq(source2, source3), Seq(sink), 68 | expectNoCreationRequest = false) 69 | 70 | client.clearEntities() 71 | 72 | // source1, source2 73 | validateInputsOutputs(queryId, Seq(source1, source2), Seq(sink), expectNoCreationRequest = true) 74 | 75 | client.clearEntities() 76 | 77 | // source1, source2, source3 78 | validateInputsOutputs(queryId, Seq(source1, source2, source3), Seq(sink), 79 | expectNoCreationRequest = true) 80 | } 81 | 82 | test("SAC-253 partial sinks presented in streaming query") { 83 | val cluster = "cl1" 84 | val queryId = UUID.randomUUID() 85 | 86 | val topic1 = KafkaTopicInformation("topic1") 87 | val topic2 = KafkaTopicInformation("topic2") 88 | val topic3 = KafkaTopicInformation("topic3") 89 | val topicSource = KafkaTopicInformation("topicSource") 90 | 91 | val source = external.kafkaToEntity(cluster, topicSource) 92 | val sink1 = external.kafkaToEntity(cluster, topic1) 93 | val sink2 = external.kafkaToEntity(cluster, topic2) 94 | val sink3 = external.kafkaToEntity(cluster, topic3) 95 | 96 | // sink1 97 | validateInputsOutputs(queryId, Seq(source), Seq(sink1), expectNoCreationRequest = false) 98 | 99 | client.clearEntities() 100 | 101 | // sink1, sink2 102 | validateInputsOutputs(queryId, Seq(source), Seq(sink1, sink2), expectNoCreationRequest = false) 103 | 104 | client.clearEntities() 105 | 106 | // sink2, sink3 107 | validateInputsOutputs(queryId, Seq(source), Seq(sink2, sink3), expectNoCreationRequest = false) 108 | 109 | client.clearEntities() 110 | 111 | // sink1, sink2 112 | validateInputsOutputs(queryId, Seq(source), Seq(sink1, sink2), expectNoCreationRequest = true) 113 | 114 | client.clearEntities() 115 | 116 | // sink1, sink2, sink3 117 | validateInputsOutputs(queryId, Seq(source), Seq(sink1, sink2, sink3), 118 | expectNoCreationRequest = true) 119 | } 120 | 121 | private def validateInputsOutputs( 122 | queryId: UUID, 123 | sources: Seq[SACAtlasEntityWithDependencies], 124 | sinks: Seq[SACAtlasEntityWithDependencies], 125 | expectNoCreationRequest: Boolean): Unit = { 126 | val process = internal.etlProcessToEntity(sources, sinks, Map()) 127 | sut.requestCreation(Seq(process), Some(queryId)) 128 | 129 | if (expectNoCreationRequest) { 130 | // no entities will be created, as both inputs and outputs are subset of 131 | // accumulated inputs and outputs 132 | assert(client.createdEntities.isEmpty) 133 | } else { 134 | val allEntities = sources ++ sinks ++ Seq(process) 135 | assert(client.createdEntities.length === allEntities.length) 136 | assert(client.createdEntities.toSet === allEntities.map(_.entity).toSet) 137 | } 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/BaseResourceIT.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import scala.collection.JavaConverters._ 21 | import com.sun.jersey.core.util.MultivaluedMapImpl 22 | import org.apache.atlas.AtlasClientV2 23 | import org.apache.atlas.model.SearchFilter 24 | import org.apache.atlas.model.instance.AtlasEntity 25 | import org.apache.atlas.model.typedef.{AtlasStructDef, AtlasTypesDef} 26 | import org.apache.atlas.utils.AuthenticationUtil 27 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} 28 | 29 | abstract class BaseResourceIT extends FunSuite with BeforeAndAfterAll with BeforeAndAfterEach { 30 | 31 | protected var atlasUrls: Array[String] = null 32 | private var client: AtlasClientV2 = null 33 | protected val atlasClientConf = new AtlasClientConf 34 | private var uniquePostfix: Long = 0 35 | 36 | override protected def beforeAll(): Unit = { 37 | super.beforeAll() 38 | 39 | // set high timeouts so that tests do not fail due to read timeouts while you 40 | // are stepping through the code in a debugger 41 | atlasClientConf.set("atlas.client.readTimeoutMSecs", "100000000") 42 | atlasClientConf.set("atlas.client.connectTimeoutMSecs", "100000000") 43 | atlasUrls = Array(atlasClientConf.get(AtlasClientConf.ATLAS_REST_ENDPOINT)) 44 | } 45 | 46 | override protected def beforeEach(): Unit = { 47 | super.beforeEach() 48 | 49 | uniquePostfix = System.currentTimeMillis() 50 | } 51 | 52 | private def atlasClient(): AtlasClientV2 = { 53 | if (client == null) { 54 | if (!AuthenticationUtil.isKerberosAuthenticationEnabled) { 55 | client = new AtlasClientV2(atlasUrls, Array[String]("admin", "admin")) 56 | } else { 57 | client = new AtlasClientV2(atlasUrls: _*) 58 | } 59 | } 60 | 61 | client 62 | } 63 | 64 | protected def getTypeDef(name: String): AtlasStructDef = { 65 | require(atlasClient != null) 66 | 67 | val searchParams = new MultivaluedMapImpl() 68 | searchParams.add(SearchFilter.PARAM_NAME, name) 69 | val searchFilter = new SearchFilter(searchParams) 70 | val typesDef = atlasClient.getAllTypeDefs(searchFilter) 71 | if (!typesDef.getClassificationDefs.isEmpty) { 72 | typesDef.getClassificationDefs.get(0) 73 | } else if (!typesDef.getEntityDefs.isEmpty) { 74 | typesDef.getEntityDefs.get(0) 75 | } else if (!typesDef.getRelationshipDefs.isEmpty) { 76 | typesDef.getRelationshipDefs.get(0) 77 | } else { 78 | null 79 | } 80 | } 81 | 82 | protected def updateTypesDef(typesDef: AtlasTypesDef): Unit = { 83 | require(atlasClient != null) 84 | 85 | atlasClient.updateAtlasTypeDefs(typesDef) 86 | } 87 | 88 | protected def deleteTypesDef(typesDef: AtlasTypesDef): Unit = { 89 | require(atlasClient != null) 90 | 91 | atlasClient.deleteAtlasTypeDefs(typesDef) 92 | } 93 | 94 | protected def getEntity(typeName: String, uniqueAttr: String): AtlasEntity = { 95 | require(atlasClient != null) 96 | 97 | atlasClient.getEntityByAttribute(typeName, 98 | Map(org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME -> uniqueAttr).asJava) 99 | .getEntity 100 | } 101 | 102 | protected def it(desc: String)(testFn: => Unit): Unit = { 103 | test(desc) { 104 | assume( 105 | sys.env.get("ATLAS_INTEGRATION_TEST").contains("true"), 106 | "integration test can be run only when env ATLAS_INTEGRATION_TEST is set and local Atlas" + 107 | " is running") 108 | testFn 109 | } 110 | } 111 | 112 | protected def uniqueName(name: String): String = { 113 | s"${name}_$uniquePostfix" 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/KafkaClientIT.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.nio.file.Files 21 | 22 | import scala.concurrent.duration._ 23 | import scala.language.postfixOps 24 | 25 | import org.apache.atlas.AtlasServiceException 26 | import org.apache.spark.sql.SparkSession 27 | import org.apache.spark.sql.catalyst.catalog._ 28 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 29 | import org.scalatest.concurrent.Eventually.{eventually, interval, timeout} 30 | import org.scalatest.Matchers 31 | 32 | import com.hortonworks.spark.atlas.utils.SparkUtils 33 | 34 | class KafkaClientIT extends BaseResourceIT with Matchers { 35 | import TestUtils._ 36 | 37 | private var sparkSession: SparkSession = _ 38 | 39 | private var tracker: SparkAtlasEventTracker = _ 40 | 41 | override protected def beforeAll(): Unit = { 42 | super.beforeAll() 43 | 44 | sparkSession = SparkSession.builder() 45 | .master("local") 46 | .getOrCreate() 47 | 48 | tracker = new SparkAtlasEventTracker(new KafkaAtlasClient(atlasClientConf), atlasClientConf) 49 | } 50 | 51 | override protected def afterAll(): Unit = { 52 | sparkSession.stop() 53 | SparkSession.clearActiveSession() 54 | SparkSession.clearDefaultSession() 55 | 56 | super.afterAll() 57 | } 58 | 59 | it("create / update / delete new entities") { 60 | val dbName = uniqueName("db2") 61 | val tbl1Name = uniqueName("tbl1") 62 | val tbl3Name = uniqueName("tbl3") 63 | 64 | // Create new DB 65 | val tempDbPath = Files.createTempDirectory("db_") 66 | val dbDefinition = createDB(dbName, tempDbPath.normalize().toUri.toString) 67 | SparkUtils.getExternalCatalog().createDatabase(dbDefinition, ignoreIfExists = true) 68 | tracker.onOtherEvent(CreateDatabaseEvent(dbName)) 69 | eventually(timeout(30 seconds), interval(100 milliseconds)) { 70 | val entity = getEntity( 71 | tracker.catalogEventTracker.sparkDbType, 72 | tracker.catalogEventTracker.sparkDbUniqueAttribute(dbName)) 73 | entity should not be (null) 74 | entity.getAttribute("name") should be (dbName) 75 | } 76 | 77 | // Create new table 78 | val schema = new StructType() 79 | .add("user", StringType) 80 | .add("age", IntegerType) 81 | val sd = CatalogStorageFormat.empty 82 | val tableDefinition = createTable(dbName, tbl1Name, schema, sd) 83 | SparkUtils.getExternalCatalog().createTable(tableDefinition, ignoreIfExists = true) 84 | tracker.onOtherEvent(CreateTableEvent(dbName, tbl1Name)) 85 | 86 | eventually(timeout(30 seconds), interval(100 milliseconds)) { 87 | val sdEntity = getEntity(tracker.catalogEventTracker.sparkStorageFormatType, 88 | tracker.catalogEventTracker.sparkStorageFormatUniqueAttribute(dbName, tbl1Name)) 89 | sdEntity should not be (null) 90 | 91 | val tblEntity = getEntity(tracker.catalogEventTracker.sparkTableType, 92 | tracker.catalogEventTracker.sparkTableUniqueAttribute(dbName, tbl1Name)) 93 | tblEntity should not be (null) 94 | tblEntity.getAttribute("name") should be (tbl1Name) 95 | } 96 | 97 | // Rename table 98 | SparkUtils.getExternalCatalog().renameTable(dbName, tbl1Name, tbl3Name) 99 | tracker.onOtherEvent(RenameTableEvent(dbName, tbl1Name, tbl3Name)) 100 | val newTblDef = SparkUtils.getExternalCatalog().getTable(dbName, tbl3Name) 101 | 102 | eventually(timeout(30 seconds), interval(100 milliseconds)) { 103 | val tblEntity = getEntity(tracker.catalogEventTracker.sparkTableType, 104 | tracker.catalogEventTracker.sparkTableUniqueAttribute(dbName, tbl3Name)) 105 | tblEntity should not be (null) 106 | tblEntity.getAttribute("name") should be (tbl3Name) 107 | 108 | val sdEntity = getEntity(tracker.catalogEventTracker.sparkStorageFormatType, 109 | tracker.catalogEventTracker.sparkStorageFormatUniqueAttribute(dbName, tbl3Name)) 110 | sdEntity should not be (null) 111 | } 112 | 113 | // Drop table 114 | tracker.onOtherEvent(DropTablePreEvent(dbName, tbl3Name)) 115 | tracker.onOtherEvent(DropTableEvent(dbName, tbl3Name)) 116 | 117 | // sleeping 2 secs - we have to do this to ensure there's no call on deletion, unfortunately... 118 | Thread.sleep(2 * 1000) 119 | // deletion request should not be added 120 | val tblEntity = getEntity(tracker.catalogEventTracker.sparkTableType, 121 | tracker.catalogEventTracker.sparkTableUniqueAttribute(dbName, tbl3Name)) 122 | tblEntity should not be (null) 123 | } 124 | 125 | } 126 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/TestUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.net.URI 21 | 22 | import org.apache.spark.sql.catalyst.TableIdentifier 23 | import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogStorageFormat, CatalogTable, CatalogTableType} 24 | import org.apache.spark.sql.types.StructType 25 | import com.hortonworks.spark.atlas.utils.SparkUtils 26 | import org.apache.atlas.model.instance.AtlasObjectId 27 | 28 | object TestUtils { 29 | def createDB(name: String, location: String): CatalogDatabase = { 30 | CatalogDatabase(name, "", new URI(location), Map.empty) 31 | } 32 | 33 | def createStorageFormat( 34 | locationUri: Option[URI] = None, 35 | inputFormat: Option[String] = None, 36 | outputFormat: Option[String] = None, 37 | serd: Option[String] = None, 38 | compressed: Boolean = false, 39 | properties: Map[String, String] = Map.empty): CatalogStorageFormat = { 40 | CatalogStorageFormat(locationUri, inputFormat, outputFormat, serd, compressed, properties) 41 | } 42 | 43 | def createTable( 44 | db: String, 45 | table: String, 46 | schema: StructType, 47 | storage: CatalogStorageFormat, 48 | isHiveTable: Boolean = false): CatalogTable = { 49 | CatalogTable( 50 | TableIdentifier(table, Some(db)), 51 | CatalogTableType.MANAGED, 52 | storage, 53 | schema, 54 | provider = if (isHiveTable) Some("hive") else None, 55 | bucketSpec = None, 56 | owner = SparkUtils.currUser()) 57 | } 58 | 59 | def assertSubsetOf[T](set: Set[T], subset: Set[T]): Unit = { 60 | assert(subset.subsetOf(set), s"$subset is not a subset of $set") 61 | } 62 | 63 | def findEntity( 64 | entities: Seq[SACAtlasReferenceable], 65 | objId: AtlasObjectId): Option[SACAtlasReferenceable] = { 66 | entities.find(p => p.asObjectId == objId) 67 | } 68 | 69 | def findEntities( 70 | entities: Seq[SACAtlasReferenceable], 71 | objIds: Seq[AtlasObjectId]): Seq[SACAtlasReferenceable] = { 72 | entities.filter(p => objIds.contains(p.asObjectId)) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/WithHDFSSupport.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.io.File 21 | 22 | import org.apache.hadoop.conf.Configuration 23 | import org.apache.hadoop.fs.FileUtil 24 | import org.apache.hadoop.hdfs.MiniDFSCluster 25 | import org.apache.spark.sql.SparkSession 26 | import org.scalatest.{BeforeAndAfterAll, Suite} 27 | 28 | trait WithHDFSSupport extends BeforeAndAfterAll { self: Suite => 29 | 30 | protected var sparkSession: SparkSession = _ 31 | 32 | private var hdfsCluster: MiniDFSCluster = _ 33 | protected var hdfsURI: String = _ 34 | 35 | private def cleanupAnyExistingSession(): Unit = { 36 | val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession) 37 | if (session.isDefined) { 38 | session.get.sessionState.catalog.reset() 39 | session.get.stop() 40 | SparkSession.clearActiveSession() 41 | SparkSession.clearDefaultSession() 42 | } 43 | } 44 | 45 | override protected def beforeAll(): Unit = { 46 | super.beforeAll() 47 | 48 | cleanupAnyExistingSession() 49 | 50 | val baseDir = new File("./target/hdfs/").getAbsoluteFile() 51 | FileUtil.fullyDelete(baseDir) 52 | 53 | val conf = new Configuration() 54 | conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()) 55 | val builder = new MiniDFSCluster.Builder(conf) 56 | 57 | hdfsCluster = builder.build() 58 | hdfsURI = s"hdfs://localhost:${hdfsCluster.getNameNodePort()}/" 59 | 60 | sparkSession = SparkSession.builder() 61 | .master("local") 62 | .appName(this.getClass.getCanonicalName) 63 | .enableHiveSupport() 64 | .config("spark.hadoop.fs.defaultFS", hdfsURI) 65 | .config("spark.ui.enabled", "false") 66 | .getOrCreate() 67 | } 68 | 69 | override protected def afterAll(): Unit = { 70 | try { 71 | sparkSession.sessionState.catalog.reset() 72 | sparkSession.stop() 73 | SparkSession.clearActiveSession() 74 | SparkSession.clearDefaultSession() 75 | } finally { 76 | sparkSession = null 77 | } 78 | System.clearProperty("spark.driver.port") 79 | 80 | hdfsCluster.shutdown(true) 81 | 82 | super.afterAll() 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/WithHiveSupport.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.io.File 21 | import java.nio.file.Files 22 | 23 | import org.apache.commons.io.FileUtils 24 | import org.apache.spark.sql.SparkSession 25 | import org.scalatest.{BeforeAndAfterAll, Suite} 26 | 27 | trait WithHiveSupport extends BeforeAndAfterAll { self: Suite => 28 | 29 | protected var sparkSession: SparkSession = _ 30 | 31 | private var metastoreDir: String = _ 32 | private var warehouseDir: String = _ 33 | 34 | private def cleanupAnyExistingSession(): Unit = { 35 | val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession) 36 | if (session.isDefined) { 37 | session.get.sessionState.catalog.reset() 38 | session.get.stop() 39 | SparkSession.clearActiveSession() 40 | SparkSession.clearDefaultSession() 41 | } 42 | } 43 | 44 | override protected def beforeAll(): Unit = { 45 | super.beforeAll() 46 | 47 | cleanupAnyExistingSession() 48 | 49 | metastoreDir = Files.createTempDirectory("sac-metastore-").toString 50 | warehouseDir = Files.createTempDirectory("sac-warehouse-").toString 51 | System.setProperty("derby.system.home", metastoreDir) 52 | sparkSession = SparkSession.builder() 53 | .master("local") 54 | .appName(this.getClass.getCanonicalName) 55 | .enableHiveSupport() 56 | .config("spark.ui.enabled", "false") 57 | .config("spark.sql.warehouse.dir", warehouseDir) 58 | .getOrCreate() 59 | } 60 | 61 | override protected def afterAll(): Unit = { 62 | try { 63 | sparkSession.sessionState.catalog.reset() 64 | sparkSession.stop() 65 | SparkSession.clearActiveSession() 66 | SparkSession.clearDefaultSession() 67 | } finally { 68 | sparkSession = null 69 | FileUtils.deleteDirectory(new File(warehouseDir)) 70 | } 71 | System.clearProperty("spark.driver.port") 72 | 73 | super.afterAll() 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/WithRemoteHiveMetastoreServiceSupport.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas 19 | 20 | import java.io.File 21 | import java.nio.file.Files 22 | 23 | import com.hortonworks.spark.atlas.utils.SparkUtils 24 | import com.hotels.beeju.ThriftHiveMetaStoreTestUtil 25 | import org.apache.commons.io.FileUtils 26 | import org.apache.spark.sql.SparkSession 27 | import org.scalatest.{BeforeAndAfterAll, Suite} 28 | 29 | trait WithRemoteHiveMetastoreServiceSupport extends BeforeAndAfterAll { self: Suite => 30 | protected val dbName = "sac_hive_metastore" 31 | 32 | protected var sparkSession: SparkSession = _ 33 | 34 | private var warehouseDir: String = _ 35 | 36 | private val hive = new ThriftHiveMetaStoreTestUtil(dbName) 37 | 38 | private def cleanupAnyExistingSession(): Unit = { 39 | val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession) 40 | if (session.isDefined) { 41 | session.get.sessionState.catalog.reset() 42 | session.get.stop() 43 | SparkSession.clearActiveSession() 44 | SparkSession.clearDefaultSession() 45 | } 46 | } 47 | 48 | override protected def beforeAll(): Unit = { 49 | super.beforeAll() 50 | 51 | cleanupAnyExistingSession() 52 | 53 | hive.before() 54 | 55 | warehouseDir = Files.createTempDirectory("sac-warehouse-").toString 56 | sparkSession = SparkSession.builder() 57 | .master("local") 58 | .appName(this.getClass.getCanonicalName) 59 | .enableHiveSupport() 60 | .config("spark.ui.enabled", "false") 61 | .config("spark.sql.warehouse.dir", warehouseDir) 62 | .config("spark.hadoop.hive.metastore.uris", hive.getThriftConnectionUri) 63 | .getOrCreate() 64 | 65 | // reset hiveConf to make sure the configuration change takes effect 66 | SparkUtils.resetHiveConf 67 | } 68 | 69 | override protected def afterAll(): Unit = { 70 | try { 71 | hive.after() 72 | sparkSession.sessionState.catalog.reset() 73 | sparkSession.stop() 74 | SparkSession.clearActiveSession() 75 | SparkSession.clearDefaultSession() 76 | } finally { 77 | // reset hiveConf again to prevent affecting other tests 78 | SparkUtils.resetHiveConf 79 | 80 | sparkSession = null 81 | FileUtils.deleteDirectory(new File(warehouseDir)) 82 | } 83 | System.clearProperty("spark.driver.port") 84 | 85 | super.afterAll() 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/ml/MLPipelineTrackerIT.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.ml 19 | 20 | import org.apache.spark.ml.Pipeline 21 | import org.apache.spark.ml.feature.MinMaxScaler 22 | import org.apache.spark.ml.linalg.Vectors 23 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 24 | import org.scalatest.Matchers 25 | import com.hortonworks.spark.atlas._ 26 | import com.hortonworks.spark.atlas.types._ 27 | import com.hortonworks.spark.atlas.TestUtils._ 28 | 29 | class MLPipelineTrackerIT extends BaseResourceIT with Matchers with WithHiveSupport { 30 | private val atlasClient = new RestAtlasClient(atlasClientConf) 31 | 32 | def clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME) 33 | 34 | def getTableEntity(tableName: String): SACAtlasEntityWithDependencies = { 35 | val dbDefinition = createDB("db1", "hdfs:///test/db/db1") 36 | val sd = createStorageFormat() 37 | val schema = new StructType() 38 | .add("user", StringType, false) 39 | .add("age", IntegerType, true) 40 | val tableDefinition = createTable("db1", s"$tableName", schema, sd) 41 | internal.sparkTableToEntity(tableDefinition, clusterName, Some(dbDefinition)) 42 | } 43 | 44 | // Enable it to run integrated test. 45 | it("pipeline and pipeline model") { 46 | val uri = "hdfs://" 47 | val pipelineDir = "tmp/pipeline" 48 | val modelDir = "tmp/model" 49 | 50 | val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir) 51 | val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir) 52 | 53 | atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, modelDirEntity)) 54 | 55 | val df = sparkSession.createDataFrame(Seq( 56 | (1, Vectors.dense(0.0, 1.0, 4.0), 1.0), 57 | (2, Vectors.dense(1.0, 0.0, 4.0), 2.0), 58 | (3, Vectors.dense(1.0, 0.0, 5.0), 3.0), 59 | (4, Vectors.dense(0.0, 0.0, 5.0), 4.0) 60 | )).toDF("id", "features", "label") 61 | 62 | val scaler = new MinMaxScaler() 63 | .setInputCol("features") 64 | .setOutputCol("features_scaled") 65 | .setMin(0.0) 66 | .setMax(3.0) 67 | val pipeline = new Pipeline().setStages(Array(scaler)) 68 | 69 | val model = pipeline.fit(df) 70 | 71 | pipeline.write.overwrite().save(pipelineDir) 72 | 73 | val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity) 74 | 75 | atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, pipelineEntity)) 76 | 77 | val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity) 78 | 79 | atlasClient.createEntitiesWithDependencies(Seq(modelDirEntity, modelEntity)) 80 | 81 | val tableEntities1 = getTableEntity("chris1") 82 | val tableEntities2 = getTableEntity("chris2") 83 | 84 | atlasClient.createEntitiesWithDependencies(tableEntities1) 85 | atlasClient.createEntitiesWithDependencies(tableEntities2) 86 | 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/CatalogEventToAtlasIT.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import java.nio.file.Files 21 | 22 | import scala.concurrent.duration._ 23 | import scala.language.postfixOps 24 | 25 | import org.apache.atlas.AtlasServiceException 26 | import org.apache.spark.sql.SparkSession 27 | import org.apache.spark.sql.catalyst.catalog._ 28 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 29 | import org.scalatest.Matchers 30 | import org.scalatest.concurrent.Eventually._ 31 | 32 | import com.hortonworks.spark.atlas.utils.SparkUtils 33 | import com.hortonworks.spark.atlas.{BaseResourceIT, RestAtlasClient, TestUtils} 34 | 35 | class CatalogEventToAtlasIT extends BaseResourceIT with Matchers { 36 | import TestUtils._ 37 | 38 | private var sparkSession: SparkSession = _ 39 | 40 | private var processor: SparkCatalogEventProcessor = _ 41 | 42 | override protected def beforeAll(): Unit = { 43 | super.beforeAll() 44 | sparkSession = SparkSession.builder() 45 | .master("local") 46 | .getOrCreate() 47 | processor = 48 | new SparkCatalogEventProcessor(new RestAtlasClient(atlasClientConf), atlasClientConf) 49 | processor.startThread() 50 | } 51 | 52 | override def afterAll(): Unit = { 53 | sparkSession.stop() 54 | SparkSession.clearActiveSession() 55 | SparkSession.clearDefaultSession() 56 | super.afterAll() 57 | } 58 | 59 | it("catalog spark db event to Atlas entities") { 60 | val dbName = uniqueName("db1") 61 | 62 | // Create db entity in Atlas and make sure we get it from Atlas 63 | val tempDbPath = Files.createTempDirectory("db_") 64 | val dbDefinition = createDB(dbName, tempDbPath.normalize().toUri.toString) 65 | SparkUtils.getExternalCatalog().createDatabase(dbDefinition, ignoreIfExists = true) 66 | processor.pushEvent(CreateDatabaseEvent(dbName)) 67 | eventually(timeout(30 seconds), interval(100 milliseconds)) { 68 | val entity = getEntity(processor.sparkDbType, processor.sparkDbUniqueAttribute(dbName)) 69 | entity should not be (null) 70 | entity.getAttribute("name") should be (dbName) 71 | entity.getAttribute("owner") should be (SparkUtils.currUser()) 72 | entity.getAttribute("ownerType") should be ("USER") 73 | } 74 | 75 | // Drop DB from external catalog to make sure we also delete the corresponding Atlas entity 76 | SparkUtils.getExternalCatalog().dropDatabase(dbName, ignoreIfNotExists = true, cascade = false) 77 | processor.pushEvent(DropDatabaseEvent(dbName)) 78 | eventually(timeout(30 seconds), interval(100 milliseconds)) { 79 | intercept[AtlasServiceException]( 80 | getEntity(processor.sparkDbType, processor.sparkDbUniqueAttribute(dbName))) 81 | } 82 | } 83 | 84 | it("catalog spark table event to Atlas entities") { 85 | val dbName = uniqueName("db2") 86 | val tbl1Name = uniqueName("tbl1") 87 | val tbl2Name = uniqueName("tbl2") 88 | 89 | val tempDbPath = Files.createTempDirectory("db_") 90 | val dbDefinition = createDB(dbName, tempDbPath.normalize().toUri.toString) 91 | SparkUtils.getExternalCatalog().createDatabase(dbDefinition, ignoreIfExists = true) 92 | processor.pushEvent(CreateDatabaseEvent(dbName)) 93 | eventually(timeout(30 seconds), interval(100 milliseconds)) { 94 | val entity = getEntity(processor.sparkDbType, processor.sparkDbUniqueAttribute(dbName)) 95 | entity should not be (null) 96 | entity.getAttribute("name") should be (dbName) 97 | } 98 | 99 | // Create new table 100 | val schema = new StructType() 101 | .add("user", StringType) 102 | .add("age", IntegerType) 103 | val sd = CatalogStorageFormat.empty 104 | val tableDefinition = createTable(dbName, tbl1Name, schema, sd) 105 | SparkUtils.getExternalCatalog().createTable(tableDefinition, ignoreIfExists = true) 106 | processor.pushEvent(CreateTableEvent(dbName, tbl1Name)) 107 | 108 | eventually(timeout(30 seconds), interval(100 milliseconds)) { 109 | val sdEntity = getEntity(processor.sparkStorageFormatType, 110 | processor.sparkStorageFormatUniqueAttribute(dbName, tbl1Name)) 111 | sdEntity should not be (null) 112 | 113 | val tblEntity = getEntity(processor.sparkTableType, 114 | processor.sparkTableUniqueAttribute(dbName, tbl1Name)) 115 | tblEntity should not be (null) 116 | tblEntity.getAttribute("name") should be (tbl1Name) 117 | } 118 | 119 | // Rename table 120 | SparkUtils.getExternalCatalog().renameTable(dbName, tbl1Name, tbl2Name) 121 | processor.pushEvent(RenameTableEvent(dbName, tbl1Name, tbl2Name)) 122 | eventually(timeout(30 seconds), interval(100 milliseconds)) { 123 | val tblEntity = getEntity(processor.sparkTableType, 124 | processor.sparkTableUniqueAttribute(dbName, tbl2Name)) 125 | tblEntity should not be (null) 126 | tblEntity.getAttribute("name") should be (tbl2Name) 127 | 128 | val sdEntity = getEntity(processor.sparkStorageFormatType, 129 | processor.sparkStorageFormatUniqueAttribute(dbName, tbl2Name)) 130 | sdEntity should not be (null) 131 | } 132 | 133 | // Drop table 134 | val tblDef2 = SparkUtils.getExternalCatalog().getTable(dbName, tbl2Name) 135 | processor.pushEvent(DropTablePreEvent(dbName, tbl2Name)) 136 | processor.pushEvent(DropTableEvent(dbName, tbl2Name)) 137 | 138 | // sleeping 2 secs - we have to do this to ensure there's no call on deletion, unfortunately... 139 | Thread.sleep(2 * 1000) 140 | // deletion request should not be added 141 | val tblEntity = getEntity(processor.sparkTableType, 142 | processor.sparkTableUniqueAttribute(dbName, tbl2Name)) 143 | tblEntity should not be (null) 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/CreateDataSourceTableAsSelectHarvesterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import com.hortonworks.spark.atlas.types.metadata 21 | 22 | import scala.util.Random 23 | import com.hortonworks.spark.atlas.{SACAtlasEntityWithDependencies, WithHiveSupport} 24 | import com.hortonworks.spark.atlas.utils.SparkUtils 25 | import org.apache.atlas.model.instance.AtlasEntity 26 | import org.apache.spark.sql.SaveMode 27 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} 28 | import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand 29 | import org.apache.spark.sql.execution.datasources.DataSource 30 | import org.apache.spark.sql.types.StructType 31 | import org.scalatest.{FunSuite, Matchers} 32 | 33 | // This is not leveraging BaseHarvesterSuite, as it doesn't need to be tested with 34 | // both non-remote HMS and remote HMS cases. 35 | class CreateDataSourceTableAsSelectHarvesterSuite 36 | extends FunSuite with Matchers with WithHiveSupport { 37 | 38 | private val sourceTblName = "source_" + Random.nextInt(100000) 39 | 40 | override protected def beforeAll(): Unit = { 41 | super.beforeAll() 42 | 43 | sparkSession.sql(s"CREATE TABLE $sourceTblName (name string, age int)") 44 | } 45 | 46 | test("saveAsTable should have output entity having table details - parquet") { 47 | testWithProvider("parquet") 48 | } 49 | 50 | test("saveAsTable should have output entity having table details - hive") { 51 | val entity = testWithProvider("hive") 52 | assert(entity.getAttribute("partitionProvider") == "Catalog") 53 | } 54 | 55 | def testWithProvider(provider: String): AtlasEntity = { 56 | val destTblName = "dest1_" + Random.nextInt(100000) 57 | val df = sparkSession.sql(s"SELECT * FROM $sourceTblName") 58 | 59 | // The codes below look after DataFrameWriter.saveAsTable codes as of Spark 2.4. 60 | // It uses internal APIs for this test. If the compatibility is broken, we should better 61 | // just remove this test. 62 | val tableIdent = df.sparkSession.sessionState.sqlParser.parseTableIdentifier(destTblName) 63 | val storage = DataSource.buildStorageFormatFromOptions(Map("path" -> "/tmp/foo")) 64 | val tableDesc = CatalogTable( 65 | identifier = tableIdent, 66 | tableType = CatalogTableType.EXTERNAL, 67 | storage = storage, 68 | schema = new StructType, 69 | provider = Some(provider), 70 | partitionColumnNames = Nil, 71 | bucketSpec = None) 72 | val cmd = CreateDataSourceTableAsSelectCommand( 73 | tableDesc, 74 | SaveMode.ErrorIfExists, 75 | df.queryExecution.logical, 76 | Seq("name", "age")) 77 | val newTable = tableDesc.copy( 78 | storage = tableDesc.storage.copy(), 79 | schema = df.schema) 80 | sparkSession.sessionState.catalog.createTable( 81 | newTable, ignoreIfExists = false, validateLocation = false) 82 | 83 | val qd = QueryDetail(df.queryExecution, 0L) 84 | val entities = CommandsHarvester.CreateDataSourceTableAsSelectHarvester.harvest(cmd, qd) 85 | val processDeps = entities.head.asInstanceOf[SACAtlasEntityWithDependencies].dependencies 86 | val maybeEntity = processDeps.find(_.typeName == metadata.TABLE_TYPE_STRING) 87 | .map(_.asInstanceOf[SACAtlasEntityWithDependencies].entity) 88 | 89 | assert(maybeEntity.isDefined, s"Output entity for table [$destTblName] was not found.") 90 | assert(maybeEntity.get.getAttribute("name") == destTblName) 91 | assert(maybeEntity.get.getAttribute("owner") == SparkUtils.currUser()) 92 | assert(maybeEntity.get.getAttribute("schemaDesc") == "struct") 93 | assert(maybeEntity.get.getAttribute("provider") == provider) 94 | maybeEntity.get 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/CreateViewHarvesterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import scala.util.Random 21 | import org.apache.spark.sql.execution.command.{CreateViewCommand, ExecutedCommandExec} 22 | import com.hortonworks.spark.atlas._ 23 | import com.hortonworks.spark.atlas.sql.testhelper.BaseHarvesterSuite 24 | import org.apache.spark.sql.SparkSession 25 | 26 | abstract class BaseCreateViewHarvesterSuite 27 | extends BaseHarvesterSuite { 28 | 29 | private val sourceTblName = "source_" + Random.nextInt(100000) 30 | private val destinationViewName = "destination_" + Random.nextInt(100000) 31 | private val destinationViewName2 = "destination_" + Random.nextInt(100000) 32 | 33 | protected override def initializeTestEnvironment(): Unit = { 34 | prepareDatabase() 35 | 36 | _spark.sql(s"CREATE TABLE $sourceTblName (name string)") 37 | _spark.sql(s"INSERT INTO TABLE $sourceTblName VALUES ('lucy'), ('tom')") 38 | } 39 | 40 | protected override def cleanupTestEnvironment(): Unit = { 41 | cleanupDatabase() 42 | } 43 | 44 | test("CREATE VIEW FROM TABLE") { 45 | val qe = _spark.sql(s"CREATE VIEW $destinationViewName " + 46 | s"AS SELECT * FROM $sourceTblName").queryExecution 47 | val qd = QueryDetail(qe, 0L) 48 | 49 | assert(qe.sparkPlan.isInstanceOf[ExecutedCommandExec]) 50 | val node = qe.sparkPlan.asInstanceOf[ExecutedCommandExec] 51 | assert(node.cmd.isInstanceOf[CreateViewCommand]) 52 | val cmd = node.cmd.asInstanceOf[CreateViewCommand] 53 | 54 | val entities = CommandsHarvester.CreateViewHarvester.harvest(cmd, qd) 55 | validateProcessEntity(entities.head, _ => {}, inputs => { 56 | inputs.size should be (1) 57 | assertTable(inputs.head, sourceTblName) 58 | }, outputs => { 59 | outputs.size should be (1) 60 | assertTable(outputs.head, destinationViewName) 61 | }) 62 | } 63 | 64 | test("CREATE VIEW without source") { 65 | val qe = _spark.sql(s"CREATE VIEW $destinationViewName2 " + 66 | s"AS SELECT 1").queryExecution 67 | val qd = QueryDetail(qe, 0L) 68 | 69 | assert(qe.sparkPlan.isInstanceOf[ExecutedCommandExec]) 70 | val node = qe.sparkPlan.asInstanceOf[ExecutedCommandExec] 71 | assert(node.cmd.isInstanceOf[CreateViewCommand]) 72 | val cmd = node.cmd.asInstanceOf[CreateViewCommand] 73 | 74 | val entities = CommandsHarvester.CreateViewHarvester.harvest(cmd, qd) 75 | validateProcessEntity(entities.head, _ => {}, inputs => { 76 | inputs.size should be (0) 77 | }, outputs => { 78 | outputs.size should be (1) 79 | assertTable(outputs.head, destinationViewName2) 80 | }) 81 | } 82 | } 83 | 84 | class CreateViewHarvesterSuite 85 | extends BaseCreateViewHarvesterSuite 86 | with WithHiveSupport { 87 | 88 | override def beforeAll(): Unit = { 89 | super.beforeAll() 90 | initializeTestEnvironment() 91 | } 92 | 93 | override def afterAll(): Unit = { 94 | cleanupTestEnvironment() 95 | super.afterAll() 96 | } 97 | 98 | override protected def getSparkSession: SparkSession = sparkSession 99 | 100 | override protected def getDbName: String = "sac" 101 | 102 | override protected def expectSparkTableModels: Boolean = true 103 | } 104 | 105 | class CreateViewHarvesterWithRemoteHMSSuite 106 | extends BaseCreateViewHarvesterSuite 107 | with WithRemoteHiveMetastoreServiceSupport { 108 | 109 | override def beforeAll(): Unit = { 110 | super.beforeAll() 111 | initializeTestEnvironment() 112 | } 113 | 114 | override def afterAll(): Unit = { 115 | cleanupTestEnvironment() 116 | super.afterAll() 117 | } 118 | 119 | override protected def getSparkSession: SparkSession = sparkSession 120 | 121 | override protected def getDbName: String = dbName 122 | 123 | override protected def expectSparkTableModels: Boolean = false 124 | } 125 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/InsertIntoHiveDirHarvesterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import java.io.File 21 | 22 | import scala.util.Random 23 | import org.apache.spark.sql.execution.command.DataWritingCommandExec 24 | import org.apache.spark.sql.hive.execution.InsertIntoHiveDirCommand 25 | import com.hortonworks.spark.atlas.{WithHiveSupport, WithRemoteHiveMetastoreServiceSupport} 26 | import com.hortonworks.spark.atlas.sql.testhelper.{BaseHarvesterSuite, FsEntityValidator} 27 | import org.apache.spark.sql.SparkSession 28 | 29 | abstract class BaseInsertIntoHiveDirHarvesterSuite 30 | extends BaseHarvesterSuite 31 | with FsEntityValidator { 32 | 33 | private val sourceTblName = "source_" + Random.nextInt(100000) 34 | 35 | protected override def initializeTestEnvironment(): Unit = { 36 | prepareDatabase() 37 | 38 | _spark.sql(s"CREATE TABLE $sourceTblName (name string)") 39 | _spark.sql(s"INSERT INTO TABLE $sourceTblName VALUES ('a'), ('b'), ('c')") 40 | } 41 | 42 | override protected def cleanupTestEnvironment(): Unit = { 43 | cleanupDatabase() 44 | } 45 | 46 | test("INSERT OVERWRITE DIRECTORY path...") { 47 | val qe = _spark.sql(s"INSERT OVERWRITE DIRECTORY 'target/dir1' " + 48 | s"SELECT * FROM $sourceTblName").queryExecution 49 | val qd = QueryDetail(qe, 0L) 50 | 51 | assert(qe.sparkPlan.isInstanceOf[DataWritingCommandExec]) 52 | val node = qe.sparkPlan.asInstanceOf[DataWritingCommandExec] 53 | assert(node.cmd.isInstanceOf[InsertIntoHiveDirCommand]) 54 | val cmd = node.cmd.asInstanceOf[InsertIntoHiveDirCommand] 55 | 56 | val entities = CommandsHarvester.InsertIntoHiveDirHarvester.harvest(cmd, qd) 57 | validateProcessEntity(entities.head, _ => {}, inputs => { 58 | inputs.size should be (1) 59 | assertTable(inputs.head, _dbName, sourceTblName, _clusterName, _useSparkTable) 60 | }, outputs => { 61 | outputs.size should be (1) 62 | val dir = new File("target/dir1").getAbsolutePath 63 | assertFsEntity(outputs.head, dir) 64 | }) 65 | } 66 | } 67 | 68 | class InsertIntoHiveDirHarvesterSuite 69 | extends BaseInsertIntoHiveDirHarvesterSuite 70 | with WithHiveSupport { 71 | 72 | override def beforeAll(): Unit = { 73 | super.beforeAll() 74 | initializeTestEnvironment() 75 | } 76 | 77 | override def afterAll(): Unit = { 78 | cleanupTestEnvironment() 79 | super.afterAll() 80 | } 81 | 82 | override protected def getSparkSession: SparkSession = sparkSession 83 | 84 | override protected def getDbName: String = "sac" 85 | 86 | override protected def expectSparkTableModels: Boolean = true 87 | } 88 | 89 | class InsertIntoHiveDirHarvesterWithRemoteHMSSuite 90 | extends BaseInsertIntoHiveDirHarvesterSuite 91 | with WithRemoteHiveMetastoreServiceSupport { 92 | 93 | override def beforeAll(): Unit = { 94 | super.beforeAll() 95 | initializeTestEnvironment() 96 | } 97 | 98 | override def afterAll(): Unit = { 99 | cleanupTestEnvironment() 100 | super.afterAll() 101 | } 102 | 103 | override protected def getSparkSession: SparkSession = sparkSession 104 | 105 | override protected def getDbName: String = dbName 106 | 107 | override protected def expectSparkTableModels: Boolean = false 108 | } 109 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/LoadDataHarvesterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import java.io.{FileOutputStream, PrintWriter} 21 | import java.nio.file.Files 22 | 23 | import scala.util.Random 24 | import org.apache.spark.sql.execution.LeafExecNode 25 | import org.apache.spark.sql.execution.command.{ExecutedCommandExec, LoadDataCommand} 26 | import com.hortonworks.spark.atlas.types.external 27 | import com.hortonworks.spark.atlas._ 28 | import com.hortonworks.spark.atlas.sql.testhelper.BaseHarvesterSuite 29 | import org.apache.spark.sql.SparkSession 30 | 31 | abstract class BaseLoadDataHarvesterSuite 32 | extends BaseHarvesterSuite { 33 | 34 | protected val sourceTblName = "source_" + Random.nextInt(100000) 35 | 36 | protected override def initializeTestEnvironment(): Unit = { 37 | prepareDatabase() 38 | 39 | _spark.sql(s"CREATE TABLE $sourceTblName (name string)") 40 | } 41 | 42 | override protected def cleanupTestEnvironment(): Unit = { 43 | cleanupDatabase() 44 | } 45 | 46 | test("LOAD DATA [LOCAL] INPATH path source") { 47 | val file = Files.createTempFile("input", ".txt").toFile 48 | val out = new PrintWriter(new FileOutputStream(file)) 49 | out.write("a\nb\nc\nd\n") 50 | out.close() 51 | 52 | val qe = _spark.sql(s"LOAD DATA LOCAL INPATH '${file.getAbsolutePath}' " + 53 | s"OVERWRITE INTO TABLE $sourceTblName").queryExecution 54 | val qd = QueryDetail(qe, 0L) 55 | val node = qe.sparkPlan.collect { case p: LeafExecNode => p } 56 | assert(node.size == 1) 57 | val execNode = node.head.asInstanceOf[ExecutedCommandExec] 58 | 59 | val entities = CommandsHarvester.LoadDataHarvester.harvest( 60 | execNode.cmd.asInstanceOf[LoadDataCommand], qd) 61 | validateProcessEntity(entities.head, _ => {}, inputs => { 62 | inputs.size should be (1) 63 | val inputEntity = inputs.head.asInstanceOf[SACAtlasEntityWithDependencies].entity 64 | inputEntity.getTypeName should be (external.FS_PATH_TYPE_STRING) 65 | inputEntity.getAttribute("name") should be (file.getAbsolutePath.toLowerCase) 66 | }, outputs => { 67 | outputs.size should be (1) 68 | assertTable(outputs.head, _dbName, sourceTblName, _clusterName, _useSparkTable) 69 | }) 70 | } 71 | } 72 | 73 | class LoadDataHarvesterSuite 74 | extends BaseLoadDataHarvesterSuite 75 | with WithHiveSupport { 76 | 77 | override def beforeAll(): Unit = { 78 | super.beforeAll() 79 | initializeTestEnvironment() 80 | } 81 | 82 | override def afterAll(): Unit = { 83 | cleanupTestEnvironment() 84 | super.afterAll() 85 | } 86 | 87 | override def getSparkSession: SparkSession = sparkSession 88 | 89 | override def getDbName: String = "sac" 90 | 91 | override def expectSparkTableModels: Boolean = true 92 | } 93 | 94 | class LoadDataHarvesterWithRemoteHMSSuite 95 | extends BaseLoadDataHarvesterSuite 96 | with WithRemoteHiveMetastoreServiceSupport { 97 | 98 | override def beforeAll(): Unit = { 99 | super.beforeAll() 100 | initializeTestEnvironment() 101 | } 102 | 103 | override def afterAll(): Unit = { 104 | cleanupTestEnvironment() 105 | super.afterAll() 106 | } 107 | 108 | override def getSparkSession: SparkSession = sparkSession 109 | 110 | override def expectSparkTableModels: Boolean = false 111 | 112 | override def getDbName: String = dbName 113 | } 114 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/SparkExecutionPlanProcessForRdbmsQuerySuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import org.scalatest.{BeforeAndAfter, FunSuite, Matchers} 21 | import java.sql.DriverManager 22 | 23 | import com.hortonworks.spark.atlas.{AtlasClientConf, AtlasUtils, WithHiveSupport} 24 | import com.hortonworks.spark.atlas.AtlasEntityReadHelper._ 25 | import com.hortonworks.spark.atlas.sql.testhelper.{AtlasQueryExecutionListener, CreateEntitiesTrackingAtlasClient, DirectProcessSparkExecutionPlanProcessor, ProcessEntityValidator} 26 | import com.hortonworks.spark.atlas.types.{external, metadata} 27 | import org.apache.atlas.model.instance.AtlasEntity 28 | 29 | class SparkExecutionPlanProcessForRdbmsQuerySuite 30 | extends FunSuite 31 | with Matchers 32 | with BeforeAndAfter 33 | with WithHiveSupport 34 | with ProcessEntityValidator { 35 | 36 | val sinkTableName = "sink_table" 37 | val sourceTableName = "source_table" 38 | val databaseName = "testdb" 39 | val jdbcDriver = "org.apache.derby.jdbc.EmbeddedDriver" 40 | 41 | val atlasClientConf: AtlasClientConf = new AtlasClientConf() 42 | var atlasClient: CreateEntitiesTrackingAtlasClient = _ 43 | val testHelperQueryListener = new AtlasQueryExecutionListener() 44 | 45 | before { 46 | // setup derby database and necesaary table 47 | val connectionURL = s"jdbc:derby:memory:$databaseName;create=true" 48 | Class.forName(jdbcDriver) 49 | val connection = DriverManager.getConnection(connectionURL) 50 | 51 | val createSinkTableQuery = s"CREATE TABLE $sinkTableName (NAME VARCHAR(20))" 52 | val createSourceTableQuery = s"CREATE TABLE $sourceTableName (NAME VARCHAR(20))" 53 | val insertQuery = s"INSERT INTO $sourceTableName (Name) VALUES ('A'), ('B'), ('C')" 54 | val statement = connection.createStatement 55 | statement.executeUpdate(createSinkTableQuery) 56 | statement.executeUpdate(createSourceTableQuery) 57 | statement.executeUpdate(insertQuery) 58 | 59 | // setup Atlas client 60 | atlasClient = new CreateEntitiesTrackingAtlasClient() 61 | sparkSession.listenerManager.register(testHelperQueryListener) 62 | } 63 | 64 | test("read from derby table and insert into a different derby table") { 65 | val planProcessor = new DirectProcessSparkExecutionPlanProcessor(atlasClient, atlasClientConf) 66 | 67 | val jdbcProperties = new java.util.Properties 68 | jdbcProperties.setProperty("driver", jdbcDriver) 69 | val url = s"jdbc:derby:memory:$databaseName;create=false" 70 | 71 | val readDataFrame = sparkSession.read.jdbc(url, sourceTableName, jdbcProperties) 72 | readDataFrame.write.mode("append").jdbc(url, sinkTableName, jdbcProperties) 73 | 74 | val queryDetail = testHelperQueryListener.queryDetails.last 75 | planProcessor.process(queryDetail) 76 | val entities = atlasClient.createdEntities 77 | 78 | // we're expecting two table entities: 79 | // one from the source table and another from the sink table 80 | val tableEntities = listAtlasEntitiesAsType(entities, external.RDBMS_TABLE) 81 | assert(tableEntities.size === 2) 82 | 83 | val inputEntity = getOnlyOneEntityOnAttribute(tableEntities, "name", sourceTableName) 84 | val outputEntity = getOnlyOneEntityOnAttribute(tableEntities, "name", sinkTableName) 85 | assertTableEntity(inputEntity, sourceTableName) 86 | assertTableEntity(outputEntity, sinkTableName) 87 | 88 | // check for 'spark_process' 89 | validateProcessEntityWithAtlasEntities(entities, _ => {}, 90 | AtlasUtils.entitiesToReferences(Seq(inputEntity)), 91 | AtlasUtils.entitiesToReferences(Seq(outputEntity))) 92 | } 93 | 94 | private def assertTableEntity(entity: AtlasEntity, tableName: String): Unit = { 95 | val tableQualifiedName = getStringAttribute(entity, "qualifiedName") 96 | assert(tableQualifiedName.equals(s"$databaseName.$tableName")) 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/SparkExecutionPlanProcessorForComplicatedQuerySuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import java.io.File 21 | 22 | import com.hortonworks.spark.atlas.{AtlasClientConf, AtlasUtils, WithRemoteHiveMetastoreServiceSupport} 23 | import com.hortonworks.spark.atlas.sql.testhelper._ 24 | import com.hortonworks.spark.atlas.types.external 25 | import org.apache.atlas.model.instance.AtlasObjectId 26 | import org.scalatest.{BeforeAndAfterEach, FunSuite, Matchers} 27 | 28 | class SparkExecutionPlanProcessorForComplicatedQuerySuite 29 | extends FunSuite 30 | with BeforeAndAfterEach 31 | with Matchers 32 | with WithRemoteHiveMetastoreServiceSupport 33 | with ProcessEntityValidator 34 | with TableEntityValidator 35 | with FsEntityValidator { 36 | 37 | import com.hortonworks.spark.atlas.AtlasEntityReadHelper._ 38 | 39 | val atlasClientConf: AtlasClientConf = new AtlasClientConf() 40 | var testHelperQueryListener: AtlasQueryExecutionListener = _ 41 | 42 | val clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME) 43 | 44 | override def beforeAll(): Unit = { 45 | super.beforeAll() 46 | 47 | testHelperQueryListener = new AtlasQueryExecutionListener() 48 | sparkSession.listenerManager.register(testHelperQueryListener) 49 | } 50 | 51 | override def afterAll(): Unit = { 52 | sparkSession.listenerManager.unregister(testHelperQueryListener) 53 | super.afterAll() 54 | } 55 | 56 | test("select tbl1, tbl2 -> save to tbl3 -> select tbl3 -> save to file") { 57 | val atlasClient = new CreateEntitiesTrackingAtlasClient() 58 | val planProcessor = new DirectProcessSparkExecutionPlanProcessor(atlasClient, atlasClientConf) 59 | 60 | val rand = new scala.util.Random() 61 | val randNum = rand.nextInt(1000000000) 62 | 63 | val table1 = s"t1_$randNum" 64 | val table2 = s"t2_$randNum" 65 | val table3 = s"t3_$randNum" 66 | val outputPath = s"/tmp/hdfs_$randNum" 67 | 68 | sparkSession.sql(s"create table ${dbName}.${table1}(col1 int)") 69 | sparkSession.sql(s"create table ${dbName}.${table2}(col2 int)") 70 | 71 | testHelperQueryListener.clear() 72 | 73 | sparkSession 74 | .sql(s"select * from ${dbName}.${table1}, ${dbName}.${table2} where col1=col2") 75 | .write 76 | .saveAsTable(s"${dbName}.${table3}") 77 | 78 | val queryDetail = testHelperQueryListener.queryDetails.last 79 | planProcessor.process(queryDetail) 80 | val entities = atlasClient.createdEntities 81 | 82 | val expectedInputs = Set( 83 | new AtlasObjectId(external.HIVE_TABLE_TYPE_STRING, 84 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 85 | external.hiveTableUniqueAttribute(clusterName, dbName, table1)), 86 | new AtlasObjectId(external.HIVE_TABLE_TYPE_STRING, 87 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 88 | external.hiveTableUniqueAttribute(clusterName, dbName, table2))) 89 | 90 | val expectedOutputs = Set( 91 | new AtlasObjectId(external.HIVE_TABLE_TYPE_STRING, 92 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 93 | external.hiveTableUniqueAttribute(clusterName, dbName, table3))) 94 | 95 | validateProcessEntityWithAtlasEntities(entities, _ => {}, expectedInputs, expectedOutputs) 96 | 97 | testHelperQueryListener.clear() 98 | atlasClient.clearEntities() 99 | 100 | sparkSession 101 | .sql(s"select * from ${dbName}.${table3}") 102 | .write 103 | .mode("append") 104 | .save(outputPath) 105 | 106 | val queryDetail2 = testHelperQueryListener.queryDetails.last 107 | planProcessor.process(queryDetail2) 108 | val entities2 = atlasClient.createdEntities 109 | 110 | val expectedInputs2 = Set( 111 | new AtlasObjectId(external.HIVE_TABLE_TYPE_STRING, 112 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 113 | external.hiveTableUniqueAttribute(clusterName, dbName, table3))) 114 | 115 | val output = getOnlyOneEntity(entities, external.FS_PATH_TYPE_STRING) 116 | val dir = new File(outputPath).getAbsolutePath 117 | assertFsEntity(output, dir) 118 | val expectedOutputs2 = AtlasUtils.entitiesToReferences(Seq(output), useGuid = false) 119 | 120 | validateProcessEntityWithAtlasEntities(entities2, _ => {}, expectedInputs2, expectedOutputs2) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/SparkExecutionPlanProcessorForViewSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import scala.util.Random 21 | import org.scalatest.{FunSuite, Matchers} 22 | import org.apache.atlas.model.instance.AtlasEntity 23 | import com.hortonworks.spark.atlas.AtlasEntityReadHelper._ 24 | import com.hortonworks.spark.atlas.{AtlasClientConf, AtlasUtils, WithHiveSupport} 25 | import com.hortonworks.spark.atlas.sql.testhelper.{AtlasQueryExecutionListener, CreateEntitiesTrackingAtlasClient, DirectProcessSparkExecutionPlanProcessor, ProcessEntityValidator} 26 | import com.hortonworks.spark.atlas.types.metadata 27 | 28 | class SparkExecutionPlanProcessorForViewSuite 29 | extends FunSuite 30 | with Matchers 31 | with WithHiveSupport 32 | with ProcessEntityValidator { 33 | private val sourceTblName = "source_" + Random.nextInt(100000) 34 | private val destinationViewName = "destination_" + Random.nextInt(100000) 35 | private val destinationTableName = "destination_" + Random.nextInt(100000) 36 | 37 | private val testHelperQueryListener = new AtlasQueryExecutionListener() 38 | 39 | var atlasClient: CreateEntitiesTrackingAtlasClient = _ 40 | val atlasClientConf: AtlasClientConf = new AtlasClientConf() 41 | 42 | override protected def beforeAll(): Unit = { 43 | super.beforeAll() 44 | 45 | sparkSession.sql(s"CREATE TABLE $sourceTblName (name string)") 46 | sparkSession.sql(s"INSERT INTO TABLE $sourceTblName VALUES ('lucy'), ('tom')") 47 | 48 | // setup Atlas client 49 | testHelperQueryListener.clear() 50 | atlasClient = new CreateEntitiesTrackingAtlasClient() 51 | sparkSession.listenerManager.register(testHelperQueryListener) 52 | } 53 | 54 | test("CREATE TEMPORARY VIEW FROM TABLE, SAVE TEMP VIEW TO TABLE") { 55 | val planProcessor = new DirectProcessSparkExecutionPlanProcessor(atlasClient, atlasClientConf) 56 | sparkSession.sql(s"SELECT * FROM $sourceTblName").createOrReplaceTempView(destinationViewName) 57 | 58 | var queryDetail = testHelperQueryListener.queryDetails.last 59 | planProcessor.process(queryDetail) 60 | var entities = atlasClient.createdEntities 61 | 62 | // no entities should have been received from a creating a temporary view 63 | assert(entities.isEmpty) 64 | 65 | // we don't want to check above queries, so reset the entities in listener 66 | testHelperQueryListener.clear() 67 | 68 | sparkSession.sql(s"SELECT * FROM $destinationViewName").write.saveAsTable(destinationTableName) 69 | 70 | queryDetail = testHelperQueryListener.queryDetails.last 71 | planProcessor.process(queryDetail) 72 | entities = atlasClient.createdEntities 73 | 74 | // we're expecting two table entities: 75 | // one from the source table and another from the sink table, the temporary view is ignored 76 | assert(entities.nonEmpty) 77 | val tableEntities = listAtlasEntitiesAsType(entities, metadata.TABLE_TYPE_STRING) 78 | assert(tableEntities.size === 2) 79 | 80 | val inputEntity = getOnlyOneEntityOnAttribute(tableEntities, "name", sourceTblName) 81 | val outputEntity = getOnlyOneEntityOnAttribute(tableEntities, "name", destinationTableName) 82 | assertTableEntity(inputEntity, sourceTblName) 83 | assertTableEntity(outputEntity, destinationTableName) 84 | 85 | // check for 'spark_process' 86 | validateProcessEntityWithAtlasEntities(entities, _ => {}, 87 | AtlasUtils.entitiesToReferences(Seq(inputEntity)), 88 | AtlasUtils.entitiesToReferences(Seq(outputEntity))) 89 | } 90 | 91 | private def assertTableEntity(entity: AtlasEntity, tableName: String): Unit = { 92 | val tableQualifiedName = getStringAttribute(entity, "qualifiedName") 93 | assert(tableQualifiedName.contains(s"$tableName")) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/SparkExecutionPlanProcessorWithRemoteHiveMetastoreServiceSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql 19 | 20 | import java.io.{File, FileOutputStream, PrintWriter} 21 | import java.nio.file.Files 22 | 23 | import com.hortonworks.spark.atlas._ 24 | import com.hortonworks.spark.atlas.sql.testhelper._ 25 | import com.hortonworks.spark.atlas.types.external 26 | import org.apache.atlas.model.instance.AtlasObjectId 27 | import org.apache.spark.sql.execution.command.{CreateViewCommand, ExecutedCommandExec} 28 | import org.apache.spark.sql.kafka010.KafkaTestUtils 29 | import org.scalatest.{BeforeAndAfterEach, FunSuite, Matchers} 30 | 31 | import scala.util.Random 32 | 33 | class SparkExecutionPlanProcessorWithRemoteHiveMetastoreServiceSuite 34 | extends FunSuite 35 | with BeforeAndAfterEach 36 | with Matchers 37 | with WithRemoteHiveMetastoreServiceSupport 38 | with ProcessEntityValidator 39 | with TableEntityValidator 40 | with FsEntityValidator { 41 | import com.hortonworks.spark.atlas.AtlasEntityReadHelper._ 42 | 43 | private val sourceTblName = "source_" + Random.nextInt(100000) 44 | 45 | val brokerProps: Map[String, Object] = Map[String, Object]() 46 | var kafkaTestUtils: KafkaTestUtils = _ 47 | 48 | val atlasClientConf: AtlasClientConf = new AtlasClientConf() 49 | var atlasClient: CreateEntitiesTrackingAtlasClient = _ 50 | val testHelperQueryListener = new AtlasQueryExecutionListener() 51 | 52 | val clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME) 53 | 54 | override def beforeAll(): Unit = { 55 | super.beforeAll() 56 | 57 | sparkSession.sql(s"CREATE TABLE $dbName.$sourceTblName (name string)") 58 | sparkSession.sql(s"INSERT INTO TABLE $dbName.$sourceTblName VALUES ('a'), ('b'), ('c')") 59 | 60 | atlasClient = new CreateEntitiesTrackingAtlasClient() 61 | testHelperQueryListener.clear() 62 | sparkSession.listenerManager.register(testHelperQueryListener) 63 | } 64 | 65 | override def afterAll(): Unit = { 66 | atlasClient = null 67 | sparkSession.listenerManager.unregister(testHelperQueryListener) 68 | super.afterAll() 69 | } 70 | 71 | override def beforeEach(): Unit = { 72 | atlasClient.clearEntities() 73 | } 74 | 75 | test("CREATE EXTERNAL TABLE ... LOCATION ...") { 76 | val planProcessor = new DirectProcessSparkExecutionPlanProcessor(atlasClient, atlasClientConf) 77 | val tempDir = Files.createTempDirectory("spark-atlas-connector-temp") 78 | 79 | val rand = new scala.util.Random() 80 | val outputTableName = "employee_details_" + rand.nextInt(1000000000) 81 | 82 | sparkSession.sql(s"CREATE EXTERNAL TABLE IF NOT EXISTS $dbName.$outputTableName " + 83 | "(name STRING, age INT, emp_id INT, designation STRING) " + 84 | s"LOCATION '$tempDir'") 85 | 86 | val queryDetail = testHelperQueryListener.queryDetails.last 87 | planProcessor.process(queryDetail) 88 | val entities = atlasClient.createdEntities 89 | 90 | // The query doesn't bring spark_process entity - it only brings table entities 91 | // which SAC will create reference for table entity when Spark connects to remote HMS. 92 | // SAC Atlas Client doesn't request creation for reference, hence NO recorded entities. 93 | assert(entities.length === 0) 94 | } 95 | 96 | // Here we are duplicating some tests here to verify whether no table entity is created 97 | // for hive table but process has reference for hive table. 98 | 99 | // borrowed from LoadDataHarvesterSuite 100 | test("LOAD DATA [LOCAL] INPATH path source") { 101 | val file = Files.createTempFile("input", ".txt").toFile 102 | val out = new PrintWriter(new FileOutputStream(file)) 103 | out.write("a\nb\nc\nd\n") 104 | out.close() 105 | 106 | val planProcessor = new DirectProcessSparkExecutionPlanProcessor(atlasClient, atlasClientConf) 107 | 108 | sparkSession.sql(s"LOAD DATA LOCAL INPATH '${file.getAbsolutePath}' " + 109 | s"OVERWRITE INTO TABLE $dbName.$sourceTblName").queryExecution 110 | 111 | val queryDetail = testHelperQueryListener.queryDetails.last 112 | planProcessor.process(queryDetail) 113 | val entities = atlasClient.createdEntities 114 | 115 | val input = getOnlyOneEntity(entities, external.FS_PATH_TYPE_STRING) 116 | val expectedInputs = AtlasUtils.entitiesToReferences(Seq(input), useGuid = false) 117 | 118 | val expectedOutputs = Set( 119 | new AtlasObjectId(external.HIVE_TABLE_TYPE_STRING, 120 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 121 | external.hiveTableUniqueAttribute(clusterName, dbName, sourceTblName))) 122 | 123 | validateProcessEntityWithAtlasEntities(entities, _ => {}, expectedInputs, expectedOutputs) 124 | } 125 | 126 | // borrowed from InsertIntoHiveDirHarvesterSuite 127 | test("INSERT OVERWRITE DIRECTORY path...") { 128 | val planProcessor = new DirectProcessSparkExecutionPlanProcessor(atlasClient, atlasClientConf) 129 | 130 | sparkSession.sql(s"INSERT OVERWRITE DIRECTORY 'target/dir1' " + 131 | s"SELECT * FROM $dbName.$sourceTblName").queryExecution 132 | 133 | val queryDetail = testHelperQueryListener.queryDetails.last 134 | planProcessor.process(queryDetail) 135 | val entities = atlasClient.createdEntities 136 | 137 | val expectedInputs = Set( 138 | new AtlasObjectId(external.HIVE_TABLE_TYPE_STRING, 139 | org.apache.atlas.AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, 140 | external.hiveTableUniqueAttribute(clusterName, dbName, sourceTblName))) 141 | 142 | val output = getOnlyOneEntity(entities, external.FS_PATH_TYPE_STRING) 143 | val dir = new File("target/dir1").getAbsolutePath 144 | assertFsEntity(output, dir) 145 | val expectedOutputs = AtlasUtils.entitiesToReferences(Seq(output), useGuid = false) 146 | 147 | validateProcessEntityWithAtlasEntities(entities, _ => {}, expectedInputs, expectedOutputs) 148 | } 149 | 150 | } 151 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/testhelper/AtlasQueryExecutionListener.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql.testhelper 19 | 20 | import com.hortonworks.spark.atlas.sql.QueryDetail 21 | import org.apache.spark.sql.execution.QueryExecution 22 | import org.apache.spark.sql.util.QueryExecutionListener 23 | 24 | import scala.collection.mutable 25 | 26 | class AtlasQueryExecutionListener extends QueryExecutionListener { 27 | val queryDetails = new mutable.MutableList[QueryDetail]() 28 | 29 | override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { 30 | if (qe.logical.isStreaming) { 31 | // streaming query will be tracked via SparkAtlasStreamingQueryEventTracker 32 | return 33 | } 34 | queryDetails += QueryDetail.fromQueryExecutionListener(qe, durationNs) 35 | } 36 | 37 | override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { 38 | throw exception 39 | } 40 | 41 | def clear(): Unit = { 42 | queryDetails.clear() 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/testhelper/AtlasStreamingQueryProgressListener.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql.testhelper 19 | 20 | import com.hortonworks.spark.atlas.sql.QueryDetail 21 | import com.hortonworks.spark.atlas.utils.Logging 22 | import org.apache.spark.sql.SparkSession 23 | import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper} 24 | 25 | import scala.collection.mutable 26 | import org.apache.spark.sql.streaming.StreamingQueryListener 27 | import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent} 28 | 29 | class AtlasStreamingQueryProgressListener extends StreamingQueryListener with Logging { 30 | val queryDetails = new mutable.MutableList[QueryDetail]() 31 | 32 | def onQueryStarted(event: QueryStartedEvent): Unit = {} 33 | 34 | def onQueryProgress(event: QueryProgressEvent): Unit = { 35 | // FIXME: this is totally duplicated with SparkAtlasStreamingQueryEventTracker... 36 | // Extract into somewhere... 37 | val query = SparkSession.active.streams.get(event.progress.id) 38 | if (query != null) { 39 | query match { 40 | case query: StreamingQueryWrapper => 41 | val qd = QueryDetail.fromStreamingQueryListener(query.streamingQuery, event) 42 | queryDetails += qd 43 | 44 | case query: StreamExecution => 45 | val qd = QueryDetail.fromStreamingQueryListener(query, event) 46 | queryDetails += qd 47 | 48 | case _ => logWarn(s"Unexpected type of streaming query: ${query.getClass}") 49 | } 50 | } else { 51 | logWarn(s"Cannot find query ${event.progress.id} from active spark session!") 52 | } 53 | } 54 | 55 | def onQueryTerminated(event: QueryTerminatedEvent): Unit = {} 56 | 57 | def clear(): Unit = { 58 | queryDetails.clear() 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/testhelper/BaseHarvesterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql.testhelper 19 | 20 | import com.hortonworks.spark.atlas.{AtlasClientConf, SACAtlasReferenceable} 21 | import org.apache.spark.sql.SparkSession 22 | import org.scalatest.{FunSuite, Matchers} 23 | 24 | abstract class BaseHarvesterSuite 25 | extends FunSuite 26 | with Matchers 27 | with ProcessEntityValidator 28 | with TableEntityValidator { 29 | 30 | protected def getSparkSession: SparkSession 31 | 32 | protected def getDbName: String 33 | 34 | protected def expectSparkTableModels: Boolean 35 | 36 | protected def initializeTestEnvironment(): Unit = {} 37 | 38 | protected def cleanupTestEnvironment(): Unit = {} 39 | 40 | private val atlasClientConf: AtlasClientConf = new AtlasClientConf() 41 | protected val _clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME) 42 | 43 | protected lazy val _spark: SparkSession = getSparkSession 44 | protected lazy val _dbName: String = getDbName 45 | protected lazy val _useSparkTable: Boolean = expectSparkTableModels 46 | 47 | protected def prepareDatabase(): Unit = { 48 | _spark.sql(s"DROP DATABASE IF EXISTS ${_dbName} Cascade") 49 | _spark.sql(s"CREATE DATABASE ${_dbName}") 50 | _spark.sql(s"USE ${_dbName}") 51 | } 52 | 53 | protected def cleanupDatabase(): Unit = { 54 | _spark.sql(s"DROP DATABASE IF EXISTS ${_dbName} Cascade") 55 | } 56 | 57 | protected def assertTable(ref: SACAtlasReferenceable, tableName: String): Unit = { 58 | assertTable(ref, _dbName, tableName, _clusterName, _useSparkTable) 59 | } 60 | 61 | protected def assertTableWithNamePrefix( 62 | ref: SACAtlasReferenceable, 63 | tblNamePrefix: String): Unit = { 64 | assertTableWithNamePrefix(ref, _dbName, tblNamePrefix, _clusterName, _useSparkTable) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/testhelper/CreateEntitiesTrackingAtlasClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql.testhelper 19 | 20 | import com.hortonworks.spark.atlas.AtlasClient 21 | import com.sun.jersey.core.util.MultivaluedMapImpl 22 | import org.apache.atlas.model.instance.AtlasEntity 23 | import org.apache.atlas.model.typedef.AtlasTypesDef 24 | 25 | import scala.collection.mutable 26 | 27 | class CreateEntitiesTrackingAtlasClient extends AtlasClient { 28 | val createdEntities = new mutable.ListBuffer[AtlasEntity]() 29 | 30 | def clearEntities(): Unit = { 31 | createdEntities.clear() 32 | } 33 | 34 | override def createAtlasTypeDefs(typeDefs: AtlasTypesDef): Unit = {} 35 | 36 | override def getAtlasTypeDefs(searchParams: MultivaluedMapImpl): AtlasTypesDef = { 37 | new AtlasTypesDef() 38 | } 39 | 40 | override def updateAtlasTypeDefs(typeDefs: AtlasTypesDef): Unit = {} 41 | 42 | override protected def doCreateEntities(entities: Seq[AtlasEntity]): Unit = { 43 | createdEntities ++= entities 44 | } 45 | 46 | override protected def doDeleteEntityWithUniqueAttr(entityType: String, 47 | attribute: String): Unit = {} 48 | 49 | override protected def doUpdateEntityWithUniqueAttr(entityType: String, attribute: String, 50 | entity: AtlasEntity): Unit = {} 51 | } 52 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/testhelper/DirectProcessSparkExecutionPlanProcessor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql.testhelper 19 | 20 | import com.hortonworks.spark.atlas.sql.{QueryDetail, SparkExecutionPlanProcessor} 21 | import com.hortonworks.spark.atlas.{AtlasClient, AtlasClientConf} 22 | 23 | class DirectProcessSparkExecutionPlanProcessor( 24 | atlasClient: AtlasClient, 25 | atlasClientConf: AtlasClientConf) 26 | extends SparkExecutionPlanProcessor(atlasClient, atlasClientConf) { 27 | 28 | override def process(qd: QueryDetail): Unit = super.process(qd) 29 | } 30 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/testhelper/FsEntityValidator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql.testhelper 19 | 20 | import java.io.File 21 | import java.util.Locale 22 | 23 | import com.hortonworks.spark.atlas.AtlasEntityReadHelper.{getStringAttribute, listAtlasEntitiesAsType} 24 | import com.hortonworks.spark.atlas.{SACAtlasEntityWithDependencies, SACAtlasReferenceable} 25 | import com.hortonworks.spark.atlas.types.external 26 | import org.apache.atlas.model.instance.AtlasEntity 27 | import org.scalatest.FunSuite 28 | 29 | trait FsEntityValidator extends FunSuite { 30 | 31 | def findFsEntities(entities: Seq[AtlasEntity], dir: File): Seq[AtlasEntity] = { 32 | entities.filter { e => 33 | getStringAttribute(e, "qualifiedName").toLowerCase(Locale.ROOT).contains( 34 | dir.getAbsolutePath.toLowerCase(Locale.ROOT)) 35 | } 36 | } 37 | 38 | def assertEntitiesFsType( 39 | dirToExpectedCount: Map[File, Int], 40 | entities: Set[AtlasEntity]): Unit = { 41 | val fsEntities = listAtlasEntitiesAsType(entities.toSeq, external.FS_PATH_TYPE_STRING) 42 | assert(fsEntities.size === dirToExpectedCount.values.sum) 43 | 44 | dirToExpectedCount.foreach { case (dir, expectedCnt) => 45 | val fsEntitiesFiltered = fsEntities.filter { e => 46 | getStringAttribute(e, "qualifiedName").toLowerCase(Locale.ROOT).contains( 47 | dir.getAbsolutePath.toLowerCase(Locale.ROOT)) 48 | } 49 | assert(fsEntitiesFiltered.length === expectedCnt) 50 | } 51 | } 52 | 53 | def assertFsEntity(ref: SACAtlasReferenceable, path: String): Unit = { 54 | val inputEntity = ref.asInstanceOf[SACAtlasEntityWithDependencies].entity 55 | assertFsEntity(inputEntity, path) 56 | } 57 | 58 | def assertFsEntity(entity: AtlasEntity, path: String): Unit = { 59 | assert(entity.getTypeName === external.FS_PATH_TYPE_STRING) 60 | assert(entity.getAttribute("name") === path.toLowerCase) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/testhelper/KafkaTopicEntityValidator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql.testhelper 19 | 20 | import com.hortonworks.spark.atlas.TestUtils 21 | import com.hortonworks.spark.atlas.sql.KafkaTopicInformation 22 | import org.scalatest.FunSuite 23 | import com.hortonworks.spark.atlas.AtlasEntityReadHelper.{getStringAttribute, listAtlasEntitiesAsType} 24 | import com.hortonworks.spark.atlas.types.external.KAFKA_TOPIC_STRING 25 | import org.apache.atlas.model.instance.AtlasEntity 26 | 27 | trait KafkaTopicEntityValidator extends FunSuite { 28 | 29 | def assertEntitiesKafkaTopicType( 30 | topics: Seq[KafkaTopicInformation], 31 | entities: Set[AtlasEntity]): Unit = { 32 | val kafkaTopicEntities = listAtlasEntitiesAsType(entities.toSeq, KAFKA_TOPIC_STRING) 33 | assert(kafkaTopicEntities.size === topics.size) 34 | 35 | val expectedTopicNames = topics.map(_.topicName).toSet 36 | val expectedClusterNames = topics.map(_.clusterName.getOrElse("primary")).toSet 37 | val expectedQualifiedNames = topics.map { ti => 38 | KafkaTopicInformation.getQualifiedName(ti, "primary") 39 | }.toSet 40 | 41 | assert(kafkaTopicEntities.map(_.getAttribute("name").toString()).toSet === expectedTopicNames) 42 | assert(kafkaTopicEntities.map(_.getAttribute("topic").toString()).toSet === 43 | expectedTopicNames) 44 | assert(kafkaTopicEntities.map(getStringAttribute(_, "uri")).toSet === expectedTopicNames) 45 | assert(kafkaTopicEntities.map(getStringAttribute(_, "clusterName")).toSet === 46 | expectedClusterNames) 47 | assert(kafkaTopicEntities.map(getStringAttribute(_, "qualifiedName")).toSet === 48 | expectedQualifiedNames) 49 | } 50 | 51 | def assertKafkaTopicEntities( 52 | topics: Seq[KafkaTopicInformation], entities: Seq[AtlasEntity]): Unit = { 53 | assert( 54 | topics.map(KafkaTopicInformation.getQualifiedName(_, "primary")).toSet === 55 | entities.map(getStringAttribute(_, "qualifiedName")).toSet) 56 | } 57 | 58 | def assertEntitiesAreSubsetOfTopics( 59 | topics: Seq[KafkaTopicInformation], entities: Seq[AtlasEntity]): Unit = { 60 | TestUtils.assertSubsetOf( 61 | topics.map(KafkaTopicInformation.getQualifiedName(_, "primary")).toSet, 62 | entities.map(getStringAttribute(_, "qualifiedName")).toSet) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/testhelper/ProcessEntityValidator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql.testhelper 19 | 20 | import java.util 21 | 22 | import com.hortonworks.spark.atlas.AtlasEntityReadHelper.getOnlyOneEntity 23 | import com.hortonworks.spark.atlas.types.metadata 24 | 25 | import scala.collection.JavaConverters._ 26 | import com.hortonworks.spark.atlas.{SACAtlasEntityWithDependencies, SACAtlasReferenceable, AtlasUtils, TestUtils} 27 | import org.apache.atlas.model.instance.{AtlasEntity, AtlasObjectId} 28 | import org.scalatest.FunSuite 29 | 30 | trait ProcessEntityValidator extends FunSuite { 31 | def validateProcessEntity( 32 | process: SACAtlasReferenceable, 33 | validateFnForProcess: AtlasEntity => Unit, 34 | validateFnForInputs: Seq[SACAtlasReferenceable] => Unit, 35 | validateFnForOutputs: Seq[SACAtlasReferenceable] => Unit): Unit = { 36 | require(process.isInstanceOf[SACAtlasEntityWithDependencies]) 37 | val pEntity = process.asInstanceOf[SACAtlasEntityWithDependencies].entity 38 | validateFnForProcess(pEntity) 39 | 40 | assert(pEntity.getAttribute("inputs").isInstanceOf[util.Collection[_]]) 41 | assert(pEntity.getAttribute("outputs").isInstanceOf[util.Collection[_]]) 42 | val inputs = pEntity.getAttribute("inputs").asInstanceOf[util.Collection[AtlasObjectId]] 43 | val outputs = pEntity.getAttribute("outputs").asInstanceOf[util.Collection[AtlasObjectId]] 44 | 45 | val pDeps = process.asInstanceOf[SACAtlasEntityWithDependencies].dependencies 46 | val inputEntities = TestUtils.findEntities(pDeps, inputs.asScala.toSeq) 47 | val outputEntities = TestUtils.findEntities(pDeps, outputs.asScala.toSeq) 48 | 49 | assert(inputs.size() === inputEntities.size) 50 | assert(outputs.size() === outputEntities.size) 51 | 52 | validateFnForInputs(inputEntities) 53 | validateFnForOutputs(outputEntities) 54 | } 55 | 56 | def validateProcessEntityWithAtlasEntities( 57 | entities: Seq[AtlasEntity], 58 | validateFnForProcess: AtlasEntity => Unit, 59 | expectedInputObjectIds: Set[AtlasObjectId], 60 | expectedOutputObjectIds: Set[AtlasObjectId]): Unit = { 61 | val pEntity = getOnlyOneEntity(entities, metadata.PROCESS_TYPE_STRING) 62 | validateFnForProcess(pEntity) 63 | 64 | assert(pEntity.getAttribute("inputs").isInstanceOf[util.Collection[_]]) 65 | assert(pEntity.getAttribute("outputs").isInstanceOf[util.Collection[_]]) 66 | val inputs = pEntity.getAttribute("inputs").asInstanceOf[util.Collection[AtlasObjectId]] 67 | val outputs = pEntity.getAttribute("outputs").asInstanceOf[util.Collection[AtlasObjectId]] 68 | 69 | assert(inputs.asScala.toSet === expectedInputObjectIds) 70 | assert(outputs.asScala.toSet === expectedOutputObjectIds) 71 | } 72 | 73 | def validateProcessEntityWithAtlasEntitiesForStreamingQuery( 74 | entities: Seq[AtlasEntity], 75 | validateFnForProcess: AtlasEntity => Unit, 76 | expectedInputEntities: Seq[AtlasEntity], 77 | expectedOutputEntities: Seq[AtlasEntity]): Unit = { 78 | val pEntity = getOnlyOneEntity(entities, metadata.PROCESS_TYPE_STRING) 79 | validateFnForProcess(pEntity) 80 | 81 | assert(pEntity.getAttribute("inputs").isInstanceOf[util.Collection[_]]) 82 | assert(pEntity.getAttribute("outputs").isInstanceOf[util.Collection[_]]) 83 | val inputs = pEntity.getAttribute("inputs").asInstanceOf[util.Collection[AtlasObjectId]] 84 | val outputs = pEntity.getAttribute("outputs").asInstanceOf[util.Collection[AtlasObjectId]] 85 | 86 | val expectedInputObjectIds = expectedInputEntities.map { entity => 87 | AtlasUtils.entityToReference(entity, useGuid = false) 88 | }.toSet 89 | val expectedOutputObjectIds = expectedOutputEntities.map { entity => 90 | AtlasUtils.entityToReference(entity, useGuid = false) 91 | }.toSet 92 | 93 | assert(inputs.asScala.toSet === expectedInputObjectIds) 94 | assert(outputs.asScala.toSet === expectedOutputObjectIds) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/sql/testhelper/TableEntityValidator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.sql.testhelper 19 | 20 | import com.hortonworks.spark.atlas.types.{external, metadata} 21 | import com.hortonworks.spark.atlas.{SACAtlasEntityReference, SACAtlasEntityWithDependencies, SACAtlasReferenceable} 22 | import org.apache.atlas.AtlasClient 23 | import org.scalatest.FunSuite 24 | 25 | trait TableEntityValidator extends FunSuite { 26 | def assertTable( 27 | ref: SACAtlasReferenceable, 28 | dbName: String, 29 | tblName: String, 30 | clusterName: String, 31 | useSparkTable: Boolean): Unit = { 32 | if (useSparkTable) { 33 | assertSparkTable(ref, dbName, tblName) 34 | } else { 35 | assertHiveTable(ref, dbName, tblName, clusterName) 36 | } 37 | } 38 | 39 | def assertTableWithNamePrefix( 40 | ref: SACAtlasReferenceable, 41 | dbName: String, 42 | tblNamePrefix: String, 43 | clusterName: String, 44 | useSparkTable: Boolean): Unit = { 45 | if (useSparkTable) { 46 | assertSparkTableWithNamePrefix(ref, dbName, tblNamePrefix) 47 | } else { 48 | assertHiveTableWithNamePrefix(ref, dbName, tblNamePrefix, clusterName) 49 | } 50 | } 51 | 52 | def assertSparkTable(ref: SACAtlasReferenceable, dbName: String, tblName: String): Unit = { 53 | assert(ref.isInstanceOf[SACAtlasEntityWithDependencies]) 54 | val entity = ref.asInstanceOf[SACAtlasEntityWithDependencies].entity 55 | assert(entity.getTypeName === metadata.TABLE_TYPE_STRING) 56 | assert(entity.getAttribute("name") === tblName) 57 | assert(entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME).toString 58 | .endsWith(s"$dbName.$tblName")) 59 | } 60 | 61 | def assertSparkTableWithNamePrefix( 62 | ref: SACAtlasReferenceable, 63 | dbName: String, 64 | tblNamePrefix: String): Unit = { 65 | assert(ref.isInstanceOf[SACAtlasEntityWithDependencies]) 66 | val entity = ref.asInstanceOf[SACAtlasEntityWithDependencies].entity 67 | assert(entity.getTypeName === metadata.TABLE_TYPE_STRING) 68 | assert(entity.getAttribute("name").toString.startsWith(tblNamePrefix)) 69 | assert(entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME).toString 70 | .contains(s"$dbName.$tblNamePrefix")) 71 | } 72 | 73 | def assertHiveTable( 74 | ref: SACAtlasReferenceable, 75 | dbName: String, 76 | tblName: String, 77 | clusterName: String): Unit = { 78 | assert(ref.isInstanceOf[SACAtlasEntityReference]) 79 | val outputRef = ref.asInstanceOf[SACAtlasEntityReference] 80 | assert(outputRef.typeName === external.HIVE_TABLE_TYPE_STRING) 81 | assert(outputRef.qualifiedName === s"$dbName.$tblName@$clusterName") 82 | } 83 | 84 | def assertHiveTableWithNamePrefix( 85 | ref: SACAtlasReferenceable, 86 | dbName: String, 87 | tblNamePrefix: String, 88 | clusterName: String): Unit = { 89 | assert(ref.isInstanceOf[SACAtlasEntityReference]) 90 | val outputRef = ref.asInstanceOf[SACAtlasEntityReference] 91 | assert(outputRef.typeName === external.HIVE_TABLE_TYPE_STRING) 92 | assert(outputRef.qualifiedName.startsWith(s"$dbName.$tblNamePrefix")) 93 | assert(outputRef.qualifiedName.endsWith(s"@$clusterName")) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/types/MLAtlasEntityUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.types 19 | 20 | import java.io.File 21 | 22 | import org.apache.atlas.{AtlasClient, AtlasConstants} 23 | import org.apache.atlas.model.instance.AtlasEntity 24 | import org.apache.commons.io.FileUtils 25 | import org.apache.spark.ml.Pipeline 26 | import org.apache.spark.ml.feature.MinMaxScaler 27 | import org.apache.spark.ml.linalg.Vectors 28 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 29 | import org.scalatest.{FunSuite, Matchers} 30 | import com.hortonworks.spark.atlas.TestUtils._ 31 | import com.hortonworks.spark.atlas.{AtlasUtils, WithHiveSupport} 32 | 33 | class MLAtlasEntityUtilsSuite extends FunSuite with Matchers with WithHiveSupport { 34 | 35 | def getTableEntity(tableName: String): AtlasEntity = { 36 | val dbDefinition = createDB("db1", "hdfs:///test/db/db1") 37 | val sd = createStorageFormat() 38 | val schema = new StructType() 39 | .add("user", StringType, false) 40 | .add("age", IntegerType, true) 41 | val tableDefinition = createTable("db1", s"$tableName", schema, sd) 42 | 43 | val tableEntities = internal.sparkTableToEntity( 44 | tableDefinition, AtlasConstants.DEFAULT_CLUSTER_NAME, Some(dbDefinition)) 45 | val tableEntity = tableEntities.entity 46 | 47 | tableEntity 48 | } 49 | 50 | test("pipeline, pipeline model, fit and transform") { 51 | val uri = "/" 52 | val pipelineDir = "tmp/pipeline" 53 | val modelDir = "tmp/model" 54 | 55 | val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir) 56 | pipelineDirEntity.entity.getAttribute("uri") should be (uri) 57 | pipelineDirEntity.entity.getAttribute("directory") should be (pipelineDir) 58 | pipelineDirEntity.dependencies.length should be (0) 59 | 60 | val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir) 61 | modelDirEntity.entity.getAttribute("uri") should be (uri) 62 | modelDirEntity.entity.getAttribute("directory") should be (modelDir) 63 | modelDirEntity.dependencies.length should be (0) 64 | 65 | val df = sparkSession.createDataFrame(Seq( 66 | (1, Vectors.dense(0.0, 1.0, 4.0), 1.0), 67 | (2, Vectors.dense(1.0, 0.0, 4.0), 2.0), 68 | (3, Vectors.dense(1.0, 0.0, 5.0), 3.0), 69 | (4, Vectors.dense(0.0, 0.0, 5.0), 4.0) 70 | )).toDF("id", "features", "label") 71 | 72 | val scaler = new MinMaxScaler() 73 | .setInputCol("features") 74 | .setOutputCol("features_scaled") 75 | .setMin(0.0) 76 | .setMax(3.0) 77 | val pipeline = new Pipeline().setStages(Array(scaler)) 78 | 79 | val model = pipeline.fit(df) 80 | 81 | pipeline.write.overwrite().save(pipelineDir) 82 | 83 | val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity) 84 | pipelineEntity.entity.getTypeName should be (metadata.ML_PIPELINE_TYPE_STRING) 85 | pipelineEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be ( 86 | pipeline.uid) 87 | pipelineEntity.entity.getAttribute("name") should be (pipeline.uid) 88 | pipelineEntity.entity.getRelationshipAttribute("directory") should be ( 89 | AtlasUtils.entityToReference(pipelineDirEntity.entity, useGuid = false)) 90 | pipelineEntity.dependencies should be (Seq(pipelineDirEntity)) 91 | 92 | val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity) 93 | val modelUid = model.uid.replaceAll("pipeline", "model") 94 | modelEntity.entity.getTypeName should be (metadata.ML_MODEL_TYPE_STRING) 95 | modelEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (modelUid) 96 | modelEntity.entity.getAttribute("name") should be (modelUid) 97 | modelEntity.entity.getRelationshipAttribute("directory") should be ( 98 | AtlasUtils.entityToReference(modelDirEntity.entity, useGuid = false)) 99 | 100 | modelEntity.dependencies should be (Seq(modelDirEntity)) 101 | 102 | FileUtils.deleteDirectory(new File("tmp")) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/types/SparkAtlasEntityUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.types 19 | 20 | import org.apache.atlas.{AtlasClient, AtlasConstants} 21 | import org.apache.spark.sql.SparkSession 22 | import org.apache.spark.sql.types._ 23 | import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} 24 | import com.hortonworks.spark.atlas.{AtlasClientConf, SACAtlasEntityWithDependencies, AtlasUtils, TestUtils} 25 | import com.hortonworks.spark.atlas.utils.SparkUtils 26 | 27 | class SparkAtlasEntityUtilsSuite extends FunSuite with Matchers with BeforeAndAfterAll { 28 | import TestUtils._ 29 | 30 | private var sparkSession: SparkSession = _ 31 | 32 | private var sparkAtlasEntityUtils: AtlasEntityUtils = _ 33 | 34 | override protected def beforeAll(): Unit = { 35 | super.beforeAll() 36 | sparkSession = SparkSession.builder() 37 | .master("local") 38 | .config("spark.ui.enabled", "false") 39 | .getOrCreate() 40 | 41 | sparkAtlasEntityUtils = new AtlasEntityUtils { 42 | override def conf: AtlasClientConf = new AtlasClientConf 43 | } 44 | } 45 | 46 | override protected def afterAll(): Unit = { 47 | sparkSession.stop() 48 | SparkSession.clearActiveSession() 49 | SparkSession.clearDefaultSession() 50 | sparkSession = null 51 | sparkAtlasEntityUtils = null 52 | super.afterAll() 53 | } 54 | 55 | test("convert spark catalog db to entity") { 56 | val dbDefinition = createDB("db1", "hdfs:///test/db/db1") 57 | val dbEntity = sparkAtlasEntityUtils.sparkDbToEntity(dbDefinition) 58 | 59 | dbEntity.entity.getTypeName should be (metadata.DB_TYPE_STRING) 60 | dbEntity.entity.getAttribute("name") should be ("db1") 61 | dbEntity.entity.getAttribute(AtlasConstants.CLUSTER_NAME_ATTRIBUTE) should be ( 62 | AtlasConstants.DEFAULT_CLUSTER_NAME) 63 | dbEntity.entity.getAttribute("location") should be ("hdfs:///test/db/db1") 64 | dbEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be ( 65 | sparkSession.sparkContext.applicationId + ".db1") 66 | 67 | dbEntity.dependencies.length should be (0) 68 | } 69 | 70 | test("convert spark catalog storage format to entity") { 71 | val storageFormat = createStorageFormat() 72 | val sdEntity = 73 | sparkAtlasEntityUtils.sparkStorageFormatToEntity(storageFormat, "db1", "tbl1") 74 | 75 | sdEntity.entity.getTypeName should be (metadata.STORAGEDESC_TYPE_STRING) 76 | sdEntity.entity.getAttribute("location") should be (null) 77 | sdEntity.entity.getAttribute("inputFormat") should be (null) 78 | sdEntity.entity.getAttribute("outputFormat") should be (null) 79 | sdEntity.entity.getAttribute("serde") should be (null) 80 | sdEntity.entity.getAttribute("compressed") should be (java.lang.Boolean.FALSE) 81 | sdEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be ( 82 | sparkSession.sparkContext.applicationId + ".db1.tbl1.storageFormat") 83 | 84 | sdEntity.dependencies.length should be (0) 85 | } 86 | 87 | test("convert spark table to entity") { 88 | val dbDefinition = createDB("db1", "hdfs:///test/db/db1") 89 | val sd = createStorageFormat() 90 | val schema = new StructType() 91 | .add("user", StringType, false) 92 | .add("age", IntegerType, true) 93 | val tableDefinition = createTable("db1", "tbl1", schema, sd) 94 | 95 | val tableEnt = sparkAtlasEntityUtils.sparkTableToEntity(tableDefinition, Some(dbDefinition)) 96 | assert(tableEnt.isInstanceOf[SACAtlasEntityWithDependencies]) 97 | val tableEntity = tableEnt.asInstanceOf[SACAtlasEntityWithDependencies] 98 | 99 | val tableDeps = tableEntity.dependencies 100 | 101 | val dbEntity = tableDeps.find(_.typeName == metadata.DB_TYPE_STRING).get 102 | val sdEntity = tableDeps.find(_.typeName == metadata.STORAGEDESC_TYPE_STRING).get 103 | 104 | tableEntity.entity.getTypeName should be (metadata.TABLE_TYPE_STRING) 105 | tableEntity.entity.getAttribute("name") should be ("tbl1") 106 | tableEntity.entity.getAttribute("owner") should be (SparkUtils.currUser()) 107 | tableEntity.entity.getAttribute("ownerType") should be ("USER") 108 | 109 | tableEntity.entity.getRelationshipAttribute("db") should be (dbEntity.asObjectId) 110 | tableEntity.entity.getRelationshipAttribute("sd") should be (sdEntity.asObjectId) 111 | } 112 | } 113 | 114 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/utils/JdbcUtilsTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.utils 19 | 20 | import org.scalatest.{FunSuite, Matchers} 21 | 22 | class JdbcUtilsTest extends FunSuite with Matchers { 23 | 24 | test("get database name from mysql url") { 25 | val dbName = JdbcUtils.getDatabaseName("jdbc:mysql://localhost:3306/testdb") 26 | dbName should be ("testdb") 27 | } 28 | 29 | test("get database name from mysql url with properties") { 30 | val dbName = JdbcUtils.getDatabaseName( 31 | "jdbc:mysql://localhost:3306/testdb?user=root&password=secret") 32 | dbName should be ("testdb") 33 | } 34 | 35 | test("get database name from mariadb url") { 36 | val dbName = JdbcUtils.getDatabaseName("jdbc:mariadb://127.0.0.1/testdb") 37 | dbName should be ("testdb") 38 | } 39 | 40 | test("get database name from db2 url") { 41 | val dbName = JdbcUtils.getDatabaseName("jdbc:db2://127.0.0.1:50000/testdb") 42 | dbName should be ("testdb") 43 | } 44 | 45 | test("get database name from derby url") { 46 | val dbName = JdbcUtils.getDatabaseName("jdbc:derby://localhost/testdb") 47 | dbName should be ("testdb") 48 | } 49 | 50 | test("get database name from derby url with properties") { 51 | val dbName = JdbcUtils.getDatabaseName("jdbc:derby://localhost/testdb;create=true") 52 | dbName should be ("testdb") 53 | } 54 | 55 | test("get database name from derby in memory format url with properties") { 56 | val dbName = JdbcUtils.getDatabaseName("jdbc:derby:memory:testdb;create=true") 57 | dbName should be ("testdb") 58 | } 59 | 60 | test("get database name from oracle url") { 61 | val dbName = JdbcUtils.getDatabaseName("jdbc:oracle:thin:root/secret@localhost:1521:testdb") 62 | dbName should be ("testdb") 63 | } 64 | 65 | test("get database name from postgres url") { 66 | val dbName = JdbcUtils.getDatabaseName("jdbc:postgresql://localhost:5432/testdb") 67 | dbName should be ("testdb") 68 | } 69 | 70 | test("get database name from sql server url") { 71 | val dbName = JdbcUtils.getDatabaseName( 72 | "jdbc:sqlserver://localhost:1433;databaseName=testdb;integratedSecurity=true;") 73 | dbName should be ("testdb") 74 | } 75 | 76 | test("get database name from sql server url with properties") { 77 | val dbName = JdbcUtils.getDatabaseName( 78 | "jdbc:sqlserver://localhost:1433;databaseName=testdb") 79 | dbName should be ("testdb") 80 | } 81 | 82 | test("get database name from teradata url") { 83 | val dbName = JdbcUtils.getDatabaseName( 84 | "jdbc:teradata://127.0.0.1/DATABASE=testdb") 85 | dbName should be ("testdb") 86 | } 87 | 88 | test("get database name from teradata url with properties") { 89 | val dbName = JdbcUtils.getDatabaseName( 90 | "jdbc:teradata://127.0.0.1/DATABASE=testdb/CHARSET=UTF8,COMPAT_DBS=true") 91 | dbName should be ("testdb") 92 | } 93 | 94 | test("unsupported database") { 95 | val dbName = JdbcUtils.getDatabaseName( 96 | "jdbc:sqlite:product.db") 97 | dbName should be ("") 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hortonworks/spark/atlas/utils/SparkUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hortonworks.spark.atlas.utils 19 | 20 | import org.apache.hadoop.hive.conf.HiveConf 21 | import org.apache.spark.sql.SparkSession 22 | import org.scalatest._ 23 | 24 | class SparkUtilsSuite extends FunSuite with Matchers with BeforeAndAfter { 25 | 26 | var sparkSession: SparkSession = _ 27 | 28 | after { 29 | if (sparkSession != null) { 30 | sparkSession.stop() 31 | SparkSession.clearActiveSession() 32 | SparkSession.clearDefaultSession() 33 | sparkSession = null 34 | } 35 | } 36 | 37 | test("get unique prefix when using in-memory catalog") { 38 | sparkSession = SparkSession.builder() 39 | .master("local") 40 | .getOrCreate() 41 | 42 | SparkUtils.getUniqueQualifiedPrefix() should be (sparkSession.sparkContext.applicationId + ".") 43 | } 44 | 45 | // TODO. Should have a better way to figure out unique name 46 | ignore("get unique prefix when using hive catalog") { 47 | sparkSession = SparkSession.builder() 48 | .master("local") 49 | .enableHiveSupport() 50 | .config("spark.ui.enabled", "false") 51 | .getOrCreate() 52 | 53 | val hiveConf = new HiveConf(sparkSession.sparkContext.hadoopConfiguration, classOf[HiveConf]) 54 | 55 | // if hive.metastore.uris is set, which means we're using metastore server. 56 | hiveConf.set("hive.metastore.uris", "thrift://localhost:10000") 57 | SparkUtils.getUniqueQualifiedPrefix(Some(hiveConf)) should be ("thrift://localhost:10000.") 58 | 59 | // if embedded mode is used 60 | hiveConf.unset("hive.metastore.uris") 61 | hiveConf.set("javax.jdo.option.ConnectionDriverName", "org.apache.derby.jdbc.EmbeddedDriver") 62 | SparkUtils.getUniqueQualifiedPrefix(Some(hiveConf)) should be ( 63 | sparkSession.sparkContext.applicationId + ".") 64 | 65 | // otherwise if local metastore backend is used 66 | hiveConf.set("javax.jdo.option.ConnectionDriverName", "com.mysql.jdbc.Driver") 67 | hiveConf.set("javax.jdo.option.ConnectionURL", 68 | "jdbc:mysql://localhost:3030/hive?createDatabaseIfNotExist=true") 69 | SparkUtils.getUniqueQualifiedPrefix(Some(hiveConf)) should be ( 70 | "jdbc:mysql://localhost:3030/hive?createDatabaseIfNotExist=true.") 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /spark-atlas-connector/src/test/scala/com/hotels/beeju/ThriftHiveMetaStoreTestUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.hotels.beeju 19 | 20 | class ThriftHiveMetaStoreTestUtil(dbName: String) 21 | extends ThriftHiveMetaStoreJUnitRule(dbName) { 22 | override def before(): Unit = { 23 | super.before() 24 | } 25 | 26 | override def after(): Unit = { 27 | super.after() 28 | } 29 | } 30 | --------------------------------------------------------------------------------