├── src ├── test │ ├── resources │ │ ├── credentials.json │ │ └── logback-test.xml │ └── java │ │ └── io │ │ └── cdap │ │ └── plugin │ │ └── cdc │ │ ├── integration │ │ ├── StructuredRecordRepresentation.java │ │ └── CDCPluginIntegrationTestBase.java │ │ ├── common │ │ └── BigtableOperations.java │ │ ├── performance │ │ ├── CDCPluginPerfTestBase.java │ │ └── CDCPipelinePerfTest.java │ │ ├── sink │ │ └── CDCBigTableConfigUnitTest.java │ │ └── source │ │ └── oracle │ │ └── GoldenGateKafkaConfigUnitTest.java └── main │ └── java │ └── io │ └── cdap │ └── plugin │ └── cdc │ ├── sink │ ├── TypeConversionException.java │ ├── CDCHBaseConfig.java │ ├── CDCBigTableConfig.java │ ├── CDCHBase.java │ ├── CDCBigTable.java │ ├── CDCKuduConfig.java │ └── CDCTableUtil.java │ ├── common │ ├── OperationType.java │ ├── DriverCleanup.java │ ├── CDCReferencePluginConfig.java │ ├── SparkConfigs.java │ ├── JDBCDriverShim.java │ ├── Schemas.java │ └── DBUtils.java │ ├── source │ ├── oracle │ │ ├── BinaryMessages.java │ │ ├── GoldenGateKafkaConfig.java │ │ ├── Normalizer.java │ │ └── GoldenGateKafka.java │ └── sqlserver │ │ ├── SQLServerConnectionFactory.java │ │ ├── ResultSetToDDLRecord.java │ │ ├── TableInformation.java │ │ ├── PluginConnectionFactory.java │ │ ├── ResultSetToDMLRecord.java │ │ ├── CTSQLServerConfig.java │ │ └── CTSQLServer.java │ └── DMLFlattener.java ├── icons └── CDCBigTable-sparksink.png ├── docker-compose └── cdc-env │ ├── GoldenGate │ ├── dirprm │ │ └── ext1.prm │ └── Dockerfile │ ├── Oracle │ ├── Dockerfile │ ├── init.sh │ ├── createDB.sh │ └── dbca.rsp.tmpl │ ├── GoldenGate-Bigdata │ ├── dirprm │ │ ├── dependencies │ │ │ └── kafka │ │ │ │ ├── lz4-java-1.4.1.jar │ │ │ │ ├── slf4j-api-1.7.25.jar │ │ │ │ ├── kafka-clients-2.1.1.jar │ │ │ │ └── snappy-java-1.1.7.2.jar │ │ ├── rconf.prm │ │ ├── kafka-producer.properties │ │ └── rconf.properties │ └── Dockerfile │ └── docker-compose.yml ├── widgets ├── DMLFlattener-transform.json ├── CDCHBase-sparksink.json ├── CDCBigTable-sparksink.json ├── CDCDatabase-streamingsource.json ├── CDCKudu-sparksink.json └── CTSQLServer-streamingsource.json ├── .gitignore ├── docs ├── CDCKudu-sparksink.md ├── CDCDatabase-streamingsource.md ├── CDCHBase-sparksink.md ├── CTSQLServer-streamingsource.md └── CDCBigTable-sparksink.md ├── suppressions.xml └── README.md /src/test/resources/credentials.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /icons/CDCBigTable-sparksink.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/icons/CDCBigTable-sparksink.png -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate/dirprm/ext1.prm: -------------------------------------------------------------------------------- 1 | EXTRACT ext1 2 | USERIDALIAS oggadmin 3 | EXTTRAIL /u01/app/ogg/dirdat/in 4 | TABLE trans_user.*; -------------------------------------------------------------------------------- /docker-compose/cdc-env/Oracle/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM oracle/database:18.3.0-se2 2 | 3 | COPY "dbca.rsp.tmpl" "createDB.sh" $ORACLE_BASE/ 4 | COPY "init.sh" $ORACLE_BASE/scripts/setup/ -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/lz4-java-1.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/lz4-java-1.4.1.jar -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/slf4j-api-1.7.25.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/slf4j-api-1.7.25.jar -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/kafka-clients-2.1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/kafka-clients-2.1.1.jar -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/snappy-java-1.1.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/snappy-java-1.1.7.2.jar -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate-Bigdata/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM oracle/goldengate-standard:12.3.0.1.2 2 | 3 | COPY ./dirprm /u01/app/ogg/dirprm 4 | RUN chmod -R 777 /u01/app/ogg/dirprm \ 5 | && mkdir /u01/app/ogg/dirdat && chown -R oracle:oinstall /u01/app/ogg/dirdat -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM oracle/goldengate-standard:18.1.0.0.0 2 | 3 | COPY ./dirprm /u01/app/ogg/dirprm 4 | RUN chmod -R 777 /u01/app/ogg/dirprm \ 5 | && mkdir /u01/app/ogg/dirdat && chown -R oracle:oinstall /u01/app/ogg/dirdat 6 | -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/rconf.prm: -------------------------------------------------------------------------------- 1 | REPLICAT rconf 2 | TARGETDB LIBFILE libggjava.so SET property=dirprm/rconf.properties 3 | discardfile ./dirrpt/kafkax.dsc, purge 4 | GETTRUNCATES 5 | GETUPDATEBEFORES 6 | ReportCount Every 1000 Records, Rate 7 | MAP *.*, TARGET *.*; -------------------------------------------------------------------------------- /widgets/DMLFlattener-transform.json: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "spec-version": "1.5" 4 | }, 5 | "display-name": "DML Flattener", 6 | "configuration-groups": [ 7 | { 8 | "label": "Basic", 9 | "properties": [ ] 10 | } 11 | ], 12 | "outputs": [ 13 | { 14 | "name": "schema", 15 | "widget-type": "schema", 16 | "widget-attributes": { 17 | "schema-default-type": "string" 18 | } 19 | } 20 | ] 21 | } 22 | 23 | -------------------------------------------------------------------------------- /widgets/CDCHBase-sparksink.json: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "spec-version": "1.5" 4 | }, 5 | "configuration-groups": [ 6 | { 7 | "label": "HBase Sink Configuration", 8 | "properties": [ 9 | { 10 | "widget-type" : "textbox", 11 | "label" : "Reference Name", 12 | "name" : "referenceName", 13 | "description" : "Reference specifies the name to be used to track this external source" 14 | } 15 | ] 16 | } 17 | ] 18 | } -------------------------------------------------------------------------------- /docker-compose/cdc-env/Oracle/init.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | sqlplus / as sysdba << EOF 4 | alter system set enable_goldengate_replication=TRUE; 5 | alter database add supplemental log data; 6 | alter database force logging; 7 | alter system switch logfile; 8 | 9 | shutdown immediate; 10 | Startup mount; 11 | Alter database archivelog; 12 | alter database open; 13 | 14 | CREATE USER gg_extract IDENTIFIED BY gg_extract; 15 | GRANT CREATE SESSION, CONNECT, RESOURCE, ALTER ANY TABLE, ALTER SYSTEM, DBA, SELECT ANY TRANSACTION TO gg_extract; 16 | CREATE USER trans_user IDENTIFIED BY trans_user; 17 | GRANT CREATE SESSION, CONNECT, RESOURCE TO trans_user; 18 | ALTER USER trans_user QUOTA UNLIMITED ON USERS; 19 | 20 | exit; 21 | EOF -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | .*.swp 3 | .beamer 4 | # Package Files # 5 | *.jar 6 | *.war 7 | *.ear 8 | 9 | # Intellij Files & Dir # 10 | *.iml 11 | *.ipr 12 | *.iws 13 | atlassian-ide-plugin.xml 14 | out/ 15 | .DS_Store 16 | ./lib/ 17 | .idea 18 | 19 | # Gradle Files & Dir # 20 | build/ 21 | .gradle/ 22 | .stickyStorage 23 | .build/ 24 | target/ 25 | 26 | # Node log 27 | npm-*.log 28 | logs/ 29 | .nux_enabled 30 | .nux_dashboard 31 | 32 | # Singlenode and test data files. 33 | /templates/ 34 | /artifacts/ 35 | /data/ 36 | /data-fabric-tests/data/ 37 | 38 | # gateway test leftover 39 | /gateway/data/ 40 | /watchdog/data/ 41 | 42 | # Checkstyle report 43 | examples/checkstyle_report.xml 44 | 45 | # Examples Stuff 46 | dependency-reduced-pom.xml 47 | 48 | # Hive db Stuff 49 | derby.log 50 | 51 | # generated config files 52 | /cdap-web-app/conf/generated 53 | /cdap-client-tests/conf/generated 54 | 55 | # generated by docs build 56 | *.pyc 57 | !/docker-compose/cdc-env/**/*.jar 58 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/sink/TypeConversionException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.sink; 18 | 19 | /** 20 | * Exception thrown when there is issue with type conversion from CDAP pipeline schema to Kudu. 21 | */ 22 | public class TypeConversionException extends Exception { 23 | public TypeConversionException(String s) { 24 | super(s); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/sink/CDCHBaseConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.sink; 18 | 19 | import io.cdap.cdap.api.plugin.PluginConfig; 20 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig; 21 | 22 | /** 23 | * Defines the {@link PluginConfig} for the {@link CDCHBase}. 24 | */ 25 | public class CDCHBaseConfig extends CDCReferencePluginConfig { 26 | public CDCHBaseConfig(String referenceName) { 27 | super(referenceName); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/kafka-producer.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2019 Cask Data, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | # use this file except in compliance with the License. You may obtain a copy of 6 | # the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | # License for the specific language governing permissions and limitations under 14 | # the License. 15 | # 16 | 17 | bootstrap.servers=localhost:9092 18 | acks = 1 19 | compression.type = gzip 20 | reconnect.backoff.ms = 1000 21 | 22 | value.serializer = org.apache.kafka.common.serialization.ByteArraySerializer 23 | key.serializer = org.apache.kafka.common.serialization.ByteArraySerializer 24 | # 100KB per partition 25 | batch.size = 102400 26 | linger.ms = 0 27 | max.request.size = 1048576 28 | send.buffer.bytes = 131072 -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/common/OperationType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.common; 18 | 19 | /** 20 | * Represents change operation type. 21 | */ 22 | public enum OperationType { 23 | INSERT, UPDATE, DELETE; 24 | 25 | public static OperationType fromShortName(String name) { 26 | switch (name.toUpperCase()) { 27 | case "I": 28 | return INSERT; 29 | case "U": 30 | return UPDATE; 31 | case "D": 32 | return DELETE; 33 | default: 34 | throw new IllegalArgumentException(String.format("Unknown change operation '%s'", name)); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /docs/CDCKudu-sparksink.md: -------------------------------------------------------------------------------- 1 | # CDC SQL Server Streaming Source 2 | 3 | Description 4 | ----------- 5 | This plugin takes input from a CDC source and writes the changes to Kudu. 6 | 7 | All CDC sink plugins are normally used in conjunction with CDC source plugins. 8 | CDC sink expects messages in CDC format as an input. 9 | 10 | Properties 11 | ---------- 12 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc. 13 | 14 | **Master Addresses**: Comma separated list of hostname:port of Apache Kudu Masters. 15 | 16 | **Replicas**: Specifies the number of buckets to split the table into. 17 | 18 | **Seed**: Seed to randomize the mapping of rows to hash buckets. 19 | 20 | **Compression Algorithm**: Compression algorithm to be applied on the columns. 21 | 22 | **Encoding Type**: Specifies the encoding to be applied on the schema. 23 | 24 | **User Operations Timeout**: Timeout for Kudu operations in milliseconds. 25 | 26 | **Administration Operations Timeout**: Administration operation timeout. 27 | 28 | **Number of copies**: Specifies the number of replicas for the Kudu tables. 29 | 30 | **Rows Buffer**: Number of rows that are buffered before flushing to the tablet server. 31 | 32 | **Boss Threads**: Specifies the number of boss threads to be used by the client. -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/oracle/BinaryMessages.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.oracle; 18 | 19 | import io.cdap.cdap.api.common.Bytes; 20 | 21 | import java.nio.ByteBuffer; 22 | import javax.annotation.Nonnull; 23 | 24 | /** 25 | * Utility methods for dealing with binary messages. 26 | */ 27 | public class BinaryMessages { 28 | private BinaryMessages() { 29 | // utility class 30 | } 31 | 32 | @Nonnull 33 | static byte[] getBytesFromBinaryMessage(Object message) { 34 | if (message instanceof ByteBuffer) { 35 | ByteBuffer bb = (ByteBuffer) message; 36 | return Bytes.toBytes(bb); 37 | } else { 38 | return (byte[]) message; 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/rconf.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2019 Cask Data, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | # use this file except in compliance with the License. You may obtain a copy of 6 | # the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | # License for the specific language governing permissions and limitations under 14 | # the License. 15 | # 16 | 17 | gg.handlerlist = kafkahandler 18 | 19 | # Kafka handler properties 20 | gg.handler.kafkahandler.type = kafka 21 | gg.handler.kafkahandler.KafkaProducerConfigFile = kafka-producer.properties 22 | gg.handler.kafkahandler.topicMappingTemplate=oggtopic 23 | gg.handler.kafkahandler.keyMappingTemplate=${position} 24 | gg.handler.kafkahandler.format = avro_op 25 | gg.handler.kafkahandler.schemaTopicName=oggtopic 26 | gg.handler.kafkahandler.format.wrapMessageInGenericAvroMessage=true 27 | gg.handler.kafkahandler.mode = op 28 | gg.handler.kafkahandler.BlockingSend = true 29 | 30 | # Logging settings 31 | gg.log=log4j 32 | gg.log.level=INFO 33 | gg.report.time=30sec 34 | 35 | #Set the classpath here 36 | gg.classpath=dirprm/dependencies/kafka/* -------------------------------------------------------------------------------- /widgets/CDCBigTable-sparksink.json: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "spec-version": "1.5" 4 | }, 5 | "display-name": "CDC Google Cloud Bigtable Sink", 6 | "configuration-groups": [ 7 | { 8 | "label": "Cloud Bigtable Sink Configuration", 9 | "properties": [ 10 | { 11 | "widget-type": "textbox", 12 | "label": "Reference Name", 13 | "name": "referenceName", 14 | "description": "Reference specifies the name to be used to track this external source" 15 | }, 16 | { 17 | "widget-type": "textbox", 18 | "label": "Instance Id", 19 | "name": "instance", 20 | "description": "The Instance Id the Cloud Bigtable is in." 21 | }, 22 | { 23 | "widget-type": "textbox", 24 | "label": "Project Id", 25 | "name": "project", 26 | "description": "The Project Id the Cloud Bigtable table is in.", 27 | "widget-attributes": { 28 | "default": "auto-detect" 29 | } 30 | }, 31 | { 32 | "widget-type": "textbox", 33 | "label": "Service Account File Path", 34 | "name": "serviceFilePath", 35 | "description": "Path to service account file (local to host running on).", 36 | "widget-attributes": { 37 | "default": "auto-detect" 38 | } 39 | } 40 | ] 41 | } 42 | ] 43 | } 44 | 45 | -------------------------------------------------------------------------------- /src/test/java/io/cdap/plugin/cdc/integration/StructuredRecordRepresentation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.integration; 18 | 19 | import io.cdap.cdap.api.data.format.StructuredRecord; 20 | import io.cdap.cdap.format.StructuredRecordStringConverter; 21 | import org.assertj.core.presentation.StandardRepresentation; 22 | 23 | import java.io.IOException; 24 | 25 | public class StructuredRecordRepresentation extends StandardRepresentation { 26 | @Override 27 | public String toStringOf(Object object) { 28 | try { 29 | if (object instanceof StructuredRecord) { 30 | return StructuredRecordStringConverter.toJsonString((StructuredRecord) object); 31 | } 32 | return super.toStringOf(object); 33 | } catch (IOException e) { 34 | throw new IllegalArgumentException(e); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/common/DriverCleanup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | package io.cdap.plugin.cdc.common; 17 | 18 | import com.google.common.base.Throwables; 19 | import io.cdap.cdap.etl.api.Destroyable; 20 | 21 | import java.sql.DriverManager; 22 | import java.sql.SQLException; 23 | import javax.annotation.Nullable; 24 | 25 | /** 26 | * class to de-register driver 27 | */ 28 | public class DriverCleanup implements Destroyable { 29 | private final JDBCDriverShim driverShim; 30 | 31 | DriverCleanup(@Nullable JDBCDriverShim driverShim) { 32 | this.driverShim = driverShim; 33 | } 34 | 35 | public void destroy() { 36 | if (driverShim != null) { 37 | try { 38 | DriverManager.deregisterDriver(driverShim); 39 | } catch (SQLException e) { 40 | throw Throwables.propagate(e); 41 | } 42 | } 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /widgets/CDCDatabase-streamingsource.json: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "spec-version": "1.5" 4 | }, 5 | "configuration-groups": [ 6 | { 7 | "label": "Kafka Configuration", 8 | "properties": [ 9 | { 10 | "widget-type": "textbox", 11 | "label": "Reference Name", 12 | "name": "referenceName", 13 | "description": "Reference specifies the name to be used to track this external source" 14 | }, 15 | { 16 | "widget-type": "textbox", 17 | "label": "Kafka Broker", 18 | "name": "broker" 19 | }, 20 | { 21 | "widget-type": "textbox", 22 | "label": "Kafka Topic", 23 | "name": "topic" 24 | }, 25 | { 26 | "widget-type": "textbox", 27 | "label": "Default Initial Offset", 28 | "name": "defaultInitialOffset" 29 | }, 30 | { 31 | "widget-type": "textbox", 32 | "label": "Max Rate Per Partition", 33 | "name": "maxRatePerPartition", 34 | "widget-attributes": { 35 | "default": "1000" 36 | } 37 | } 38 | ] 39 | } 40 | ], 41 | "outputs": [ 42 | { 43 | "widget-type": "non-editable-schema-editor", 44 | "schema": { 45 | "name": "CDCRecord", 46 | "type": "record", 47 | "fields": [ 48 | { 49 | "name": "cdcMessage", 50 | "type": "bytes" 51 | } 52 | ] 53 | } 54 | } 55 | ] 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/common/CDCReferencePluginConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.common; 18 | 19 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException; 20 | import io.cdap.plugin.common.Constants; 21 | import io.cdap.plugin.common.IdUtils; 22 | import io.cdap.plugin.common.ReferencePluginConfig; 23 | 24 | /** 25 | * {@link ReferencePluginConfig} with reference name validation. 26 | */ 27 | public class CDCReferencePluginConfig extends ReferencePluginConfig { 28 | public CDCReferencePluginConfig(String referenceName) { 29 | super(referenceName); 30 | } 31 | 32 | public void validate() { 33 | if (!containsMacro(Constants.Reference.REFERENCE_NAME)) { 34 | try { 35 | IdUtils.validateId(referenceName); 36 | } catch (IllegalArgumentException e) { 37 | throw new InvalidConfigPropertyException(e.getMessage(), Constants.Reference.REFERENCE_NAME); 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /docker-compose/cdc-env/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | mssql: 4 | image: mcr.microsoft.com/mssql/server:2017-latest 5 | ports: 6 | - 1433:1433 7 | environment: 8 | - SA_PASSWORD=123Qwe123 9 | - ACCEPT_EULA=Y 10 | 11 | zookeeper: 12 | image: zookeeper:3.4.13 13 | network_mode: host 14 | ports: 15 | - "2181:2181" 16 | environment: 17 | - ZOOKEEPER_TICK_TIME=2000 18 | 19 | kafka: 20 | image: wurstmeister/kafka:2.12-2.2.0 21 | network_mode: host 22 | depends_on: 23 | - zookeeper 24 | environment: 25 | - JVM_OPTS=-Xmx2g -XX:+UseG1GC 26 | - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0 27 | - KAFKA_UNCLEAN_LEADER_ELECTION_ENABLE=true 28 | - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9092 29 | - KAFKA_LISTENERS=PLAINTEXT://0.0.0.0:9092 30 | - KAFKA_ZOOKEEPER_CONNECT=localhost:2181 31 | - KAFKA_CREATE_TOPICS=oggtopic:1:1 32 | 33 | oracledb: 34 | build: Oracle 35 | ports: 36 | - 1521:1521 37 | - 8080:8080 38 | - 5500:5500 39 | environment: 40 | - ORACLE_SID=XE 41 | - ORACLE_PWD=123Qwe123 42 | volumes: 43 | - "oracleData:/opt/oracle/oradata" 44 | 45 | goldengate_oracle: 46 | build: GoldenGate 47 | cap_add: 48 | - SYS_RESOURCE 49 | volumes: 50 | - "oracleData:/opt/oracle/oradata" 51 | - "goldengateDirdat:/u01/app/ogg/dirdat" 52 | 53 | goldengate_bigdata: 54 | build: GoldenGate-Bigdata 55 | network_mode: host 56 | volumes: 57 | - "goldengateDirdat:/u01/app/ogg/dirdat" 58 | 59 | 60 | volumes: 61 | oracleData: 62 | goldengateDirdat: 63 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/common/SparkConfigs.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.common; 18 | 19 | import org.apache.spark.api.java.JavaRDD; 20 | 21 | import java.util.HashMap; 22 | import java.util.Iterator; 23 | import java.util.Map; 24 | 25 | /** 26 | * Utility methods for dealing with Spark configuration and data. 27 | */ 28 | public class SparkConfigs { 29 | private SparkConfigs() { 30 | // utility class 31 | } 32 | 33 | /** 34 | * Get the hadoop configurations and passed it as a Map to the closure 35 | * 36 | * @param javaRDD Spark RDD object 37 | * @return configuration Map 38 | */ 39 | public static Map getHadoopConfigs(JavaRDD javaRDD) { 40 | Iterator> iterator = javaRDD.context().hadoopConfiguration().iterator(); 41 | Map configs = new HashMap<>(); 42 | while (iterator.hasNext()) { 43 | Map.Entry next = iterator.next(); 44 | configs.put(next.getKey(), next.getValue()); 45 | } 46 | return configs; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | %d{ISO8601} - %-5p [%t:%C{1}@%L] - %m%n 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/sqlserver/SQLServerConnectionFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.sqlserver; 18 | 19 | import com.microsoft.sqlserver.jdbc.SQLServerDriver; 20 | import org.apache.spark.rdd.JdbcRDD; 21 | 22 | import java.sql.Connection; 23 | import java.sql.DriverManager; 24 | 25 | /** 26 | * A class which can provide a {@link Connection} using {@link SQLServerDriver} which is 27 | * serializable. 28 | * Note: This class does not do any connection management. Its the responsibility of the client 29 | * to manage/close the connection. 30 | */ 31 | class SQLServerConnectionFactory implements JdbcRDD.ConnectionFactory { 32 | private final String connectionUrl; 33 | private final String userName; 34 | private final String password; 35 | 36 | SQLServerConnectionFactory(String connectionUrl, String userName, String password) { 37 | this.connectionUrl = connectionUrl; 38 | this.userName = userName; 39 | this.password = password; 40 | } 41 | 42 | @Override 43 | public Connection getConnection() throws Exception { 44 | Class.forName(SQLServerDriver.class.getName()); 45 | return DriverManager.getConnection(connectionUrl, userName, password); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /suppressions.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 18 | 19 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/sqlserver/ResultSetToDDLRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.sqlserver; 18 | 19 | import com.google.common.base.Joiner; 20 | import io.cdap.cdap.api.data.format.StructuredRecord; 21 | import io.cdap.cdap.api.data.schema.Schema; 22 | import io.cdap.plugin.cdc.common.DBUtils; 23 | import io.cdap.plugin.cdc.common.Schemas; 24 | import org.apache.spark.api.java.function.Function; 25 | 26 | import java.sql.ResultSet; 27 | import java.sql.SQLException; 28 | 29 | /** 30 | * A serializable class to allow invoking {@link scala.Function1} from Java. The function converts {@link ResultSet} 31 | * to {@link StructuredRecord} for DDL i.e. schema changes 32 | */ 33 | public class ResultSetToDDLRecord implements Function { 34 | 35 | private final String schemaName; 36 | private final String tableName; 37 | 38 | ResultSetToDDLRecord(String schemaName, String tableName) { 39 | this.schemaName = schemaName; 40 | this.tableName = tableName; 41 | } 42 | 43 | @Override 44 | public StructuredRecord call(ResultSet row) throws SQLException { 45 | Schema tableSchema = Schema.recordOf(Schemas.SCHEMA_RECORD, DBUtils.getSchemaFields(row)); 46 | return StructuredRecord.builder(Schemas.DDL_SCHEMA) 47 | .set(Schemas.TABLE_FIELD, Joiner.on(".").join(schemaName, tableName)) 48 | .set(Schemas.SCHEMA_FIELD, tableSchema.toString()) 49 | .build(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/test/java/io/cdap/plugin/cdc/common/BigtableOperations.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.common; 18 | 19 | import com.google.cloud.bigtable.hbase.BigtableConfiguration; 20 | import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.hbase.TableName; 23 | import org.apache.hadoop.hbase.client.Connection; 24 | 25 | import java.io.IOException; 26 | import javax.annotation.Nullable; 27 | 28 | /** 29 | * Utility methods for common Bigtable operations. 30 | */ 31 | public class BigtableOperations { 32 | private BigtableOperations() { 33 | // utility class 34 | } 35 | 36 | public static Connection connect(String projectId, String instanceId, @Nullable String serviceAccountFilepath) { 37 | Configuration configuration = BigtableConfiguration.configure(projectId, instanceId); 38 | if (serviceAccountFilepath != null) { 39 | configuration.set(BigtableOptionsFactory.BIGTABLE_SERVICE_ACCOUNT_JSON_KEYFILE_LOCATION_KEY, 40 | serviceAccountFilepath); 41 | } 42 | return BigtableConfiguration.connect(configuration); 43 | } 44 | 45 | public static void dropTableIfExists(Connection connection, String dbTableName) throws IOException { 46 | TableName tableName = TableName.valueOf(dbTableName); 47 | if (connection.getAdmin().tableExists(tableName)) { 48 | connection.getAdmin().disableTable(tableName); 49 | connection.getAdmin().deleteTable(tableName); 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /docs/CDCDatabase-streamingsource.md: -------------------------------------------------------------------------------- 1 | # CDC Golden Gate Kafka Streaming Source 2 | 3 | Description 4 | ----------- 5 | This plugin reads Change Data Capture (CDC) events from a Golden Gate Kafka topic. 6 | 7 | All CDC source plugins are normally used in conjunction with CDC sink plugins. 8 | CDC source produces messages in CDC format. 9 | 10 | Useful links: 11 | * [Goldengate site](https://www.oracle.com/middleware/technologies/goldengate.html) 12 | * [Installing Oracle GoldenGate](https://docs.oracle.com/goldengate/1212/gg-winux/GIORA/install.htm#GIORA162). 13 | * [Using Oracle GoldenGate for Oracle Database](https://www.oracle.com/pls/topic/lookup?ctx=en/middleware/goldengate/core/18.1&id=GGODB-GUID-110CD372-2F7E-4262-B8D2-DC0A80422806). 14 | * [Using Oracle GoldenGate for BigData](https://docs.oracle.com/goldengate/bd123210/gg-bd/GADBD/introduction-oracle-goldengate-big-data.htm#GADBD114). 15 | 16 | Properties 17 | ---------- 18 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc. 19 | 20 | **Kafka Broker**: Kafka broker specified in host:port form. For example, example.com:9092. 21 | 22 | **Kafka Topic**: Name of the topic to which Golden Gate publishes the DDL and DML changes. 23 | 24 | **Default Initial Offset**: The default initial offset to read from. 25 | An offset of -2 means the smallest offset (the beginning of the topic). 26 | An offset of -1 means the latest offset (the end of the topic). 27 | Defaults to -1. Offsets are inclusive. 28 | If an offset of 5 is used, the message at offset 5 will be read. 29 | 30 | **Max Rate Per Partition**: Max number of records to read per second per partition. 0 means there is no limit. 31 | Defaults to 1000. 32 | 33 | Required GoldenGate Settings 34 | ---------- 35 | * GoldenGate should push data using Kafka handler 36 | * Generic Wrapper Functionality should be enabled ("gg.handler.kafkahandler.format.wrapMessageInGenericAvroMessage"). 37 | * Schema topic ("gg.handler.kafkahandler.schemaTopicName") should be equal to DML changes topic. 38 | * Handler should send events in "OP" mode ("gg.handler.kafkahandler.mode"). 39 | * Handler should send events in "avro_op" format ("gg.handler.kafkahandler.format"). -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/sqlserver/TableInformation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.sqlserver; 18 | 19 | import com.google.common.collect.ImmutableSet; 20 | import com.google.common.collect.Sets; 21 | import scala.Serializable; 22 | 23 | import java.util.Set; 24 | 25 | /** 26 | * Represents SQL Server Table information 27 | */ 28 | class TableInformation implements Serializable { 29 | private final String schemaName; 30 | private final String name; 31 | private final Set columnSchema; 32 | private final Set primaryKeys; 33 | private final Set valueColumns; 34 | 35 | TableInformation(String schemaName, String name, Set columnSchema, Set primaryKeys) { 36 | this.schemaName = schemaName; 37 | this.name = name; 38 | this.columnSchema = columnSchema; 39 | this.primaryKeys = primaryKeys; 40 | this.valueColumns = ImmutableSet.copyOf(Sets.difference(columnSchema, primaryKeys)); 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return "TableInformation{" + 46 | "schemaName='" + schemaName + '\'' + 47 | ", name='" + name + '\'' + 48 | ", columnSchema=" + columnSchema + 49 | ", primaryKeys=" + primaryKeys + 50 | ", valueColumns=" + valueColumns + 51 | '}'; 52 | } 53 | 54 | String getSchemaName() { 55 | return schemaName; 56 | } 57 | 58 | String getName() { 59 | return name; 60 | } 61 | 62 | Set getColumnSchema() { 63 | return columnSchema; 64 | } 65 | 66 | Set getPrimaryKeys() { 67 | return primaryKeys; 68 | } 69 | 70 | Set getValueColumnNames() { 71 | return valueColumns; 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/sqlserver/PluginConnectionFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | 18 | package io.cdap.plugin.cdc.source.sqlserver; 19 | 20 | import io.cdap.cdap.api.plugin.PluginContext; 21 | import io.cdap.plugin.cdc.common.DBUtils; 22 | import org.apache.spark.rdd.JdbcRDD; 23 | 24 | import java.io.Serializable; 25 | import java.sql.Connection; 26 | import java.sql.Driver; 27 | import java.sql.DriverManager; 28 | import java.util.Map; 29 | 30 | /** 31 | * Serializable jdbc connection factory that uses CDAP plugin context to instantiate the jdbc driver. 32 | */ 33 | public class PluginConnectionFactory implements JdbcRDD.ConnectionFactory, Serializable { 34 | private static final long serialVersionUID = -7897960584858589314L; 35 | private final String stageName; 36 | private final String connectionString; 37 | private final PluginContext pluginContext; 38 | private transient String user; 39 | private transient String password; 40 | private transient boolean initialized; 41 | 42 | PluginConnectionFactory(PluginContext pluginContext, String stageName, String connectionString) { 43 | this.stageName = stageName; 44 | this.connectionString = connectionString; 45 | this.pluginContext = pluginContext; 46 | } 47 | 48 | @Override 49 | public Connection getConnection() throws Exception { 50 | if (!initialized) { 51 | Class driverClass = pluginContext.loadPluginClass(stageName + ":" + CTSQLServer.JDBC_PLUGIN_ID); 52 | DBUtils.ensureJDBCDriverIsAvailable(driverClass, connectionString); 53 | Map stageProperties = pluginContext.getPluginProperties(stageName).getProperties(); 54 | user = stageProperties.get(CTSQLServerConfig.USERNAME); 55 | password = stageProperties.get(CTSQLServerConfig.PASSWORD); 56 | initialized = true; 57 | } 58 | return DriverManager.getConnection(connectionString, user, password); 59 | } 60 | 61 | private void initialize() { 62 | 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/test/java/io/cdap/plugin/cdc/performance/CDCPluginPerfTestBase.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.performance; 18 | 19 | import io.cdap.cdap.api.artifact.ArtifactScope; 20 | import io.cdap.cdap.api.artifact.ArtifactSummary; 21 | import io.cdap.cdap.common.UnauthenticatedException; 22 | import io.cdap.cdap.datastreams.DataStreamsSparkLauncher; 23 | import io.cdap.cdap.etl.proto.v2.DataStreamsConfig; 24 | import io.cdap.cdap.etl.proto.v2.ETLPlugin; 25 | import io.cdap.cdap.etl.proto.v2.ETLStage; 26 | import io.cdap.cdap.proto.artifact.AppRequest; 27 | import io.cdap.cdap.proto.id.ApplicationId; 28 | import io.cdap.cdap.proto.id.NamespaceId; 29 | import io.cdap.cdap.test.ApplicationManager; 30 | import io.cdap.cdap.test.IntegrationTestBase; 31 | import io.cdap.cdap.test.SparkManager; 32 | 33 | import java.io.IOException; 34 | 35 | public abstract class CDCPluginPerfTestBase extends IntegrationTestBase { 36 | protected SparkManager deployETL(ETLPlugin sourcePlugin, ETLPlugin sinkPlugin, String appName) throws Exception { 37 | ETLStage source = new ETLStage("source", sourcePlugin); 38 | ETLStage sink = new ETLStage("sink", sinkPlugin); 39 | DataStreamsConfig etlConfig = DataStreamsConfig.builder() 40 | .addStage(source) 41 | .addStage(sink) 42 | .addConnection(source.getName(), sink.getName()) 43 | .setBatchInterval("1s") 44 | .build(); 45 | 46 | AppRequest appRequest = getStreamingAppRequest(etlConfig); 47 | ApplicationId appId = NamespaceId.DEFAULT.app(appName); 48 | ApplicationManager applicationManager = deployApplication(appId, appRequest); 49 | return applicationManager.getSparkManager(DataStreamsSparkLauncher.NAME); 50 | } 51 | 52 | private AppRequest getStreamingAppRequest(DataStreamsConfig config) 53 | throws IOException, UnauthenticatedException { 54 | String version = getMetaClient().getVersion().getVersion(); 55 | return new AppRequest<>(new ArtifactSummary("cdap-data-streams", version, ArtifactScope.SYSTEM), config); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/common/JDBCDriverShim.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.common; 18 | 19 | import java.sql.Connection; 20 | import java.sql.Driver; 21 | import java.sql.DriverPropertyInfo; 22 | import java.sql.SQLException; 23 | import java.sql.SQLFeatureNotSupportedException; 24 | import java.util.Properties; 25 | import java.util.logging.Logger; 26 | 27 | /** 28 | * Shim for JDBC driver as a better alternative to mere Class.forName to load the JDBC Driver class. 29 | * 30 | * From http://www.kfu.com/~nsayer/Java/dyn-jdbc.html 31 | * One problem with using
{@code Class.forName()}
to find and load the JDBC Driver class is that it 32 | * presumes that your driver is in the classpath. This means either packaging the driver in your jar, or having to 33 | * stick the driver somewhere (probably unpacking it too), or modifying your classpath. 34 | * But why not use something like URLClassLoader and the overload of Class.forName() that lets you specify the 35 | * ClassLoader?" Because the DriverManager will refuse to use a driver not loaded by the system ClassLoader. 36 | * The workaround for this is to create a shim class that implements java.sql.Driver. 37 | * This shim class will do nothing but call the methods of an instance of a JDBC driver that we loaded dynamically. 38 | */ 39 | public class JDBCDriverShim implements Driver { 40 | 41 | private final Driver delegate; 42 | 43 | public JDBCDriverShim(Driver delegate) { 44 | this.delegate = delegate; 45 | } 46 | 47 | @Override 48 | public boolean acceptsURL(String url) throws SQLException { 49 | return delegate.acceptsURL(url); 50 | } 51 | 52 | @Override 53 | public Connection connect(String url, Properties info) throws SQLException { 54 | return delegate.connect(url, info); 55 | } 56 | 57 | @Override 58 | public int getMajorVersion() { 59 | return delegate.getMajorVersion(); 60 | } 61 | 62 | @Override 63 | public int getMinorVersion() { 64 | return delegate.getMinorVersion(); 65 | } 66 | 67 | @Override 68 | public DriverPropertyInfo[] getPropertyInfo(String url, Properties info) throws SQLException { 69 | return delegate.getPropertyInfo(url, info); 70 | } 71 | 72 | @Override 73 | public boolean jdbcCompliant() { 74 | return delegate.jdbcCompliant(); 75 | } 76 | 77 | @Override 78 | public Logger getParentLogger() throws SQLFeatureNotSupportedException { 79 | return delegate.getParentLogger(); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /docker-compose/cdc-env/Oracle/createDB.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # LICENSE UPL 1.0 3 | # 4 | # Copyright (c) 1982-2018 Oracle and/or its affiliates. All rights reserved. 5 | # 6 | # Since: November, 2016 7 | # Author: gerald.venzl@oracle.com 8 | # Description: Creates an Oracle Database based on following parameters: 9 | # $ORACLE_SID: The Oracle SID and CDB name 10 | # $ORACLE_PDB: The PDB name 11 | # $ORACLE_PWD: The Oracle password 12 | # 13 | # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 14 | # 15 | 16 | set -e 17 | 18 | # Check whether ORACLE_SID is passed on 19 | export ORACLE_SID=${1:-ORCLCDB} 20 | 21 | # Check whether ORACLE_PDB is passed on 22 | export ORACLE_PDB=${2:-ORCLPDB1} 23 | 24 | # Auto generate ORACLE PWD if not passed on 25 | export ORACLE_PWD=${3:-"`openssl rand -base64 8`1"} 26 | echo "ORACLE PASSWORD FOR SYS, SYSTEM AND PDBADMIN: $ORACLE_PWD"; 27 | 28 | # Replace place holders in response file 29 | cp $ORACLE_BASE/$CONFIG_RSP $ORACLE_BASE/dbca.rsp 30 | sed -i -e "s|###ORACLE_SID###|$ORACLE_SID|g" $ORACLE_BASE/dbca.rsp 31 | sed -i -e "s|###ORACLE_PDB###|$ORACLE_PDB|g" $ORACLE_BASE/dbca.rsp 32 | sed -i -e "s|###ORACLE_PWD###|$ORACLE_PWD|g" $ORACLE_BASE/dbca.rsp 33 | sed -i -e "s|###ORACLE_CHARACTERSET###|$ORACLE_CHARACTERSET|g" $ORACLE_BASE/dbca.rsp 34 | 35 | # If there is greater than 8 CPUs default back to dbca memory calculations 36 | # dbca will automatically pick 40% of available memory for Oracle DB 37 | # The minimum of 2G is for small environments to guarantee that Oracle has enough memory to function 38 | # However, bigger environment can and should use more of the available memory 39 | # This is due to Github Issue #307 40 | if [ `nproc` -gt 8 ]; then 41 | sed -i -e "s|totalMemory=2048||g" $ORACLE_BASE/dbca.rsp 42 | fi; 43 | 44 | # Create network related config files (sqlnet.ora, tnsnames.ora, listener.ora) 45 | mkdir -p $ORACLE_HOME/network/admin 46 | echo "NAME.DIRECTORY_PATH= (TNSNAMES, EZCONNECT, HOSTNAME)" > $ORACLE_HOME/network/admin/sqlnet.ora 47 | 48 | # Listener.ora 49 | echo "LISTENER = 50 | (DESCRIPTION_LIST = 51 | (DESCRIPTION = 52 | (ADDRESS = (PROTOCOL = IPC)(KEY = EXTPROC1)) 53 | (ADDRESS = (PROTOCOL = TCP)(HOST = 0.0.0.0)(PORT = 1521)) 54 | ) 55 | ) 56 | 57 | DEDICATED_THROUGH_BROKER_LISTENER=ON 58 | DIAG_ADR_ENABLED = off 59 | " > $ORACLE_HOME/network/admin/listener.ora 60 | 61 | # Start LISTENER and run DBCA 62 | lsnrctl start && 63 | dbca -silent -createDatabase -responseFile $ORACLE_BASE/dbca.rsp || 64 | cat /opt/oracle/cfgtoollogs/dbca/$ORACLE_SID/$ORACLE_SID.log || 65 | cat /opt/oracle/cfgtoollogs/dbca/$ORACLE_SID.log 66 | 67 | echo "$ORACLE_SID=localhost:1521/$ORACLE_SID" > $ORACLE_HOME/network/admin/tnsnames.ora 68 | echo "$ORACLE_PDB= 69 | (DESCRIPTION = 70 | (ADDRESS = (PROTOCOL = TCP)(HOST = 0.0.0.0)(PORT = 1521)) 71 | (CONNECT_DATA = 72 | (SERVER = DEDICATED) 73 | (SERVICE_NAME = $ORACLE_PDB) 74 | ) 75 | )" >> $ORACLE_HOME/network/admin/tnsnames.ora 76 | 77 | # Remove second control file, fix local_listener, make PDB auto open 78 | sqlplus / as sysdba << EOF 79 | ALTER SYSTEM SET control_files='$ORACLE_BASE/oradata/$ORACLE_SID/control01.ctl' scope=spfile; 80 | ALTER SYSTEM SET local_listener=''; 81 | exit; 82 | EOF 83 | 84 | # Remove temporary response file 85 | rm $ORACLE_BASE/dbca.rsp 86 | -------------------------------------------------------------------------------- /docs/CDCHBase-sparksink.md: -------------------------------------------------------------------------------- 1 | # CDC HBase Sink 2 | 3 | Description 4 | ----------- 5 | This plugin takes input from a CDC source and writes the changes to HBase. 6 | It will write to the HBase instance running on the cluster. 7 | 8 | All CDC sink plugins are normally used in conjunction with CDC source plugins. 9 | CDC sink expects messages in CDC format as an input. 10 | 11 | Properties 12 | ---------- 13 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc. 14 | 15 | Usage Notes 16 | ----------- 17 | This plugin supports table creation and table modification on an HBase server. We recommend placing a normalizer transformation plugin before this plugin. It converts inputs into standard Data Definition Language (DDL) and Data Manipulation Language (DML) records that can be parsed by this plugin. 18 | 19 | Table Creation 20 | -------------- 21 | When the plugin receives a DDL record, it creates a table in the target HBase database. The name of the table is specified in the DDL record. Below is a sample DDL Record that creates a table in namespace `GGTEST` with name `TESTANOTHER`. 22 | ```{ 23 | "schema": { 24 | "type": "RECORD", 25 | "recordName": "DDLRecord", 26 | "fieldMap": { 27 | "table": { 28 | "name": "table", 29 | "schema": { 30 | "type": "STRING", 31 | "unionSchemas": [] 32 | } 33 | }, 34 | "schema": { 35 | "name": "schema", 36 | "schema": { 37 | "type": "STRING", 38 | "unionSchemas": [] 39 | } 40 | } 41 | }, 42 | "fields": [ 43 | { 44 | "name": "table", 45 | "schema": { 46 | "type": "STRING", 47 | "unionSchemas": [] 48 | } 49 | }, 50 | { 51 | "name": "schema", 52 | "schema": { 53 | "type": "STRING", 54 | "unionSchemas": [] 55 | } 56 | } 57 | ], 58 | "unionSchemas": [] 59 | }, 60 | "fields": { 61 | "schema": "{\"type\":\"record\",\"name\":\"columns\",\"fields\":[{\"name\":\"CID\",\"type\":[\"null\",\"long\"]},{\"name\":\"CNAME\",\"type\":[\"null\",\"string\"]}]}", 62 | "table": "GGTEST.TESTANOTHER" 63 | } 64 | } 65 | ``` 66 | 67 | Table Modification 68 | -------------- 69 | When the plugin receives a DML record, it modifies the corresponding table according to the operation specified in `op_type`. 70 | 71 | | op\_type | Operation | 72 | | :--------------: | :--------------: | 73 | | I | Insert | 74 | | U | Update | 75 | | D | Delete | 76 | 77 | The content of the changes is listed in the `change` field. The `primary_keys` field specifies the fields in `change` that will be used to name a row in the table. Below is a sample DML record that creates a row for `Scott` and inserts his information into the row. 78 | ``` 79 | { 80 | "table": "GGTEST_EMPLOYEE", 81 | "schema": "{\"type\":\"record\",\"name\":\"columns\",\"fields\":[{\"name\":\"EMPNO\",\"type\":[\"null\",\"long\"]},{\"name\":\"ENAME\",\"type\":[\"null\",\"string\"]},{\"name\":\"JOB\",\"type\":[\"null\",\"string\"]},{\"name\":\"MGR\",\"type\":[\"null\",\"long\"]},{\"name\":\"HIREDATE\",\"type\":[\"null\",\"string\"]},{\"name\":\"SAL\",\"type\":[\"null\",\"long\"]},{\"name\":\"COMM\",\"type\":[\"null\",\"long\"]},{\"name\":\"DEPTNO\",\"type\":[\"null\",\"long\"]},{\"name\":\"EMP_ADDRESS\",\"type\":[\"null\",\"string\"]}]}", 82 | "op_type": "I", 83 | "primary_keys": [ 84 | "ENAME" 85 | ], 86 | "change": { 87 | "HIREDATE": "03-DEC-2015", 88 | "JOB": "Software Engineer", 89 | "MGR": 991, 90 | "SAL": 1234, 91 | "DEPTNO": 1, 92 | "EMP_ADDRESS": "San Jose", 93 | "ENAME": "Scott", 94 | "EMPNO": 1, 95 | "COMM": 1 96 | } 97 | } 98 | ``` -------------------------------------------------------------------------------- /widgets/CDCKudu-sparksink.json: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "spec-version": "1.5" 4 | }, 5 | "configuration-groups": [ 6 | { 7 | "label": "Kudu Configuration", 8 | "properties": [ 9 | { 10 | "widget-type" : "textbox", 11 | "label" : "Reference Name", 12 | "name" : "referenceName", 13 | "description" : "Reference specifies the name to be used to track this external source" 14 | }, 15 | { 16 | "widget-type": "textbox", 17 | "label": "Master Addresses", 18 | "name": "master", 19 | "description": "Comma separated list of hostname:port of Apache Kudu Masters" 20 | } 21 | ] 22 | }, 23 | { 24 | "label": "Kudu Advanced Options", 25 | "properties" : [ 26 | { 27 | "widget-type": "number", 28 | "label": "No of buckets", 29 | "name": "buckets", 30 | "widget-attributes" : { 31 | "default" : 16 32 | } 33 | }, 34 | { 35 | "widget-type": "number", 36 | "label": "Seed", 37 | "name": "seed", 38 | "description": "Seed to randomize the mapping of rows to hash buckets", 39 | "widget-attributes" : { 40 | "default" : 0 41 | } 42 | }, 43 | { 44 | "widget-type": "select", 45 | "label": "Compression Algorithm", 46 | "name": "compression-algo", 47 | "description": "Compression Algorithm. All fields will be applied same compression", 48 | "widget-attributes" : { 49 | "values" : [ 50 | "Snappy", 51 | "LZ4", 52 | "ZLib", 53 | "Backend configured", 54 | "No Compression" 55 | ], 56 | "default": "Snappy" 57 | } 58 | }, 59 | { 60 | "widget-type": "select", 61 | "label": "Encoding Type", 62 | "name": "encoding", 63 | "description": "Encoding Type. All fields will be applied same encoding", 64 | "widget-attributes" : { 65 | "values": [ 66 | "Auto", 67 | "Plain", 68 | "Prefix", 69 | "Group Variant", 70 | "RLE", 71 | "Dictionary", 72 | "Bit Shuffle" 73 | ], 74 | "default": "Auto" 75 | } 76 | }, 77 | { 78 | "widget-type": "number", 79 | "label": "User Operations Timeout", 80 | "name": "opt-timeout", 81 | "description": "User operations timeout in milliseconds", 82 | "widget-attributes" : { 83 | "default" : 30000 84 | } 85 | }, 86 | { 87 | "widget-type": "number", 88 | "label": "Administration Operations Timeout", 89 | "name": "admin-timeout", 90 | "description": "Administration operation timeout in milliseconds", 91 | "widget-attributes" : { 92 | "default" : 30000 93 | } 94 | }, 95 | { 96 | "widget-type": "number", 97 | "label": "Replicas", 98 | "name": "replicas", 99 | "description": "Specifies the number of replicas for the Kudu tables", 100 | "widget-attributes" : { 101 | "default" : 1 102 | } 103 | }, 104 | { 105 | "widget-type": "number", 106 | "label": "Rows Buffer", 107 | "name" : "row-flush", 108 | "description": "Number of rows that are buffered before flushing to the tablet server", 109 | "widget-attributes" : { 110 | "default" : 1000 111 | } 112 | }, 113 | { 114 | "widget-type": "number", 115 | "label" : "Boss Threads", 116 | "name": "boss-threads", 117 | "description" : "Specifies the number of boss threads to be used by the client", 118 | "widget-attributes" : { 119 | "default" : 1 120 | } 121 | } 122 | ] 123 | } 124 | ] 125 | } 126 | -------------------------------------------------------------------------------- /docs/CTSQLServer-streamingsource.md: -------------------------------------------------------------------------------- 1 | # CDC SQL Server Streaming Source 2 | 3 | Description 4 | ----------- 5 | This plugin reads Change Data Capture (CDC) events from SQL Server through Change Tracking. 6 | 7 | All CDC source plugins are normally used in conjunction with CDC sink plugins. 8 | CDC source produces messages in CDC format. 9 | 10 | Properties 11 | ---------- 12 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc. 13 | 14 | **Hostname**: Hostname of the SQL Server from which the data needs to be offloaded. 15 | Ex: mysqlserver.net or 12.123.12.123. 16 | 17 | **Port**: SQL Server Port. 18 | 19 | **Username**: Username to use to connect to the specified database. Required for databases that need authentication. 20 | Optional for databases that do not require authentication. 21 | 22 | **Password**: Password to use to connect to the specified database. Required for databases that need authentication. 23 | Optional for databases that do not require authentication. 24 | 25 | **Database name**: SQL Server database name which needs to be tracked. 26 | Note: Change Tracking must be enabled on the database for the source to read the chage data. 27 | 28 | **Max Retry Seconds**: Maximum number of seconds to retry failures when reading change events. 29 | If no retries should be done, this should be set to 0. 30 | If there should not be a retry limit, this should be set to a negative number or left empty. 31 | 32 | SQL Server Change Tracking 33 | -------------------------- 34 | Change Tracking allows to identify the rows which have changed. Change Tracking SQL Server Streaming Source leverage 35 | this to retrieve just the minimum information to keep a SQL server database in sync with a downstream sink. You can 36 | read more about SQL Server Change Tracking 37 | [here](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/about-change-tracking-sql-server) 38 | 39 | ### Change Tracking and Change Data Capture 40 | SQL Server also allow capturing the changed data through Change Data Capture. Change Data Capture provides historical 41 | information about the changes. This plugin uses Change Tracking over Change Data Capture because of the following 42 | reasons: 43 | 44 | 1. Historical Information: From a pipeline whose purose is to offload data from a database and/or to keep a database 45 | in sync with some external storage, historical information is not critical. 46 | 2. Schema Changes: Change Data Capture has very limited suport for schema changes in the table being tracked. 47 | New columns added to a tracked table are not automatically tracked. For more details please refer 48 | [Handling Changes to Source Data](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/about-change-data-capture-sql-server#handling-changes-to-source-tables) 49 | 3. Supported Editions: Change Data Capture is only avaliable in DataCenter and Enterprise editions whereas 50 | Change Tracking is supported in Express, Workgroup, Web Standard, Enterprise and DataCenter. 51 | You can read more about differences between SQL Server CT and CDC 52 | [here](https://technet.microsoft.com/en-us/library/cc280519(v=sql.105).aspx) 53 | 54 | ### Enable Change Tracking for a Database 55 | Before you start using the Change Tracking SQL Server Source to track changes in your database you will need to 56 | enable Change Tracking on the database. Change Tracking can be enabled on database by: 57 | 58 | > ALTER DATABASE dbName SET CHANGE_TRACKING = ON (CHANGE_RETENTION = 2 DAYS, AUTO_CLEANUP = ON) 59 | 60 | Refer [Enable Change Tracking for a Database](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/enable-and-disable-change-tracking-sql-server#enable-change-tracking-for-a-database) for more details. 61 | 62 | ### Enable Change Tracking for a Table 63 | Change Tracking SQL Server Streaming Source will sync all the tables in a database which has change tracking enabled. 64 | Change Tracking can ge enabled for a table by: 65 | 66 | > ALTER TABLE tableName ENABLE CHANGE_TRACKING WITH (TRACK_COLUMNS_UPDATED = OFF) 67 | 68 | Refer [Enable Change Tracking for a Table](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/enable-and-disable-change-tracking-sql-server#enable-change-tracking-for-a-table) for more details. 69 | 70 | -------------------------------------------------------------------------------- /src/test/java/io/cdap/plugin/cdc/integration/CDCPluginIntegrationTestBase.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.integration; 18 | 19 | import com.codahale.metrics.MetricRegistry; 20 | import io.cdap.cdap.api.artifact.ArtifactSummary; 21 | import io.cdap.cdap.common.conf.Constants; 22 | import io.cdap.cdap.datastreams.DataStreamsApp; 23 | import io.cdap.cdap.datastreams.DataStreamsSparkLauncher; 24 | import io.cdap.cdap.etl.mock.test.HydratorTestBase; 25 | import io.cdap.cdap.etl.proto.v2.DataStreamsConfig; 26 | import io.cdap.cdap.etl.proto.v2.ETLPlugin; 27 | import io.cdap.cdap.etl.proto.v2.ETLStage; 28 | import io.cdap.cdap.etl.spark.Compat; 29 | import io.cdap.cdap.proto.artifact.AppRequest; 30 | import io.cdap.cdap.proto.id.ApplicationId; 31 | import io.cdap.cdap.proto.id.ArtifactId; 32 | import io.cdap.cdap.proto.id.NamespaceId; 33 | import io.cdap.cdap.test.ApplicationManager; 34 | import io.cdap.cdap.test.SparkManager; 35 | import io.cdap.cdap.test.TestConfiguration; 36 | import io.cdap.plugin.cdc.sink.CDCBigTable; 37 | import io.cdap.plugin.cdc.sink.CDCHBase; 38 | import io.cdap.plugin.cdc.sink.CDCKudu; 39 | import io.cdap.plugin.cdc.source.oracle.GoldenGateKafka; 40 | import io.cdap.plugin.cdc.source.sqlserver.CTSQLServer; 41 | import kafka.serializer.DefaultDecoder; 42 | import org.apache.spark.streaming.kafka.KafkaUtils; 43 | import org.junit.BeforeClass; 44 | import org.junit.ClassRule; 45 | import org.slf4j.Logger; 46 | import org.slf4j.LoggerFactory; 47 | 48 | public abstract class CDCPluginIntegrationTestBase extends HydratorTestBase { 49 | private static final Logger LOG = LoggerFactory.getLogger(CDCPluginIntegrationTestBase.class); 50 | private static final ArtifactId APP_ARTIFACT_ID = NamespaceId.DEFAULT.artifact("data-streams", "1.0.0"); 51 | private static final ArtifactSummary APP_ARTIFACT = new ArtifactSummary("data-streams", "1.0.0"); 52 | 53 | @ClassRule 54 | public static final TestConfiguration CONFIG = 55 | new TestConfiguration(Constants.Explore.EXPLORE_ENABLED, false, 56 | Constants.AppFabric.SPARK_COMPAT, Compat.SPARK_COMPAT); 57 | 58 | @BeforeClass 59 | public static void setupTest() throws Exception { 60 | LOG.info("Setting up application"); 61 | 62 | setupStreamingArtifacts(APP_ARTIFACT_ID, DataStreamsApp.class); 63 | 64 | LOG.info("Setting up plugins"); 65 | 66 | addPluginArtifact(NamespaceId.DEFAULT.artifact("cdc-plugins", "1.0.0"), 67 | APP_ARTIFACT_ID, 68 | GoldenGateKafka.class, CTSQLServer.class, 69 | CDCBigTable.class, CDCHBase.class, CDCKudu.class, 70 | // Bigtable plugin dependencies 71 | MetricRegistry.class, 72 | // GoldenGateKafka plugin dependencies 73 | KafkaUtils.class, DefaultDecoder.class); 74 | } 75 | 76 | protected SparkManager deployETL(ETLPlugin sourcePlugin, ETLPlugin sinkPlugin, String appName) throws Exception { 77 | ETLStage source = new ETLStage("source", sourcePlugin); 78 | ETLStage sink = new ETLStage("sink", sinkPlugin); 79 | DataStreamsConfig etlConfig = DataStreamsConfig.builder() 80 | .addStage(source) 81 | .addStage(sink) 82 | .addConnection(source.getName(), sink.getName()) 83 | .setBatchInterval("1s") 84 | .build(); 85 | 86 | AppRequest appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); 87 | ApplicationId appId = NamespaceId.DEFAULT.app(appName); 88 | ApplicationManager applicationManager = deployApplication(appId, appRequest); 89 | return getProgramManager(applicationManager); 90 | } 91 | 92 | private SparkManager getProgramManager(ApplicationManager appManager) { 93 | return appManager.getSparkManager(DataStreamsSparkLauncher.NAME); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/sink/CDCBigTableConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.sink; 18 | 19 | import com.google.bigtable.repackaged.com.google.cloud.ServiceOptions; 20 | import com.google.common.base.Strings; 21 | import io.cdap.cdap.api.annotation.Description; 22 | import io.cdap.cdap.api.annotation.Macro; 23 | import io.cdap.cdap.api.annotation.Name; 24 | import io.cdap.cdap.api.plugin.PluginConfig; 25 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException; 26 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig; 27 | 28 | import java.io.File; 29 | import javax.annotation.Nullable; 30 | 31 | /** 32 | * Defines the {@link PluginConfig} for the {@link CDCBigTable}. 33 | */ 34 | public class CDCBigTableConfig extends CDCReferencePluginConfig { 35 | public static final String AUTO_DETECT = "auto-detect"; 36 | 37 | public static final String INSTANCE = "instance"; 38 | public static final String PROJECT = "project"; 39 | public static final String SERVICE_ACCOUNT_FILE_PATH = "serviceFilePath"; 40 | 41 | @Name(INSTANCE) 42 | @Description("BigTable instance id. " + 43 | "Uniquely identifies BigTable instance within your Google Cloud Platform project.") 44 | @Macro 45 | public final String instance; 46 | 47 | @Name(PROJECT) 48 | @Description("Google Cloud Project ID, which uniquely identifies a project. " 49 | + "It can be found on the Dashboard in the Google Cloud Platform Console.") 50 | @Macro 51 | @Nullable 52 | public final String project; 53 | 54 | @Name(SERVICE_ACCOUNT_FILE_PATH) 55 | @Description("Path on the local file system of the service account key used " 56 | + "for authorization. Can be set to 'auto-detect' when running on a Dataproc cluster. " 57 | + "When running on other clusters, the file must be present on every node in the cluster.") 58 | @Macro 59 | @Nullable 60 | public final String serviceAccountFilePath; 61 | 62 | public CDCBigTableConfig(String referenceName, String instance, @Nullable String project, 63 | @Nullable String serviceAccountFilePath) { 64 | super(referenceName); 65 | this.instance = instance; 66 | this.project = project; 67 | this.serviceAccountFilePath = serviceAccountFilePath; 68 | } 69 | 70 | @Nullable 71 | public String resolveProject() { 72 | if (project == null || project.isEmpty() || AUTO_DETECT.equals(project)) { 73 | return ServiceOptions.getDefaultProjectId(); 74 | } 75 | return project; 76 | } 77 | 78 | @Nullable 79 | public String resolveServiceAccountFilePath() { 80 | if (serviceAccountFilePath == null || serviceAccountFilePath.isEmpty() 81 | || AUTO_DETECT.equals(serviceAccountFilePath)) { 82 | return null; 83 | } 84 | return serviceAccountFilePath; 85 | } 86 | 87 | @Override 88 | public void validate() { 89 | super.validate(); 90 | if (!containsMacro(PROJECT) && resolveProject() == null) { 91 | throw new InvalidConfigPropertyException("Could not detect Google Cloud project id from the environment. " + 92 | "Please specify a project id.", PROJECT); 93 | } 94 | if (!containsMacro(INSTANCE) && Strings.isNullOrEmpty(instance)) { 95 | throw new InvalidConfigPropertyException("Instance ID cannot be null or empty", INSTANCE); 96 | } 97 | String serviceAccountFilePath = resolveServiceAccountFilePath(); 98 | if (!containsMacro(SERVICE_ACCOUNT_FILE_PATH) && serviceAccountFilePath != null) { 99 | File serviceAccountFile = new File(serviceAccountFilePath); 100 | if (!serviceAccountFile.exists()) { 101 | throw new InvalidConfigPropertyException(String.format("File '%s' does not exist", serviceAccountFilePath), 102 | SERVICE_ACCOUNT_FILE_PATH); 103 | } 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/common/Schemas.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.common; 18 | 19 | import io.cdap.cdap.api.data.format.StructuredRecord; 20 | import io.cdap.cdap.api.data.schema.Schema; 21 | import io.cdap.cdap.api.data.schema.Schema.Field; 22 | import io.cdap.cdap.api.data.schema.Schema.Type; 23 | 24 | import java.util.Arrays; 25 | import java.util.Objects; 26 | import java.util.stream.Collectors; 27 | 28 | /** 29 | * Helper class with common cdc schemes definitions. 30 | */ 31 | public class Schemas { 32 | 33 | private static final Schema SIMPLE_TYPES = Schema.unionOf(Arrays.stream(Type.values()) 34 | .filter(Type::isSimpleType) 35 | .map(Schema::of) 36 | .collect(Collectors.toList())); 37 | 38 | public static final String SCHEMA_RECORD = "schema"; 39 | public static final String TABLE_FIELD = "table"; 40 | public static final String SCHEMA_FIELD = "schema"; 41 | public static final String OP_TYPE_FIELD = "op_type"; 42 | public static final String PRIMARY_KEYS_FIELD = "primary_keys"; 43 | public static final String DDL_FIELD = "ddl"; 44 | public static final String DML_FIELD = "dml"; 45 | public static final String UPDATE_SCHEMA_FIELD = "rows_schema"; 46 | public static final String UPDATE_VALUES_FIELD = "rows_values"; 47 | public static final String CHANGE_TRACKING_VERSION = "change_tracking_version"; 48 | public static final String CDC_CURRENT_TIMESTAMP = "cdc_current_timestamp"; 49 | 50 | public static final Schema DDL_SCHEMA = Schema.recordOf( 51 | "DDLRecord", 52 | Field.of(TABLE_FIELD, Schema.of(Type.STRING)), 53 | Field.of(SCHEMA_FIELD, Schema.of(Type.STRING)) 54 | ); 55 | 56 | public static final Schema DML_SCHEMA = Schema.recordOf( 57 | "DMLRecord", 58 | Field.of(OP_TYPE_FIELD, enumWith(OperationType.class)), 59 | Field.of(TABLE_FIELD, Schema.of(Type.STRING)), 60 | Field.of(PRIMARY_KEYS_FIELD, Schema.arrayOf(Schema.of(Type.STRING))), 61 | Field.of(UPDATE_SCHEMA_FIELD, Schema.of(Type.STRING)), 62 | Field.of(UPDATE_VALUES_FIELD, Schema.mapOf(Schema.of(Type.STRING), SIMPLE_TYPES)), 63 | Field.of(CHANGE_TRACKING_VERSION, Schema.of(Type.STRING)), 64 | Field.of(CDC_CURRENT_TIMESTAMP, Schema.of(Schema.LogicalType.TIME_MICROS)) 65 | ); 66 | 67 | public static final Schema CHANGE_SCHEMA = Schema.recordOf( 68 | "changeRecord", 69 | Field.of(DDL_FIELD, Schema.nullableOf(DDL_SCHEMA)), 70 | Field.of(DML_FIELD, Schema.nullableOf(DML_SCHEMA)) 71 | ); 72 | 73 | public static StructuredRecord toCDCRecord(StructuredRecord changeRecord) { 74 | String recordName = changeRecord.getSchema().getRecordName(); 75 | if (Objects.equals(recordName, DDL_SCHEMA.getRecordName())) { 76 | return StructuredRecord.builder(CHANGE_SCHEMA) 77 | .set(DDL_FIELD, changeRecord) 78 | .build(); 79 | } else if (Objects.equals(recordName, DML_SCHEMA.getRecordName())) { 80 | return StructuredRecord.builder(CHANGE_SCHEMA) 81 | .set(DML_FIELD, changeRecord) 82 | .build(); 83 | } 84 | throw new IllegalArgumentException(String.format("Wrong schema name '%s' for record", recordName)); 85 | } 86 | 87 | public static String getTableName(String namespacedTableName) { 88 | return namespacedTableName.split("\\.")[1]; 89 | } 90 | 91 | private static Schema enumWith(Class> enumClass) { 92 | // this method may be removed when Schema.enumWith() method signature fixed 93 | Enum[] enumConstants = enumClass.getEnumConstants(); 94 | String[] names = new String[enumConstants.length]; 95 | for (int i = 0; i < enumConstants.length; i++) { 96 | names[i] = enumConstants[i].name(); 97 | } 98 | return Schema.enumWith(names); 99 | } 100 | 101 | private Schemas() { 102 | // utility class 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/sink/CDCHBase.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.sink; 18 | 19 | import io.cdap.cdap.api.annotation.Name; 20 | import io.cdap.cdap.api.annotation.Plugin; 21 | import io.cdap.cdap.api.data.format.StructuredRecord; 22 | import io.cdap.cdap.etl.api.PipelineConfigurer; 23 | import io.cdap.cdap.etl.api.batch.SparkExecutionPluginContext; 24 | import io.cdap.cdap.etl.api.batch.SparkPluginContext; 25 | import io.cdap.cdap.etl.api.batch.SparkSink; 26 | import io.cdap.cdap.etl.api.validation.InvalidStageException; 27 | import io.cdap.plugin.cdc.common.Schemas; 28 | import io.cdap.plugin.cdc.common.SparkConfigs; 29 | import io.cdap.plugin.common.batch.JobUtils; 30 | import org.apache.hadoop.conf.Configuration; 31 | import org.apache.hadoop.hbase.TableName; 32 | import org.apache.hadoop.hbase.client.Admin; 33 | import org.apache.hadoop.hbase.client.Connection; 34 | import org.apache.hadoop.hbase.client.ConnectionFactory; 35 | import org.apache.hadoop.hbase.client.Table; 36 | import org.apache.hadoop.mapreduce.Job; 37 | import org.apache.spark.api.java.JavaRDD; 38 | 39 | import java.io.IOException; 40 | import java.util.Map; 41 | 42 | /** 43 | * HBase sink for CDC 44 | */ 45 | @Plugin(type = SparkSink.PLUGIN_TYPE) 46 | @Name("CDCHBase") 47 | public class CDCHBase extends SparkSink { 48 | private final CDCHBaseConfig config; 49 | 50 | public CDCHBase(CDCHBaseConfig config) { 51 | this.config = config; 52 | } 53 | 54 | @Override 55 | public void prepareRun(SparkPluginContext context) throws Exception { 56 | } 57 | 58 | @Override 59 | public void configurePipeline(PipelineConfigurer pipelineConfigurer) { 60 | config.validate(); 61 | if (!Schemas.CHANGE_SCHEMA.isCompatible(pipelineConfigurer.getStageConfigurer().getInputSchema())) { 62 | throw new InvalidStageException("Input schema is incompatible with change record schema"); 63 | } 64 | } 65 | 66 | @Override 67 | public void run(SparkExecutionPluginContext context, JavaRDD javaRDD) throws Exception { 68 | Map hadoopConfigs = SparkConfigs.getHadoopConfigs(javaRDD); 69 | // maps data sets to each block of computing resources 70 | javaRDD.foreachPartition(structuredRecordIterator -> { 71 | try (Connection conn = getConnection(hadoopConfigs); 72 | Admin hBaseAdmin = conn.getAdmin()) { 73 | while (structuredRecordIterator.hasNext()) { 74 | StructuredRecord input = structuredRecordIterator.next(); 75 | StructuredRecord ddlRecord = input.get(Schemas.DDL_FIELD); 76 | if (ddlRecord != null) { 77 | String tableName = Schemas.getTableName(ddlRecord.get(Schemas.TABLE_FIELD)); 78 | CDCTableUtil.createHBaseTable(hBaseAdmin, tableName); 79 | } 80 | StructuredRecord dmlRecord = input.get(Schemas.DML_FIELD); 81 | if (dmlRecord != null) { 82 | String tableName = Schemas.getTableName(dmlRecord.get(Schemas.TABLE_FIELD)); 83 | Table table = hBaseAdmin.getConnection().getTable(TableName.valueOf(tableName)); 84 | CDCTableUtil.updateHBaseTable(table, dmlRecord); 85 | } 86 | } 87 | } 88 | }); 89 | } 90 | 91 | private Connection getConnection(Map hadoopConfigs) throws IOException { 92 | ClassLoader oldClassLoader = Thread.currentThread().getContextClassLoader(); 93 | // Switch the context classloader to plugin class' classloader (PluginClassLoader) so that 94 | // when Job/Configuration is created, it uses PluginClassLoader to load resources (hbase-default.xml) 95 | // which is present in the plugin jar and is not visible in the CombineClassLoader (which is what oldClassLoader 96 | // points to). 97 | Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); 98 | Job job; 99 | try { 100 | job = JobUtils.createInstance(); 101 | } finally { 102 | // Switch back to the original 103 | Thread.currentThread().setContextClassLoader(oldClassLoader); 104 | } 105 | Configuration conf = job.getConfiguration(); 106 | 107 | for (Map.Entry configEntry : hadoopConfigs.entrySet()) { 108 | conf.set(configEntry.getKey(), configEntry.getValue()); 109 | } 110 | 111 | return ConnectionFactory.createConnection(conf); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /docs/CDCBigTable-sparksink.md: -------------------------------------------------------------------------------- 1 | # CDC Google Cloud Bigtable Sink 2 | 3 | Description 4 | ----------- 5 | This plugin takes input from a CDC source and writes the changes to Cloud Bigtable. 6 | 7 | All CDC sink plugins are normally used in conjunction with CDC source plugins. 8 | CDC sink expects messages in CDC format as an input. 9 | 10 | Credentials 11 | ----------- 12 | If the plugin is run on a Google Cloud Dataproc cluster, the service account key does not need to be 13 | provided and can be set to 'auto-detect'. 14 | Credentials will be automatically read from the cluster environment. 15 | 16 | If the plugin is not run on a Dataproc cluster, the path to a service account key must be provided. 17 | The service account key can be found on the Dashboard in the Cloud Platform Console. 18 | Make sure the account key has permission to access BigQuery and Google Cloud Storage. 19 | The service account key file needs to be available on every node in your cluster and 20 | must be readable by all users running the job. 21 | 22 | Properties 23 | ---------- 24 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc. 25 | 26 | **Instance ID**: The Instance Id the Cloud Bigtable is in. 27 | 28 | **Project ID**: Google Cloud Project ID, which uniquely identifies a project. 29 | It can be found on the Dashboard in the Google Cloud Platform Console. This is the project 30 | that the BigQuery job will run in. If a temporary bucket needs to be created, the service account 31 | must have permission in this project to create buckets. 32 | 33 | **Service Account File Path**: Path on the local file system of the service account key used for 34 | authorization. Can be set to 'auto-detect' when running on a Dataproc cluster. 35 | When running on other clusters, the file must be present on every node in the cluster. 36 | 37 | Usage Notes 38 | ----------- 39 | This plugin supports table creation and table modification on a Cloud Bigtable project. 40 | We recommend placing a normalizer transformation plugin before this plugin. 41 | It converts inputs into standard Data Definition Language (DDL) and Data Manipulation Language (DML) records that 42 | can be parsed by this plugin. 43 | 44 | Table Creation 45 | -------------- 46 | When the plugin receives a DDL record, it creates a table in the target Cloud Bigtable project. The name of the table 47 | is specified in the DDL record. Below is a sample DDL Record that creates a table with name `TESTANOTHER`. 48 | ```{ 49 | "schema": { 50 | "type": "RECORD", 51 | "recordName": "DDLRecord", 52 | "fieldMap": { 53 | "table": { 54 | "name": "table", 55 | "schema": { 56 | "type": "STRING", 57 | "unionSchemas": [] 58 | } 59 | }, 60 | "schema": { 61 | "name": "schema", 62 | "schema": { 63 | "type": "STRING", 64 | "unionSchemas": [] 65 | } 66 | } 67 | }, 68 | "fields": [ 69 | { 70 | "name": "table", 71 | "schema": { 72 | "type": "STRING", 73 | "unionSchemas": [] 74 | } 75 | }, 76 | { 77 | "name": "schema", 78 | "schema": { 79 | "type": "STRING", 80 | "unionSchemas": [] 81 | } 82 | } 83 | ], 84 | "unionSchemas": [] 85 | }, 86 | "fields": { 87 | "schema": "{\"type\":\"record\",\"name\":\"columns\",\"fields\":[{\"name\":\"CID\",\"type\":[\"null\",\"long\"]},{\"name\":\"CNAME\",\"type\":[\"null\",\"string\"]}]}", 88 | "table": "TESTANOTHER" 89 | } 90 | } 91 | ``` 92 | 93 | Table Modification 94 | -------------- 95 | When the plugin receives a DML record, it modifies the corresponding table according to the operation specified in 96 | `op_type`. 97 | 98 | | op\_type | Operation | 99 | | :--------------: | :--------------: | 100 | | I | Insert | 101 | | U | Update | 102 | | D | Delete | 103 | 104 | The content of the changes is listed in the `change` field. The `primary_keys` field specifies the fields in `change` 105 | that will be used to name a row in the table. Below is a sample DML record that creates a row for `Scott` and inserts 106 | his information into the row. 107 | ``` 108 | { 109 | "table": "EMPLOYEE", 110 | "schema": "{\"type\":\"record\",\"name\":\"columns\",\"fields\":[{\"name\":\"EMPNO\",\"type\":[\"null\",\"long\"]},{\"name\":\"ENAME\",\"type\":[\"null\",\"string\"]},{\"name\":\"JOB\",\"type\":[\"null\",\"string\"]},{\"name\":\"MGR\",\"type\":[\"null\",\"long\"]},{\"name\":\"HIREDATE\",\"type\":[\"null\",\"string\"]},{\"name\":\"SAL\",\"type\":[\"null\",\"long\"]},{\"name\":\"COMM\",\"type\":[\"null\",\"long\"]},{\"name\":\"DEPTNO\",\"type\":[\"null\",\"long\"]},{\"name\":\"EMP_ADDRESS\",\"type\":[\"null\",\"string\"]}]}", 111 | "op_type": "I", 112 | "primary_keys": [ 113 | "ENAME" 114 | ], 115 | "change": { 116 | "HIREDATE": "03-DEC-2015", 117 | "JOB": "Software Engineer", 118 | "MGR": 991, 119 | "SAL": 1234, 120 | "DEPTNO": 1, 121 | "EMP_ADDRESS": "San Jose", 122 | "ENAME": "Scott", 123 | "EMPNO": 1, 124 | "COMM": 1 125 | } 126 | } 127 | ``` -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/oracle/GoldenGateKafkaConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | package io.cdap.plugin.cdc.source.oracle; 17 | 18 | import com.google.common.base.Strings; 19 | import io.cdap.cdap.api.annotation.Description; 20 | import io.cdap.cdap.api.annotation.Macro; 21 | import io.cdap.cdap.api.annotation.Name; 22 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException; 23 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig; 24 | import org.apache.commons.lang3.ObjectUtils; 25 | 26 | import javax.annotation.Nullable; 27 | 28 | /** 29 | * Configurations to be used for Golden Gate Kafka source. 30 | */ 31 | public class GoldenGateKafkaConfig extends CDCReferencePluginConfig { 32 | 33 | private static final long serialVersionUID = 8069169417140954175L; 34 | 35 | public static final String BROKER = "broker"; 36 | public static final String TOPIC = "topic"; 37 | public static final String DEFAULT_INITIAL_OFFSET = "defaultInitialOffset"; 38 | public static final String MAX_RATE_PER_PARTITION = "maxRatePerPartition"; 39 | 40 | @Name(BROKER) 41 | @Description("Kafka broker specified in host:port form. For example, example.com:9092") 42 | @Macro 43 | private final String broker; 44 | 45 | @Name(TOPIC) 46 | @Description("Name of the topic to which Golden Gate publishes the DDL and DML changes.") 47 | @Macro 48 | private final String topic; 49 | 50 | @Name(DEFAULT_INITIAL_OFFSET) 51 | @Description("The default initial offset to read from. " + 52 | "An offset of -2 means the smallest offset. An offset of -1 means the latest offset. Defaults to -1. " + 53 | "Offsets are inclusive. If an offset of 5 is used, the message at offset 5 will be read. ") 54 | @Macro 55 | @Nullable 56 | private final Long defaultInitialOffset; 57 | 58 | @Name(MAX_RATE_PER_PARTITION) 59 | @Description("Max number of records to read per second per partition. 0 means there is no limit. Defaults to 1000.") 60 | @Macro 61 | @Nullable 62 | private final Integer maxRatePerPartition; 63 | 64 | public GoldenGateKafkaConfig(String referenceName, @Nullable String broker, @Nullable String topic, 65 | @Nullable Long defaultInitialOffset, @Nullable Integer maxRatePerPartition) { 66 | super(referenceName); 67 | this.broker = broker; 68 | this.topic = topic; 69 | this.defaultInitialOffset = defaultInitialOffset; 70 | this.maxRatePerPartition = maxRatePerPartition; 71 | } 72 | 73 | @Nullable 74 | public String getBroker() { 75 | return broker; 76 | } 77 | 78 | public String getHost() { 79 | return broker.split(":")[0]; 80 | } 81 | 82 | public int getPort() { 83 | return Integer.valueOf(broker.split(":")[1]); 84 | } 85 | 86 | @Nullable 87 | public String getTopic() { 88 | return topic; 89 | } 90 | 91 | public Long getDefaultInitialOffset() { 92 | return ObjectUtils.defaultIfNull(defaultInitialOffset, -1L); 93 | } 94 | 95 | public Integer getMaxRatePerPartition() { 96 | return ObjectUtils.defaultIfNull(maxRatePerPartition, 1000); 97 | } 98 | 99 | /** 100 | * Method to validate the broker address which should be in the form 'host:port'. 101 | * throws IllegalArgumentException if validation fails 102 | */ 103 | @Override 104 | public void validate() { 105 | super.validate(); 106 | if (!containsMacro(BROKER)) { 107 | if (Strings.isNullOrEmpty(broker)) { 108 | throw new InvalidConfigPropertyException("Broker address cannot be null or empty", BROKER); 109 | } 110 | try { 111 | getHost(); 112 | getPort(); 113 | } catch (Exception e) { 114 | throw new InvalidConfigPropertyException( 115 | String.format("Broker address '%s' should be in the form of 'host:port'.", broker), e, BROKER); 116 | } 117 | } 118 | if (!containsMacro(TOPIC) && Strings.isNullOrEmpty(topic)) { 119 | throw new InvalidConfigPropertyException("Topic cannot be null or empty", TOPIC); 120 | } 121 | if (!containsMacro(DEFAULT_INITIAL_OFFSET) && defaultInitialOffset != null && defaultInitialOffset < -2) { 122 | throw new InvalidConfigPropertyException("'defaultInitialOffset' should be equal to -2, -1, 0 or positive number", 123 | DEFAULT_INITIAL_OFFSET); 124 | } 125 | if (!containsMacro(MAX_RATE_PER_PARTITION) && maxRatePerPartition != null && maxRatePerPartition < 0) { 126 | throw new InvalidConfigPropertyException("'maxRatePerPartition' should be equal to 0 or positive number", 127 | MAX_RATE_PER_PARTITION); 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/test/java/io/cdap/plugin/cdc/sink/CDCBigTableConfigUnitTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.sink; 18 | 19 | import com.google.bigtable.repackaged.com.google.cloud.ServiceOptions; 20 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException; 21 | import io.cdap.plugin.common.Constants; 22 | import org.junit.Assert; 23 | import org.junit.Assume; 24 | import org.junit.Test; 25 | 26 | public class CDCBigTableConfigUnitTest { 27 | private static final String VALID_REF = "test-ref"; 28 | private static final String VALID_PROJECT = "test-project"; 29 | private static final String VALID_INSTANCE = "test-instance"; 30 | private static final String VALID_ACCOUNT_FILE_PATH 31 | = CDCBigTableConfigUnitTest.class.getResource("/credentials.json").getPath(); 32 | 33 | @Test 34 | public void testValidateValidConfig() { 35 | CDCBigTableConfig config = new CDCBigTableConfig( 36 | VALID_REF, 37 | VALID_INSTANCE, 38 | VALID_PROJECT, 39 | VALID_ACCOUNT_FILE_PATH 40 | ); 41 | 42 | config.validate(); 43 | } 44 | 45 | @Test 46 | public void testValidateReference() { 47 | CDCBigTableConfig config = new CDCBigTableConfig( 48 | "", 49 | VALID_INSTANCE, 50 | VALID_PROJECT, 51 | VALID_ACCOUNT_FILE_PATH 52 | ); 53 | 54 | try { 55 | config.validate(); 56 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 57 | } catch (InvalidConfigPropertyException e) { 58 | Assert.assertEquals(Constants.Reference.REFERENCE_NAME, e.getProperty()); 59 | } 60 | } 61 | 62 | @Test 63 | public void testValidateMissingCredentialsFile() { 64 | CDCBigTableConfig config = new CDCBigTableConfig( 65 | VALID_REF, 66 | VALID_INSTANCE, 67 | VALID_PROJECT, 68 | "/tmp/non_existing_file" 69 | ); 70 | 71 | try { 72 | config.validate(); 73 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 74 | } catch (InvalidConfigPropertyException e) { 75 | Assert.assertEquals(CDCBigTableConfig.SERVICE_ACCOUNT_FILE_PATH, e.getProperty()); 76 | } 77 | } 78 | 79 | @Test 80 | public void testValidateMissingProjectId() { 81 | Assume.assumeTrue(ServiceOptions.getDefaultProjectId() == null); 82 | 83 | CDCBigTableConfig config = new CDCBigTableConfig( 84 | VALID_REF, 85 | VALID_INSTANCE, 86 | null, 87 | VALID_ACCOUNT_FILE_PATH 88 | ); 89 | 90 | try { 91 | config.validate(); 92 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 93 | } catch (InvalidConfigPropertyException e) { 94 | Assert.assertEquals(CDCBigTableConfig.PROJECT, e.getProperty()); 95 | } 96 | } 97 | 98 | @Test 99 | public void testValidateMissingInstanceId() { 100 | CDCBigTableConfig config = new CDCBigTableConfig( 101 | VALID_REF, 102 | null, 103 | VALID_PROJECT, 104 | VALID_ACCOUNT_FILE_PATH 105 | ); 106 | 107 | try { 108 | config.validate(); 109 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 110 | } catch (InvalidConfigPropertyException e) { 111 | Assert.assertEquals(CDCBigTableConfig.INSTANCE, e.getProperty()); 112 | } 113 | } 114 | 115 | @Test 116 | public void testResolveProjectId() { 117 | CDCBigTableConfig config = new CDCBigTableConfig( 118 | VALID_REF, 119 | VALID_INSTANCE, 120 | null, 121 | VALID_ACCOUNT_FILE_PATH 122 | ); 123 | 124 | Assert.assertEquals(ServiceOptions.getDefaultProjectId(), config.resolveProject()); 125 | } 126 | 127 | @Test 128 | public void testResolveProjectIdAutoDetect() { 129 | CDCBigTableConfig config = new CDCBigTableConfig( 130 | VALID_REF, 131 | VALID_INSTANCE, 132 | CDCBigTableConfig.AUTO_DETECT, 133 | VALID_ACCOUNT_FILE_PATH 134 | ); 135 | 136 | Assert.assertEquals(ServiceOptions.getDefaultProjectId(), config.resolveProject()); 137 | } 138 | 139 | @Test 140 | public void testServiceAccountFilePath() { 141 | CDCBigTableConfig config = new CDCBigTableConfig( 142 | VALID_REF, 143 | VALID_INSTANCE, 144 | VALID_PROJECT, 145 | null 146 | ); 147 | 148 | Assert.assertNull(config.resolveServiceAccountFilePath()); 149 | } 150 | 151 | @Test 152 | public void testServiceAccountFilePathAutoDetect() { 153 | CDCBigTableConfig config = new CDCBigTableConfig( 154 | VALID_REF, 155 | VALID_INSTANCE, 156 | VALID_PROJECT, 157 | CDCBigTableConfig.AUTO_DETECT 158 | ); 159 | 160 | Assert.assertNull(config.resolveServiceAccountFilePath()); 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/sqlserver/ResultSetToDMLRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.sqlserver; 18 | 19 | import com.google.common.base.Joiner; 20 | import com.google.common.collect.Lists; 21 | import io.cdap.cdap.api.data.format.StructuredRecord; 22 | import io.cdap.cdap.api.data.schema.Schema; 23 | import io.cdap.plugin.cdc.common.DBUtils; 24 | import io.cdap.plugin.cdc.common.OperationType; 25 | import io.cdap.plugin.cdc.common.Schemas; 26 | import org.apache.spark.api.java.function.Function; 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | 30 | import java.sql.Date; 31 | import java.sql.ResultSet; 32 | import java.sql.ResultSetMetaData; 33 | import java.sql.SQLException; 34 | import java.sql.Time; 35 | import java.sql.Timestamp; 36 | import java.time.Instant; 37 | import java.util.HashMap; 38 | import java.util.List; 39 | import java.util.Map; 40 | import java.util.concurrent.TimeUnit; 41 | 42 | /** 43 | * A serializable class to allow invoking {@link scala.Function1} from Java. The function converts {@link ResultSet} 44 | * to {@link StructuredRecord} for dml records 45 | */ 46 | public class ResultSetToDMLRecord implements Function { 47 | private static final Logger LOG = LoggerFactory.getLogger(ResultSetToDMLRecord.class); 48 | private static final int CHANGE_TABLE_COLUMNS_SIZE = 4; 49 | private final TableInformation tableInformation; 50 | 51 | ResultSetToDMLRecord(TableInformation tableInformation) { 52 | this.tableInformation = tableInformation; 53 | } 54 | 55 | @Override 56 | public StructuredRecord call(ResultSet row) throws SQLException { 57 | Schema changeSchema = getChangeSchema(row); 58 | String operation = row.getString("SYS_CHANGE_OPERATION"); 59 | OperationType operationType = OperationType.fromShortName(operation); 60 | return StructuredRecord.builder(Schemas.DML_SCHEMA) 61 | .set(Schemas.TABLE_FIELD, Joiner.on(".").join(tableInformation.getSchemaName(), tableInformation.getName())) 62 | .set(Schemas.PRIMARY_KEYS_FIELD, Lists.newArrayList(tableInformation.getPrimaryKeys())) 63 | .set(Schemas.OP_TYPE_FIELD, operationType.name()) 64 | .set(Schemas.UPDATE_SCHEMA_FIELD, changeSchema.toString()) 65 | .set(Schemas.UPDATE_VALUES_FIELD, getChangeData(row, changeSchema)) 66 | .set(Schemas.CHANGE_TRACKING_VERSION, row.getString("CHANGE_TRACKING_VERSION")) 67 | .set(Schemas.CDC_CURRENT_TIMESTAMP, row.getTimestamp("CDC_CURRENT_TIMESTAMP").getTime() * 1000) 68 | .build(); 69 | } 70 | 71 | private static Map getChangeData(ResultSet resultSet, Schema changeSchema) throws SQLException { 72 | ResultSetMetaData metadata = resultSet.getMetaData(); 73 | Map changes = new HashMap<>(); 74 | for (int i = 0; i < changeSchema.getFields().size(); i++) { 75 | Schema.Field field = changeSchema.getFields().get(i); 76 | // Ignore the first CHANGE_TABLE_COLUMN_SIZE columns since those are change tracking data and not the 77 | // actual row data. Add 1 because ResultSetMetaData starts from 1, not 0. 78 | int column = 1 + i + CHANGE_TABLE_COLUMNS_SIZE; 79 | int sqlType = metadata.getColumnType(column); 80 | int sqlPrecision = metadata.getPrecision(column); 81 | int sqlScale = metadata.getScale(column); 82 | Object sqlValue = DBUtils.transformValue(sqlType, sqlPrecision, sqlScale, resultSet, field.getName()); 83 | Object javaValue = transformSQLToJavaType(sqlValue); 84 | changes.put(field.getName(), javaValue); 85 | } 86 | return changes; 87 | } 88 | 89 | private static Schema getChangeSchema(ResultSet resultSet) throws SQLException { 90 | List schemaFields = DBUtils.getSchemaFields(resultSet); 91 | // drop first four columns as they are from change tracking tables and does not represent the change data 92 | return Schema.recordOf(Schemas.SCHEMA_RECORD, 93 | schemaFields.subList(CHANGE_TABLE_COLUMNS_SIZE, schemaFields.size())); 94 | } 95 | 96 | private static Object transformSQLToJavaType(Object sqlValue) { 97 | if (sqlValue instanceof Date) { 98 | Date d = (Date) sqlValue; 99 | // dates are number of days since the epoch 100 | return (int) d.toLocalDate().toEpochDay(); 101 | } else if (sqlValue instanceof Time) { 102 | // times are microseconds since midnight 103 | Time t = (Time) sqlValue; 104 | return TimeUnit.NANOSECONDS.toMicros(t.toLocalTime().toNanoOfDay()); 105 | } else if (sqlValue instanceof Timestamp) { 106 | // timestamps are in microseconds 107 | Instant instant = ((Timestamp) sqlValue).toInstant(); 108 | long micros = TimeUnit.SECONDS.toMicros(instant.getEpochSecond()); 109 | return micros + TimeUnit.NANOSECONDS.toMicros(instant.getNano()); 110 | } else { 111 | return sqlValue; 112 | } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /widgets/CTSQLServer-streamingsource.json: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "spec-version": "1.5" 4 | }, 5 | "configuration-groups": [ 6 | { 7 | "label": "Basic", 8 | "properties": [ 9 | { 10 | "widget-type": "textbox", 11 | "label": "Reference Name", 12 | "name": "referenceName", 13 | "description": "Reference specifies the name to be used to track this external source" 14 | }, 15 | { 16 | "widget-type": "textbox", 17 | "label": "Username", 18 | "name": "username", 19 | "description": "Username to use to connect to the specified database. Required for databases that need authentication. Optional for databases that do not require authentication" 20 | }, 21 | { 22 | "widget-type": "password", 23 | "label": "Password", 24 | "name": "password", 25 | "description": "Password to use to connect to the specified database. Required for databases that need authentication. Optional for databases that do not require authentication" 26 | }, 27 | { 28 | "widget-type": "textbox", 29 | "label": "Database name", 30 | "name": "dbname", 31 | "description": "SQL Server database name which needs to be tracked. Note: Change Tracking must be enabled on the database for the source to read the chage data" 32 | }, 33 | { 34 | "widget-type": "csv", 35 | "label": "Table Whitelist", 36 | "name": "tableWhitelist" 37 | } 38 | ] 39 | }, 40 | { 41 | "label": "Connection", 42 | "properties": [ 43 | { 44 | "widget-type": "textbox", 45 | "label": "Hostname", 46 | "name": "hostname", 47 | "widget-attributes": { 48 | "placeholder": "SQL Server hostname" 49 | } 50 | }, 51 | { 52 | "widget-type": "textbox", 53 | "label": "Port", 54 | "name": "port", 55 | "widget-attributes": { 56 | "placeholder": "SQL Server Port. Ex: 1433" 57 | } 58 | } 59 | ] 60 | }, 61 | { 62 | "label": "Custom JDBC Connection", 63 | "properties": [ 64 | { 65 | "widget-type": "textbox", 66 | "label": "JDBC Plugin Name", 67 | "name": "jdbcPluginName" 68 | }, 69 | { 70 | "widget-type": "textbox", 71 | "label": "Connection String", 72 | "name": "connectionString" 73 | } 74 | ] 75 | }, 76 | { 77 | "label": "Advanced", 78 | "properties": [ 79 | { 80 | "widget-type": "textbox", 81 | "label": "Max Retry Seconds", 82 | "name": "maxRetrySeconds" 83 | }, 84 | { 85 | "widget-type": "textbox", 86 | "label": "Max Batch Size", 87 | "name": "maxBatchSize", 88 | "widget-attributes": { 89 | "default": "100000" 90 | } 91 | }, 92 | { 93 | "widget-type": "textbox", 94 | "label": "Starting Sequence Number", 95 | "name": "sequenceStartNum", 96 | "widget-attributes": { 97 | "default": "0" 98 | } 99 | } 100 | ] 101 | } 102 | ], 103 | "outputs": [ 104 | { 105 | "widget-type": "non-editable-schema-editor", 106 | "schema": { 107 | "name": "changeRecord", 108 | "type": "record", 109 | "fields": [ 110 | { 111 | "name": "ddl", 112 | "type": [ 113 | { 114 | "type": "record", 115 | "name": "DDLRecord", 116 | "fields": [ 117 | { "name": "table", "type": "string" }, 118 | { "name": "schema", "type": "string" } 119 | ] 120 | }, 121 | "null" 122 | ] 123 | }, 124 | { 125 | "name": "dml", 126 | "type": [ 127 | { 128 | "type": "record", 129 | "name": "DMLRecord", 130 | "fields": [ 131 | { 132 | "name": "op_type", 133 | "type": { 134 | "symbols": [ "INSERT", "UPDATE", "DELETE" ], 135 | "type": "enum" 136 | } 137 | }, 138 | { "name": "table", "type": "string" }, 139 | { "name": "primary_keys", "type": { "type": "array", "items": "string" } }, 140 | { "name": "rows_schema", "type": "string" }, 141 | { 142 | "name": "rows_values", 143 | "type": { 144 | "type": "map", 145 | "keys": "string", 146 | "values": [ 147 | "null", 148 | "boolean", 149 | "int", 150 | "long", 151 | "float", 152 | "double", 153 | "bytes", 154 | "string" 155 | ] 156 | } 157 | }, 158 | { "name": "change_tracking_version", "type": "string" } 159 | ] 160 | }, 161 | "null" 162 | ] 163 | } 164 | ] 165 | } 166 | } 167 | ] 168 | } 169 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/DMLFlattener.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc; 18 | 19 | import io.cdap.cdap.api.annotation.Description; 20 | import io.cdap.cdap.api.annotation.Name; 21 | import io.cdap.cdap.api.annotation.Plugin; 22 | import io.cdap.cdap.api.data.format.StructuredRecord; 23 | import io.cdap.cdap.api.data.schema.Schema; 24 | import io.cdap.cdap.api.plugin.PluginConfig; 25 | import io.cdap.cdap.etl.api.Emitter; 26 | import io.cdap.cdap.etl.api.PipelineConfigurer; 27 | import io.cdap.cdap.etl.api.Transform; 28 | import io.cdap.cdap.etl.api.TransformContext; 29 | 30 | import java.io.IOException; 31 | import java.util.ArrayList; 32 | import java.util.HashMap; 33 | import java.util.List; 34 | import java.util.Map; 35 | import javax.annotation.Nullable; 36 | 37 | /** 38 | * Extracts the DML record from the output of a cdc source for direct manipulation. 39 | */ 40 | @Plugin(type = Transform.PLUGIN_TYPE) 41 | @Name("DMLFlattener") 42 | @Description("Flattens DML records output by a CDC source.") 43 | public class DMLFlattener extends Transform { 44 | private static final String OP_TYPE = "CDC_OP_TYPE"; 45 | private static final String CHANGE_TRACKING_VERSION = "CHANGE_TRACKING_VERSION"; 46 | private static final String CDC_TIMESTAMP = "CDC_CURRENT_TIMESTAMP"; 47 | private final Conf conf; 48 | private Map schemaCache; 49 | private Schema configuredOutputSchema; 50 | private boolean addOpType = false; 51 | private boolean addTrackingVersion = false; 52 | private boolean addTimestamp = false; 53 | 54 | public DMLFlattener(Conf conf) { 55 | this.conf = conf; 56 | } 57 | 58 | @Override 59 | public void configurePipeline(PipelineConfigurer pipelineConfigurer) { 60 | if (conf.schema != null) { 61 | try { 62 | pipelineConfigurer.getStageConfigurer().setOutputSchema(Schema.parseJson(conf.schema)); 63 | } catch (IOException e) { 64 | throw new IllegalArgumentException("Unable to parse configured schema: " + e.getMessage(), e); 65 | } 66 | } 67 | } 68 | 69 | @Override 70 | public void initialize(TransformContext context) throws IOException { 71 | configuredOutputSchema = conf.schema == null ? null : Schema.parseJson(conf.schema); 72 | addOpType = configuredOutputSchema.getField(OP_TYPE) != null; 73 | addTrackingVersion = configuredOutputSchema.getField(CHANGE_TRACKING_VERSION) != null; 74 | addTimestamp = configuredOutputSchema.getField(CDC_TIMESTAMP) != null; 75 | schemaCache = new HashMap<>(); 76 | } 77 | 78 | @Override 79 | public void transform(StructuredRecord record, Emitter emitter) throws Exception { 80 | StructuredRecord dml = record.get("dml"); 81 | if (dml == null) { 82 | return; 83 | } 84 | 85 | Schema rowSchema = Schema.parseJson((String) dml.get("rows_schema")); 86 | Schema outputSchema = schemaCache.computeIfAbsent(rowSchema, this::createOutputSchema); 87 | 88 | StructuredRecord.Builder output = StructuredRecord.builder(outputSchema); 89 | if (addOpType) { 90 | output.set(OP_TYPE, dml.get("op_type").toString()); 91 | } 92 | if (addTrackingVersion) { 93 | output.set(CHANGE_TRACKING_VERSION, dml.get("change_tracking_version")); 94 | } 95 | if (addTimestamp) { 96 | output.set(CDC_TIMESTAMP, dml.get("cdc_current_timestamp")); 97 | } 98 | Map valueMap = dml.get("rows_values"); 99 | if (valueMap == null) { 100 | valueMap = new HashMap<>(); 101 | } 102 | for (Map.Entry entry : valueMap.entrySet()) { 103 | output.set(entry.getKey(), entry.getValue()); 104 | } 105 | emitter.emit(output.build()); 106 | } 107 | 108 | private Schema createOutputSchema(Schema rowSchema) { 109 | // the transform optionally adds a OP_TYPE field and CHANGE_TRACKING_VERSION field that do not come from the 110 | // actual row data, but from general change tracking information. 111 | int numFields = rowSchema.getFields().size() + (addOpType ? 1 : 0) + 112 | (addTrackingVersion ? 1 : 0) + (addTimestamp ? 1 : 0); 113 | List fields = new ArrayList<>(numFields); 114 | fields.addAll(rowSchema.getFields()); 115 | if (addOpType) { 116 | fields.add(Schema.Field.of(OP_TYPE, Schema.of(Schema.Type.STRING))); 117 | } 118 | if (addTrackingVersion) { 119 | fields.add(Schema.Field.of(CHANGE_TRACKING_VERSION, Schema.of(Schema.Type.STRING))); 120 | } 121 | if (addTimestamp) { 122 | fields.add(Schema.Field.of(CDC_TIMESTAMP, Schema.of(Schema.LogicalType.TIMESTAMP_MICROS))); 123 | } 124 | return Schema.recordOf(rowSchema + ".added", fields); 125 | } 126 | 127 | /** 128 | * plugin config. 129 | */ 130 | public static class Conf extends PluginConfig { 131 | 132 | @Nullable 133 | @Description("The output schema of DML records. This should only be set if the source has been configured to read " 134 | + "from a single table whose schema will never change.") 135 | private String schema; 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/sink/CDCBigTable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.sink; 18 | 19 | import com.google.cloud.bigtable.hbase.BigtableConfiguration; 20 | import com.google.cloud.bigtable.hbase.BigtableOptionsFactory; 21 | import io.cdap.cdap.api.annotation.Name; 22 | import io.cdap.cdap.api.annotation.Plugin; 23 | import io.cdap.cdap.api.data.format.StructuredRecord; 24 | import io.cdap.cdap.etl.api.PipelineConfigurer; 25 | import io.cdap.cdap.etl.api.batch.SparkExecutionPluginContext; 26 | import io.cdap.cdap.etl.api.batch.SparkPluginContext; 27 | import io.cdap.cdap.etl.api.batch.SparkSink; 28 | import io.cdap.cdap.etl.api.validation.InvalidStageException; 29 | import io.cdap.plugin.cdc.common.Schemas; 30 | import io.cdap.plugin.cdc.common.SparkConfigs; 31 | import io.cdap.plugin.common.batch.JobUtils; 32 | import org.apache.hadoop.conf.Configuration; 33 | import org.apache.hadoop.hbase.TableName; 34 | import org.apache.hadoop.hbase.client.Admin; 35 | import org.apache.hadoop.hbase.client.Connection; 36 | import org.apache.hadoop.hbase.client.Table; 37 | import org.apache.hadoop.mapreduce.Job; 38 | import org.apache.spark.api.java.JavaRDD; 39 | 40 | import java.io.IOException; 41 | import java.util.Map; 42 | 43 | /** 44 | * BigTable sink for CDC 45 | */ 46 | @Plugin(type = SparkSink.PLUGIN_TYPE) 47 | @Name("CDCBigTable") 48 | public class CDCBigTable extends SparkSink { 49 | private final CDCBigTableConfig config; 50 | 51 | public CDCBigTable(CDCBigTableConfig config) { 52 | this.config = config; 53 | } 54 | 55 | @Override 56 | public void prepareRun(SparkPluginContext context) throws Exception { 57 | } 58 | 59 | @Override 60 | public void configurePipeline(PipelineConfigurer pipelineConfigurer) { 61 | config.validate(); 62 | if (!Schemas.CHANGE_SCHEMA.isCompatible(pipelineConfigurer.getStageConfigurer().getInputSchema())) { 63 | throw new InvalidStageException("Input schema is incompatible with change record schema"); 64 | } 65 | } 66 | 67 | @Override 68 | public void run(SparkExecutionPluginContext context, JavaRDD javaRDD) throws Exception { 69 | Map hadoopConfigs = SparkConfigs.getHadoopConfigs(javaRDD); 70 | // maps data sets to each block of computing resources 71 | javaRDD.foreachPartition(structuredRecordIterator -> { 72 | try (Connection conn = getConnection(hadoopConfigs); 73 | Admin hBaseAdmin = conn.getAdmin()) { 74 | while (structuredRecordIterator.hasNext()) { 75 | StructuredRecord input = structuredRecordIterator.next(); 76 | StructuredRecord ddlRecord = input.get(Schemas.DDL_FIELD); 77 | if (ddlRecord != null) { 78 | // Notes: In BigTable, there no such thing as namespace. 79 | // Dots are allowed in table names, but colons are not. 80 | // If you try a table name with a colon in it, you will get: 81 | // io.grpc.StatusRuntimeException: INVALID_ARGUMENT: Invalid id for collection tables : \ 82 | // Should match [_a-zA-Z0-9][-_.a-zA-Z0-9]* but found 'ns:abcd' 83 | String tableName = Schemas.getTableName(ddlRecord.get(Schemas.TABLE_FIELD)); 84 | CDCTableUtil.createHBaseTable(hBaseAdmin, tableName); 85 | } 86 | StructuredRecord dmlRecord = input.get(Schemas.DML_FIELD); 87 | if (dmlRecord != null) { 88 | String tableName = Schemas.getTableName(dmlRecord.get(Schemas.TABLE_FIELD)); 89 | Table table = hBaseAdmin.getConnection().getTable(TableName.valueOf(tableName)); 90 | CDCTableUtil.updateHBaseTable(table, dmlRecord); 91 | } 92 | } 93 | } 94 | }); 95 | } 96 | 97 | private Connection getConnection(Map hadoopConfigs) throws IOException { 98 | ClassLoader oldClassLoader = Thread.currentThread().getContextClassLoader(); 99 | // Switch the context classloader to plugin class' classloader (PluginClassLoader) so that 100 | // when Job/Configuration is created, it uses PluginClassLoader to load resources (hbase-default.xml) 101 | // which is present in the plugin jar and is not visible in the CombineClassLoader (which is what oldClassLoader 102 | // points to). 103 | Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); 104 | try { 105 | Job job = JobUtils.createInstance(); 106 | 107 | Configuration conf = job.getConfiguration(); 108 | 109 | for (Map.Entry configEntry : hadoopConfigs.entrySet()) { 110 | conf.set(configEntry.getKey(), configEntry.getValue()); 111 | } 112 | 113 | String projectId = config.resolveProject(); 114 | String serviceAccountFilePath = config.resolveServiceAccountFilePath(); 115 | BigtableConfiguration.configure(conf, projectId, config.instance); 116 | if (serviceAccountFilePath != null) { 117 | conf.set(BigtableOptionsFactory.BIGTABLE_SERVICE_ACCOUNT_JSON_KEYFILE_LOCATION_KEY, serviceAccountFilePath); 118 | } 119 | 120 | return BigtableConfiguration.connect(conf); 121 | } finally { 122 | // Switch back to the original 123 | Thread.currentThread().setContextClassLoader(oldClassLoader); 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/test/java/io/cdap/plugin/cdc/source/oracle/GoldenGateKafkaConfigUnitTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.oracle; 18 | 19 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException; 20 | import io.cdap.plugin.common.Constants; 21 | import org.junit.Assert; 22 | import org.junit.Test; 23 | 24 | public class GoldenGateKafkaConfigUnitTest { 25 | private static final String VALID_REF = "test-ref"; 26 | private static final String VALID_BROKER = "localhost:9092"; 27 | private static final String VALID_TOPIC = "topic1"; 28 | private static final Long VALID_DEFAULT_INITIAL_OFFSET = 0L; 29 | private static final Integer VALID_MAX_RATE_PER_PARTITION = 0; 30 | 31 | @Test 32 | public void testValidateValidConfig() { 33 | GoldenGateKafkaConfig config = new GoldenGateKafkaConfig( 34 | VALID_REF, 35 | VALID_BROKER, 36 | VALID_TOPIC, 37 | VALID_DEFAULT_INITIAL_OFFSET, 38 | VALID_MAX_RATE_PER_PARTITION 39 | ); 40 | 41 | config.validate(); 42 | } 43 | 44 | @Test 45 | public void testValidateReference() { 46 | GoldenGateKafkaConfig config = new GoldenGateKafkaConfig( 47 | "", 48 | VALID_BROKER, 49 | VALID_TOPIC, 50 | VALID_DEFAULT_INITIAL_OFFSET, 51 | VALID_MAX_RATE_PER_PARTITION 52 | ); 53 | 54 | try { 55 | config.validate(); 56 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 57 | } catch (InvalidConfigPropertyException e) { 58 | Assert.assertEquals(Constants.Reference.REFERENCE_NAME, e.getProperty()); 59 | } 60 | } 61 | 62 | @Test 63 | public void testValidateMissingBroker() { 64 | GoldenGateKafkaConfig config = new GoldenGateKafkaConfig( 65 | VALID_REF, 66 | null, 67 | VALID_TOPIC, 68 | VALID_DEFAULT_INITIAL_OFFSET, 69 | VALID_MAX_RATE_PER_PARTITION 70 | ); 71 | 72 | try { 73 | config.validate(); 74 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 75 | } catch (InvalidConfigPropertyException e) { 76 | Assert.assertEquals(GoldenGateKafkaConfig.BROKER, e.getProperty()); 77 | } 78 | } 79 | 80 | @Test 81 | public void testValidateEmptyBroker() { 82 | GoldenGateKafkaConfig config = new GoldenGateKafkaConfig( 83 | VALID_REF, 84 | "", 85 | VALID_TOPIC, 86 | VALID_DEFAULT_INITIAL_OFFSET, 87 | VALID_MAX_RATE_PER_PARTITION 88 | ); 89 | 90 | try { 91 | config.validate(); 92 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 93 | } catch (InvalidConfigPropertyException e) { 94 | Assert.assertEquals(GoldenGateKafkaConfig.BROKER, e.getProperty()); 95 | } 96 | } 97 | 98 | @Test 99 | public void testValidateWronglyFormattedBroker() { 100 | GoldenGateKafkaConfig config = new GoldenGateKafkaConfig( 101 | VALID_REF, 102 | "localhost", 103 | VALID_TOPIC, 104 | VALID_DEFAULT_INITIAL_OFFSET, 105 | VALID_MAX_RATE_PER_PARTITION 106 | ); 107 | 108 | try { 109 | config.validate(); 110 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 111 | } catch (InvalidConfigPropertyException e) { 112 | Assert.assertEquals(GoldenGateKafkaConfig.BROKER, e.getProperty()); 113 | } 114 | } 115 | 116 | @Test 117 | public void testValidateMissingTopic() { 118 | GoldenGateKafkaConfig config = new GoldenGateKafkaConfig( 119 | VALID_REF, 120 | VALID_BROKER, 121 | null, 122 | VALID_DEFAULT_INITIAL_OFFSET, 123 | VALID_MAX_RATE_PER_PARTITION 124 | ); 125 | 126 | try { 127 | config.validate(); 128 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 129 | } catch (InvalidConfigPropertyException e) { 130 | Assert.assertEquals(GoldenGateKafkaConfig.TOPIC, e.getProperty()); 131 | } 132 | } 133 | 134 | @Test 135 | public void testValidateEmptyTopic() { 136 | GoldenGateKafkaConfig config = new GoldenGateKafkaConfig( 137 | VALID_REF, 138 | VALID_BROKER, 139 | "", 140 | VALID_DEFAULT_INITIAL_OFFSET, 141 | VALID_MAX_RATE_PER_PARTITION 142 | ); 143 | 144 | try { 145 | config.validate(); 146 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 147 | } catch (InvalidConfigPropertyException e) { 148 | Assert.assertEquals(GoldenGateKafkaConfig.TOPIC, e.getProperty()); 149 | } 150 | } 151 | 152 | @Test 153 | public void testValidateDefaultInitialOffset() { 154 | GoldenGateKafkaConfig config = new GoldenGateKafkaConfig( 155 | VALID_REF, 156 | VALID_BROKER, 157 | VALID_TOPIC, 158 | -3L, 159 | VALID_MAX_RATE_PER_PARTITION 160 | ); 161 | 162 | try { 163 | config.validate(); 164 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 165 | } catch (InvalidConfigPropertyException e) { 166 | Assert.assertEquals(GoldenGateKafkaConfig.DEFAULT_INITIAL_OFFSET, e.getProperty()); 167 | } 168 | } 169 | 170 | @Test 171 | public void testValidateMaxRatePerPartition() { 172 | GoldenGateKafkaConfig config = new GoldenGateKafkaConfig( 173 | VALID_REF, 174 | VALID_BROKER, 175 | VALID_TOPIC, 176 | VALID_DEFAULT_INITIAL_OFFSET, 177 | -1 178 | ); 179 | 180 | try { 181 | config.validate(); 182 | Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName())); 183 | } catch (InvalidConfigPropertyException e) { 184 | Assert.assertEquals(GoldenGateKafkaConfig.MAX_RATE_PER_PARTITION, e.getProperty()); 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/sink/CDCKuduConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.sink; 18 | 19 | import io.cdap.cdap.api.annotation.Description; 20 | import io.cdap.cdap.api.annotation.Macro; 21 | import io.cdap.cdap.api.annotation.Name; 22 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig; 23 | import org.apache.kudu.ColumnSchema; 24 | 25 | import javax.annotation.Nullable; 26 | 27 | /** 28 | * Configurations for the Kudu. 29 | */ 30 | public class CDCKuduConfig extends CDCReferencePluginConfig { 31 | 32 | // Required Fields. 33 | 34 | @Name("master") 35 | @Description("Comma-separated list of hostname:port for Kudu masters") 36 | @Macro 37 | public String optMasterAddresses; 38 | 39 | // Options Fields 40 | @Name("opt-timeout") 41 | @Description("Timeout for Kudu operations in milliseconds. Defaults is '30000 ms'.") 42 | @Nullable 43 | public String optOperationTimeoutMs; 44 | 45 | @Name("admin-timeout") 46 | @Description("Administration operation time out. Default is '30000 ms'.") 47 | @Nullable 48 | public String optAdminTimeoutMs; 49 | 50 | @Name("seed") 51 | @Description("Seed to be used for hashing. Default is 0") 52 | @Nullable 53 | public String optSeed; 54 | 55 | @Name("replicas") 56 | @Description("Specifies the number of replicas for the Kudu tables") 57 | @Nullable 58 | public String optReplicas; 59 | 60 | @Name("compression-algo") 61 | @Description("Compression algorithm to be applied on the columns. Default is 'snappy'") 62 | @Nullable 63 | public String optCompressionAlgorithm; 64 | 65 | @Name("encoding") 66 | @Description("Specifies the encoding to be applied on the schema. Default is 'auto'") 67 | @Nullable 68 | public String optEncoding; 69 | 70 | @Name("row-flush") 71 | @Description("Number of rows that are buffered before flushing to the tablet server") 72 | @Nullable 73 | public String optFlushRows; 74 | 75 | @Name("buckets") 76 | @Description("Specifies the number of buckets to split the table into.") 77 | @Nullable 78 | public String optBucketsCounts; 79 | 80 | @Name("boss-threads") 81 | @Description("Specifies the number of boss threads to be used by the client.") 82 | @Nullable 83 | private String optBossThreads; 84 | 85 | public CDCKuduConfig(ColumnSchema.CompressionAlgorithm compression) { 86 | this("kudu"); 87 | } 88 | 89 | public CDCKuduConfig(String referenceName) { 90 | super(referenceName); 91 | } 92 | 93 | /** 94 | * @return cleaned up master address. 95 | */ 96 | public String getMasterAddress() { 97 | return optMasterAddresses.trim(); 98 | } 99 | 100 | /** 101 | * @return Compression algorithm to be associated with all the fields. 102 | */ 103 | public ColumnSchema.CompressionAlgorithm getCompression() { 104 | ColumnSchema.CompressionAlgorithm algorithm = ColumnSchema.CompressionAlgorithm.SNAPPY; 105 | 106 | switch(optCompressionAlgorithm.toLowerCase()) { 107 | case "snappy": 108 | algorithm = ColumnSchema.CompressionAlgorithm.SNAPPY; 109 | break; 110 | 111 | case "lz4": 112 | algorithm = ColumnSchema.CompressionAlgorithm.LZ4; 113 | break; 114 | 115 | case "zlib": 116 | algorithm = ColumnSchema.CompressionAlgorithm.ZLIB; 117 | break; 118 | 119 | case "backend configured": 120 | algorithm = ColumnSchema.CompressionAlgorithm.DEFAULT_COMPRESSION; 121 | break; 122 | 123 | case "No Compression": 124 | algorithm = ColumnSchema.CompressionAlgorithm.NO_COMPRESSION; 125 | break; 126 | } 127 | return algorithm; 128 | } 129 | 130 | /** 131 | * @return Encoding to be applied to all the columns. 132 | */ 133 | public ColumnSchema.Encoding getEncoding() { 134 | ColumnSchema.Encoding encoding = ColumnSchema.Encoding.AUTO_ENCODING; 135 | switch(optEncoding.toLowerCase()) { 136 | case "auto": 137 | encoding = ColumnSchema.Encoding.AUTO_ENCODING; 138 | break; 139 | 140 | case "plain": 141 | encoding = ColumnSchema.Encoding.PLAIN_ENCODING; 142 | break; 143 | 144 | case "prefix": 145 | encoding = ColumnSchema.Encoding.PREFIX_ENCODING; 146 | break; 147 | 148 | case "group variant": 149 | encoding = ColumnSchema.Encoding.GROUP_VARINT; 150 | break; 151 | 152 | case "rle": 153 | encoding = ColumnSchema.Encoding.RLE; 154 | break; 155 | 156 | case "dictionary": 157 | encoding = ColumnSchema.Encoding.DICT_ENCODING; 158 | break; 159 | 160 | case "bit shuffle": 161 | encoding = ColumnSchema.Encoding.BIT_SHUFFLE; 162 | break; 163 | } 164 | return encoding; 165 | } 166 | 167 | /** 168 | * @return Number of replicas of a table on tablet servers. 169 | */ 170 | public int getReplicas() { 171 | return (optReplicas != null) ? Integer.parseInt(optReplicas) : 1; 172 | } 173 | 174 | /** 175 | * @return Timeout for user operations. 176 | */ 177 | public int getOperationTimeout() { 178 | return (optOperationTimeoutMs != null) ? Integer.parseInt(optOperationTimeoutMs) : 30000; 179 | } 180 | 181 | /** 182 | * @return Number of rows to be cached before being flushed. 183 | */ 184 | public int getCacheRowCount() { 185 | return (optFlushRows != null) ? Integer.parseInt(optFlushRows) : 30000; 186 | } 187 | 188 | /** 189 | * @return Timeout for admin operations. 190 | */ 191 | public int getAdministrationTimeout() { 192 | return (optAdminTimeoutMs != null) ? Integer.parseInt(optAdminTimeoutMs) : 30000; 193 | } 194 | 195 | /** 196 | * @return Number of buckets to be used for storing the rows. 197 | */ 198 | public int getBuckets() { 199 | return (optBucketsCounts != null) ? Integer.parseInt(optBucketsCounts) : 16; 200 | } 201 | 202 | /** 203 | * @return Seed to be used for randomizing rows into hashed buckets. 204 | */ 205 | public int getSeed() { 206 | return (optSeed != null) ? Integer.parseInt(optSeed) : 0; 207 | } 208 | 209 | /** 210 | * @return Number of boss threads to be used. 211 | */ 212 | public int getThreads() { 213 | return (optBossThreads != null) ? Integer.parseInt(optBossThreads) : 1; 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/sink/CDCTableUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.sink; 18 | 19 | import com.google.common.base.Preconditions; 20 | import io.cdap.cdap.api.common.Bytes; 21 | import io.cdap.cdap.api.data.format.StructuredRecord; 22 | import io.cdap.cdap.api.data.schema.Schema; 23 | import io.cdap.plugin.cdc.common.OperationType; 24 | import io.cdap.plugin.cdc.common.Schemas; 25 | import org.apache.hadoop.hbase.HColumnDescriptor; 26 | import org.apache.hadoop.hbase.HTableDescriptor; 27 | import org.apache.hadoop.hbase.TableName; 28 | import org.apache.hadoop.hbase.client.Admin; 29 | import org.apache.hadoop.hbase.client.Delete; 30 | import org.apache.hadoop.hbase.client.Put; 31 | import org.apache.hadoop.hbase.client.Table; 32 | import org.slf4j.Logger; 33 | import org.slf4j.LoggerFactory; 34 | 35 | import java.io.IOException; 36 | import java.nio.ByteBuffer; 37 | import java.util.List; 38 | import java.util.Map; 39 | import java.util.stream.Collectors; 40 | import javax.annotation.Nullable; 41 | 42 | /** 43 | * Utility methods for dealing with Tables, for CDC use cases. 44 | */ 45 | public class CDCTableUtil { 46 | 47 | private static final Logger LOG = LoggerFactory.getLogger(CDCTableUtil.class); 48 | 49 | public static final String CDC_COLUMN_FAMILY = "cdc"; 50 | 51 | /** 52 | * Creates a table using the HBase Admin API. 53 | * 54 | * @param admin the HBase Admin to use to create the table 55 | * @param tableName the name of the table 56 | */ 57 | public static void createHBaseTable(Admin admin, String tableName) throws IOException { 58 | if (!admin.tableExists(TableName.valueOf(tableName))) { 59 | HTableDescriptor descriptor = new HTableDescriptor(TableName.valueOf(tableName)); 60 | descriptor.addFamily(new HColumnDescriptor(CDC_COLUMN_FAMILY)); 61 | LOG.debug("Creating HBase table {}.", tableName); 62 | admin.createTable(descriptor); 63 | } 64 | } 65 | 66 | /** 67 | * Updates an HBase API Table with a CDC record. 68 | * 69 | * @param table the HBase API Table to update 70 | * @param dmlRecord the StructuredRecord containing the CDC data 71 | */ 72 | public static void updateHBaseTable(Table table, StructuredRecord dmlRecord) throws Exception { 73 | OperationType operationType = OperationType.valueOf(dmlRecord.get(Schemas.OP_TYPE_FIELD)); 74 | List primaryKeys = dmlRecord.get(Schemas.PRIMARY_KEYS_FIELD); 75 | Schema updateSchema = Schema.parseJson((String) dmlRecord.get(Schemas.UPDATE_SCHEMA_FIELD)); 76 | Map changes = dmlRecord.get(Schemas.UPDATE_VALUES_FIELD); 77 | 78 | switch (operationType) { 79 | case INSERT: 80 | case UPDATE: 81 | Put put = new Put(getRowKey(primaryKeys, changes)); 82 | for (Schema.Field field : updateSchema.getFields()) { 83 | setPutField(put, field, changes.get(field.getName())); 84 | } 85 | table.put(put); 86 | LOG.debug("Putting row {}", Bytes.toString(getRowKey(primaryKeys, changes))); 87 | break; 88 | case DELETE: 89 | Delete delete = new Delete(getRowKey(primaryKeys, changes)); 90 | table.delete(delete); 91 | LOG.debug("Deleting row {}", Bytes.toString(getRowKey(primaryKeys, changes))); 92 | break; 93 | default: 94 | LOG.warn("Operation of type '{}' will be ignored.", operationType); 95 | } 96 | } 97 | 98 | private static byte[] getRowKey(List primaryKeys, Map changes) { 99 | // the primary keys are always in sorted order 100 | String joinedValue = primaryKeys.stream() 101 | .sorted() 102 | .map(primaryKey -> changes.get(primaryKey).toString()) 103 | .collect(Collectors.joining(":")); 104 | return Bytes.toBytes(joinedValue); 105 | } 106 | 107 | // get the non-nullable type of the field and check that it's a simple type. 108 | private static Schema.Type validateAndGetType(Schema.Field field) { 109 | Schema.Type type; 110 | if (field.getSchema().isNullable()) { 111 | type = field.getSchema().getNonNullable().getType(); 112 | } else { 113 | type = field.getSchema().getType(); 114 | } 115 | Preconditions.checkArgument(type.isSimpleType(), 116 | "only simple types are supported (boolean, int, long, float, double, bytes)."); 117 | return type; 118 | } 119 | 120 | private static void setPutField(Put put, Schema.Field field, @Nullable Object val) { 121 | Schema.Type type = validateAndGetType(field); 122 | String column = field.getName(); 123 | if (val == null) { 124 | put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), null); 125 | return; 126 | } 127 | 128 | switch (type) { 129 | case BOOLEAN: 130 | put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), Bytes.toBytes((Boolean) val)); 131 | break; 132 | case INT: 133 | put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), 134 | Bytes.toBytes(((Number) val).intValue())); 135 | break; 136 | case LONG: 137 | put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), 138 | Bytes.toBytes(((Number) val).longValue())); 139 | break; 140 | case FLOAT: 141 | put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), 142 | Bytes.toBytes(((Number) val).floatValue())); 143 | break; 144 | case DOUBLE: 145 | put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), 146 | Bytes.toBytes(((Number) val).doubleValue())); 147 | break; 148 | case BYTES: 149 | if (val instanceof ByteBuffer) { 150 | put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), Bytes.toBytes((ByteBuffer) val)); 151 | } else { 152 | put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), (byte[]) val); 153 | } 154 | break; 155 | case STRING: 156 | put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), Bytes.toBytes((String) val)); 157 | break; 158 | default: 159 | throw new IllegalArgumentException("Field " + field.getName() + " is of unsupported type " + type); 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/sqlserver/CTSQLServerConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.sqlserver; 18 | 19 | import io.cdap.cdap.api.annotation.Description; 20 | import io.cdap.cdap.api.annotation.Name; 21 | import io.cdap.cdap.api.plugin.PluginConfig; 22 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException; 23 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig; 24 | 25 | import java.util.Arrays; 26 | import java.util.Collections; 27 | import java.util.Set; 28 | import java.util.stream.Collectors; 29 | import javax.annotation.Nullable; 30 | 31 | /** 32 | * Defines the {@link PluginConfig} for the {@link CTSQLServer}. 33 | */ 34 | public class CTSQLServerConfig extends CDCReferencePluginConfig { 35 | 36 | public static final String HOST_NAME = "hostname"; 37 | public static final String PORT = "port"; 38 | public static final String USERNAME = "username"; 39 | public static final String PASSWORD = "password"; 40 | public static final String DATABASE_NAME = "dbname"; 41 | public static final String SEQUENCE_START_NUM = "sequenceStartNum"; 42 | public static final String MAX_RETRY_SECONDS = "maxRetrySeconds"; 43 | public static final String MAX_BATCH_SIZE = "maxBatchSize"; 44 | public static final String TABLE_WHITELIST = "tableWhitelist"; 45 | public static final String JDBC_PLUGIN_NAME = "jdbcPluginName"; 46 | public static final String CONNECTION_STRING = "connectionString"; 47 | 48 | @Name(HOST_NAME) 49 | @Description("SQL Server hostname. This is not required if a connection string was specified.") 50 | @Nullable 51 | private final String hostname; 52 | 53 | @Name(PORT) 54 | @Description("SQL Server port. This is not required if a connection string was specified.") 55 | @Nullable 56 | private final Integer port; 57 | 58 | @Name(DATABASE_NAME) 59 | @Description("SQL Server database name. Note: CT must be enabled on the database for change tracking.") 60 | @Nullable 61 | private String dbName; 62 | 63 | @Name(USERNAME) 64 | @Description("User to use to connect to the specified database. Required for databases that " + 65 | "need authentication. Optional for databases that do not require authentication.") 66 | @Nullable 67 | private final String username; 68 | 69 | @Name(PASSWORD) 70 | @Description("Password to use to connect to the specified database. Required for databases that " + 71 | "need authentication. Optional for databases that do not require authentication.") 72 | @Nullable 73 | private final String password; 74 | 75 | @Name(MAX_RETRY_SECONDS) 76 | @Description("Maximum amount of time to retry reading change events if there is an error. " 77 | + "If no retries should be done, this should be set to 0. " 78 | + "If there should not be a retry limit, this should be set to a negative number or left empty.") 79 | @Nullable 80 | private final Long maxRetrySeconds; 81 | 82 | @Name(SEQUENCE_START_NUM) 83 | @Description("The Change Tracking sequence number to start from.") 84 | @Nullable 85 | private final Long sequenceStartNum; 86 | 87 | @Name(MAX_BATCH_SIZE) 88 | @Description("Maximum number of changes to consume in a single batch interval.") 89 | @Nullable 90 | private final Integer maxBatchSize; 91 | 92 | @Name(TABLE_WHITELIST) 93 | @Description("A whitelist of tables to consume changes from. " 94 | + "If none is specified, changes from all tables will be consumed.") 95 | @Nullable 96 | private final String tableWhitelist; 97 | 98 | @Description("Name of the JDBC plugin to use if something different than the built-in sql server driver is required.") 99 | @Nullable 100 | private final String jdbcPluginName; 101 | 102 | @Description("Connection string to use when connecting to the database through JDBC. " 103 | + "This is required if a JDBC plugin was specified.") 104 | @Nullable 105 | private final String connectionString; 106 | 107 | public CTSQLServerConfig() { 108 | super(""); 109 | this.hostname = null; 110 | this.port = 1433; 111 | this.dbName = null; 112 | this.username = null; 113 | this.password = null; 114 | this.sequenceStartNum = 0L; 115 | this.maxRetrySeconds = -1L; 116 | this.maxBatchSize = 100000; 117 | this.tableWhitelist = null; 118 | this.jdbcPluginName = null; 119 | this.connectionString = null; 120 | } 121 | 122 | public String getHostname() { 123 | return hostname; 124 | } 125 | 126 | public int getPort() { 127 | return port; 128 | } 129 | 130 | public String getDbName() { 131 | return dbName; 132 | } 133 | 134 | @Nullable 135 | public String getUsername() { 136 | return username; 137 | } 138 | 139 | @Nullable 140 | public String getPassword() { 141 | return password; 142 | } 143 | 144 | public long getSequenceStartNum() { 145 | return sequenceStartNum == null ? 0L : sequenceStartNum; 146 | } 147 | 148 | public long getMaxRetrySeconds() { 149 | return maxRetrySeconds == null ? -1L : maxRetrySeconds; 150 | } 151 | 152 | public int getMaxBatchSize() { 153 | return maxBatchSize == null ? 100000 : maxBatchSize; 154 | } 155 | 156 | public Set getTableWhitelist() { 157 | return tableWhitelist == null ? Collections.emptySet() : 158 | Arrays.stream(tableWhitelist.split(",")).map(String::trim).collect(Collectors.toSet()); 159 | } 160 | 161 | @Nullable 162 | public String getJdbcPluginName() { 163 | return jdbcPluginName; 164 | } 165 | 166 | public String getConnectionString() { 167 | if (connectionString != null) { 168 | return connectionString; 169 | } 170 | return String.format("jdbc:sqlserver://%s:%s;DatabaseName=%s", hostname, port, dbName); 171 | } 172 | 173 | @Override 174 | public void validate() { 175 | super.validate(); 176 | if (jdbcPluginName != null && connectionString == null) { 177 | throw new InvalidConfigPropertyException( 178 | "A connection string must be specified when a custom jdbc driver is used.", CONNECTION_STRING); 179 | } 180 | 181 | if (dbName == null) { 182 | throw new InvalidConfigPropertyException("A database name must be specified", DATABASE_NAME); 183 | } 184 | 185 | if (connectionString == null) { 186 | if (hostname == null) { 187 | throw new InvalidConfigPropertyException("A hostname must be specified", HOST_NAME); 188 | } 189 | if (port == null) { 190 | throw new InvalidConfigPropertyException("A port must be specified", PORT); 191 | } 192 | } 193 | 194 | if (port != null && (port < 0 || port > 65535)) { 195 | throw new InvalidConfigPropertyException("Port number should be in range 0-65535", PORT); 196 | } 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 2 | [![Join CDAP community](https://cdap-users.herokuapp.com/badge.svg?t=wrangler)](https://cdap-users.herokuapp.com?t=1) 3 | 4 | Change Data Capture (Alpha) 5 | =========================== 6 | 7 | In databases, Change Data Capture(CDC) is used to determine and track the data that has changed so that 8 | action can be taken using the changed data. This repository contains CDAP plugins which allows to capture 9 | the changes from databases such as Oracle and Microsoft SQL Server and to push those changes in realtime 10 | to sinks such as Kudu, HBase, and Database. 11 | 12 | * [Oracle](docs/oracle/Oracle.md) 13 | * [Microsoft SQL Server](docs/CTSQLServer.md) 14 | 15 | # Overview 16 | 17 | Following plugins are available in this repository. 18 | 19 | * [Google Cloud BigTable Sink](docs/CDCBigTable-sparksink.md) 20 | * [HBase Sink](docs/CDCHBase-sparksink.md) 21 | * Kudu Sink 22 | * [Golden Gate Kafka Source](docs/oracle/Oracle.md) 23 | * [SQL Server Change Tracking Streaming Source](docs/CTSQLServer.md) 24 | 25 | # Development 26 | 27 | ## Run Integration Tests 28 | It is possible to run integration tests against **local** (see [Setup Local Environment](#setup-local-environment)) 29 | or **remote environment**. 30 | 31 | Run tests against **local** environment: 32 | ```bash 33 | mvn clean test 34 | ``` 35 | 36 | Run tests against **remote** environment: 37 | ```bash 38 | mvn clean test -DinstanceUri= 39 | ``` 40 | 41 | To use **remote environment** you may configure the following system properties: 42 | * **test.sql-server.host** - SQL Server host. Default: localhost. 43 | * **test.sql-server.port** - SQL Server port. Default: 1433. 44 | * **test.sql-server.username** - SQL Server username. This user should have permissions to create databases. 45 | Default: SA. 46 | * **test.sql-server.password** - SQL Server password. Default: 123Qwe123. 47 | * **test.sql-server.namespace** - SQL Server namespace for test databases. Default: dbo. 48 | * **test.bigtable.project** - Google Cloud Project ID. Default: lookup from local environment. 49 | * **test.bigtable.instance** - Bigtable Instance ID. Default: null. 50 | * **test.bigtable.serviceFilePath** - Path on the local file system of the service account key used for 51 | authorization. Default: lookup from local environment. 52 | * **test.oracle-db.host** - Oracle DB host. Default: localhost. 53 | * **test.oracle-db.port** - Oracle DB port. Default: 1521. 54 | * **test.oracle-db.service** - Oracle DB service name. Default: XE. 55 | * **test.oracle-db.username** - Oracle DB username. Default: trans_user. 56 | * **test.oracle-db.password** - Oracle DB password. Default: trans_user. 57 | * **test.oracle-db.driver.jar** - Path to Oracle Java Driver jar file. Default: null. 58 | * **test.oracle-db.driver.class** - Oracle Java Driver class name. Default: oracle.jdbc.OracleDriver. 59 | * **test.goldengate.broker** - Kafka broker specified in host:port form. Default: localhost:9092. 60 | * **test.goldengate.topic** - Name of the topic to which Golden Gate publishes the DDL and DML changes. 61 | Default: oggtopic. 62 | 63 | **NOTE:** Bigtable Sink tests will be skipped without provided properties. 64 | **NOTE:** Golden Gate Kafka Source tests will be skipped without provided properties. 65 | 66 | ## Run Performance Tests 67 | It is possible to run performance tests against **local** (see [Setup Local Environment](#setup-local-environment)) 68 | or **remote environment**. 69 | 70 | Run tests against **local** environment: 71 | ```bash 72 | mvn clean test -P perf-tests 73 | ``` 74 | 75 | Run tests against **remote** environment: 76 | ```bash 77 | mvn clean test -P perf-tests -DinstanceUri= 78 | ``` 79 | 80 | Common system properties for tests: 81 | * **ptest.test-data.load** - Prepare and load test data to source storage. Default: true. 82 | * **ptest.test-data.inserts** - Number of records to prepare. Default: 5000. 83 | * **ptest.target-table-created-timeout.seconds** - Timeout for table creation in sink storage. Default: 300. 84 | * **ptest.data-transferred-timeout.seconds** - Timeout for data transfer to target storage. Default: 600. 85 | 86 | To use **remote environment** you may configure the following system properties: 87 | * **ptest.sql-server.host** - SQL Server host. Default: localhost. 88 | * **ptest.sql-server.port** - SQL Server port. Default: 1433. 89 | * **ptest.sql-server.username** - SQL Server username. This user should have permissions to create databases. 90 | Default: SA. 91 | * **ptest.sql-server.password** - SQL Server password. Default: 123Qwe123. 92 | * **ptest.bigtable.project** - Google Cloud Project ID. Default: lookup from local environment. 93 | * **ptest.bigtable.instance** - Bigtable Instance ID. Default: null. 94 | * **ptest.bigtable.serviceFilePath** - Path on the local file system of the service account key used for 95 | authorization. Default: lookup from local environment. 96 | 97 | **NOTE:** Bigtable Sink tests will be skipped without provided properties. 98 | 99 | ## Setup Local Environment 100 | To start local environment you should: 101 | * [Install Docker Compose](https://docs.docker.com/compose/install/) 102 | * Build local docker images 103 | * [Build Oracle DB docker image](https://github.com/oracle/docker-images/tree/master/OracleDatabase/SingleInstance) 104 | * [Build Oracle GoldenGate docker image](https://github.com/oracle/docker-images/tree/master/OracleGoldenGate) 105 | * Start environment by running commands: 106 | ```bash 107 | cd docker-compose/cdc-env/ 108 | docker-compose up -d 109 | ``` 110 | * Configure GoldenGate for Oracle: 111 | * Start ggsci: 112 | ```bash 113 | docker-compose exec --user oracle goldengate_oracle ggsci 114 | ``` 115 | * Configure user credentials: 116 | ```bash 117 | ADD credentialstore 118 | alter credentialstore add user gg_extract@oracledb:1521/xe password gg_extract alias oggadmin 119 | ``` 120 | * Change source schema configuration: 121 | ```bash 122 | DBLOGIN USERIDALIAS oggadmin 123 | add schematrandata trans_user ALLCOLS 124 | ``` 125 | * Define the Extract and start it 126 | (all EXTRACT params are defined in docker-compose/cdc-env/GoldenGate/dirprm/ext1.prm): 127 | ```bash 128 | ADD EXTRACT ext1, TRANLOG, BEGIN NOW 129 | ADD EXTTRAIL /u01/app/ogg/dirdat/in, EXTRACT ext1 130 | START ext1 131 | ``` 132 | * Check its status: 133 | ```bash 134 | INFO ext1 135 | ``` 136 | * Configure GoldenGate for BigData: 137 | * Start ggsci: 138 | ```bash 139 | docker-compose exec --user oracle goldengate_bigdata ggsci 140 | ``` 141 | * Define the Replicat and start it 142 | (all REPLICAT params are defined in docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/rconf.prm): 143 | ```bash 144 | ADD REPLICAT rconf, EXTTRAIL /u01/app/ogg/dirdat/in 145 | START rconf 146 | ``` 147 | * Check its status: 148 | ```bash 149 | INFO RCONF 150 | ``` 151 | NOTE: More info about *.prm files - https://docs.oracle.com/goldengate/1212/gg-winux/GWURF/gg_parameters.htm#GWURF394 152 | 153 | # Contact 154 | 155 | ## Mailing Lists 156 | 157 | CDAP User Group and Development Discussions: 158 | 159 | * [cdap-user@googlegroups.com](https://groups.google.com/d/forum/cdap-user) 160 | 161 | The *cdap-user* mailing list is primarily for users using the product to develop 162 | applications or building plugins for appplications. You can expect questions from 163 | users, release announcements, and any other discussions that we think will be helpful 164 | to the users. 165 | 166 | # License and Trademarks 167 | 168 | Copyright © 2016-2019 Cask Data, Inc. 169 | 170 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 171 | in compliance with the License. You may obtain a copy of the License at 172 | 173 | http://www.apache.org/licenses/LICENSE-2.0 174 | 175 | Unless required by applicable law or agreed to in writing, software distributed under the 176 | License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 177 | either express or implied. See the License for the specific language governing permissions 178 | and limitations under the License. 179 | 180 | Cask is a trademark of Cask Data, Inc. All rights reserved. 181 | 182 | Apache, Apache HBase, and HBase are trademarks of The Apache Software Foundation. Used with 183 | permission. No endorsement by The Apache Software Foundation is implied by the use of these marks. 184 | 185 | 186 | 187 | 188 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/sqlserver/CTSQLServer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.sqlserver; 18 | 19 | import io.cdap.cdap.api.annotation.Description; 20 | import io.cdap.cdap.api.annotation.Name; 21 | import io.cdap.cdap.api.annotation.Plugin; 22 | import io.cdap.cdap.api.data.format.StructuredRecord; 23 | import io.cdap.cdap.api.dataset.DatasetProperties; 24 | import io.cdap.cdap.api.plugin.PluginProperties; 25 | import io.cdap.cdap.etl.api.PipelineConfigurer; 26 | import io.cdap.cdap.etl.api.streaming.StreamingContext; 27 | import io.cdap.cdap.etl.api.streaming.StreamingSource; 28 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException; 29 | import io.cdap.cdap.etl.api.validation.InvalidStageException; 30 | import io.cdap.plugin.cdc.common.DBUtils; 31 | import io.cdap.plugin.cdc.common.DriverCleanup; 32 | import io.cdap.plugin.cdc.common.Schemas; 33 | import io.cdap.plugin.common.Constants; 34 | import org.apache.spark.api.java.Optional; 35 | import org.apache.spark.api.java.function.Function4; 36 | import org.apache.spark.rdd.JdbcRDD; 37 | import org.apache.spark.streaming.State; 38 | import org.apache.spark.streaming.StateSpec; 39 | import org.apache.spark.streaming.Time; 40 | import org.apache.spark.streaming.api.java.JavaDStream; 41 | import org.slf4j.Logger; 42 | import org.slf4j.LoggerFactory; 43 | import scala.Tuple2; 44 | import scala.reflect.ClassTag; 45 | import scala.reflect.ClassTag$; 46 | 47 | import java.sql.Connection; 48 | import java.sql.Driver; 49 | import java.sql.DriverManager; 50 | import java.sql.PreparedStatement; 51 | import java.sql.ResultSet; 52 | import java.sql.SQLException; 53 | import java.util.HashMap; 54 | import java.util.Map; 55 | 56 | /** 57 | * Streaming source for reading changes from SQL Server. 58 | */ 59 | @Plugin(type = StreamingSource.PLUGIN_TYPE) 60 | @Name("CTSQLServer") 61 | @Description("SQL Server Change Tracking Streaming Source") 62 | public class CTSQLServer extends StreamingSource { 63 | private static final Logger LOG = LoggerFactory.getLogger(CTSQLServer.class); 64 | static final String JDBC_PLUGIN_ID = "jdbc"; 65 | private final CTSQLServerConfig conf; 66 | 67 | public CTSQLServer(CTSQLServerConfig conf) { 68 | this.conf = conf; 69 | } 70 | 71 | @Override 72 | public void configurePipeline(PipelineConfigurer pipelineConfigurer) { 73 | conf.validate(); 74 | pipelineConfigurer.createDataset(conf.referenceName, Constants.EXTERNAL_DATASET_TYPE, DatasetProperties.EMPTY); 75 | pipelineConfigurer.getStageConfigurer().setOutputSchema(Schemas.CHANGE_SCHEMA); 76 | 77 | DriverCleanup driverCleanup = null; 78 | JdbcRDD.ConnectionFactory connectionFactory; 79 | if (conf.getJdbcPluginName() != null) { 80 | Class driverClass = pipelineConfigurer.usePluginClass("jdbc", conf.getJdbcPluginName(), 81 | JDBC_PLUGIN_ID, 82 | PluginProperties.builder().build()); 83 | if (driverClass == null) { 84 | throw new InvalidConfigPropertyException("Unable to find jdbc driver plugin", 85 | CTSQLServerConfig.JDBC_PLUGIN_NAME); 86 | } 87 | try { 88 | driverCleanup = DBUtils.ensureJDBCDriverIsAvailable(driverClass, conf.getConnectionString()); 89 | } catch (IllegalAccessException | InstantiationException | SQLException e) { 90 | throw new IllegalArgumentException("Unable to instantiate jdbc driver plugin: " + e.getMessage(), e); 91 | } 92 | connectionFactory = (JdbcRDD.ConnectionFactory) () -> DriverManager.getConnection(conf.getConnectionString(), 93 | conf.getUsername(), 94 | conf.getPassword()); 95 | } else { 96 | connectionFactory = new SQLServerConnectionFactory(conf.getConnectionString(), 97 | conf.getUsername(), conf.getPassword()); 98 | } 99 | 100 | if (conf.getUsername() != null && conf.getPassword() != null) { 101 | LOG.info("Creating connection with url {}, username {}, password *****", 102 | getConnectionString(), conf.getUsername()); 103 | } else { 104 | LOG.info("Creating connection with url {}", getConnectionString()); 105 | } 106 | 107 | try (Connection connection = connectionFactory.getConnection()) { 108 | // check that CDC is enabled on the database 109 | checkDBCTEnabled(connection, conf.getDbName()); 110 | } catch (InvalidStageException e) { 111 | // rethrow validation exception 112 | throw e; 113 | } catch (Exception e) { 114 | throw new InvalidStageException(String.format("Failed to check tracking status. Error: %s", e.getMessage()), e); 115 | } finally { 116 | if (driverCleanup != null) { 117 | driverCleanup.destroy(); 118 | } 119 | } 120 | } 121 | 122 | @Override 123 | public JavaDStream getStream(StreamingContext context) throws Exception { 124 | context.registerLineage(conf.referenceName); 125 | 126 | 127 | JdbcRDD.ConnectionFactory connectionFactory; 128 | if (conf.getJdbcPluginName() != null) { 129 | connectionFactory = new PluginConnectionFactory(context.getSparkExecutionContext().getPluginContext(), 130 | context.getStageName(), conf.getConnectionString()); 131 | } else { 132 | connectionFactory = new SQLServerConnectionFactory(conf.getConnectionString(), 133 | conf.getUsername(), conf.getPassword()); 134 | } 135 | 136 | // get change information dtream. This dstream has both schema and data changes 137 | LOG.info("Creating change information dstream"); 138 | ClassTag tag = ClassTag$.MODULE$.apply(StructuredRecord.class); 139 | CTInputDStream dstream = new CTInputDStream(context.getSparkStreamingContext().ssc(), connectionFactory, 140 | conf.getTableWhitelist(), conf.getSequenceStartNum(), 141 | conf.getMaxRetrySeconds(), conf.getMaxBatchSize()); 142 | return JavaDStream.fromDStream(dstream, tag) 143 | .mapToPair(structuredRecord -> new Tuple2<>("", structuredRecord)) 144 | // map the dstream with schema state store to detect changes in schema 145 | // filter out the ddl record whose schema hasn't changed and then drop all the keys 146 | .mapWithState(StateSpec.function(schemaStateFunction())) 147 | .map(Schemas::toCDCRecord); 148 | } 149 | 150 | private void checkDBCTEnabled(Connection connection, String dbName) throws SQLException { 151 | String query = "SELECT * FROM sys.change_tracking_databases WHERE database_id=DB_ID(?)"; 152 | try (PreparedStatement preparedStatement = connection.prepareStatement(query)) { 153 | preparedStatement.setString(1, dbName); 154 | try (ResultSet resultSet = preparedStatement.executeQuery()) { 155 | if (resultSet.next()) { 156 | // if resultset is not empty it means that our select with where clause returned data meaning ct is enabled. 157 | return; 158 | } 159 | } 160 | } 161 | throw new InvalidStageException(String.format("Change Tracking is not enabled on the specified database '%s'." + 162 | " Please enable it first.", dbName)); 163 | } 164 | 165 | private String getConnectionString() { 166 | return String.format("jdbc:sqlserver://%s:%s;DatabaseName=%s", conf.getHostname(), conf.getPort(), 167 | conf.getDbName()); 168 | } 169 | 170 | private static Function4, State>, 171 | Optional> schemaStateFunction() { 172 | return (time, key, value, state) -> { 173 | if (!value.isPresent()) { 174 | return Optional.empty(); 175 | } 176 | StructuredRecord input = value.get(); 177 | // for dml record we don't need to maintain any state so skip it 178 | if (Schemas.DML_SCHEMA.getRecordName().equals(input.getSchema().getRecordName())) { 179 | return Optional.of(input); 180 | } 181 | 182 | // we know now that its a ddl record so process it 183 | String tableName = input.get(Schemas.TABLE_FIELD); 184 | String tableSchemaStructure = input.get(Schemas.SCHEMA_FIELD); 185 | Map newState; 186 | if (state.exists()) { 187 | newState = state.get(); 188 | if (newState.containsKey(tableName) && newState.get(tableName).equals(tableSchemaStructure)) { 189 | // schema hasn't changed so emit with false so that we can later filter this record out 190 | return Optional.empty(); 191 | } 192 | } else { 193 | newState = new HashMap<>(); 194 | } 195 | // update the state 196 | newState.put(tableName, tableSchemaStructure); 197 | state.update(newState); 198 | LOG.debug("Update schema state store for table {}. New schema will be emitted.", tableName); 199 | return Optional.of(input); 200 | }; 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /docker-compose/cdc-env/Oracle/dbca.rsp.tmpl: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | ## ## 3 | ## DBCA response file ## 4 | ## ------------------ ## 5 | ## Copyright(c) Oracle Corporation 1998,2017. All rights reserved. ## 6 | ## ## 7 | ## Specify values for the variables listed below to customize ## 8 | ## your installation. ## 9 | ## ## 10 | ## Each variable is associated with a comment. The comment ## 11 | ## can help to populate the variables with the appropriate ## 12 | ## values. ## 13 | ## ## 14 | ## IMPORTANT NOTE: This file contains plain text passwords and ## 15 | ## should be secured to have read permission only by oracle user ## 16 | ## or db administrator who owns this installation. ## 17 | ############################################################################## 18 | #------------------------------------------------------------------------------- 19 | # Do not change the following system generated value. 20 | #------------------------------------------------------------------------------- 21 | responseFileVersion=/oracle/assistants/rspfmt_dbca_response_schema_v18.0.0 22 | 23 | #----------------------------------------------------------------------------- 24 | # Name : gdbName 25 | # Datatype : String 26 | # Description : Global database name of the database 27 | # Valid values : . - when database domain isn't NULL 28 | # - when database domain is NULL 29 | # Default value : None 30 | # Mandatory : Yes 31 | #----------------------------------------------------------------------------- 32 | gdbName=###ORACLE_SID### 33 | 34 | #----------------------------------------------------------------------------- 35 | # Name : sid 36 | # Datatype : String 37 | # Description : System identifier (SID) of the database 38 | # Valid values : Check Oracle12c Administrator's Guide 39 | # Default value : specified in GDBNAME 40 | # Mandatory : No 41 | #----------------------------------------------------------------------------- 42 | sid=###ORACLE_SID### 43 | 44 | #----------------------------------------------------------------------------- 45 | # Name : createAsContainerDatabase 46 | # Datatype : boolean 47 | # Description : flag to create database as container database 48 | # Valid values : Check Oracle12c Administrator's Guide 49 | # Default value : false 50 | # Mandatory : No 51 | #----------------------------------------------------------------------------- 52 | createAsContainerDatabase=false 53 | 54 | #----------------------------------------------------------------------------- 55 | # Name : numberOfPDBs 56 | # Datatype : Number 57 | # Description : Specify the number of pdb to be created 58 | # Valid values : 0 to 4094 59 | # Default value : 0 60 | # Mandatory : No 61 | #----------------------------------------------------------------------------- 62 | numberOfPDBs=0 63 | 64 | #----------------------------------------------------------------------------- 65 | # Name : pdbName 66 | # Datatype : String 67 | # Description : Specify the pdbname/pdbanme prefix if one or more pdb need to be created 68 | # Valid values : Check Oracle12c Administrator's Guide 69 | # Default value : None 70 | # Mandatory : No 71 | #----------------------------------------------------------------------------- 72 | pdbName=None 73 | 74 | #----------------------------------------------------------------------------- 75 | # Name : pdbAdminPassword 76 | # Datatype : String 77 | # Description : PDB Administrator user password 78 | # Valid values : Check Oracle12c Administrator's Guide 79 | # Default value : None 80 | # Mandatory : No 81 | #----------------------------------------------------------------------------- 82 | pdbAdminPassword=None 83 | 84 | #----------------------------------------------------------------------------- 85 | # Name : templateName 86 | # Datatype : String 87 | # Description : Name of the template 88 | # Valid values : Template file name 89 | # Default value : None 90 | # Mandatory : Yes 91 | #----------------------------------------------------------------------------- 92 | templateName=General_Purpose.dbc 93 | 94 | #----------------------------------------------------------------------------- 95 | # Name : sysPassword 96 | # Datatype : String 97 | # Description : Password for SYS user 98 | # Valid values : Check Oracle12c Administrator's Guide 99 | # Default value : None 100 | # Mandatory : Yes 101 | #----------------------------------------------------------------------------- 102 | sysPassword=###ORACLE_PWD### 103 | 104 | #----------------------------------------------------------------------------- 105 | # Name : systemPassword 106 | # Datatype : String 107 | # Description : Password for SYSTEM user 108 | # Valid values : Check Oracle12c Administrator's Guide 109 | # Default value : None 110 | # Mandatory : Yes 111 | #----------------------------------------------------------------------------- 112 | systemPassword=###ORACLE_PWD### 113 | 114 | #----------------------------------------------------------------------------- 115 | # Name : emConfiguration 116 | # Datatype : String 117 | # Description : Enterprise Manager Configuration Type 118 | # Valid values : CENTRAL|DBEXPRESS|BOTH|NONE 119 | # Default value : NONE 120 | # Mandatory : No 121 | #----------------------------------------------------------------------------- 122 | emConfiguration=NONE 123 | 124 | #----------------------------------------------------------------------------- 125 | # Name : emExpressPort 126 | # Datatype : Number 127 | # Description : Enterprise Manager Configuration Type 128 | # Valid values : Check Oracle12c Administrator's Guide 129 | # Default value : NONE 130 | # Mandatory : No, will be picked up from DBEXPRESS_HTTPS_PORT env variable 131 | # or auto generates a free port between 5500 and 5599 132 | #----------------------------------------------------------------------------- 133 | #emExpressPort=NONE 134 | 135 | #----------------------------------------------------------------------------- 136 | # Name : dbsnmpPassword 137 | # Datatype : String 138 | # Description : Password for DBSNMP user 139 | # Valid values : Check Oracle12c Administrator's Guide 140 | # Default value : None 141 | # Mandatory : Yes, if emConfiguration is specified or 142 | # the value of runCVUChecks is TRUE 143 | #----------------------------------------------------------------------------- 144 | dbsnmpPassword=###ORACLE_PWD### 145 | 146 | #----------------------------------------------------------------------------- 147 | # Name : characterSet 148 | # Datatype : String 149 | # Description : Character set of the database 150 | # Valid values : Check Oracle12c National Language Support Guide 151 | # Default value : "US7ASCII" 152 | # Mandatory : NO 153 | #----------------------------------------------------------------------------- 154 | characterSet=###ORACLE_CHARACTERSET### 155 | 156 | #----------------------------------------------------------------------------- 157 | # Name : nationalCharacterSet 158 | # Datatype : String 159 | # Description : National Character set of the database 160 | # Valid values : "UTF8" or "AL16UTF16". For details, check Oracle12c National Language Support Guide 161 | # Default value : "AL16UTF16" 162 | # Mandatory : No 163 | #----------------------------------------------------------------------------- 164 | nationalCharacterSet=AL16UTF16 165 | 166 | #----------------------------------------------------------------------------- 167 | # Name : initParams 168 | # Datatype : String 169 | # Description : comma separated list of name=value pairs. Overrides initialization parameters defined in templates 170 | # Default value : None 171 | # Mandatory : NO 172 | #----------------------------------------------------------------------------- 173 | initParams=audit_trail=none,audit_sys_operations=false 174 | 175 | #----------------------------------------------------------------------------- 176 | # Name : listeners 177 | # Datatype : String 178 | # Description : Specifies list of listeners to register the database with. 179 | # By default the database is configured for all the listeners specified in the 180 | # $ORACLE_HOME/network/admin/listener.ora 181 | # Valid values : The list should be comma separated like "listener1,listener2". 182 | # Mandatory : NO 183 | #----------------------------------------------------------------------------- 184 | #listeners=LISTENER 185 | 186 | #----------------------------------------------------------------------------- 187 | # Name : automaticMemoryManagement 188 | # Datatype : Boolean 189 | # Description : flag to indicate Automatic Memory Management is used 190 | # Valid values : TRUE/FALSE 191 | # Default value : TRUE 192 | # Mandatory : NO 193 | #----------------------------------------------------------------------------- 194 | automaticMemoryManagement=FALSE 195 | 196 | #----------------------------------------------------------------------------- 197 | # Name : totalMemory 198 | # Datatype : String 199 | # Description : total memory in MB to allocate to Oracle 200 | # Valid values : 201 | # Default value : 202 | # Mandatory : NO 203 | #----------------------------------------------------------------------------- 204 | totalMemory=2048 -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/common/DBUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.common; 18 | 19 | import com.google.common.collect.Lists; 20 | import io.cdap.cdap.api.data.schema.Schema; 21 | import io.cdap.cdap.api.data.schema.UnsupportedTypeException; 22 | import org.slf4j.Logger; 23 | import org.slf4j.LoggerFactory; 24 | 25 | import java.lang.reflect.Field; 26 | import java.math.BigDecimal; 27 | import java.sql.Blob; 28 | import java.sql.Clob; 29 | import java.sql.Driver; 30 | import java.sql.DriverManager; 31 | import java.sql.ResultSet; 32 | import java.sql.ResultSetMetaData; 33 | import java.sql.SQLException; 34 | import java.sql.Types; 35 | import java.util.List; 36 | import javax.annotation.Nullable; 37 | 38 | /** 39 | * Utility methods for Database plugins shared by Database plugins. 40 | */ 41 | public final class DBUtils { 42 | private static final Logger LOG = LoggerFactory.getLogger(DBUtils.class); 43 | 44 | /** 45 | * Ensures that the JDBC Driver specified in configuration is available and can be loaded. Also registers it with 46 | * {@link DriverManager} if it is not already registered. 47 | */ 48 | public static DriverCleanup ensureJDBCDriverIsAvailable(Class jdbcDriverClass, 49 | String connectionString) 50 | throws IllegalAccessException, InstantiationException, SQLException { 51 | 52 | try { 53 | DriverManager.getDriver(connectionString); 54 | return new DriverCleanup(null); 55 | } catch (SQLException e) { 56 | // Driver not found. We will try to register it with the DriverManager. 57 | final JDBCDriverShim driverShim = new JDBCDriverShim(jdbcDriverClass.newInstance()); 58 | try { 59 | DBUtils.deregisterAllDrivers(jdbcDriverClass); 60 | } catch (NoSuchFieldException | ClassNotFoundException e1) { 61 | LOG.error("Unable to deregister JDBC Driver class {}", jdbcDriverClass); 62 | } 63 | DriverManager.registerDriver(driverShim); 64 | return new DriverCleanup(driverShim); 65 | } 66 | } 67 | 68 | /** 69 | * Given the result set, get the metadata of the result set and return 70 | * list of {@link io.cdap.cdap.api.data.schema.Schema.Field}. 71 | * 72 | * @param resultSet result set of executed query 73 | * @return list of schema fields 74 | * @throws SQLException 75 | */ 76 | public static List getSchemaFields(ResultSet resultSet) throws SQLException { 77 | List schemaFields = Lists.newArrayList(); 78 | ResultSetMetaData metadata = resultSet.getMetaData(); 79 | // ResultSetMetadata columns are numbered starting with 1 80 | for (int i = 1; i <= metadata.getColumnCount(); i++) { 81 | String columnName = metadata.getColumnName(i); 82 | int columnSqlType = metadata.getColumnType(i); 83 | int columnSqlPrecision = metadata.getPrecision(i); // total number of digits 84 | int columnSqlScale = metadata.getScale(i); // digits after the decimal point 85 | String columnTypeName = metadata.getColumnTypeName(i); 86 | Schema columnSchema = getSchema(columnTypeName, columnSqlType, columnSqlPrecision, columnSqlScale); 87 | if (ResultSetMetaData.columnNullable == metadata.isNullable(i)) { 88 | columnSchema = Schema.nullableOf(columnSchema); 89 | } 90 | Schema.Field field = Schema.Field.of(columnName, columnSchema); 91 | schemaFields.add(field); 92 | } 93 | return schemaFields; 94 | } 95 | 96 | // given a sql type return schema type 97 | private static Schema getSchema(String typeName, int sqlType, int precision, int scale) throws SQLException { 98 | // Type.STRING covers sql types - VARCHAR,CHAR,CLOB,LONGNVARCHAR,LONGVARCHAR,NCHAR,NCLOB,NVARCHAR 99 | Schema.Type type = Schema.Type.STRING; 100 | switch (sqlType) { 101 | case Types.NULL: 102 | type = Schema.Type.NULL; 103 | break; 104 | 105 | case Types.ROWID: 106 | break; 107 | 108 | case Types.BOOLEAN: 109 | case Types.BIT: 110 | type = Schema.Type.BOOLEAN; 111 | break; 112 | 113 | case Types.TINYINT: 114 | case Types.SMALLINT: 115 | type = Schema.Type.INT; 116 | break; 117 | case Types.INTEGER: 118 | // CDAP-12211 - handling unsigned integers in mysql 119 | type = "int unsigned".equalsIgnoreCase(typeName) ? Schema.Type.LONG : Schema.Type.INT; 120 | break; 121 | 122 | case Types.BIGINT: 123 | type = Schema.Type.LONG; 124 | break; 125 | 126 | case Types.REAL: 127 | case Types.FLOAT: 128 | type = Schema.Type.FLOAT; 129 | break; 130 | 131 | case Types.NUMERIC: 132 | case Types.DECIMAL: 133 | // if there are no digits after the point, use integer types 134 | type = scale != 0 ? Schema.Type.DOUBLE : 135 | // with 10 digits we can represent 2^32 and LONG is required 136 | precision > 9 ? Schema.Type.LONG : Schema.Type.INT; 137 | break; 138 | 139 | case Types.DOUBLE: 140 | type = Schema.Type.DOUBLE; 141 | break; 142 | 143 | case Types.DATE: 144 | return Schema.of(Schema.LogicalType.DATE); 145 | case Types.TIME: 146 | return Schema.of(Schema.LogicalType.TIME_MICROS); 147 | case Types.TIMESTAMP: 148 | return Schema.of(Schema.LogicalType.TIMESTAMP_MICROS); 149 | 150 | case Types.BINARY: 151 | case Types.VARBINARY: 152 | case Types.LONGVARBINARY: 153 | case Types.BLOB: 154 | type = Schema.Type.BYTES; 155 | break; 156 | 157 | case Types.ARRAY: 158 | case Types.DATALINK: 159 | case Types.DISTINCT: 160 | case Types.JAVA_OBJECT: 161 | case Types.OTHER: 162 | case Types.REF: 163 | case Types.SQLXML: 164 | case Types.STRUCT: 165 | throw new SQLException(new UnsupportedTypeException("Unsupported SQL Type: " + sqlType)); 166 | } 167 | 168 | return Schema.of(type); 169 | } 170 | 171 | @Nullable 172 | public static Object transformValue(int sqlType, int precision, int scale, 173 | ResultSet resultSet, String fieldName) throws SQLException { 174 | Object original = resultSet.getObject(fieldName); 175 | if (original != null) { 176 | switch (sqlType) { 177 | case Types.SMALLINT: 178 | case Types.TINYINT: 179 | return ((Number) original).intValue(); 180 | case Types.NUMERIC: 181 | case Types.DECIMAL: 182 | BigDecimal decimal = (BigDecimal) original; 183 | if (scale != 0) { 184 | // if there are digits after the point, use double types 185 | return decimal.doubleValue(); 186 | } else if (precision > 9) { 187 | // with 10 digits we can represent 2^32 and LONG is required 188 | return decimal.longValue(); 189 | } else { 190 | return decimal.intValue(); 191 | } 192 | case Types.DATE: 193 | return resultSet.getDate(fieldName); 194 | case Types.TIME: 195 | return resultSet.getTime(fieldName); 196 | case Types.TIMESTAMP: 197 | return resultSet.getTimestamp(fieldName); 198 | case Types.ROWID: 199 | return resultSet.getString(fieldName); 200 | case Types.BLOB: 201 | Blob blob = (Blob) original; 202 | return blob.getBytes(1, (int) blob.length()); 203 | case Types.CLOB: 204 | Clob clob = (Clob) original; 205 | return clob.getSubString(1, (int) clob.length()); 206 | } 207 | } 208 | return original; 209 | } 210 | 211 | /** 212 | * De-register all SQL drivers that are associated with the class 213 | */ 214 | public static void deregisterAllDrivers(Class driverClass) 215 | throws NoSuchFieldException, IllegalAccessException, ClassNotFoundException { 216 | Field field = DriverManager.class.getDeclaredField("registeredDrivers"); 217 | field.setAccessible(true); 218 | List list = (List) field.get(null); 219 | for (Object driverInfo : list) { 220 | Class driverInfoClass = DBUtils.class.getClassLoader().loadClass("java.sql.DriverInfo"); 221 | Field driverField = driverInfoClass.getDeclaredField("driver"); 222 | driverField.setAccessible(true); 223 | Driver d = (Driver) driverField.get(driverInfo); 224 | if (d == null) { 225 | LOG.trace("Could not find driver %s", driverInfo); 226 | continue; 227 | } 228 | LOG.trace("Removing non-null driver object from drivers list."); 229 | ClassLoader registeredDriverClassLoader = d.getClass().getClassLoader(); 230 | if (registeredDriverClassLoader == null) { 231 | LOG.trace("Found null classloader for default driver {}. Ignoring since this may be using system classloader.", 232 | d.getClass().getName()); 233 | continue; 234 | } 235 | // Remove all objects in this list that were created using the classloader of the caller. 236 | if (d.getClass().getClassLoader().equals(driverClass.getClassLoader())) { 237 | LOG.trace("Removing default driver {} from registeredDrivers", d.getClass().getName()); 238 | list.remove(driverInfo); 239 | } 240 | } 241 | } 242 | 243 | private DBUtils() { 244 | throw new AssertionError("Should not instantiate static utility class."); 245 | } 246 | } 247 | 248 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/oracle/Normalizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.oracle; 18 | 19 | import com.google.gson.Gson; 20 | import com.google.gson.JsonObject; 21 | import io.cdap.cdap.api.data.format.StructuredRecord; 22 | import io.cdap.cdap.api.data.schema.Schema; 23 | import io.cdap.plugin.cdc.common.AvroConverter; 24 | import io.cdap.plugin.cdc.common.OperationType; 25 | import io.cdap.plugin.cdc.common.Schemas; 26 | import org.apache.avro.generic.GenericDatumReader; 27 | import org.apache.avro.generic.GenericRecord; 28 | import org.apache.avro.io.DecoderFactory; 29 | import org.slf4j.Logger; 30 | import org.slf4j.LoggerFactory; 31 | 32 | import java.io.IOException; 33 | import java.nio.charset.StandardCharsets; 34 | import java.util.ArrayList; 35 | import java.util.Collections; 36 | import java.util.LinkedHashMap; 37 | import java.util.List; 38 | import java.util.Map; 39 | import java.util.Objects; 40 | 41 | /** 42 | * Class responsible for normalizing the StructuredRecords to be sent to the CDC sinks 43 | */ 44 | public class Normalizer { 45 | private static final Logger LOG = LoggerFactory.getLogger(Normalizer.class); 46 | private static final Gson GSON = new Gson(); 47 | 48 | private static final String INPUT_FIELD = "message"; 49 | 50 | /** 51 | * Normalize the input StructuredRecord containing byte array into the DDL or DML records. 52 | * One input record can result into multiple output records. For example in case of primary key 53 | * updates, the output record consist of two StructuredRcords, one represents delete and another represnents 54 | * insert. 55 | * 56 | * @param input record containing message as byte array to be normalized 57 | * @return {@link List} of normalized records 58 | */ 59 | public List transform(StructuredRecord input) throws Exception { 60 | Object message = input.get(INPUT_FIELD); 61 | if (message == null) { 62 | throw new IllegalStateException(String.format("Input record does not contain the field '%s'.", INPUT_FIELD)); 63 | } 64 | 65 | if ("GenericWrapperSchema".equals(input.getSchema().getRecordName())) { 66 | // Do nothing for the generic wrapper schema message 67 | // Return empty list 68 | return Collections.emptyList(); 69 | } 70 | 71 | byte[] messageBytes = BinaryMessages.getBytesFromBinaryMessage(message); 72 | 73 | if (input.getSchema().getRecordName().equals(Schemas.DDL_SCHEMA.getRecordName())) { 74 | String messageBody = new String(messageBytes, StandardCharsets.UTF_8); 75 | JsonObject schemaObj = GSON.fromJson(messageBody, JsonObject.class); 76 | String namespaceName = schemaObj.get("namespace").getAsString(); 77 | String tableName = schemaObj.get("name").getAsString(); 78 | StructuredRecord ddlRecord = StructuredRecord.builder(Schemas.DDL_SCHEMA) 79 | .set(Schemas.TABLE_FIELD, namespaceName + "." + tableName) 80 | .set(Schemas.SCHEMA_FIELD, getNormalizedDDLSchema(messageBody)) 81 | .build(); 82 | return Collections.singletonList(ddlRecord); 83 | } 84 | 85 | // Current message is the Wrapped Avro binary message 86 | // Get the state map 87 | StructuredRecord stateRecord = input.get("staterecord"); 88 | Map schemaCacheMap = stateRecord.get("data"); 89 | org.apache.avro.Schema avroGenericWrapperSchema = getGenericWrapperMessageSchema(); 90 | 91 | GenericRecord genericRecord = getRecord(messageBytes, avroGenericWrapperSchema); 92 | String tableName = genericRecord.get("table_name").toString(); 93 | long schameHashId = (Long) genericRecord.get("schema_fingerprint"); 94 | 95 | byte[] payload = BinaryMessages.getBytesFromBinaryMessage(genericRecord.get("payload")); 96 | 97 | org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(schemaCacheMap.get(schameHashId)); 98 | LOG.debug("Avro schema {} for table {} with fingerprint {}", avroSchema, tableName, schameHashId); 99 | 100 | StructuredRecord structuredRecord = AvroConverter.fromAvroRecord(getRecord(payload, avroSchema), 101 | AvroConverter.fromAvroSchema(avroSchema)); 102 | 103 | return getNormalizedDMLRecord(structuredRecord); 104 | } 105 | 106 | private String getNormalizedDDLSchema(String jsonSchema) { 107 | org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(jsonSchema); 108 | Schema schema = AvroConverter.fromAvroSchema(avroSchema); 109 | Schema column = schema.getField("before").getSchema().getNonNullable(); 110 | 111 | List columnFields = new ArrayList<>(); 112 | for (Schema.Field field : column.getFields()) { 113 | if (!field.getName().endsWith("_isMissing")) { 114 | columnFields.add(field); 115 | } 116 | } 117 | 118 | Schema ddlSchema = Schema.recordOf(Schemas.SCHEMA_RECORD, columnFields); 119 | LOG.debug("Schema for DDL {}", ddlSchema); 120 | return ddlSchema.toString(); 121 | } 122 | 123 | private org.apache.avro.Schema getGenericWrapperMessageSchema() { 124 | String avroGenericWrapperSchema = "{\n" + 125 | " \"type\" : \"record\",\n" + 126 | " \"name\" : \"generic_wrapper\",\n" + 127 | " \"namespace\" : \"oracle.goldengate\",\n" + 128 | " \"fields\" : [ {\n" + 129 | " \"name\" : \"table_name\",\n" + 130 | " \"type\" : \"string\"\n" + 131 | " }, {\n" + 132 | " \"name\" : \"schema_fingerprint\",\n" + 133 | " \"type\" : \"long\"\n" + 134 | " }, {\n" + 135 | " \"name\" : \"payload\",\n" + 136 | " \"type\" : \"bytes\"\n" + 137 | " } ]\n" + 138 | " }"; 139 | return new org.apache.avro.Schema.Parser().parse(avroGenericWrapperSchema); 140 | } 141 | 142 | private GenericRecord getRecord(byte[] message, org.apache.avro.Schema schema) throws IOException { 143 | GenericDatumReader datumReader = new GenericDatumReader<>(schema); 144 | return datumReader.read(null, DecoderFactory.get().binaryDecoder(message, null)); 145 | } 146 | 147 | private List getNormalizedDMLRecord(StructuredRecord record) throws IOException { 148 | List normalizedRecords = new ArrayList<>(); 149 | // This table name contains "." in it already 150 | String tableName = record.get("table"); 151 | List primaryKeys = record.get("primary_keys"); 152 | OperationType opType = OperationType.fromShortName(record.get("op_type")); 153 | Map suppliedFieldValues = new LinkedHashMap<>(); 154 | switch (opType) { 155 | case INSERT: 156 | StructuredRecord insertRecord = record.get("after"); 157 | for (Schema.Field field : insertRecord.getSchema().getFields()) { 158 | if (!field.getName().endsWith("_isMissing")) { 159 | suppliedFieldValues.put(field, insertRecord.get(field.getName())); 160 | } 161 | } 162 | break; 163 | case UPDATE: 164 | StructuredRecord afterUpdateRecord = record.get("after"); 165 | StructuredRecord beforeUpdateRecord = record.get("before"); 166 | boolean pkChanged = primaryKeyChanged(primaryKeys, beforeUpdateRecord, afterUpdateRecord); 167 | 168 | if (pkChanged) { 169 | // We need to emit two records 170 | // One for DELETE and one for INSERT 171 | suppliedFieldValues = addDeleteFields(record); 172 | normalizedRecords.add(createDMLRecord(tableName, OperationType.DELETE, primaryKeys, suppliedFieldValues)); 173 | } 174 | 175 | suppliedFieldValues.clear(); 176 | for (Schema.Field field : afterUpdateRecord.getSchema().getFields()) { 177 | if (!field.getName().endsWith("_isMissing")) { 178 | String fieldName = field.getName(); 179 | if (!((boolean) afterUpdateRecord.get(fieldName + "_isMissing"))) { 180 | suppliedFieldValues.put(field, afterUpdateRecord.get(field.getName())); 181 | } else { 182 | // Field is not updated, use the field value from the before record 183 | suppliedFieldValues.put(field, beforeUpdateRecord.get(field.getName())); 184 | } 185 | } 186 | } 187 | if (pkChanged) { 188 | // Change the operation type to Insert if the primary key is changed 189 | opType = OperationType.INSERT; 190 | } 191 | break; 192 | case DELETE: 193 | suppliedFieldValues = addDeleteFields(record); 194 | break; 195 | default: 196 | break; 197 | } 198 | 199 | normalizedRecords.add(createDMLRecord(tableName, opType, primaryKeys, suppliedFieldValues)); 200 | return normalizedRecords; 201 | } 202 | 203 | private boolean primaryKeyChanged(List primaryKeys, StructuredRecord before, StructuredRecord after) { 204 | for (String key : primaryKeys) { 205 | if (!Objects.equals(before.get(key), after.get(key))) { 206 | return true; 207 | } 208 | } 209 | return false; 210 | } 211 | 212 | private Map addDeleteFields(StructuredRecord record) { 213 | Map fieldValues = new LinkedHashMap<>(); 214 | StructuredRecord deleteRecord = record.get("before"); 215 | for (Schema.Field field : deleteRecord.getSchema().getFields()) { 216 | if (!field.getName().endsWith("_isMissing")) { 217 | fieldValues.put(field, deleteRecord.get(field.getName())); 218 | } 219 | } 220 | return fieldValues; 221 | } 222 | 223 | private StructuredRecord createDMLRecord(String tableName, OperationType opType, List primaryKeys, 224 | Map changedFields) throws IOException { 225 | Schema changeSchema = Schema.recordOf(Schemas.SCHEMA_RECORD, changedFields.keySet()); 226 | Map changes = new LinkedHashMap<>(); 227 | for (Map.Entry entry : changedFields.entrySet()) { 228 | changes.put(entry.getKey().getName(), entry.getValue()); 229 | } 230 | return StructuredRecord.builder(Schemas.DML_SCHEMA) 231 | .set(Schemas.TABLE_FIELD, tableName) 232 | .set(Schemas.OP_TYPE_FIELD, opType.name()) 233 | .set(Schemas.PRIMARY_KEYS_FIELD, primaryKeys) 234 | .set(Schemas.UPDATE_SCHEMA_FIELD, changeSchema.toString()) 235 | .set(Schemas.UPDATE_VALUES_FIELD, changes) 236 | .build(); 237 | } 238 | } 239 | -------------------------------------------------------------------------------- /src/main/java/io/cdap/plugin/cdc/source/oracle/GoldenGateKafka.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.source.oracle; 18 | 19 | import io.cdap.cdap.api.annotation.Description; 20 | import io.cdap.cdap.api.annotation.Name; 21 | import io.cdap.cdap.api.annotation.Plugin; 22 | import io.cdap.cdap.api.data.format.StructuredRecord; 23 | import io.cdap.cdap.api.data.schema.Schema; 24 | import io.cdap.cdap.api.dataset.DatasetProperties; 25 | import io.cdap.cdap.etl.api.PipelineConfigurer; 26 | import io.cdap.cdap.etl.api.streaming.StreamingContext; 27 | import io.cdap.cdap.etl.api.streaming.StreamingSource; 28 | import io.cdap.plugin.cdc.common.Schemas; 29 | import io.cdap.plugin.common.Constants; 30 | import kafka.api.OffsetRequest; 31 | import kafka.api.PartitionOffsetRequestInfo; 32 | import kafka.common.TopicAndPartition; 33 | import kafka.javaapi.OffsetResponse; 34 | import kafka.javaapi.PartitionMetadata; 35 | import kafka.javaapi.TopicMetadata; 36 | import kafka.javaapi.TopicMetadataRequest; 37 | import kafka.javaapi.TopicMetadataResponse; 38 | import kafka.javaapi.consumer.SimpleConsumer; 39 | import kafka.message.MessageAndMetadata; 40 | import kafka.serializer.DefaultDecoder; 41 | import org.apache.avro.SchemaNormalization; 42 | import org.apache.spark.api.java.Optional; 43 | import org.apache.spark.api.java.function.Function3; 44 | import org.apache.spark.streaming.State; 45 | import org.apache.spark.streaming.StateSpec; 46 | import org.apache.spark.streaming.api.java.JavaDStream; 47 | import org.apache.spark.streaming.api.java.JavaInputDStream; 48 | import org.apache.spark.streaming.kafka.KafkaUtils; 49 | import org.slf4j.Logger; 50 | import org.slf4j.LoggerFactory; 51 | import scala.Tuple2; 52 | 53 | import java.nio.charset.StandardCharsets; 54 | import java.util.Collections; 55 | import java.util.HashMap; 56 | import java.util.HashSet; 57 | import java.util.Map; 58 | import java.util.Set; 59 | 60 | /** 61 | * Streaming source for reading from Golden Gate Kafka topic. 62 | */ 63 | @Plugin(type = StreamingSource.PLUGIN_TYPE) 64 | @Name("CDCDatabase") 65 | @Description("Streaming source for reading through Golden Gate Kafka topic") 66 | public class GoldenGateKafka extends StreamingSource { 67 | private static final Logger LOG = LoggerFactory.getLogger(GoldenGateKafka.class); 68 | private static final Schema GENERIC_WRAPPER_SCHEMA_MESSAGE 69 | = Schema.recordOf("GenericWrapperSchema", Schema.Field.of("message", Schema.of(Schema.Type.BYTES))); 70 | private static final Schema DDL_SCHEMA_MESSAGE 71 | = Schema.recordOf("DDLRecord", Schema.Field.of("message", Schema.of(Schema.Type.BYTES))); 72 | private static final Schema TRANSFORMED_MESSAGE 73 | = Schema.recordOf("Message", Schema.Field.of("message", Schema.of(Schema.Type.BYTES))); 74 | 75 | private static final Schema STATE_SCHEMA 76 | = Schema.recordOf("state", 77 | Schema.Field.of("data", 78 | Schema.mapOf(Schema.of(Schema.Type.LONG), 79 | Schema.of(Schema.Type.STRING)))); 80 | 81 | private static final Schema DML_SCHEMA = Schema.recordOf("DMLRecord", 82 | Schema.Field.of("message", Schema.of(Schema.Type.BYTES)), 83 | Schema.Field.of("staterecord", STATE_SCHEMA)); 84 | 85 | private static final Normalizer NORMALIZER = new Normalizer(); 86 | 87 | private final GoldenGateKafkaConfig conf; 88 | 89 | 90 | public GoldenGateKafka(GoldenGateKafkaConfig conf) { 91 | this.conf = conf; 92 | } 93 | 94 | @Override 95 | public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException { 96 | conf.validate(); 97 | pipelineConfigurer.createDataset(conf.referenceName, Constants.EXTERNAL_DATASET_TYPE, DatasetProperties.EMPTY); 98 | pipelineConfigurer.getStageConfigurer().setOutputSchema(Schemas.CHANGE_SCHEMA); 99 | 100 | // Make sure that Golden Gate kafka topic only have single partition 101 | SimpleConsumer consumer = new SimpleConsumer(conf.getHost(), conf.getPort(), 20 * 1000, 128 * 1024, 102 | "partitionLookup"); 103 | try { 104 | getPartitionId(consumer); 105 | } finally { 106 | consumer.close(); 107 | } 108 | 109 | if (conf.getMaxRatePerPartition() > 0) { 110 | Map pipelineProperties = new HashMap<>(); 111 | pipelineProperties.put("spark.streaming.kafka.maxRatePerPartition", conf.getMaxRatePerPartition().toString()); 112 | pipelineConfigurer.setPipelineProperties(pipelineProperties); 113 | } 114 | } 115 | 116 | @Override 117 | public JavaDStream getStream(StreamingContext context) throws Exception { 118 | context.registerLineage(conf.referenceName); 119 | 120 | SimpleConsumer consumer = new SimpleConsumer(conf.getHost(), conf.getPort(), 20 * 1000, 128 * 1024, 121 | "partitionLookup"); 122 | Map offsets; 123 | try { 124 | offsets = loadOffsets(consumer); 125 | } finally { 126 | consumer.close(); 127 | } 128 | 129 | LOG.info("Using initial offsets {}", offsets); 130 | Map kafkaParams = new HashMap<>(); 131 | kafkaParams.put("metadata.broker.list", conf.getBroker()); 132 | JavaInputDStream directStream = KafkaUtils.createDirectStream( 133 | context.getSparkStreamingContext(), byte[].class, byte[].class, DefaultDecoder.class, DefaultDecoder.class, 134 | StructuredRecord.class, kafkaParams, offsets, this::kafkaMessageToRecord); 135 | return directStream 136 | .mapToPair(record -> new Tuple2<>("", record)) 137 | .mapWithState(StateSpec.function(schemaStateFunction())) 138 | .flatMap(record -> NORMALIZER.transform(record).iterator()) 139 | .map(Schemas::toCDCRecord); 140 | } 141 | 142 | private Map loadOffsets(SimpleConsumer consumer) { 143 | // KafkaUtils doesn't understand -1 and -2 as latest offset and smallest offset. 144 | // so we have to replace them with the actual smallest and latest 145 | String topicName = conf.getTopic(); 146 | int partitionId = getPartitionId(consumer); 147 | long initialOffset = conf.getDefaultInitialOffset(); 148 | 149 | TopicAndPartition topicAndPartition = new TopicAndPartition(topicName, partitionId); 150 | 151 | Map offsetsToRequest = new HashMap<>(); 152 | if (initialOffset == OffsetRequest.EarliestTime() || initialOffset == OffsetRequest.LatestTime()) { 153 | offsetsToRequest.put(topicAndPartition, new PartitionOffsetRequestInfo(initialOffset, 1)); 154 | } 155 | 156 | kafka.javaapi.OffsetRequest offsetRequest = 157 | new kafka.javaapi.OffsetRequest(offsetsToRequest, OffsetRequest.CurrentVersion(), "offsetLookup"); 158 | OffsetResponse response = consumer.getOffsetsBefore(offsetRequest); 159 | 160 | if (response.errorCode(topicName, partitionId) != 0) { 161 | throw new IllegalStateException(String.format( 162 | "Could not find offset for topic '%s' and partition '%s'. Please check all brokers were included in the " + 163 | "broker list.", topicName, partitionId)); 164 | } 165 | 166 | Map offsets = new HashMap<>(); 167 | offsets.put(topicAndPartition, response.offsets(topicName, partitionId)[0]); 168 | return offsets; 169 | } 170 | 171 | private StructuredRecord kafkaMessageToRecord(MessageAndMetadata messageAndMetadata) { 172 | return StructuredRecord.builder(TRANSFORMED_MESSAGE) 173 | .set("message", messageAndMetadata.message()) 174 | .build(); 175 | } 176 | 177 | private int getPartitionId(SimpleConsumer consumer) { 178 | Set partitions = new HashSet<>(); 179 | TopicMetadataRequest topicMetadataRequest = new TopicMetadataRequest(Collections.singletonList(conf.getTopic())); 180 | TopicMetadataResponse response = consumer.send(topicMetadataRequest); 181 | 182 | for (TopicMetadata topicMetadata : response.topicsMetadata()) { 183 | for (PartitionMetadata partitionMetadata : topicMetadata.partitionsMetadata()) { 184 | partitions.add(partitionMetadata.partitionId()); 185 | } 186 | } 187 | 188 | if (partitions.size() != 1) { 189 | throw new IllegalArgumentException( 190 | String.format("Topic '%s' should only have one partition. Found '%s' partitions.", 191 | conf.getTopic(), partitions.size())); 192 | } 193 | return partitions.iterator().next(); 194 | } 195 | 196 | private static Function3, State>, StructuredRecord> 197 | schemaStateFunction() { 198 | return (key, value, state) -> { 199 | StructuredRecord input = value.get(); 200 | Object message = input.get("message"); 201 | 202 | byte[] messageBytes = BinaryMessages.getBytesFromBinaryMessage(message); 203 | String messageBody = new String(messageBytes, StandardCharsets.UTF_8); 204 | 205 | if (messageBody.contains("generic_wrapper") && messageBody.contains("oracle.goldengate")) { 206 | StructuredRecord.Builder builder = StructuredRecord.builder(GENERIC_WRAPPER_SCHEMA_MESSAGE); 207 | builder.set("message", message); 208 | return builder.build(); 209 | } 210 | 211 | if (messageBody.contains("\"type\" : \"record\"")) { 212 | org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(messageBody); 213 | long schemaFingerPrint = SchemaNormalization.parsingFingerprint64(avroSchema); 214 | Map newState; 215 | if (state.exists()) { 216 | newState = state.get(); 217 | } else { 218 | newState = new HashMap<>(); 219 | } 220 | newState.put(schemaFingerPrint, messageBody); 221 | state.update(newState); 222 | LOG.debug("Schema mapping updated to {}", state.get()); 223 | 224 | StructuredRecord.Builder builder = StructuredRecord.builder(DDL_SCHEMA_MESSAGE); 225 | builder.set("message", message); 226 | return builder.build(); 227 | } 228 | 229 | StructuredRecord.Builder stateBuilder = StructuredRecord.builder(STATE_SCHEMA); 230 | stateBuilder.set("data", state.get()); 231 | 232 | StructuredRecord.Builder builder = StructuredRecord.builder(DML_SCHEMA); 233 | builder.set("message", message); 234 | builder.set("staterecord", stateBuilder.build()); 235 | return builder.build(); 236 | }; 237 | } 238 | } 239 | -------------------------------------------------------------------------------- /src/test/java/io/cdap/plugin/cdc/performance/CDCPipelinePerfTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019 Cask Data, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | 17 | package io.cdap.plugin.cdc.performance; 18 | 19 | import com.google.bigtable.repackaged.com.google.cloud.ServiceOptions; 20 | import com.google.bigtable.repackaged.com.google.cloud.bigtable.grpc.io.IOExceptionWithStatus; 21 | import com.google.bigtable.repackaged.io.grpc.StatusRuntimeException; 22 | import com.google.common.collect.ImmutableMap; 23 | import io.cdap.cdap.etl.api.batch.SparkSink; 24 | import io.cdap.cdap.etl.api.streaming.StreamingSource; 25 | import io.cdap.cdap.etl.proto.v2.ETLPlugin; 26 | import io.cdap.cdap.proto.ProgramRunStatus; 27 | import io.cdap.cdap.test.SparkManager; 28 | import io.cdap.plugin.cdc.common.BigtableOperations; 29 | import io.cdap.plugin.cdc.sink.CDCBigTableConfig; 30 | import io.cdap.plugin.cdc.source.sqlserver.CTSQLServerConfig; 31 | import io.cdap.plugin.common.Constants; 32 | import org.apache.hadoop.hbase.TableName; 33 | import org.apache.hadoop.hbase.client.Connection; 34 | import org.apache.hadoop.hbase.client.Result; 35 | import org.apache.hadoop.hbase.client.ResultScanner; 36 | import org.apache.hadoop.hbase.client.Scan; 37 | import org.apache.hadoop.hbase.client.Table; 38 | import org.awaitility.Awaitility; 39 | import org.awaitility.Duration; 40 | import org.junit.After; 41 | import org.junit.Assert; 42 | import org.junit.Assume; 43 | import org.junit.Before; 44 | import org.junit.Rule; 45 | import org.junit.Test; 46 | import org.junit.rules.TestName; 47 | import org.slf4j.Logger; 48 | import org.slf4j.LoggerFactory; 49 | 50 | import java.io.IOException; 51 | import java.sql.DriverManager; 52 | import java.sql.SQLException; 53 | import java.sql.Statement; 54 | import java.util.Map; 55 | import java.util.concurrent.TimeUnit; 56 | 57 | public class CDCPipelinePerfTest extends CDCPluginPerfTestBase { 58 | private static final Logger LOG = LoggerFactory.getLogger(CDCPipelinePerfTest.class); 59 | private static final String APP_NAME = CDCPipelinePerfTest.class.getSimpleName(); 60 | 61 | // Common properties 62 | private static final boolean TEST_DATA_LOAD = 63 | Boolean.parseBoolean(System.getProperty("ptest.test-data.load", "true")); 64 | private static final int TEST_DATA_INSERTS = 65 | Integer.parseInt(System.getProperty("ptest.test-data.inserts", "5000")); 66 | private static final int TEST_TARGET_TABLE_CREATED_TIMEOUT_SECONDS = 67 | Integer.parseInt(System.getProperty("ptest.target-table-created-timeout.seconds", "300")); 68 | private static final int TEST_DATA_TRANSFERED_TIMEOUT_SECONDS = 69 | Integer.parseInt(System.getProperty("ptest.data-transferred-timeout.seconds", "600")); 70 | 71 | // Bigtable properties 72 | private static final String BIGTABLE_PROJECT 73 | = System.getProperty("ptest.bigtable.project", ServiceOptions.getDefaultProjectId()); 74 | private static final String BIGTABLE_INSTANCE = System.getProperty("ptest.bigtable.instance"); 75 | private static final String BIGTABLE_SERVICE_ACCOUNT_FILE_PATH 76 | = System.getProperty("ptest.bigtable.serviceFilePath", System.getenv("CREDENTIAL_ENV_NAME")); 77 | 78 | // SQL Server properties 79 | private static final String SQL_HOST = System.getProperty("ptest.sql-server.host", "localhost"); 80 | private static final String SQL_PORT = System.getProperty("ptest.sql-server.port", "1433"); 81 | private static final String SQL_USERNAME = System.getProperty("ptest.sql-server.username", "SA"); 82 | private static final String SQL_PASSWORD = System.getProperty("ptest.sql-server.password", "123Qwe123"); 83 | 84 | @Rule 85 | public TestName testName = new TestName(); 86 | 87 | private String dbName; 88 | private String dbTableName; 89 | private SparkManager programManager; 90 | private Connection connection; 91 | 92 | @Before 93 | @Override 94 | public void setUp() throws Exception { 95 | Assume.assumeNotNull(BIGTABLE_PROJECT); 96 | Assume.assumeNotNull(BIGTABLE_INSTANCE); 97 | 98 | super.setUp(); 99 | 100 | dbName = CDCPipelinePerfTest.class.getSimpleName() + '_' + testName.getMethodName(); 101 | dbTableName = testName.getMethodName(); 102 | 103 | connection = BigtableOperations.connect(BIGTABLE_PROJECT, BIGTABLE_INSTANCE, BIGTABLE_SERVICE_ACCOUNT_FILE_PATH); 104 | 105 | // cleanup Bigtable 106 | LOG.info("Cleaning up Bigtable"); 107 | BigtableOperations.dropTableIfExists(connection, dbTableName); 108 | 109 | if (TEST_DATA_LOAD) { 110 | LOG.info("Preparing test data"); 111 | // cleanup SQL Server 112 | LOG.info("Cleaning up SQL Server"); 113 | dropDatabaseIfExists(dbName); 114 | // prepare data 115 | LOG.info("Inserting test data ({} records)", TEST_DATA_INSERTS); 116 | createDatabaseWithTracking(dbName); 117 | createTableWithTracking(dbTableName); 118 | try (java.sql.Connection connection = getConnectionToDb(); 119 | Statement statement = connection.createStatement()) { 120 | for (int i = 0; i < TEST_DATA_INSERTS; i++) { 121 | statement.executeUpdate(String.format("insert into %s(value) values ('initial value')", dbTableName)); 122 | } 123 | } 124 | } 125 | 126 | LOG.info("Deploying application"); 127 | 128 | ImmutableMap sourceProps = ImmutableMap.builder() 129 | .put(CTSQLServerConfig.HOST_NAME, SQL_HOST) 130 | .put(CTSQLServerConfig.PORT, SQL_PORT) 131 | .put(CTSQLServerConfig.USERNAME, SQL_USERNAME) 132 | .put(CTSQLServerConfig.PASSWORD, SQL_PASSWORD) 133 | .put(CTSQLServerConfig.DATABASE_NAME, dbName) 134 | .put(Constants.Reference.REFERENCE_NAME, "CTSQLServerSource") 135 | .build(); 136 | ETLPlugin sourceConfig = new ETLPlugin("CTSQLServer", StreamingSource.PLUGIN_TYPE, sourceProps); 137 | 138 | Map sinkProps = ImmutableMap.builder() 139 | .put(CDCBigTableConfig.PROJECT, BIGTABLE_PROJECT) 140 | .put(CDCBigTableConfig.INSTANCE, BIGTABLE_INSTANCE) 141 | .put(CDCBigTableConfig.SERVICE_ACCOUNT_FILE_PATH, BIGTABLE_SERVICE_ACCOUNT_FILE_PATH) 142 | .put(Constants.Reference.REFERENCE_NAME, "CDCBigTableSink") 143 | .build(); 144 | ETLPlugin sinkConfig = new ETLPlugin("CDCBigTable", SparkSink.PLUGIN_TYPE, sinkProps); 145 | 146 | programManager = deployETL(sourceConfig, sinkConfig, APP_NAME); 147 | } 148 | 149 | @After 150 | @Override 151 | public void tearDown() throws Exception { 152 | if (programManager != null) { 153 | programManager.stop(); 154 | programManager.waitForStopped(10, TimeUnit.SECONDS); 155 | programManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS); 156 | } 157 | super.tearDown(); 158 | if (connection != null) { 159 | connection.close(); 160 | } 161 | } 162 | 163 | @Test 164 | public void testSqlServerToBigtablePipeline() throws Exception { 165 | long testStart = System.currentTimeMillis(); 166 | LOG.info("Starting pipeline"); 167 | programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS); 168 | 169 | LOG.info("Waiting until {} records are present in Bigtable", TEST_DATA_INSERTS); 170 | 171 | Awaitility.await() 172 | .atMost(TEST_TARGET_TABLE_CREATED_TIMEOUT_SECONDS, TimeUnit.SECONDS) 173 | .pollInterval(Duration.TEN_SECONDS) 174 | .ignoreException(StatusRuntimeException.class) 175 | .untilAsserted(() -> { 176 | TableName tableName = TableName.valueOf(dbTableName); 177 | Assert.assertTrue(String.format("Table '%s' was not created", tableName), 178 | connection.getAdmin().tableExists(tableName)); 179 | Assert.assertTrue(connection.getAdmin().isTableAvailable(tableName)); 180 | Assert.assertTrue(connection.getAdmin().isTableEnabled(tableName)); 181 | }); 182 | 183 | Awaitility.await() 184 | .atMost(TEST_DATA_TRANSFERED_TIMEOUT_SECONDS, TimeUnit.SECONDS) 185 | .pollInterval(Duration.TEN_SECONDS) 186 | .ignoreException(IOExceptionWithStatus.class) 187 | .untilAsserted(() -> { 188 | TableName tableName = TableName.valueOf(dbTableName); 189 | Table table = connection.getTable(tableName); 190 | int rowCount = getRowCount(table); 191 | LOG.info("Currently {} records are present in Bigtable", rowCount); 192 | Assert.assertEquals(TEST_DATA_INSERTS, rowCount); 193 | }); 194 | 195 | long testEnd = System.currentTimeMillis(); 196 | long elapsedSeconds = (testEnd - testStart) / 1000; 197 | long recordsPerSecond = TEST_DATA_INSERTS / elapsedSeconds; 198 | LOG.info("Test finished. Transferred '{}' records. Elapsed time is '{} seconds' ({} records/second)", 199 | TEST_DATA_INSERTS, elapsedSeconds, recordsPerSecond); 200 | } 201 | 202 | private static int getRowCount(Table table) throws IOException { 203 | int rowCount = 0; 204 | try (ResultScanner scanner = table.getScanner(new Scan())) { 205 | for (Result rs = scanner.next(); rs != null; rs = scanner.next()) { 206 | rowCount++; 207 | } 208 | } 209 | return rowCount; 210 | } 211 | 212 | private static void dropDatabaseIfExists(String dbName) throws SQLException { 213 | try (java.sql.Connection connection = getConnectionToRoot(); 214 | Statement statement = connection.createStatement()) { 215 | statement.executeUpdate(String.format("drop database if exists %s", dbName)); 216 | } 217 | } 218 | 219 | private static java.sql.Connection getConnectionToRoot() throws SQLException { 220 | String connectionString = String.format("jdbc:sqlserver://%s:%s", SQL_HOST, SQL_PORT); 221 | return DriverManager.getConnection(connectionString, SQL_USERNAME, SQL_PASSWORD); 222 | } 223 | 224 | private static void createDatabaseWithTracking(String dbName) throws SQLException { 225 | try (java.sql.Connection connection = getConnectionToRoot(); 226 | Statement statement = connection.createStatement()) { 227 | statement.executeUpdate(String.format("create database %s", dbName)); 228 | statement.executeUpdate(String.format("alter database %s set change_tracking = ON", dbName)); 229 | } 230 | } 231 | 232 | private void createTableWithTracking(String tableName) throws SQLException { 233 | try (java.sql.Connection connection = getConnectionToDb(); 234 | Statement statement = connection.createStatement()) { 235 | statement.executeUpdate(String.format("create table %s (id bigint identity primary key, value text)", 236 | tableName)); 237 | statement.executeUpdate(String.format("alter table %s enable change_tracking", tableName)); 238 | } 239 | } 240 | 241 | private java.sql.Connection getConnectionToDb() throws SQLException { 242 | String connectionString = String.format("jdbc:sqlserver://%s:%s;DatabaseName=%s", SQL_HOST, SQL_PORT, dbName); 243 | return DriverManager.getConnection(connectionString, SQL_USERNAME, SQL_PASSWORD); 244 | } 245 | } 246 | --------------------------------------------------------------------------------